2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <linux/ctype.h>
21 #include <asm/processor.h>
24 #include <asm/kdebug.h>
25 #include <asm/uaccess.h>
27 #define MISC_MCELOG_MINOR 227
30 static int mce_dont_init;
32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34 static int tolerant = 1;
36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37 static unsigned long console_logged;
38 static int notify_user;
40 static int mce_bootlog;
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
48 struct mce_log mcelog = {
53 void mce_log(struct mce *mce)
59 entry = rcu_dereference(mcelog.next);
60 /* The rmb forces the compiler to reload next in each
64 /* When the buffer fills up discard new entries. Assume
65 that the earlier errors are the more interesting. */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW, &mcelog.flags);
70 /* Old left over entry. Skip. */
71 if (mcelog.entry[entry].finished) {
79 if (cmpxchg(&mcelog.next, entry, next) == entry)
82 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
84 mcelog.entry[entry].finished = 1;
87 if (!test_and_set_bit(0, &console_logged))
91 static void print_mce(struct mce *m)
93 printk(KERN_EMERG "\n"
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status);
99 "RIP%s %02x:<%016Lx> ",
100 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
102 if (m->cs == __KERNEL_CS)
103 print_symbol("{%s}", m->rip);
106 printk(KERN_EMERG "TSC %Lx ", m->tsc);
108 printk("ADDR %Lx ", m->addr);
110 printk("MISC %Lx ", m->misc);
114 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
118 for (i = 0; i < MCE_LOG_LEN; i++) {
119 unsigned long tsc = mcelog.entry[i].tsc;
120 if (time_before(tsc, start))
122 print_mce(&mcelog.entry[i]);
123 if (backup && mcelog.entry[i].tsc == backup->tsc)
129 printk("Fake panic: %s\n", msg);
134 static int mce_available(struct cpuinfo_x86 *c)
136 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
137 test_bit(X86_FEATURE_MCA, &c->x86_capability);
140 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
142 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
150 /* Assume the RIP in the MSR is exact. Is this true? */
151 m->mcgstatus |= MCG_STATUS_EIPV;
152 rdmsrl(rip_msr, m->rip);
158 * The actual machine check handler
161 void do_machine_check(struct pt_regs * regs, long error_code)
163 struct mce m, panicm;
164 int nowayout = (tolerant < 1);
168 int panicm_found = 0;
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
175 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV))
184 for (i = 0; i < banks; i++) {
193 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
194 if ((m.status & MCI_STATUS_VAL) == 0)
197 if (m.status & MCI_STATUS_EN) {
198 /* In theory _OVER could be a nowayout too, but
199 assume any overflowed errors were no fatal. */
200 nowayout |= !!(m.status & MCI_STATUS_PCC);
201 kill_it |= !!(m.status & MCI_STATUS_UC);
204 if (m.status & MCI_STATUS_MISCV)
205 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
206 if (m.status & MCI_STATUS_ADDRV)
207 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
209 mce_get_rip(&m, regs);
212 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
213 if (error_code != -2)
216 /* Did this bank cause the exception? */
217 /* Assume that the bank with uncorrectable errors did it,
218 and that there is only a single one. */
219 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
224 add_taint(TAINT_MACHINE_CHECK);
227 /* Never do anything final in the polling timer */
231 /* If we didn't find an uncorrectable error, pick
232 the last one (shouldn't happen, just being safe). */
236 mce_panic("Machine check", &panicm, mcestart);
240 if (m.mcgstatus & MCG_STATUS_RIPV)
241 user_space = panicm.rip && (panicm.cs & 3);
243 /* When the machine was in user space and the CPU didn't get
244 confused it's normally not necessary to panic, unless you
245 are paranoid (tolerant == 0)
247 RED-PEN could be more tolerant for MCEs in idle,
248 but most likely they occur at boot anyways, where
249 it is best to just halt the machine. */
250 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
251 (unsigned)current->pid <= 1)
252 mce_panic("Uncorrected machine check", &panicm, mcestart);
254 /* do_exit takes an awful lot of locks and has as
255 slight risk of deadlocking. If you don't want that
256 don't set tolerant >= 2 */
262 /* Last thing done in the machine check exception to clear state. */
263 wrmsrl(MSR_IA32_MCG_STATUS, 0);
267 * Periodic polling timer for "silent" machine check errors.
270 static int check_interval = 5 * 60; /* 5 minutes */
271 static void mcheck_timer(void *data);
272 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
274 static void mcheck_check_cpu(void *info)
276 if (mce_available(¤t_cpu_data))
277 do_machine_check(NULL, 0);
280 static void mcheck_timer(void *data)
282 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
283 schedule_delayed_work(&mcheck_work, check_interval * HZ);
286 * It's ok to read stale data here for notify_user and
287 * console_logged as we'll simply get the updated versions
288 * on the next mcheck_timer execution and atomic operations
289 * on console_logged act as synchronization for notify_user
292 if (notify_user && console_logged) {
294 clear_bit(0, &console_logged);
295 printk(KERN_INFO "Machine check events logged\n");
300 static __init int periodic_mcheck_init(void)
303 schedule_delayed_work(&mcheck_work, check_interval*HZ);
306 __initcall(periodic_mcheck_init);
310 * Initialize Machine Checks for a CPU.
312 static void mce_init(void *dummy)
317 rdmsrl(MSR_IA32_MCG_CAP, cap);
319 if (banks > NR_BANKS) {
320 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
323 /* Use accurate RIP reporting if available. */
324 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
325 rip_msr = MSR_IA32_MCG_EIP;
327 /* Log the machine checks left over from the previous reset.
328 This also clears all registers */
329 do_machine_check(NULL, mce_bootlog ? -1 : -2);
331 set_in_cr4(X86_CR4_MCE);
334 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
336 for (i = 0; i < banks; i++) {
337 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
338 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
342 /* Add per CPU specific workarounds here */
343 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
345 /* This should be disabled by the BIOS, but isn't always */
346 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
347 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]);
353 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
355 switch (c->x86_vendor) {
356 case X86_VENDOR_INTEL:
357 mce_intel_feature_init(c);
365 * Called for each booted CPU to set up machine checks.
366 * Must be called with preempt off.
368 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
370 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
375 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
384 * Character device to read and clear the MCE log.
387 static void collect_tscs(void *data)
389 unsigned long *cpu_tsc = (unsigned long *)data;
390 rdtscll(cpu_tsc[smp_processor_id()]);
393 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
395 unsigned long *cpu_tsc;
396 static DECLARE_MUTEX(mce_read_sem);
398 char __user *buf = ubuf;
401 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
406 next = rcu_dereference(mcelog.next);
408 /* Only supports full reads right now */
409 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
416 for (i = 0; i < next; i++) {
417 unsigned long start = jiffies;
418 while (!mcelog.entry[i].finished) {
419 if (!time_before(jiffies, start + 2)) {
420 memset(mcelog.entry + i,0, sizeof(struct mce));
426 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
427 buf += sizeof(struct mce);
430 memset(mcelog.entry, 0, next * sizeof(struct mce));
435 /* Collect entries that were still getting written before the synchronize. */
437 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
438 for (i = next; i < MCE_LOG_LEN; i++) {
439 if (mcelog.entry[i].finished &&
440 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
441 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
443 buf += sizeof(struct mce);
444 memset(&mcelog.entry[i], 0, sizeof(struct mce));
449 return err ? -EFAULT : buf - ubuf;
452 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
454 int __user *p = (int __user *)arg;
455 if (!capable(CAP_SYS_ADMIN))
458 case MCE_GET_RECORD_LEN:
459 return put_user(sizeof(struct mce), p);
460 case MCE_GET_LOG_LEN:
461 return put_user(MCE_LOG_LEN, p);
462 case MCE_GETCLEAR_FLAGS: {
465 flags = mcelog.flags;
466 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
467 return put_user(flags, p);
474 static struct file_operations mce_chrdev_ops = {
479 static struct miscdevice mce_log_device = {
486 * Old style boot options parsing. Only for compatibility.
489 static int __init mcheck_disable(char *str)
495 /* mce=off disables machine check. Note you can reenable it later
497 mce=TOLERANCELEVEL (number, see above)
498 mce=bootlog Log MCEs from before booting. Disabled by default to work
499 around buggy BIOS that leave bogus MCEs. */
500 static int __init mcheck_enable(char *str)
504 if (!strcmp(str, "off"))
506 else if (!strcmp(str, "bootlog"))
508 else if (isdigit(str[0]))
509 get_option(&str, &tolerant);
511 printk("mce= argument %s ignored. Please use /sys", str);
515 __setup("nomce", mcheck_disable);
516 __setup("mce", mcheck_enable);
522 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
523 Only one CPU is active at this time, the others get readded later using
525 static int mce_resume(struct sys_device *dev)
531 /* Reinit MCEs after user configuration changes */
532 static void mce_restart(void)
535 cancel_delayed_work(&mcheck_work);
536 /* Timer race is harmless here */
537 on_each_cpu(mce_init, NULL, 1, 1);
539 schedule_delayed_work(&mcheck_work, check_interval*HZ);
542 static struct sysdev_class mce_sysclass = {
543 .resume = mce_resume,
544 set_kset_name("machinecheck"),
547 static DEFINE_PER_CPU(struct sys_device, device_mce);
549 /* Why are there no generic functions for this? */
550 #define ACCESSOR(name, var, start) \
551 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
552 return sprintf(buf, "%lx\n", (unsigned long)var); \
554 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
556 unsigned long new = simple_strtoul(buf, &end, 0); \
557 if (end == buf) return -EINVAL; \
562 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
564 ACCESSOR(bank0ctl,bank[0],mce_restart())
565 ACCESSOR(bank1ctl,bank[1],mce_restart())
566 ACCESSOR(bank2ctl,bank[2],mce_restart())
567 ACCESSOR(bank3ctl,bank[3],mce_restart())
568 ACCESSOR(bank4ctl,bank[4],mce_restart())
569 ACCESSOR(tolerant,tolerant,)
570 ACCESSOR(check_interval,check_interval,mce_restart())
572 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
573 static __cpuinit int mce_create_device(unsigned int cpu)
576 if (!mce_available(&cpu_data[cpu]))
579 per_cpu(device_mce,cpu).id = cpu;
580 per_cpu(device_mce,cpu).cls = &mce_sysclass;
582 err = sysdev_register(&per_cpu(device_mce,cpu));
585 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
586 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
587 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
588 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
589 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
590 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
591 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
596 #ifdef CONFIG_HOTPLUG_CPU
597 static __cpuinit void mce_remove_device(unsigned int cpu)
599 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
600 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
601 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
602 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
603 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
604 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
605 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
606 sysdev_unregister(&per_cpu(device_mce,cpu));
610 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
612 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
614 unsigned int cpu = (unsigned long)hcpu;
618 mce_create_device(cpu);
620 #ifdef CONFIG_HOTPLUG_CPU
622 mce_remove_device(cpu);
629 static struct notifier_block mce_cpu_notifier = {
630 .notifier_call = mce_cpu_callback,
633 static __init int mce_init_device(void)
638 if (!mce_available(&boot_cpu_data))
640 err = sysdev_class_register(&mce_sysclass);
642 for_each_online_cpu(i) {
643 mce_create_device(i);
646 register_cpu_notifier(&mce_cpu_notifier);
647 misc_register(&mce_log_device);
651 device_initcall(mce_init_device);