2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <asm/processor.h>
25 #include <asm/kdebug.h>
26 #include <asm/uaccess.h>
29 #define MISC_MCELOG_MINOR 227
34 static int mce_dont_init;
36 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
37 3: never panic or exit (for testing only) */
38 static int tolerant = 1;
40 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
41 static unsigned long console_logged;
42 static int notify_user;
44 static int mce_bootlog = 1;
47 * Lockless MCE logging infrastructure.
48 * This avoids deadlocks on printk locks without having to break locks. Also
49 * separate MCEs from kernel messages to avoid bogus bug reports.
52 struct mce_log mcelog = {
57 void mce_log(struct mce *mce)
63 entry = rcu_dereference(mcelog.next);
64 /* The rmb forces the compiler to reload next in each
68 /* When the buffer fills up discard new entries. Assume
69 that the earlier errors are the more interesting. */
70 if (entry >= MCE_LOG_LEN) {
71 set_bit(MCE_OVERFLOW, &mcelog.flags);
74 /* Old left over entry. Skip. */
75 if (mcelog.entry[entry].finished) {
83 if (cmpxchg(&mcelog.next, entry, next) == entry)
86 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
88 mcelog.entry[entry].finished = 1;
91 if (!test_and_set_bit(0, &console_logged))
95 static void print_mce(struct mce *m)
97 printk(KERN_EMERG "\n"
98 KERN_EMERG "HARDWARE ERROR\n"
100 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
101 m->cpu, m->mcgstatus, m->bank, m->status);
104 "RIP%s %02x:<%016Lx> ",
105 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
107 if (m->cs == __KERNEL_CS)
108 print_symbol("{%s}", m->rip);
111 printk(KERN_EMERG "TSC %Lx ", m->tsc);
113 printk("ADDR %Lx ", m->addr);
115 printk("MISC %Lx ", m->misc);
117 printk(KERN_EMERG "This is not a software problem!\n");
119 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
122 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
126 for (i = 0; i < MCE_LOG_LEN; i++) {
127 unsigned long tsc = mcelog.entry[i].tsc;
128 if (time_before(tsc, start))
130 print_mce(&mcelog.entry[i]);
131 if (backup && mcelog.entry[i].tsc == backup->tsc)
137 printk("Fake panic: %s\n", msg);
142 static int mce_available(struct cpuinfo_x86 *c)
144 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
147 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
149 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
157 /* Assume the RIP in the MSR is exact. Is this true? */
158 m->mcgstatus |= MCG_STATUS_EIPV;
159 rdmsrl(rip_msr, m->rip);
165 * The actual machine check handler
168 void do_machine_check(struct pt_regs * regs, long error_code)
170 struct mce m, panicm;
171 int nowayout = (tolerant < 1);
175 int panicm_found = 0;
177 atomic_inc(&mce_entry);
180 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
184 memset(&m, 0, sizeof(struct mce));
185 m.cpu = safe_smp_processor_id();
186 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
187 if (!(m.mcgstatus & MCG_STATUS_RIPV))
193 for (i = 0; i < banks; i++) {
202 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
203 if ((m.status & MCI_STATUS_VAL) == 0)
206 if (m.status & MCI_STATUS_EN) {
207 /* In theory _OVER could be a nowayout too, but
208 assume any overflowed errors were no fatal. */
209 nowayout |= !!(m.status & MCI_STATUS_PCC);
210 kill_it |= !!(m.status & MCI_STATUS_UC);
213 if (m.status & MCI_STATUS_MISCV)
214 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
215 if (m.status & MCI_STATUS_ADDRV)
216 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
218 mce_get_rip(&m, regs);
221 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
222 if (error_code != -2)
225 /* Did this bank cause the exception? */
226 /* Assume that the bank with uncorrectable errors did it,
227 and that there is only a single one. */
228 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
233 add_taint(TAINT_MACHINE_CHECK);
236 /* Never do anything final in the polling timer */
240 /* If we didn't find an uncorrectable error, pick
241 the last one (shouldn't happen, just being safe). */
245 mce_panic("Machine check", &panicm, mcestart);
249 if (m.mcgstatus & MCG_STATUS_RIPV)
250 user_space = panicm.rip && (panicm.cs & 3);
252 /* When the machine was in user space and the CPU didn't get
253 confused it's normally not necessary to panic, unless you
254 are paranoid (tolerant == 0)
256 RED-PEN could be more tolerant for MCEs in idle,
257 but most likely they occur at boot anyways, where
258 it is best to just halt the machine. */
259 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
260 (unsigned)current->pid <= 1)
261 mce_panic("Uncorrected machine check", &panicm, mcestart);
263 /* do_exit takes an awful lot of locks and has as
264 slight risk of deadlocking. If you don't want that
265 don't set tolerant >= 2 */
271 /* Last thing done in the machine check exception to clear state. */
272 wrmsrl(MSR_IA32_MCG_STATUS, 0);
274 atomic_dec(&mce_entry);
278 * Periodic polling timer for "silent" machine check errors.
281 static int check_interval = 5 * 60; /* 5 minutes */
282 static void mcheck_timer(void *data);
283 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
285 static void mcheck_check_cpu(void *info)
287 if (mce_available(¤t_cpu_data))
288 do_machine_check(NULL, 0);
291 static void mcheck_timer(void *data)
293 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
294 schedule_delayed_work(&mcheck_work, check_interval * HZ);
297 * It's ok to read stale data here for notify_user and
298 * console_logged as we'll simply get the updated versions
299 * on the next mcheck_timer execution and atomic operations
300 * on console_logged act as synchronization for notify_user
303 if (notify_user && console_logged) {
305 clear_bit(0, &console_logged);
306 printk(KERN_INFO "Machine check events logged\n");
311 static __init int periodic_mcheck_init(void)
314 schedule_delayed_work(&mcheck_work, check_interval*HZ);
317 __initcall(periodic_mcheck_init);
321 * Initialize Machine Checks for a CPU.
323 static void mce_init(void *dummy)
328 rdmsrl(MSR_IA32_MCG_CAP, cap);
330 if (banks > NR_BANKS) {
331 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
334 /* Use accurate RIP reporting if available. */
335 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
336 rip_msr = MSR_IA32_MCG_EIP;
338 /* Log the machine checks left over from the previous reset.
339 This also clears all registers */
340 do_machine_check(NULL, mce_bootlog ? -1 : -2);
342 set_in_cr4(X86_CR4_MCE);
345 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
347 for (i = 0; i < banks; i++) {
348 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
349 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
353 /* Add per CPU specific workarounds here */
354 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
356 /* This should be disabled by the BIOS, but isn't always */
357 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
358 /* disable GART TBL walk error reporting, which trips off
359 incorrectly with the IOMMU & 3ware & Cerberus. */
360 clear_bit(10, &bank[4]);
361 /* Lots of broken BIOS around that don't clear them
362 by default and leave crap in there. Don't log. */
368 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
370 switch (c->x86_vendor) {
371 case X86_VENDOR_INTEL:
372 mce_intel_feature_init(c);
375 mce_amd_feature_init(c);
383 * Called for each booted CPU to set up machine checks.
384 * Must be called with preempt off.
386 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
388 static cpumask_t mce_cpus = CPU_MASK_NONE;
393 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
402 * Character device to read and clear the MCE log.
405 static void collect_tscs(void *data)
407 unsigned long *cpu_tsc = (unsigned long *)data;
408 rdtscll(cpu_tsc[smp_processor_id()]);
411 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
413 unsigned long *cpu_tsc;
414 static DECLARE_MUTEX(mce_read_sem);
416 char __user *buf = ubuf;
419 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
424 next = rcu_dereference(mcelog.next);
426 /* Only supports full reads right now */
427 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
434 for (i = 0; i < next; i++) {
435 unsigned long start = jiffies;
436 while (!mcelog.entry[i].finished) {
437 if (!time_before(jiffies, start + 2)) {
438 memset(mcelog.entry + i,0, sizeof(struct mce));
444 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
445 buf += sizeof(struct mce);
448 memset(mcelog.entry, 0, next * sizeof(struct mce));
453 /* Collect entries that were still getting written before the synchronize. */
455 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
456 for (i = next; i < MCE_LOG_LEN; i++) {
457 if (mcelog.entry[i].finished &&
458 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
459 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
461 buf += sizeof(struct mce);
462 memset(&mcelog.entry[i], 0, sizeof(struct mce));
467 return err ? -EFAULT : buf - ubuf;
470 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
472 int __user *p = (int __user *)arg;
473 if (!capable(CAP_SYS_ADMIN))
476 case MCE_GET_RECORD_LEN:
477 return put_user(sizeof(struct mce), p);
478 case MCE_GET_LOG_LEN:
479 return put_user(MCE_LOG_LEN, p);
480 case MCE_GETCLEAR_FLAGS: {
483 flags = mcelog.flags;
484 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
485 return put_user(flags, p);
492 static struct file_operations mce_chrdev_ops = {
497 static struct miscdevice mce_log_device = {
504 * Old style boot options parsing. Only for compatibility.
507 static int __init mcheck_disable(char *str)
513 /* mce=off disables machine check. Note you can reenable it later
515 mce=TOLERANCELEVEL (number, see above)
516 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
517 mce=nobootlog Don't log MCEs from before booting. */
518 static int __init mcheck_enable(char *str)
522 if (!strcmp(str, "off"))
524 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
525 mce_bootlog = str[0] == 'b';
526 else if (isdigit(str[0]))
527 get_option(&str, &tolerant);
529 printk("mce= argument %s ignored. Please use /sys", str);
533 __setup("nomce", mcheck_disable);
534 __setup("mce", mcheck_enable);
540 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
541 Only one CPU is active at this time, the others get readded later using
543 static int mce_resume(struct sys_device *dev)
549 /* Reinit MCEs after user configuration changes */
550 static void mce_restart(void)
553 cancel_delayed_work(&mcheck_work);
554 /* Timer race is harmless here */
555 on_each_cpu(mce_init, NULL, 1, 1);
557 schedule_delayed_work(&mcheck_work, check_interval*HZ);
560 static struct sysdev_class mce_sysclass = {
561 .resume = mce_resume,
562 set_kset_name("machinecheck"),
565 DEFINE_PER_CPU(struct sys_device, device_mce);
567 /* Why are there no generic functions for this? */
568 #define ACCESSOR(name, var, start) \
569 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
570 return sprintf(buf, "%lx\n", (unsigned long)var); \
572 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
574 unsigned long new = simple_strtoul(buf, &end, 0); \
575 if (end == buf) return -EINVAL; \
580 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
582 ACCESSOR(bank0ctl,bank[0],mce_restart())
583 ACCESSOR(bank1ctl,bank[1],mce_restart())
584 ACCESSOR(bank2ctl,bank[2],mce_restart())
585 ACCESSOR(bank3ctl,bank[3],mce_restart())
586 ACCESSOR(bank4ctl,bank[4],mce_restart())
587 ACCESSOR(bank5ctl,bank[5],mce_restart())
588 static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
589 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
590 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
591 ACCESSOR(tolerant,tolerant,)
592 ACCESSOR(check_interval,check_interval,mce_restart())
594 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
595 static __cpuinit int mce_create_device(unsigned int cpu)
599 if (!mce_available(&cpu_data[cpu]))
602 per_cpu(device_mce,cpu).id = cpu;
603 per_cpu(device_mce,cpu).cls = &mce_sysclass;
605 err = sysdev_register(&per_cpu(device_mce,cpu));
608 for (i = 0; i < banks; i++)
609 sysdev_create_file(&per_cpu(device_mce,cpu),
611 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
612 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
617 #ifdef CONFIG_HOTPLUG_CPU
618 static void mce_remove_device(unsigned int cpu)
622 for (i = 0; i < banks; i++)
623 sysdev_remove_file(&per_cpu(device_mce,cpu),
625 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
626 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
627 sysdev_unregister(&per_cpu(device_mce,cpu));
630 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
632 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
634 unsigned int cpu = (unsigned long)hcpu;
638 mce_create_device(cpu);
641 mce_remove_device(cpu);
647 static struct notifier_block mce_cpu_notifier = {
648 .notifier_call = mce_cpu_callback,
652 static __init int mce_init_device(void)
657 if (!mce_available(&boot_cpu_data))
659 err = sysdev_class_register(&mce_sysclass);
661 for_each_online_cpu(i) {
662 mce_create_device(i);
665 register_hotcpu_notifier(&mce_cpu_notifier);
666 misc_register(&mce_log_device);
670 device_initcall(mce_init_device);