2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <linux/ctype.h>
21 #include <asm/processor.h>
24 #include <asm/kdebug.h>
25 #include <asm/uaccess.h>
27 #define MISC_MCELOG_MINOR 227
30 static int mce_dont_init;
32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34 static int tolerant = 1;
36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37 static unsigned long console_logged;
38 static int notify_user;
40 static int mce_bootlog = 1;
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
48 struct mce_log mcelog = {
53 void mce_log(struct mce *mce)
59 entry = rcu_dereference(mcelog.next);
60 /* The rmb forces the compiler to reload next in each
64 /* When the buffer fills up discard new entries. Assume
65 that the earlier errors are the more interesting. */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW, &mcelog.flags);
70 /* Old left over entry. Skip. */
71 if (mcelog.entry[entry].finished) {
79 if (cmpxchg(&mcelog.next, entry, next) == entry)
82 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
84 mcelog.entry[entry].finished = 1;
87 if (!test_and_set_bit(0, &console_logged))
91 static void print_mce(struct mce *m)
93 printk(KERN_EMERG "\n"
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status);
99 "RIP%s %02x:<%016Lx> ",
100 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
102 if (m->cs == __KERNEL_CS)
103 print_symbol("{%s}", m->rip);
106 printk(KERN_EMERG "TSC %Lx ", m->tsc);
108 printk("ADDR %Lx ", m->addr);
110 printk("MISC %Lx ", m->misc);
114 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
118 for (i = 0; i < MCE_LOG_LEN; i++) {
119 unsigned long tsc = mcelog.entry[i].tsc;
120 if (time_before(tsc, start))
122 print_mce(&mcelog.entry[i]);
123 if (backup && mcelog.entry[i].tsc == backup->tsc)
129 printk("Fake panic: %s\n", msg);
134 static int mce_available(struct cpuinfo_x86 *c)
136 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
137 test_bit(X86_FEATURE_MCA, &c->x86_capability);
140 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
142 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
150 /* Assume the RIP in the MSR is exact. Is this true? */
151 m->mcgstatus |= MCG_STATUS_EIPV;
152 rdmsrl(rip_msr, m->rip);
158 * The actual machine check handler
161 void do_machine_check(struct pt_regs * regs, long error_code)
163 struct mce m, panicm;
164 int nowayout = (tolerant < 1);
168 int panicm_found = 0;
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
175 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV))
184 for (i = 0; i < banks; i++) {
193 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
194 if ((m.status & MCI_STATUS_VAL) == 0)
197 if (m.status & MCI_STATUS_EN) {
198 /* In theory _OVER could be a nowayout too, but
199 assume any overflowed errors were no fatal. */
200 nowayout |= !!(m.status & MCI_STATUS_PCC);
201 kill_it |= !!(m.status & MCI_STATUS_UC);
204 if (m.status & MCI_STATUS_MISCV)
205 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
206 if (m.status & MCI_STATUS_ADDRV)
207 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
209 mce_get_rip(&m, regs);
212 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
213 if (error_code != -2)
216 /* Did this bank cause the exception? */
217 /* Assume that the bank with uncorrectable errors did it,
218 and that there is only a single one. */
219 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
224 add_taint(TAINT_MACHINE_CHECK);
227 /* Never do anything final in the polling timer */
231 /* If we didn't find an uncorrectable error, pick
232 the last one (shouldn't happen, just being safe). */
236 mce_panic("Machine check", &panicm, mcestart);
240 if (m.mcgstatus & MCG_STATUS_RIPV)
241 user_space = panicm.rip && (panicm.cs & 3);
243 /* When the machine was in user space and the CPU didn't get
244 confused it's normally not necessary to panic, unless you
245 are paranoid (tolerant == 0)
247 RED-PEN could be more tolerant for MCEs in idle,
248 but most likely they occur at boot anyways, where
249 it is best to just halt the machine. */
250 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
251 (unsigned)current->pid <= 1)
252 mce_panic("Uncorrected machine check", &panicm, mcestart);
254 /* do_exit takes an awful lot of locks and has as
255 slight risk of deadlocking. If you don't want that
256 don't set tolerant >= 2 */
262 /* Last thing done in the machine check exception to clear state. */
263 wrmsrl(MSR_IA32_MCG_STATUS, 0);
267 * Periodic polling timer for "silent" machine check errors.
270 static int check_interval = 5 * 60; /* 5 minutes */
271 static void mcheck_timer(void *data);
272 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
274 static void mcheck_check_cpu(void *info)
276 if (mce_available(¤t_cpu_data))
277 do_machine_check(NULL, 0);
280 static void mcheck_timer(void *data)
282 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
283 schedule_delayed_work(&mcheck_work, check_interval * HZ);
286 * It's ok to read stale data here for notify_user and
287 * console_logged as we'll simply get the updated versions
288 * on the next mcheck_timer execution and atomic operations
289 * on console_logged act as synchronization for notify_user
292 if (notify_user && console_logged) {
294 clear_bit(0, &console_logged);
295 printk(KERN_INFO "Machine check events logged\n");
300 static __init int periodic_mcheck_init(void)
303 schedule_delayed_work(&mcheck_work, check_interval*HZ);
306 __initcall(periodic_mcheck_init);
310 * Initialize Machine Checks for a CPU.
312 static void mce_init(void *dummy)
317 rdmsrl(MSR_IA32_MCG_CAP, cap);
319 if (banks > NR_BANKS) {
320 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
323 /* Use accurate RIP reporting if available. */
324 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
325 rip_msr = MSR_IA32_MCG_EIP;
327 /* Log the machine checks left over from the previous reset.
328 This also clears all registers */
329 do_machine_check(NULL, mce_bootlog ? -1 : -2);
331 set_in_cr4(X86_CR4_MCE);
334 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
336 for (i = 0; i < banks; i++) {
337 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
338 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
342 /* Add per CPU specific workarounds here */
343 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
345 /* This should be disabled by the BIOS, but isn't always */
346 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
347 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]);
350 /* Lots of broken BIOS around that don't clear them
351 by default and leave crap in there. Don't log. */
357 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
359 switch (c->x86_vendor) {
360 case X86_VENDOR_INTEL:
361 mce_intel_feature_init(c);
364 mce_amd_feature_init(c);
372 * Called for each booted CPU to set up machine checks.
373 * Must be called with preempt off.
375 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
377 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
382 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
391 * Character device to read and clear the MCE log.
394 static void collect_tscs(void *data)
396 unsigned long *cpu_tsc = (unsigned long *)data;
397 rdtscll(cpu_tsc[smp_processor_id()]);
400 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
402 unsigned long *cpu_tsc;
403 static DECLARE_MUTEX(mce_read_sem);
405 char __user *buf = ubuf;
408 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
413 next = rcu_dereference(mcelog.next);
415 /* Only supports full reads right now */
416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
423 for (i = 0; i < next; i++) {
424 unsigned long start = jiffies;
425 while (!mcelog.entry[i].finished) {
426 if (!time_before(jiffies, start + 2)) {
427 memset(mcelog.entry + i,0, sizeof(struct mce));
433 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
434 buf += sizeof(struct mce);
437 memset(mcelog.entry, 0, next * sizeof(struct mce));
442 /* Collect entries that were still getting written before the synchronize. */
444 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
445 for (i = next; i < MCE_LOG_LEN; i++) {
446 if (mcelog.entry[i].finished &&
447 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
448 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
450 buf += sizeof(struct mce);
451 memset(&mcelog.entry[i], 0, sizeof(struct mce));
456 return err ? -EFAULT : buf - ubuf;
459 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
461 int __user *p = (int __user *)arg;
462 if (!capable(CAP_SYS_ADMIN))
465 case MCE_GET_RECORD_LEN:
466 return put_user(sizeof(struct mce), p);
467 case MCE_GET_LOG_LEN:
468 return put_user(MCE_LOG_LEN, p);
469 case MCE_GETCLEAR_FLAGS: {
472 flags = mcelog.flags;
473 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
474 return put_user(flags, p);
481 static struct file_operations mce_chrdev_ops = {
486 static struct miscdevice mce_log_device = {
493 * Old style boot options parsing. Only for compatibility.
496 static int __init mcheck_disable(char *str)
502 /* mce=off disables machine check. Note you can reenable it later
504 mce=TOLERANCELEVEL (number, see above)
505 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
506 mce=nobootlog Don't log MCEs from before booting. */
507 static int __init mcheck_enable(char *str)
511 if (!strcmp(str, "off"))
513 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
514 mce_bootlog = str[0] == 'b';
515 else if (isdigit(str[0]))
516 get_option(&str, &tolerant);
518 printk("mce= argument %s ignored. Please use /sys", str);
522 __setup("nomce", mcheck_disable);
523 __setup("mce", mcheck_enable);
529 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
530 Only one CPU is active at this time, the others get readded later using
532 static int mce_resume(struct sys_device *dev)
538 /* Reinit MCEs after user configuration changes */
539 static void mce_restart(void)
542 cancel_delayed_work(&mcheck_work);
543 /* Timer race is harmless here */
544 on_each_cpu(mce_init, NULL, 1, 1);
546 schedule_delayed_work(&mcheck_work, check_interval*HZ);
549 static struct sysdev_class mce_sysclass = {
550 .resume = mce_resume,
551 set_kset_name("machinecheck"),
554 static DEFINE_PER_CPU(struct sys_device, device_mce);
556 /* Why are there no generic functions for this? */
557 #define ACCESSOR(name, var, start) \
558 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
559 return sprintf(buf, "%lx\n", (unsigned long)var); \
561 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
563 unsigned long new = simple_strtoul(buf, &end, 0); \
564 if (end == buf) return -EINVAL; \
569 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
571 ACCESSOR(bank0ctl,bank[0],mce_restart())
572 ACCESSOR(bank1ctl,bank[1],mce_restart())
573 ACCESSOR(bank2ctl,bank[2],mce_restart())
574 ACCESSOR(bank3ctl,bank[3],mce_restart())
575 ACCESSOR(bank4ctl,bank[4],mce_restart())
576 ACCESSOR(tolerant,tolerant,)
577 ACCESSOR(check_interval,check_interval,mce_restart())
579 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
580 static __cpuinit int mce_create_device(unsigned int cpu)
583 if (!mce_available(&cpu_data[cpu]))
586 per_cpu(device_mce,cpu).id = cpu;
587 per_cpu(device_mce,cpu).cls = &mce_sysclass;
589 err = sysdev_register(&per_cpu(device_mce,cpu));
592 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
593 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
594 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
595 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
596 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
597 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
598 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
603 #ifdef CONFIG_HOTPLUG_CPU
604 static __cpuinit void mce_remove_device(unsigned int cpu)
606 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
607 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
608 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
609 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
610 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
611 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
612 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
613 sysdev_unregister(&per_cpu(device_mce,cpu));
617 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
619 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
621 unsigned int cpu = (unsigned long)hcpu;
625 mce_create_device(cpu);
627 #ifdef CONFIG_HOTPLUG_CPU
629 mce_remove_device(cpu);
636 static struct notifier_block mce_cpu_notifier = {
637 .notifier_call = mce_cpu_callback,
640 static __init int mce_init_device(void)
645 if (!mce_available(&boot_cpu_data))
647 err = sysdev_class_register(&mce_sysclass);
649 for_each_online_cpu(i) {
650 mce_create_device(i);
653 register_cpu_notifier(&mce_cpu_notifier);
654 misc_register(&mce_log_device);
658 device_initcall(mce_init_device);