2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <linux/kmod.h>
23 #include <asm/processor.h>
26 #include <asm/kdebug.h>
27 #include <asm/uaccess.h>
30 #define MISC_MCELOG_MINOR 227
35 static int mce_dont_init;
37 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
38 3: never panic or exit (for testing only) */
39 static int tolerant = 1;
41 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
42 static unsigned long console_logged;
43 static int notify_user;
45 static int mce_bootlog = 1;
46 static atomic_t mce_events;
48 static char trigger[128];
49 static char *trigger_argv[2] = { trigger, NULL };
52 * Lockless MCE logging infrastructure.
53 * This avoids deadlocks on printk locks without having to break locks. Also
54 * separate MCEs from kernel messages to avoid bogus bug reports.
57 struct mce_log mcelog = {
62 void mce_log(struct mce *mce)
65 atomic_inc(&mce_events);
69 entry = rcu_dereference(mcelog.next);
70 /* The rmb forces the compiler to reload next in each
74 /* When the buffer fills up discard new entries. Assume
75 that the earlier errors are the more interesting. */
76 if (entry >= MCE_LOG_LEN) {
77 set_bit(MCE_OVERFLOW, &mcelog.flags);
80 /* Old left over entry. Skip. */
81 if (mcelog.entry[entry].finished) {
89 if (cmpxchg(&mcelog.next, entry, next) == entry)
92 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
94 mcelog.entry[entry].finished = 1;
97 if (!test_and_set_bit(0, &console_logged))
101 static void print_mce(struct mce *m)
103 printk(KERN_EMERG "\n"
104 KERN_EMERG "HARDWARE ERROR\n"
106 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
107 m->cpu, m->mcgstatus, m->bank, m->status);
110 "RIP%s %02x:<%016Lx> ",
111 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
113 if (m->cs == __KERNEL_CS)
114 print_symbol("{%s}", m->rip);
117 printk(KERN_EMERG "TSC %Lx ", m->tsc);
119 printk("ADDR %Lx ", m->addr);
121 printk("MISC %Lx ", m->misc);
123 printk(KERN_EMERG "This is not a software problem!\n");
125 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
128 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
132 for (i = 0; i < MCE_LOG_LEN; i++) {
133 unsigned long tsc = mcelog.entry[i].tsc;
134 if (time_before(tsc, start))
136 print_mce(&mcelog.entry[i]);
137 if (backup && mcelog.entry[i].tsc == backup->tsc)
143 printk("Fake panic: %s\n", msg);
148 static int mce_available(struct cpuinfo_x86 *c)
150 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
153 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
155 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
163 /* Assume the RIP in the MSR is exact. Is this true? */
164 m->mcgstatus |= MCG_STATUS_EIPV;
165 rdmsrl(rip_msr, m->rip);
170 static void do_mce_trigger(void)
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1);
182 * The actual machine check handler
185 void do_machine_check(struct pt_regs * regs, long error_code)
187 struct mce m, panicm;
188 int nowayout = (tolerant < 1);
192 int panicm_found = 0;
194 atomic_inc(&mce_entry);
197 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
201 memset(&m, 0, sizeof(struct mce));
202 m.cpu = smp_processor_id();
203 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
204 if (!(m.mcgstatus & MCG_STATUS_RIPV))
210 for (i = 0; i < banks; i++) {
219 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
220 if ((m.status & MCI_STATUS_VAL) == 0)
223 if (m.status & MCI_STATUS_EN) {
224 /* In theory _OVER could be a nowayout too, but
225 assume any overflowed errors were no fatal. */
226 nowayout |= !!(m.status & MCI_STATUS_PCC);
227 kill_it |= !!(m.status & MCI_STATUS_UC);
230 if (m.status & MCI_STATUS_MISCV)
231 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
232 if (m.status & MCI_STATUS_ADDRV)
233 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235 mce_get_rip(&m, regs);
238 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
239 if (error_code != -2)
242 /* Did this bank cause the exception? */
243 /* Assume that the bank with uncorrectable errors did it,
244 and that there is only a single one. */
245 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
250 add_taint(TAINT_MACHINE_CHECK);
253 /* Never do anything final in the polling timer */
255 /* Normal interrupt context here. Call trigger for any new
261 /* If we didn't find an uncorrectable error, pick
262 the last one (shouldn't happen, just being safe). */
266 mce_panic("Machine check", &panicm, mcestart);
270 if (m.mcgstatus & MCG_STATUS_RIPV)
271 user_space = panicm.rip && (panicm.cs & 3);
273 /* When the machine was in user space and the CPU didn't get
274 confused it's normally not necessary to panic, unless you
275 are paranoid (tolerant == 0)
277 RED-PEN could be more tolerant for MCEs in idle,
278 but most likely they occur at boot anyways, where
279 it is best to just halt the machine. */
280 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
281 (unsigned)current->pid <= 1)
282 mce_panic("Uncorrected machine check", &panicm, mcestart);
284 /* do_exit takes an awful lot of locks and has as
285 slight risk of deadlocking. If you don't want that
286 don't set tolerant >= 2 */
292 /* Last thing done in the machine check exception to clear state. */
293 wrmsrl(MSR_IA32_MCG_STATUS, 0);
295 atomic_dec(&mce_entry);
298 #ifdef CONFIG_X86_MCE_INTEL
300 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
301 * @cpu: The CPU on which the event occured.
302 * @status: Event status information
304 * This function should be called by the thermal interrupt after the
305 * event has been processed and the decision was made to log the event
308 * The status parameter will be saved to the 'status' field of 'struct mce'
309 * and historically has been the register value of the
310 * MSR_IA32_THERMAL_STATUS (Intel) msr.
312 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
316 memset(&m, 0, sizeof(m));
318 m.bank = MCE_THERMAL_BANK;
323 #endif /* CONFIG_X86_MCE_INTEL */
326 * Periodic polling timer for "silent" machine check errors.
329 static int check_interval = 5 * 60; /* 5 minutes */
330 static void mcheck_timer(struct work_struct *work);
331 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
333 static void mcheck_check_cpu(void *info)
335 if (mce_available(¤t_cpu_data))
336 do_machine_check(NULL, 0);
339 static void mcheck_timer(struct work_struct *work)
341 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
342 schedule_delayed_work(&mcheck_work, check_interval * HZ);
345 * It's ok to read stale data here for notify_user and
346 * console_logged as we'll simply get the updated versions
347 * on the next mcheck_timer execution and atomic operations
348 * on console_logged act as synchronization for notify_user
351 if (notify_user && console_logged) {
353 clear_bit(0, &console_logged);
354 printk(KERN_INFO "Machine check events logged\n");
359 static __init int periodic_mcheck_init(void)
362 schedule_delayed_work(&mcheck_work, check_interval*HZ);
365 __initcall(periodic_mcheck_init);
369 * Initialize Machine Checks for a CPU.
371 static void mce_init(void *dummy)
376 rdmsrl(MSR_IA32_MCG_CAP, cap);
378 if (banks > NR_BANKS) {
379 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
382 /* Use accurate RIP reporting if available. */
383 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
384 rip_msr = MSR_IA32_MCG_EIP;
386 /* Log the machine checks left over from the previous reset.
387 This also clears all registers */
388 do_machine_check(NULL, mce_bootlog ? -1 : -2);
390 set_in_cr4(X86_CR4_MCE);
393 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
395 for (i = 0; i < banks; i++) {
396 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
397 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
401 /* Add per CPU specific workarounds here */
402 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
404 /* This should be disabled by the BIOS, but isn't always */
405 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
406 /* disable GART TBL walk error reporting, which trips off
407 incorrectly with the IOMMU & 3ware & Cerberus. */
408 clear_bit(10, &bank[4]);
409 /* Lots of broken BIOS around that don't clear them
410 by default and leave crap in there. Don't log. */
416 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
418 switch (c->x86_vendor) {
419 case X86_VENDOR_INTEL:
420 mce_intel_feature_init(c);
423 mce_amd_feature_init(c);
431 * Called for each booted CPU to set up machine checks.
432 * Must be called with preempt off.
434 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
436 static cpumask_t mce_cpus = CPU_MASK_NONE;
441 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
450 * Character device to read and clear the MCE log.
453 static void collect_tscs(void *data)
455 unsigned long *cpu_tsc = (unsigned long *)data;
456 rdtscll(cpu_tsc[smp_processor_id()]);
459 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
461 unsigned long *cpu_tsc;
462 static DECLARE_MUTEX(mce_read_sem);
464 char __user *buf = ubuf;
467 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
472 next = rcu_dereference(mcelog.next);
474 /* Only supports full reads right now */
475 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
482 for (i = 0; i < next; i++) {
483 unsigned long start = jiffies;
484 while (!mcelog.entry[i].finished) {
485 if (!time_before(jiffies, start + 2)) {
486 memset(mcelog.entry + i,0, sizeof(struct mce));
492 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
493 buf += sizeof(struct mce);
496 memset(mcelog.entry, 0, next * sizeof(struct mce));
501 /* Collect entries that were still getting written before the synchronize. */
503 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
504 for (i = next; i < MCE_LOG_LEN; i++) {
505 if (mcelog.entry[i].finished &&
506 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
507 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
509 buf += sizeof(struct mce);
510 memset(&mcelog.entry[i], 0, sizeof(struct mce));
515 return err ? -EFAULT : buf - ubuf;
518 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
520 int __user *p = (int __user *)arg;
521 if (!capable(CAP_SYS_ADMIN))
524 case MCE_GET_RECORD_LEN:
525 return put_user(sizeof(struct mce), p);
526 case MCE_GET_LOG_LEN:
527 return put_user(MCE_LOG_LEN, p);
528 case MCE_GETCLEAR_FLAGS: {
531 flags = mcelog.flags;
532 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
533 return put_user(flags, p);
540 static const struct file_operations mce_chrdev_ops = {
545 static struct miscdevice mce_log_device = {
552 * Old style boot options parsing. Only for compatibility.
555 static int __init mcheck_disable(char *str)
561 /* mce=off disables machine check. Note you can reenable it later
563 mce=TOLERANCELEVEL (number, see above)
564 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
565 mce=nobootlog Don't log MCEs from before booting. */
566 static int __init mcheck_enable(char *str)
570 if (!strcmp(str, "off"))
572 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
573 mce_bootlog = str[0] == 'b';
574 else if (isdigit(str[0]))
575 get_option(&str, &tolerant);
577 printk("mce= argument %s ignored. Please use /sys", str);
581 __setup("nomce", mcheck_disable);
582 __setup("mce", mcheck_enable);
588 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
589 Only one CPU is active at this time, the others get readded later using
591 static int mce_resume(struct sys_device *dev)
597 /* Reinit MCEs after user configuration changes */
598 static void mce_restart(void)
601 cancel_delayed_work(&mcheck_work);
602 /* Timer race is harmless here */
603 on_each_cpu(mce_init, NULL, 1, 1);
605 schedule_delayed_work(&mcheck_work, check_interval*HZ);
608 static struct sysdev_class mce_sysclass = {
609 .resume = mce_resume,
610 set_kset_name("machinecheck"),
613 DEFINE_PER_CPU(struct sys_device, device_mce);
615 /* Why are there no generic functions for this? */
616 #define ACCESSOR(name, var, start) \
617 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
618 return sprintf(buf, "%lx\n", (unsigned long)var); \
620 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
622 unsigned long new = simple_strtoul(buf, &end, 0); \
623 if (end == buf) return -EINVAL; \
628 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
630 /* TBD should generate these dynamically based on number of available banks */
631 ACCESSOR(bank0ctl,bank[0],mce_restart())
632 ACCESSOR(bank1ctl,bank[1],mce_restart())
633 ACCESSOR(bank2ctl,bank[2],mce_restart())
634 ACCESSOR(bank3ctl,bank[3],mce_restart())
635 ACCESSOR(bank4ctl,bank[4],mce_restart())
636 ACCESSOR(bank5ctl,bank[5],mce_restart())
638 static ssize_t show_trigger(struct sys_device *s, char *buf)
640 strcpy(buf, trigger);
642 return strlen(trigger) + 1;
645 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
649 strncpy(trigger, buf, sizeof(trigger));
650 trigger[sizeof(trigger)-1] = 0;
651 len = strlen(trigger);
652 p = strchr(trigger, '\n');
657 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
658 ACCESSOR(tolerant,tolerant,)
659 ACCESSOR(check_interval,check_interval,mce_restart())
660 static struct sysdev_attribute *mce_attributes[] = {
661 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
662 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
663 &attr_tolerant, &attr_check_interval, &attr_trigger,
667 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
668 static __cpuinit int mce_create_device(unsigned int cpu)
672 if (!mce_available(&cpu_data[cpu]))
675 per_cpu(device_mce,cpu).id = cpu;
676 per_cpu(device_mce,cpu).cls = &mce_sysclass;
678 err = sysdev_register(&per_cpu(device_mce,cpu));
681 for (i = 0; mce_attributes[i]; i++)
682 sysdev_create_file(&per_cpu(device_mce,cpu),
688 static void mce_remove_device(unsigned int cpu)
692 for (i = 0; mce_attributes[i]; i++)
693 sysdev_remove_file(&per_cpu(device_mce,cpu),
695 sysdev_unregister(&per_cpu(device_mce,cpu));
696 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
699 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
701 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
703 unsigned int cpu = (unsigned long)hcpu;
707 mce_create_device(cpu);
710 mce_remove_device(cpu);
716 static struct notifier_block mce_cpu_notifier = {
717 .notifier_call = mce_cpu_callback,
720 static __init int mce_init_device(void)
725 if (!mce_available(&boot_cpu_data))
727 err = sysdev_class_register(&mce_sysclass);
729 for_each_online_cpu(i) {
730 mce_create_device(i);
733 register_hotcpu_notifier(&mce_cpu_notifier);
734 misc_register(&mce_log_device);
738 device_initcall(mce_init_device);