2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <asm/processor.h>
25 #include <asm/kdebug.h>
26 #include <asm/uaccess.h>
29 #define MISC_MCELOG_MINOR 227
32 static int mce_dont_init;
34 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
35 3: never panic or exit (for testing only) */
36 static int tolerant = 1;
38 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
39 static unsigned long console_logged;
40 static int notify_user;
42 static int mce_bootlog = 1;
45 * Lockless MCE logging infrastructure.
46 * This avoids deadlocks on printk locks without having to break locks. Also
47 * separate MCEs from kernel messages to avoid bogus bug reports.
50 struct mce_log mcelog = {
55 void mce_log(struct mce *mce)
61 entry = rcu_dereference(mcelog.next);
62 /* The rmb forces the compiler to reload next in each
66 /* When the buffer fills up discard new entries. Assume
67 that the earlier errors are the more interesting. */
68 if (entry >= MCE_LOG_LEN) {
69 set_bit(MCE_OVERFLOW, &mcelog.flags);
72 /* Old left over entry. Skip. */
73 if (mcelog.entry[entry].finished) {
81 if (cmpxchg(&mcelog.next, entry, next) == entry)
84 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
86 mcelog.entry[entry].finished = 1;
89 if (!test_and_set_bit(0, &console_logged))
93 static void print_mce(struct mce *m)
95 printk(KERN_EMERG "\n"
96 KERN_EMERG "HARDWARE ERROR\n"
98 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
99 m->cpu, m->mcgstatus, m->bank, m->status);
102 "RIP%s %02x:<%016Lx> ",
103 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
105 if (m->cs == __KERNEL_CS)
106 print_symbol("{%s}", m->rip);
109 printk(KERN_EMERG "TSC %Lx ", m->tsc);
111 printk("ADDR %Lx ", m->addr);
113 printk("MISC %Lx ", m->misc);
115 printk(KERN_EMERG "This is not a software problem!\n");
117 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
120 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
124 for (i = 0; i < MCE_LOG_LEN; i++) {
125 unsigned long tsc = mcelog.entry[i].tsc;
126 if (time_before(tsc, start))
128 print_mce(&mcelog.entry[i]);
129 if (backup && mcelog.entry[i].tsc == backup->tsc)
135 printk("Fake panic: %s\n", msg);
140 static int mce_available(struct cpuinfo_x86 *c)
142 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
143 test_bit(X86_FEATURE_MCA, &c->x86_capability);
146 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
148 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
156 /* Assume the RIP in the MSR is exact. Is this true? */
157 m->mcgstatus |= MCG_STATUS_EIPV;
158 rdmsrl(rip_msr, m->rip);
164 * The actual machine check handler
167 void do_machine_check(struct pt_regs * regs, long error_code)
169 struct mce m, panicm;
170 int nowayout = (tolerant < 1);
174 int panicm_found = 0;
177 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
181 memset(&m, 0, sizeof(struct mce));
182 m.cpu = safe_smp_processor_id();
183 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
184 if (!(m.mcgstatus & MCG_STATUS_RIPV))
190 for (i = 0; i < banks; i++) {
199 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
200 if ((m.status & MCI_STATUS_VAL) == 0)
203 if (m.status & MCI_STATUS_EN) {
204 /* In theory _OVER could be a nowayout too, but
205 assume any overflowed errors were no fatal. */
206 nowayout |= !!(m.status & MCI_STATUS_PCC);
207 kill_it |= !!(m.status & MCI_STATUS_UC);
210 if (m.status & MCI_STATUS_MISCV)
211 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
212 if (m.status & MCI_STATUS_ADDRV)
213 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
215 mce_get_rip(&m, regs);
218 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
219 if (error_code != -2)
222 /* Did this bank cause the exception? */
223 /* Assume that the bank with uncorrectable errors did it,
224 and that there is only a single one. */
225 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
230 add_taint(TAINT_MACHINE_CHECK);
233 /* Never do anything final in the polling timer */
237 /* If we didn't find an uncorrectable error, pick
238 the last one (shouldn't happen, just being safe). */
242 mce_panic("Machine check", &panicm, mcestart);
246 if (m.mcgstatus & MCG_STATUS_RIPV)
247 user_space = panicm.rip && (panicm.cs & 3);
249 /* When the machine was in user space and the CPU didn't get
250 confused it's normally not necessary to panic, unless you
251 are paranoid (tolerant == 0)
253 RED-PEN could be more tolerant for MCEs in idle,
254 but most likely they occur at boot anyways, where
255 it is best to just halt the machine. */
256 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
257 (unsigned)current->pid <= 1)
258 mce_panic("Uncorrected machine check", &panicm, mcestart);
260 /* do_exit takes an awful lot of locks and has as
261 slight risk of deadlocking. If you don't want that
262 don't set tolerant >= 2 */
268 /* Last thing done in the machine check exception to clear state. */
269 wrmsrl(MSR_IA32_MCG_STATUS, 0);
273 * Periodic polling timer for "silent" machine check errors.
276 static int check_interval = 5 * 60; /* 5 minutes */
277 static void mcheck_timer(void *data);
278 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
280 static void mcheck_check_cpu(void *info)
282 if (mce_available(¤t_cpu_data))
283 do_machine_check(NULL, 0);
286 static void mcheck_timer(void *data)
288 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
289 schedule_delayed_work(&mcheck_work, check_interval * HZ);
292 * It's ok to read stale data here for notify_user and
293 * console_logged as we'll simply get the updated versions
294 * on the next mcheck_timer execution and atomic operations
295 * on console_logged act as synchronization for notify_user
298 if (notify_user && console_logged) {
300 clear_bit(0, &console_logged);
301 printk(KERN_INFO "Machine check events logged\n");
306 static __init int periodic_mcheck_init(void)
309 schedule_delayed_work(&mcheck_work, check_interval*HZ);
312 __initcall(periodic_mcheck_init);
316 * Initialize Machine Checks for a CPU.
318 static void mce_init(void *dummy)
323 rdmsrl(MSR_IA32_MCG_CAP, cap);
325 if (banks > NR_BANKS) {
326 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
329 /* Use accurate RIP reporting if available. */
330 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
331 rip_msr = MSR_IA32_MCG_EIP;
333 /* Log the machine checks left over from the previous reset.
334 This also clears all registers */
335 do_machine_check(NULL, mce_bootlog ? -1 : -2);
337 set_in_cr4(X86_CR4_MCE);
340 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
342 for (i = 0; i < banks; i++) {
343 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
344 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
348 /* Add per CPU specific workarounds here */
349 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
351 /* This should be disabled by the BIOS, but isn't always */
352 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
353 /* disable GART TBL walk error reporting, which trips off
354 incorrectly with the IOMMU & 3ware & Cerberus. */
355 clear_bit(10, &bank[4]);
356 /* Lots of broken BIOS around that don't clear them
357 by default and leave crap in there. Don't log. */
363 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
365 switch (c->x86_vendor) {
366 case X86_VENDOR_INTEL:
367 mce_intel_feature_init(c);
370 mce_amd_feature_init(c);
378 * Called for each booted CPU to set up machine checks.
379 * Must be called with preempt off.
381 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
383 static cpumask_t mce_cpus = CPU_MASK_NONE;
388 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
397 * Character device to read and clear the MCE log.
400 static void collect_tscs(void *data)
402 unsigned long *cpu_tsc = (unsigned long *)data;
403 rdtscll(cpu_tsc[smp_processor_id()]);
406 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
408 unsigned long *cpu_tsc;
409 static DECLARE_MUTEX(mce_read_sem);
411 char __user *buf = ubuf;
414 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
419 next = rcu_dereference(mcelog.next);
421 /* Only supports full reads right now */
422 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
429 for (i = 0; i < next; i++) {
430 unsigned long start = jiffies;
431 while (!mcelog.entry[i].finished) {
432 if (!time_before(jiffies, start + 2)) {
433 memset(mcelog.entry + i,0, sizeof(struct mce));
439 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
440 buf += sizeof(struct mce);
443 memset(mcelog.entry, 0, next * sizeof(struct mce));
448 /* Collect entries that were still getting written before the synchronize. */
450 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
451 for (i = next; i < MCE_LOG_LEN; i++) {
452 if (mcelog.entry[i].finished &&
453 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
454 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
456 buf += sizeof(struct mce);
457 memset(&mcelog.entry[i], 0, sizeof(struct mce));
462 return err ? -EFAULT : buf - ubuf;
465 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
467 int __user *p = (int __user *)arg;
468 if (!capable(CAP_SYS_ADMIN))
471 case MCE_GET_RECORD_LEN:
472 return put_user(sizeof(struct mce), p);
473 case MCE_GET_LOG_LEN:
474 return put_user(MCE_LOG_LEN, p);
475 case MCE_GETCLEAR_FLAGS: {
478 flags = mcelog.flags;
479 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
480 return put_user(flags, p);
487 static struct file_operations mce_chrdev_ops = {
492 static struct miscdevice mce_log_device = {
499 * Old style boot options parsing. Only for compatibility.
502 static int __init mcheck_disable(char *str)
508 /* mce=off disables machine check. Note you can reenable it later
510 mce=TOLERANCELEVEL (number, see above)
511 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
512 mce=nobootlog Don't log MCEs from before booting. */
513 static int __init mcheck_enable(char *str)
517 if (!strcmp(str, "off"))
519 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
520 mce_bootlog = str[0] == 'b';
521 else if (isdigit(str[0]))
522 get_option(&str, &tolerant);
524 printk("mce= argument %s ignored. Please use /sys", str);
528 __setup("nomce", mcheck_disable);
529 __setup("mce", mcheck_enable);
535 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
536 Only one CPU is active at this time, the others get readded later using
538 static int mce_resume(struct sys_device *dev)
544 /* Reinit MCEs after user configuration changes */
545 static void mce_restart(void)
548 cancel_delayed_work(&mcheck_work);
549 /* Timer race is harmless here */
550 on_each_cpu(mce_init, NULL, 1, 1);
552 schedule_delayed_work(&mcheck_work, check_interval*HZ);
555 static struct sysdev_class mce_sysclass = {
556 .resume = mce_resume,
557 set_kset_name("machinecheck"),
560 static DEFINE_PER_CPU(struct sys_device, device_mce);
562 /* Why are there no generic functions for this? */
563 #define ACCESSOR(name, var, start) \
564 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
565 return sprintf(buf, "%lx\n", (unsigned long)var); \
567 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
569 unsigned long new = simple_strtoul(buf, &end, 0); \
570 if (end == buf) return -EINVAL; \
575 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
577 ACCESSOR(bank0ctl,bank[0],mce_restart())
578 ACCESSOR(bank1ctl,bank[1],mce_restart())
579 ACCESSOR(bank2ctl,bank[2],mce_restart())
580 ACCESSOR(bank3ctl,bank[3],mce_restart())
581 ACCESSOR(bank4ctl,bank[4],mce_restart())
582 ACCESSOR(bank5ctl,bank[5],mce_restart())
583 static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
584 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
585 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
586 ACCESSOR(tolerant,tolerant,)
587 ACCESSOR(check_interval,check_interval,mce_restart())
589 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
590 static __cpuinit int mce_create_device(unsigned int cpu)
594 if (!mce_available(&cpu_data[cpu]))
597 per_cpu(device_mce,cpu).id = cpu;
598 per_cpu(device_mce,cpu).cls = &mce_sysclass;
600 err = sysdev_register(&per_cpu(device_mce,cpu));
603 for (i = 0; i < banks; i++)
604 sysdev_create_file(&per_cpu(device_mce,cpu),
606 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
607 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
612 #ifdef CONFIG_HOTPLUG_CPU
613 static __cpuinit void mce_remove_device(unsigned int cpu)
617 for (i = 0; i < banks; i++)
618 sysdev_remove_file(&per_cpu(device_mce,cpu),
620 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
621 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
622 sysdev_unregister(&per_cpu(device_mce,cpu));
626 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
628 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
630 unsigned int cpu = (unsigned long)hcpu;
634 mce_create_device(cpu);
636 #ifdef CONFIG_HOTPLUG_CPU
638 mce_remove_device(cpu);
645 static struct notifier_block mce_cpu_notifier = {
646 .notifier_call = mce_cpu_callback,
649 static __init int mce_init_device(void)
654 if (!mce_available(&boot_cpu_data))
656 err = sysdev_class_register(&mce_sysclass);
658 for_each_online_cpu(i) {
659 mce_create_device(i);
662 register_cpu_notifier(&mce_cpu_notifier);
663 misc_register(&mce_log_device);
667 device_initcall(mce_init_device);