2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <linux/ctype.h>
21 #include <asm/processor.h>
24 #include <asm/kdebug.h>
25 #include <asm/uaccess.h>
27 #define MISC_MCELOG_MINOR 227
30 static int mce_dont_init;
32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34 static int tolerant = 1;
36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37 static unsigned long console_logged;
38 static int notify_user;
40 static int mce_bootlog;
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
48 struct mce_log mcelog = {
53 void mce_log(struct mce *mce)
59 entry = rcu_dereference(mcelog.next);
61 /* When the buffer fills up discard new entries. Assume
62 that the earlier errors are the more interesting. */
63 if (entry >= MCE_LOG_LEN) {
64 set_bit(MCE_OVERFLOW, &mcelog.flags);
67 /* Old left over entry. Skip. */
68 if (mcelog.entry[entry].finished) {
75 if (cmpxchg(&mcelog.next, entry, next) == entry)
78 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
80 mcelog.entry[entry].finished = 1;
83 if (!test_and_set_bit(0, &console_logged))
87 static void print_mce(struct mce *m)
89 printk(KERN_EMERG "\n"
91 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
92 m->cpu, m->mcgstatus, m->bank, m->status);
95 "RIP%s %02x:<%016Lx> ",
96 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
98 if (m->cs == __KERNEL_CS)
99 print_symbol("{%s}", m->rip);
102 printk(KERN_EMERG "TSC %Lx ", m->tsc);
104 printk("ADDR %Lx ", m->addr);
106 printk("MISC %Lx ", m->misc);
110 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
114 for (i = 0; i < MCE_LOG_LEN; i++) {
115 unsigned long tsc = mcelog.entry[i].tsc;
116 if (time_before(tsc, start))
118 print_mce(&mcelog.entry[i]);
119 if (backup && mcelog.entry[i].tsc == backup->tsc)
125 printk("Fake panic: %s\n", msg);
130 static int mce_available(struct cpuinfo_x86 *c)
132 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
133 test_bit(X86_FEATURE_MCA, &c->x86_capability);
136 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
138 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
146 /* Assume the RIP in the MSR is exact. Is this true? */
147 m->mcgstatus |= MCG_STATUS_EIPV;
148 rdmsrl(rip_msr, m->rip);
154 * The actual machine check handler
157 void do_machine_check(struct pt_regs * regs, long error_code)
159 struct mce m, panicm;
160 int nowayout = (tolerant < 1);
164 int panicm_found = 0;
167 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
171 memset(&m, 0, sizeof(struct mce));
172 m.cpu = hard_smp_processor_id();
173 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
174 if (!(m.mcgstatus & MCG_STATUS_RIPV))
180 for (i = 0; i < banks; i++) {
189 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
190 if ((m.status & MCI_STATUS_VAL) == 0)
193 if (m.status & MCI_STATUS_EN) {
194 /* In theory _OVER could be a nowayout too, but
195 assume any overflowed errors were no fatal. */
196 nowayout |= !!(m.status & MCI_STATUS_PCC);
197 kill_it |= !!(m.status & MCI_STATUS_UC);
200 if (m.status & MCI_STATUS_MISCV)
201 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
202 if (m.status & MCI_STATUS_ADDRV)
203 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
205 mce_get_rip(&m, regs);
208 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
209 if (error_code != -2)
212 /* Did this bank cause the exception? */
213 /* Assume that the bank with uncorrectable errors did it,
214 and that there is only a single one. */
215 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
220 add_taint(TAINT_MACHINE_CHECK);
223 /* Never do anything final in the polling timer */
227 /* If we didn't find an uncorrectable error, pick
228 the last one (shouldn't happen, just being safe). */
232 mce_panic("Machine check", &panicm, mcestart);
236 if (m.mcgstatus & MCG_STATUS_RIPV)
237 user_space = panicm.rip && (panicm.cs & 3);
239 /* When the machine was in user space and the CPU didn't get
240 confused it's normally not necessary to panic, unless you
241 are paranoid (tolerant == 0)
243 RED-PEN could be more tolerant for MCEs in idle,
244 but most likely they occur at boot anyways, where
245 it is best to just halt the machine. */
246 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
247 (unsigned)current->pid <= 1)
248 mce_panic("Uncorrected machine check", &panicm, mcestart);
250 /* do_exit takes an awful lot of locks and has as
251 slight risk of deadlocking. If you don't want that
252 don't set tolerant >= 2 */
258 /* Last thing done in the machine check exception to clear state. */
259 wrmsrl(MSR_IA32_MCG_STATUS, 0);
263 * Periodic polling timer for "silent" machine check errors.
266 static int check_interval = 5 * 60; /* 5 minutes */
267 static void mcheck_timer(void *data);
268 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
270 static void mcheck_check_cpu(void *info)
272 if (mce_available(¤t_cpu_data))
273 do_machine_check(NULL, 0);
276 static void mcheck_timer(void *data)
278 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
279 schedule_delayed_work(&mcheck_work, check_interval * HZ);
282 * It's ok to read stale data here for notify_user and
283 * console_logged as we'll simply get the updated versions
284 * on the next mcheck_timer execution and atomic operations
285 * on console_logged act as synchronization for notify_user
288 if (notify_user && console_logged) {
290 clear_bit(0, &console_logged);
291 printk(KERN_INFO "Machine check events logged\n");
296 static __init int periodic_mcheck_init(void)
299 schedule_delayed_work(&mcheck_work, check_interval*HZ);
302 __initcall(periodic_mcheck_init);
306 * Initialize Machine Checks for a CPU.
308 static void mce_init(void *dummy)
313 rdmsrl(MSR_IA32_MCG_CAP, cap);
315 if (banks > NR_BANKS) {
316 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
319 /* Use accurate RIP reporting if available. */
320 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
321 rip_msr = MSR_IA32_MCG_EIP;
323 /* Log the machine checks left over from the previous reset.
324 This also clears all registers */
325 do_machine_check(NULL, mce_bootlog ? -1 : -2);
327 set_in_cr4(X86_CR4_MCE);
330 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
332 for (i = 0; i < banks; i++) {
333 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
334 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
338 /* Add per CPU specific workarounds here */
339 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
341 /* This should be disabled by the BIOS, but isn't always */
342 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
343 /* disable GART TBL walk error reporting, which trips off
344 incorrectly with the IOMMU & 3ware & Cerberus. */
345 clear_bit(10, &bank[4]);
349 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
351 switch (c->x86_vendor) {
352 case X86_VENDOR_INTEL:
353 mce_intel_feature_init(c);
361 * Called for each booted CPU to set up machine checks.
362 * Must be called with preempt off.
364 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
366 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
371 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
380 * Character device to read and clear the MCE log.
383 static void collect_tscs(void *data)
385 unsigned long *cpu_tsc = (unsigned long *)data;
386 rdtscll(cpu_tsc[smp_processor_id()]);
389 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
391 unsigned long *cpu_tsc;
392 static DECLARE_MUTEX(mce_read_sem);
394 char __user *buf = ubuf;
397 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
402 next = rcu_dereference(mcelog.next);
404 /* Only supports full reads right now */
405 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
412 for (i = 0; i < next; i++) {
413 unsigned long start = jiffies;
414 while (!mcelog.entry[i].finished) {
415 if (!time_before(jiffies, start + 2)) {
416 memset(mcelog.entry + i,0, sizeof(struct mce));
422 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
423 buf += sizeof(struct mce);
426 memset(mcelog.entry, 0, next * sizeof(struct mce));
431 /* Collect entries that were still getting written before the synchronize. */
433 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
434 for (i = next; i < MCE_LOG_LEN; i++) {
435 if (mcelog.entry[i].finished &&
436 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
437 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
439 buf += sizeof(struct mce);
440 memset(&mcelog.entry[i], 0, sizeof(struct mce));
445 return err ? -EFAULT : buf - ubuf;
448 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
450 int __user *p = (int __user *)arg;
451 if (!capable(CAP_SYS_ADMIN))
454 case MCE_GET_RECORD_LEN:
455 return put_user(sizeof(struct mce), p);
456 case MCE_GET_LOG_LEN:
457 return put_user(MCE_LOG_LEN, p);
458 case MCE_GETCLEAR_FLAGS: {
461 flags = mcelog.flags;
462 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
463 return put_user(flags, p);
470 static struct file_operations mce_chrdev_ops = {
475 static struct miscdevice mce_log_device = {
482 * Old style boot options parsing. Only for compatibility.
485 static int __init mcheck_disable(char *str)
491 /* mce=off disables machine check. Note you can reenable it later
493 mce=TOLERANCELEVEL (number, see above)
494 mce=bootlog Log MCEs from before booting. Disabled by default to work
495 around buggy BIOS that leave bogus MCEs. */
496 static int __init mcheck_enable(char *str)
500 if (!strcmp(str, "off"))
502 else if (!strcmp(str, "bootlog"))
504 else if (isdigit(str[0]))
505 get_option(&str, &tolerant);
507 printk("mce= argument %s ignored. Please use /sys", str);
511 __setup("nomce", mcheck_disable);
512 __setup("mce", mcheck_enable);
518 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
519 Only one CPU is active at this time, the others get readded later using
521 static int mce_resume(struct sys_device *dev)
527 /* Reinit MCEs after user configuration changes */
528 static void mce_restart(void)
531 cancel_delayed_work(&mcheck_work);
532 /* Timer race is harmless here */
533 on_each_cpu(mce_init, NULL, 1, 1);
535 schedule_delayed_work(&mcheck_work, check_interval*HZ);
538 static struct sysdev_class mce_sysclass = {
539 .resume = mce_resume,
540 set_kset_name("machinecheck"),
543 static DEFINE_PER_CPU(struct sys_device, device_mce);
545 /* Why are there no generic functions for this? */
546 #define ACCESSOR(name, var, start) \
547 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
548 return sprintf(buf, "%lx\n", (unsigned long)var); \
550 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
552 unsigned long new = simple_strtoul(buf, &end, 0); \
553 if (end == buf) return -EINVAL; \
558 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
560 ACCESSOR(bank0ctl,bank[0],mce_restart())
561 ACCESSOR(bank1ctl,bank[1],mce_restart())
562 ACCESSOR(bank2ctl,bank[2],mce_restart())
563 ACCESSOR(bank3ctl,bank[3],mce_restart())
564 ACCESSOR(bank4ctl,bank[4],mce_restart())
565 ACCESSOR(tolerant,tolerant,)
566 ACCESSOR(check_interval,check_interval,mce_restart())
568 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
569 static __cpuinit int mce_create_device(unsigned int cpu)
572 if (!mce_available(&cpu_data[cpu]))
575 per_cpu(device_mce,cpu).id = cpu;
576 per_cpu(device_mce,cpu).cls = &mce_sysclass;
578 err = sysdev_register(&per_cpu(device_mce,cpu));
581 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
582 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
583 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
584 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
585 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
586 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
587 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
592 #ifdef CONFIG_HOTPLUG_CPU
593 static __cpuinit void mce_remove_device(unsigned int cpu)
595 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
596 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
597 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
598 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
599 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
600 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
601 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
602 sysdev_unregister(&per_cpu(device_mce,cpu));
606 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
608 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
610 unsigned int cpu = (unsigned long)hcpu;
614 mce_create_device(cpu);
616 #ifdef CONFIG_HOTPLUG_CPU
618 mce_remove_device(cpu);
625 static struct notifier_block mce_cpu_notifier = {
626 .notifier_call = mce_cpu_callback,
629 static __init int mce_init_device(void)
634 if (!mce_available(&boot_cpu_data))
636 err = sysdev_class_register(&mce_sysclass);
638 for_each_online_cpu(i) {
639 mce_create_device(i);
642 register_cpu_notifier(&mce_cpu_notifier);
643 misc_register(&mce_log_device);
647 device_initcall(mce_init_device);