2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <asm/processor.h>
25 #include <asm/kdebug.h>
26 #include <asm/uaccess.h>
29 #define MISC_MCELOG_MINOR 227
32 static int mce_dont_init;
34 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
35 3: never panic or exit (for testing only) */
36 static int tolerant = 1;
38 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
39 static unsigned long console_logged;
40 static int notify_user;
42 static int mce_bootlog = 1;
45 * Lockless MCE logging infrastructure.
46 * This avoids deadlocks on printk locks without having to break locks. Also
47 * separate MCEs from kernel messages to avoid bogus bug reports.
50 struct mce_log mcelog = {
55 void mce_log(struct mce *mce)
61 entry = rcu_dereference(mcelog.next);
62 /* The rmb forces the compiler to reload next in each
66 /* When the buffer fills up discard new entries. Assume
67 that the earlier errors are the more interesting. */
68 if (entry >= MCE_LOG_LEN) {
69 set_bit(MCE_OVERFLOW, &mcelog.flags);
72 /* Old left over entry. Skip. */
73 if (mcelog.entry[entry].finished) {
81 if (cmpxchg(&mcelog.next, entry, next) == entry)
84 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
86 mcelog.entry[entry].finished = 1;
89 if (!test_and_set_bit(0, &console_logged))
93 static void print_mce(struct mce *m)
95 printk(KERN_EMERG "\n"
96 KERN_EMERG "HARDWARE ERROR\n"
98 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
99 m->cpu, m->mcgstatus, m->bank, m->status);
102 "RIP%s %02x:<%016Lx> ",
103 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
105 if (m->cs == __KERNEL_CS)
106 print_symbol("{%s}", m->rip);
109 printk(KERN_EMERG "TSC %Lx ", m->tsc);
111 printk("ADDR %Lx ", m->addr);
113 printk("MISC %Lx ", m->misc);
115 printk(KERN_EMERG "This is not a software problem!\n");
117 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
120 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
124 for (i = 0; i < MCE_LOG_LEN; i++) {
125 unsigned long tsc = mcelog.entry[i].tsc;
126 if (time_before(tsc, start))
128 print_mce(&mcelog.entry[i]);
129 if (backup && mcelog.entry[i].tsc == backup->tsc)
135 printk("Fake panic: %s\n", msg);
140 static int mce_available(struct cpuinfo_x86 *c)
142 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
145 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
147 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
155 /* Assume the RIP in the MSR is exact. Is this true? */
156 m->mcgstatus |= MCG_STATUS_EIPV;
157 rdmsrl(rip_msr, m->rip);
163 * The actual machine check handler
166 void do_machine_check(struct pt_regs * regs, long error_code)
168 struct mce m, panicm;
169 int nowayout = (tolerant < 1);
173 int panicm_found = 0;
176 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
180 memset(&m, 0, sizeof(struct mce));
181 m.cpu = safe_smp_processor_id();
182 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
183 if (!(m.mcgstatus & MCG_STATUS_RIPV))
189 for (i = 0; i < banks; i++) {
198 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
199 if ((m.status & MCI_STATUS_VAL) == 0)
202 if (m.status & MCI_STATUS_EN) {
203 /* In theory _OVER could be a nowayout too, but
204 assume any overflowed errors were no fatal. */
205 nowayout |= !!(m.status & MCI_STATUS_PCC);
206 kill_it |= !!(m.status & MCI_STATUS_UC);
209 if (m.status & MCI_STATUS_MISCV)
210 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
211 if (m.status & MCI_STATUS_ADDRV)
212 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
214 mce_get_rip(&m, regs);
217 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
218 if (error_code != -2)
221 /* Did this bank cause the exception? */
222 /* Assume that the bank with uncorrectable errors did it,
223 and that there is only a single one. */
224 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
229 add_taint(TAINT_MACHINE_CHECK);
232 /* Never do anything final in the polling timer */
236 /* If we didn't find an uncorrectable error, pick
237 the last one (shouldn't happen, just being safe). */
241 mce_panic("Machine check", &panicm, mcestart);
245 if (m.mcgstatus & MCG_STATUS_RIPV)
246 user_space = panicm.rip && (panicm.cs & 3);
248 /* When the machine was in user space and the CPU didn't get
249 confused it's normally not necessary to panic, unless you
250 are paranoid (tolerant == 0)
252 RED-PEN could be more tolerant for MCEs in idle,
253 but most likely they occur at boot anyways, where
254 it is best to just halt the machine. */
255 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
256 (unsigned)current->pid <= 1)
257 mce_panic("Uncorrected machine check", &panicm, mcestart);
259 /* do_exit takes an awful lot of locks and has as
260 slight risk of deadlocking. If you don't want that
261 don't set tolerant >= 2 */
267 /* Last thing done in the machine check exception to clear state. */
268 wrmsrl(MSR_IA32_MCG_STATUS, 0);
272 * Periodic polling timer for "silent" machine check errors.
275 static int check_interval = 5 * 60; /* 5 minutes */
276 static void mcheck_timer(void *data);
277 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
279 static void mcheck_check_cpu(void *info)
281 if (mce_available(¤t_cpu_data))
282 do_machine_check(NULL, 0);
285 static void mcheck_timer(void *data)
287 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
288 schedule_delayed_work(&mcheck_work, check_interval * HZ);
291 * It's ok to read stale data here for notify_user and
292 * console_logged as we'll simply get the updated versions
293 * on the next mcheck_timer execution and atomic operations
294 * on console_logged act as synchronization for notify_user
297 if (notify_user && console_logged) {
299 clear_bit(0, &console_logged);
300 printk(KERN_INFO "Machine check events logged\n");
305 static __init int periodic_mcheck_init(void)
308 schedule_delayed_work(&mcheck_work, check_interval*HZ);
311 __initcall(periodic_mcheck_init);
315 * Initialize Machine Checks for a CPU.
317 static void mce_init(void *dummy)
322 rdmsrl(MSR_IA32_MCG_CAP, cap);
324 if (banks > NR_BANKS) {
325 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
328 /* Use accurate RIP reporting if available. */
329 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
330 rip_msr = MSR_IA32_MCG_EIP;
332 /* Log the machine checks left over from the previous reset.
333 This also clears all registers */
334 do_machine_check(NULL, mce_bootlog ? -1 : -2);
336 set_in_cr4(X86_CR4_MCE);
339 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
341 for (i = 0; i < banks; i++) {
342 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
343 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
347 /* Add per CPU specific workarounds here */
348 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
350 /* This should be disabled by the BIOS, but isn't always */
351 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
352 /* disable GART TBL walk error reporting, which trips off
353 incorrectly with the IOMMU & 3ware & Cerberus. */
354 clear_bit(10, &bank[4]);
355 /* Lots of broken BIOS around that don't clear them
356 by default and leave crap in there. Don't log. */
362 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
364 switch (c->x86_vendor) {
365 case X86_VENDOR_INTEL:
366 mce_intel_feature_init(c);
369 mce_amd_feature_init(c);
377 * Called for each booted CPU to set up machine checks.
378 * Must be called with preempt off.
380 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
382 static cpumask_t mce_cpus = CPU_MASK_NONE;
387 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
396 * Character device to read and clear the MCE log.
399 static void collect_tscs(void *data)
401 unsigned long *cpu_tsc = (unsigned long *)data;
402 rdtscll(cpu_tsc[smp_processor_id()]);
405 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
407 unsigned long *cpu_tsc;
408 static DECLARE_MUTEX(mce_read_sem);
410 char __user *buf = ubuf;
413 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
418 next = rcu_dereference(mcelog.next);
420 /* Only supports full reads right now */
421 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
428 for (i = 0; i < next; i++) {
429 unsigned long start = jiffies;
430 while (!mcelog.entry[i].finished) {
431 if (!time_before(jiffies, start + 2)) {
432 memset(mcelog.entry + i,0, sizeof(struct mce));
438 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
439 buf += sizeof(struct mce);
442 memset(mcelog.entry, 0, next * sizeof(struct mce));
447 /* Collect entries that were still getting written before the synchronize. */
449 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
450 for (i = next; i < MCE_LOG_LEN; i++) {
451 if (mcelog.entry[i].finished &&
452 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
453 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
455 buf += sizeof(struct mce);
456 memset(&mcelog.entry[i], 0, sizeof(struct mce));
461 return err ? -EFAULT : buf - ubuf;
464 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
466 int __user *p = (int __user *)arg;
467 if (!capable(CAP_SYS_ADMIN))
470 case MCE_GET_RECORD_LEN:
471 return put_user(sizeof(struct mce), p);
472 case MCE_GET_LOG_LEN:
473 return put_user(MCE_LOG_LEN, p);
474 case MCE_GETCLEAR_FLAGS: {
477 flags = mcelog.flags;
478 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
479 return put_user(flags, p);
486 static struct file_operations mce_chrdev_ops = {
491 static struct miscdevice mce_log_device = {
498 * Old style boot options parsing. Only for compatibility.
501 static int __init mcheck_disable(char *str)
507 /* mce=off disables machine check. Note you can reenable it later
509 mce=TOLERANCELEVEL (number, see above)
510 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
511 mce=nobootlog Don't log MCEs from before booting. */
512 static int __init mcheck_enable(char *str)
516 if (!strcmp(str, "off"))
518 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
519 mce_bootlog = str[0] == 'b';
520 else if (isdigit(str[0]))
521 get_option(&str, &tolerant);
523 printk("mce= argument %s ignored. Please use /sys", str);
527 __setup("nomce", mcheck_disable);
528 __setup("mce", mcheck_enable);
534 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
535 Only one CPU is active at this time, the others get readded later using
537 static int mce_resume(struct sys_device *dev)
543 /* Reinit MCEs after user configuration changes */
544 static void mce_restart(void)
547 cancel_delayed_work(&mcheck_work);
548 /* Timer race is harmless here */
549 on_each_cpu(mce_init, NULL, 1, 1);
551 schedule_delayed_work(&mcheck_work, check_interval*HZ);
554 static struct sysdev_class mce_sysclass = {
555 .resume = mce_resume,
556 set_kset_name("machinecheck"),
559 static DEFINE_PER_CPU(struct sys_device, device_mce);
561 /* Why are there no generic functions for this? */
562 #define ACCESSOR(name, var, start) \
563 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
564 return sprintf(buf, "%lx\n", (unsigned long)var); \
566 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
568 unsigned long new = simple_strtoul(buf, &end, 0); \
569 if (end == buf) return -EINVAL; \
574 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
576 ACCESSOR(bank0ctl,bank[0],mce_restart())
577 ACCESSOR(bank1ctl,bank[1],mce_restart())
578 ACCESSOR(bank2ctl,bank[2],mce_restart())
579 ACCESSOR(bank3ctl,bank[3],mce_restart())
580 ACCESSOR(bank4ctl,bank[4],mce_restart())
581 ACCESSOR(bank5ctl,bank[5],mce_restart())
582 static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
583 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
584 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
585 ACCESSOR(tolerant,tolerant,)
586 ACCESSOR(check_interval,check_interval,mce_restart())
588 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
589 static __cpuinit int mce_create_device(unsigned int cpu)
593 if (!mce_available(&cpu_data[cpu]))
596 per_cpu(device_mce,cpu).id = cpu;
597 per_cpu(device_mce,cpu).cls = &mce_sysclass;
599 err = sysdev_register(&per_cpu(device_mce,cpu));
602 for (i = 0; i < banks; i++)
603 sysdev_create_file(&per_cpu(device_mce,cpu),
605 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
606 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
611 #ifdef CONFIG_HOTPLUG_CPU
612 static __cpuinit void mce_remove_device(unsigned int cpu)
616 for (i = 0; i < banks; i++)
617 sysdev_remove_file(&per_cpu(device_mce,cpu),
619 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
620 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
621 sysdev_unregister(&per_cpu(device_mce,cpu));
625 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
627 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
629 unsigned int cpu = (unsigned long)hcpu;
633 mce_create_device(cpu);
635 #ifdef CONFIG_HOTPLUG_CPU
637 mce_remove_device(cpu);
644 static struct notifier_block mce_cpu_notifier = {
645 .notifier_call = mce_cpu_callback,
648 static __init int mce_init_device(void)
653 if (!mce_available(&boot_cpu_data))
655 err = sysdev_class_register(&mce_sysclass);
657 for_each_online_cpu(i) {
658 mce_create_device(i);
661 register_cpu_notifier(&mce_cpu_notifier);
662 misc_register(&mce_log_device);
666 device_initcall(mce_init_device);