2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <asm/processor.h>
25 #include <asm/kdebug.h>
26 #include <asm/uaccess.h>
29 #define MISC_MCELOG_MINOR 227
34 static int mce_dont_init;
36 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
37 3: never panic or exit (for testing only) */
38 static int tolerant = 1;
40 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
41 static unsigned long console_logged;
42 static int notify_user;
44 static int mce_bootlog = 1;
47 * Lockless MCE logging infrastructure.
48 * This avoids deadlocks on printk locks without having to break locks. Also
49 * separate MCEs from kernel messages to avoid bogus bug reports.
52 struct mce_log mcelog = {
57 void mce_log(struct mce *mce)
63 entry = rcu_dereference(mcelog.next);
64 /* The rmb forces the compiler to reload next in each
68 /* When the buffer fills up discard new entries. Assume
69 that the earlier errors are the more interesting. */
70 if (entry >= MCE_LOG_LEN) {
71 set_bit(MCE_OVERFLOW, &mcelog.flags);
74 /* Old left over entry. Skip. */
75 if (mcelog.entry[entry].finished) {
83 if (cmpxchg(&mcelog.next, entry, next) == entry)
86 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
88 mcelog.entry[entry].finished = 1;
91 if (!test_and_set_bit(0, &console_logged))
95 static void print_mce(struct mce *m)
97 printk(KERN_EMERG "\n"
98 KERN_EMERG "HARDWARE ERROR\n"
100 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
101 m->cpu, m->mcgstatus, m->bank, m->status);
104 "RIP%s %02x:<%016Lx> ",
105 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
107 if (m->cs == __KERNEL_CS)
108 print_symbol("{%s}", m->rip);
111 printk(KERN_EMERG "TSC %Lx ", m->tsc);
113 printk("ADDR %Lx ", m->addr);
115 printk("MISC %Lx ", m->misc);
117 printk(KERN_EMERG "This is not a software problem!\n");
119 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
122 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
126 for (i = 0; i < MCE_LOG_LEN; i++) {
127 unsigned long tsc = mcelog.entry[i].tsc;
128 if (time_before(tsc, start))
130 print_mce(&mcelog.entry[i]);
131 if (backup && mcelog.entry[i].tsc == backup->tsc)
137 printk("Fake panic: %s\n", msg);
142 static int mce_available(struct cpuinfo_x86 *c)
144 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
147 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
149 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
157 /* Assume the RIP in the MSR is exact. Is this true? */
158 m->mcgstatus |= MCG_STATUS_EIPV;
159 rdmsrl(rip_msr, m->rip);
165 * The actual machine check handler
168 void do_machine_check(struct pt_regs * regs, long error_code)
170 struct mce m, panicm;
171 int nowayout = (tolerant < 1);
175 int panicm_found = 0;
177 atomic_inc(&mce_entry);
180 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
184 memset(&m, 0, sizeof(struct mce));
185 m.cpu = smp_processor_id();
186 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
187 if (!(m.mcgstatus & MCG_STATUS_RIPV))
193 for (i = 0; i < banks; i++) {
202 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
203 if ((m.status & MCI_STATUS_VAL) == 0)
206 if (m.status & MCI_STATUS_EN) {
207 /* In theory _OVER could be a nowayout too, but
208 assume any overflowed errors were no fatal. */
209 nowayout |= !!(m.status & MCI_STATUS_PCC);
210 kill_it |= !!(m.status & MCI_STATUS_UC);
213 if (m.status & MCI_STATUS_MISCV)
214 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
215 if (m.status & MCI_STATUS_ADDRV)
216 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
218 mce_get_rip(&m, regs);
221 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
222 if (error_code != -2)
225 /* Did this bank cause the exception? */
226 /* Assume that the bank with uncorrectable errors did it,
227 and that there is only a single one. */
228 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
233 add_taint(TAINT_MACHINE_CHECK);
236 /* Never do anything final in the polling timer */
240 /* If we didn't find an uncorrectable error, pick
241 the last one (shouldn't happen, just being safe). */
245 mce_panic("Machine check", &panicm, mcestart);
249 if (m.mcgstatus & MCG_STATUS_RIPV)
250 user_space = panicm.rip && (panicm.cs & 3);
252 /* When the machine was in user space and the CPU didn't get
253 confused it's normally not necessary to panic, unless you
254 are paranoid (tolerant == 0)
256 RED-PEN could be more tolerant for MCEs in idle,
257 but most likely they occur at boot anyways, where
258 it is best to just halt the machine. */
259 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
260 (unsigned)current->pid <= 1)
261 mce_panic("Uncorrected machine check", &panicm, mcestart);
263 /* do_exit takes an awful lot of locks and has as
264 slight risk of deadlocking. If you don't want that
265 don't set tolerant >= 2 */
271 /* Last thing done in the machine check exception to clear state. */
272 wrmsrl(MSR_IA32_MCG_STATUS, 0);
274 atomic_dec(&mce_entry);
277 #ifdef CONFIG_X86_MCE_INTEL
279 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
280 * @cpu: The CPU on which the event occured.
281 * @status: Event status information
283 * This function should be called by the thermal interrupt after the
284 * event has been processed and the decision was made to log the event
287 * The status parameter will be saved to the 'status' field of 'struct mce'
288 * and historically has been the register value of the
289 * MSR_IA32_THERMAL_STATUS (Intel) msr.
291 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
295 memset(&m, 0, sizeof(m));
297 m.bank = MCE_THERMAL_BANK;
302 #endif /* CONFIG_X86_MCE_INTEL */
305 * Periodic polling timer for "silent" machine check errors.
308 static int check_interval = 5 * 60; /* 5 minutes */
309 static void mcheck_timer(void *data);
310 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
312 static void mcheck_check_cpu(void *info)
314 if (mce_available(¤t_cpu_data))
315 do_machine_check(NULL, 0);
318 static void mcheck_timer(void *data)
320 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
321 schedule_delayed_work(&mcheck_work, check_interval * HZ);
324 * It's ok to read stale data here for notify_user and
325 * console_logged as we'll simply get the updated versions
326 * on the next mcheck_timer execution and atomic operations
327 * on console_logged act as synchronization for notify_user
330 if (notify_user && console_logged) {
332 clear_bit(0, &console_logged);
333 printk(KERN_INFO "Machine check events logged\n");
338 static __init int periodic_mcheck_init(void)
341 schedule_delayed_work(&mcheck_work, check_interval*HZ);
344 __initcall(periodic_mcheck_init);
348 * Initialize Machine Checks for a CPU.
350 static void mce_init(void *dummy)
355 rdmsrl(MSR_IA32_MCG_CAP, cap);
357 if (banks > NR_BANKS) {
358 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
361 /* Use accurate RIP reporting if available. */
362 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
363 rip_msr = MSR_IA32_MCG_EIP;
365 /* Log the machine checks left over from the previous reset.
366 This also clears all registers */
367 do_machine_check(NULL, mce_bootlog ? -1 : -2);
369 set_in_cr4(X86_CR4_MCE);
372 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
374 for (i = 0; i < banks; i++) {
375 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
376 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
380 /* Add per CPU specific workarounds here */
381 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
383 /* This should be disabled by the BIOS, but isn't always */
384 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
385 /* disable GART TBL walk error reporting, which trips off
386 incorrectly with the IOMMU & 3ware & Cerberus. */
387 clear_bit(10, &bank[4]);
388 /* Lots of broken BIOS around that don't clear them
389 by default and leave crap in there. Don't log. */
395 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
397 switch (c->x86_vendor) {
398 case X86_VENDOR_INTEL:
399 mce_intel_feature_init(c);
402 mce_amd_feature_init(c);
410 * Called for each booted CPU to set up machine checks.
411 * Must be called with preempt off.
413 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
415 static cpumask_t mce_cpus = CPU_MASK_NONE;
420 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
429 * Character device to read and clear the MCE log.
432 static void collect_tscs(void *data)
434 unsigned long *cpu_tsc = (unsigned long *)data;
435 rdtscll(cpu_tsc[smp_processor_id()]);
438 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
440 unsigned long *cpu_tsc;
441 static DECLARE_MUTEX(mce_read_sem);
443 char __user *buf = ubuf;
446 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
451 next = rcu_dereference(mcelog.next);
453 /* Only supports full reads right now */
454 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
461 for (i = 0; i < next; i++) {
462 unsigned long start = jiffies;
463 while (!mcelog.entry[i].finished) {
464 if (!time_before(jiffies, start + 2)) {
465 memset(mcelog.entry + i,0, sizeof(struct mce));
471 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
472 buf += sizeof(struct mce);
475 memset(mcelog.entry, 0, next * sizeof(struct mce));
480 /* Collect entries that were still getting written before the synchronize. */
482 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
483 for (i = next; i < MCE_LOG_LEN; i++) {
484 if (mcelog.entry[i].finished &&
485 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
486 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
488 buf += sizeof(struct mce);
489 memset(&mcelog.entry[i], 0, sizeof(struct mce));
494 return err ? -EFAULT : buf - ubuf;
497 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
499 int __user *p = (int __user *)arg;
500 if (!capable(CAP_SYS_ADMIN))
503 case MCE_GET_RECORD_LEN:
504 return put_user(sizeof(struct mce), p);
505 case MCE_GET_LOG_LEN:
506 return put_user(MCE_LOG_LEN, p);
507 case MCE_GETCLEAR_FLAGS: {
510 flags = mcelog.flags;
511 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
512 return put_user(flags, p);
519 static struct file_operations mce_chrdev_ops = {
524 static struct miscdevice mce_log_device = {
531 * Old style boot options parsing. Only for compatibility.
534 static int __init mcheck_disable(char *str)
540 /* mce=off disables machine check. Note you can reenable it later
542 mce=TOLERANCELEVEL (number, see above)
543 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
544 mce=nobootlog Don't log MCEs from before booting. */
545 static int __init mcheck_enable(char *str)
549 if (!strcmp(str, "off"))
551 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
552 mce_bootlog = str[0] == 'b';
553 else if (isdigit(str[0]))
554 get_option(&str, &tolerant);
556 printk("mce= argument %s ignored. Please use /sys", str);
560 __setup("nomce", mcheck_disable);
561 __setup("mce", mcheck_enable);
567 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
568 Only one CPU is active at this time, the others get readded later using
570 static int mce_resume(struct sys_device *dev)
576 /* Reinit MCEs after user configuration changes */
577 static void mce_restart(void)
580 cancel_delayed_work(&mcheck_work);
581 /* Timer race is harmless here */
582 on_each_cpu(mce_init, NULL, 1, 1);
584 schedule_delayed_work(&mcheck_work, check_interval*HZ);
587 static struct sysdev_class mce_sysclass = {
588 .resume = mce_resume,
589 set_kset_name("machinecheck"),
592 DEFINE_PER_CPU(struct sys_device, device_mce);
594 /* Why are there no generic functions for this? */
595 #define ACCESSOR(name, var, start) \
596 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
597 return sprintf(buf, "%lx\n", (unsigned long)var); \
599 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
601 unsigned long new = simple_strtoul(buf, &end, 0); \
602 if (end == buf) return -EINVAL; \
607 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
609 ACCESSOR(bank0ctl,bank[0],mce_restart())
610 ACCESSOR(bank1ctl,bank[1],mce_restart())
611 ACCESSOR(bank2ctl,bank[2],mce_restart())
612 ACCESSOR(bank3ctl,bank[3],mce_restart())
613 ACCESSOR(bank4ctl,bank[4],mce_restart())
614 ACCESSOR(bank5ctl,bank[5],mce_restart())
615 static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
616 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
617 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
618 ACCESSOR(tolerant,tolerant,)
619 ACCESSOR(check_interval,check_interval,mce_restart())
621 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
622 static __cpuinit int mce_create_device(unsigned int cpu)
626 if (!mce_available(&cpu_data[cpu]))
629 per_cpu(device_mce,cpu).id = cpu;
630 per_cpu(device_mce,cpu).cls = &mce_sysclass;
632 err = sysdev_register(&per_cpu(device_mce,cpu));
635 for (i = 0; i < banks; i++)
636 sysdev_create_file(&per_cpu(device_mce,cpu),
638 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
639 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
644 #ifdef CONFIG_HOTPLUG_CPU
645 static void mce_remove_device(unsigned int cpu)
649 for (i = 0; i < banks; i++)
650 sysdev_remove_file(&per_cpu(device_mce,cpu),
652 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
653 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
654 sysdev_unregister(&per_cpu(device_mce,cpu));
657 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
659 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
661 unsigned int cpu = (unsigned long)hcpu;
665 mce_create_device(cpu);
668 mce_remove_device(cpu);
674 static struct notifier_block mce_cpu_notifier = {
675 .notifier_call = mce_cpu_callback,
679 static __init int mce_init_device(void)
684 if (!mce_available(&boot_cpu_data))
686 err = sysdev_class_register(&mce_sysclass);
688 for_each_online_cpu(i) {
689 mce_create_device(i);
692 register_hotcpu_notifier(&mce_cpu_notifier);
693 misc_register(&mce_log_device);
697 device_initcall(mce_init_device);