git.oblomov.eu Git - linux-2.6/blob - arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/cpu.h>
  19 #include <linux/percpu.h>
  20 #include <asm/processor.h>
  21 #include <asm/msr.h>
  22 #include <asm/mce.h>
  23 #include <asm/kdebug.h>
  24 #include <asm/uaccess.h>
  25
  26 #define MISC_MCELOG_MINOR 227
  27 #define NR_BANKS 5
  28
  29 static int mce_dont_init;
  30
  31 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  32    3: never panic or exit (for testing only) */
  33 static int tolerant = 1;
  34 static int banks;
  35 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  36 static unsigned long console_logged;
  37 static int notify_user;
  38 static int rip_msr;
  39 static int mce_bootlog;
  40
  41 /*
  42  * Lockless MCE logging infrastructure.
  43  * This avoids deadlocks on printk locks without having to break locks. Also
  44  * separate MCEs from kernel messages to avoid bogus bug reports.
  45  */
  46
  47 struct mce_log mcelog = {
  48         MCE_LOG_SIGNATURE,
  49         MCE_LOG_LEN,
  50 };
  51
  52 void mce_log(struct mce *mce)
  53 {
  54         unsigned next, entry;
  55         mce->finished = 0;
  56         smp_wmb();
  57         for (;;) {
  58                 entry = rcu_dereference(mcelog.next);
  59                 /* When the buffer fills up discard new entries. Assume
  60                    that the earlier errors are the more interesting. */
  61                 if (entry >= MCE_LOG_LEN) {
  62                         set_bit(MCE_OVERFLOW, &mcelog.flags);
  63                         return;
  64                 }
  65                 /* Old left over entry. Skip. */
  66                 if (mcelog.entry[entry].finished)
  67                         continue;
  68                 smp_rmb();
  69                 next = entry + 1;
  70                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  71                         break;
  72         }
  73         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  74         smp_wmb();
  75         mcelog.entry[entry].finished = 1;
  76         smp_wmb();
  77
  78         if (!test_and_set_bit(0, &console_logged))
  79                 notify_user = 1;
  80 }
  81
  82 static void print_mce(struct mce *m)
  83 {
  84         printk(KERN_EMERG "\n"
  85                KERN_EMERG
  86                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
  87                m->cpu, m->mcgstatus, m->bank, m->status);
  88         if (m->rip) {
  89                 printk(KERN_EMERG
  90                        "RIP%s %02x:<%016Lx> ",
  91                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  92                        m->cs, m->rip);
  93                 if (m->cs == __KERNEL_CS)
  94                         print_symbol("{%s}", m->rip);
  95                 printk("\n");
  96         }
  97         printk(KERN_EMERG "TSC %Lx ", m->tsc);
  98         if (m->addr)
  99                 printk("ADDR %Lx ", m->addr);
 100         if (m->misc)
 101                 printk("MISC %Lx ", m->misc);
 102         printk("\n");
 103 }
 104
 105 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 106 {
 107         int i;
 108         oops_begin();
 109         for (i = 0; i < MCE_LOG_LEN; i++) {
 110                 unsigned long tsc = mcelog.entry[i].tsc;
 111                 if (time_before(tsc, start))
 112                         continue;
 113                 print_mce(&mcelog.entry[i]);
 114                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 115                         backup = NULL;
 116         }
 117         if (backup)
 118                 print_mce(backup);
 119         if (tolerant >= 3)
 120                 printk("Fake panic: %s\n", msg);
 121         else
 122                 panic(msg);
 123 }
 124
 125 static int mce_available(struct cpuinfo_x86 *c)
 126 {
 127         return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
 128                test_bit(X86_FEATURE_MCA, &c->x86_capability);
 129 }
 130
 131 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 132 {
 133         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 134                 m->rip = regs->rip;
 135                 m->cs = regs->cs;
 136         } else {
 137                 m->rip = 0;
 138                 m->cs = 0;
 139         }
 140         if (rip_msr) {
 141                 /* Assume the RIP in the MSR is exact. Is this true? */
 142                 m->mcgstatus |= MCG_STATUS_EIPV;
 143                 rdmsrl(rip_msr, m->rip);
 144                 m->cs = 0;
 145         }
 146 }
 147
 148 /*
 149  * The actual machine check handler
 150  */
 151
 152 void do_machine_check(struct pt_regs * regs, long error_code)
 153 {
 154         struct mce m, panicm;
 155         int nowayout = (tolerant < 1);
 156         int kill_it = 0;
 157         u64 mcestart = 0;
 158         int i;
 159         int panicm_found = 0;
 160
 161         if (regs)
 162                 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
 163         if (!banks)
 164                 return;
 165
 166         memset(&m, 0, sizeof(struct mce));
 167         m.cpu = hard_smp_processor_id();
 168         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 169         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 170                 kill_it = 1;
 171
 172         rdtscll(mcestart);
 173         barrier();
 174
 175         for (i = 0; i < banks; i++) {
 176                 if (!bank[i])
 177                         continue;
 178
 179                 m.misc = 0;
 180                 m.addr = 0;
 181                 m.bank = i;
 182                 m.tsc = 0;
 183
 184                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 185                 if ((m.status & MCI_STATUS_VAL) == 0)
 186                         continue;
 187
 188                 if (m.status & MCI_STATUS_EN) {
 189                         /* In theory _OVER could be a nowayout too, but
 190                            assume any overflowed errors were no fatal. */
 191                         nowayout |= !!(m.status & MCI_STATUS_PCC);
 192                         kill_it |= !!(m.status & MCI_STATUS_UC);
 193                 }
 194
 195                 if (m.status & MCI_STATUS_MISCV)
 196                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 197                 if (m.status & MCI_STATUS_ADDRV)
 198                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 199
 200                 mce_get_rip(&m, regs);
 201                 if (error_code >= 0)
 202                         rdtscll(m.tsc);
 203                 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 204                 if (error_code != -2)
 205                         mce_log(&m);
 206
 207                 /* Did this bank cause the exception? */
 208                 /* Assume that the bank with uncorrectable errors did it,
 209                    and that there is only a single one. */
 210                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 211                         panicm = m;
 212                         panicm_found = 1;
 213                 }
 214
 215                 tainted |= TAINT_MACHINE_CHECK;
 216         }
 217
 218         /* Never do anything final in the polling timer */
 219         if (!regs)
 220                 goto out;
 221
 222         /* If we didn't find an uncorrectable error, pick
 223            the last one (shouldn't happen, just being safe). */
 224         if (!panicm_found)
 225                 panicm = m;
 226         if (nowayout)
 227                 mce_panic("Machine check", &panicm, mcestart);
 228         if (kill_it) {
 229                 int user_space = 0;
 230
 231                 if (m.mcgstatus & MCG_STATUS_RIPV)
 232                         user_space = panicm.rip && (panicm.cs & 3);
 233
 234                 /* When the machine was in user space and the CPU didn't get
 235                    confused it's normally not necessary to panic, unless you
 236                    are paranoid (tolerant == 0)
 237
 238                    RED-PEN could be more tolerant for MCEs in idle,
 239                    but most likely they occur at boot anyways, where
 240                    it is best to just halt the machine. */
 241                 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
 242                     (unsigned)current->pid <= 1)
 243                         mce_panic("Uncorrected machine check", &panicm, mcestart);
 244
 245                 /* do_exit takes an awful lot of locks and has as
 246                    slight risk of deadlocking. If you don't want that
 247                    don't set tolerant >= 2 */
 248                 if (tolerant < 3)
 249                         do_exit(SIGBUS);
 250         }
 251
 252  out:
 253         /* Last thing done in the machine check exception to clear state. */
 254         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 255 }
 256
 257 /*
 258  * Periodic polling timer for "silent" machine check errors.
 259  */
 260
 261 static int check_interval = 5 * 60; /* 5 minutes */
 262 static void mcheck_timer(void *data);
 263 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
 264
 265 static void mcheck_check_cpu(void *info)
 266 {
 267         if (mce_available(&current_cpu_data))
 268                 do_machine_check(NULL, 0);
 269 }
 270
 271 static void mcheck_timer(void *data)
 272 {
 273         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 274         schedule_delayed_work(&mcheck_work, check_interval * HZ);
 275
 276         /*
 277          * It's ok to read stale data here for notify_user and
 278          * console_logged as we'll simply get the updated versions
 279          * on the next mcheck_timer execution and atomic operations
 280          * on console_logged act as synchronization for notify_user
 281          * writes.
 282          */
 283         if (notify_user && console_logged) {
 284                 notify_user = 0;
 285                 clear_bit(0, &console_logged);
 286                 printk(KERN_INFO "Machine check events logged\n");
 287         }
 288 }
 289
 290
 291 static __init int periodic_mcheck_init(void)
 292 {
 293         if (check_interval)
 294                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 295         return 0;
 296 }
 297 __initcall(periodic_mcheck_init);
 298
 299
 300 /*
 301  * Initialize Machine Checks for a CPU.
 302  */
 303 static void mce_init(void *dummy)
 304 {
 305         u64 cap;
 306         int i;
 307
 308         rdmsrl(MSR_IA32_MCG_CAP, cap);
 309         banks = cap & 0xff;
 310         if (banks > NR_BANKS) {
 311                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 312                 banks = NR_BANKS;
 313         }
 314         /* Use accurate RIP reporting if available. */
 315         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 316                 rip_msr = MSR_IA32_MCG_EIP;
 317
 318         /* Log the machine checks left over from the previous reset.
 319            This also clears all registers */
 320         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 321
 322         set_in_cr4(X86_CR4_MCE);
 323
 324         if (cap & MCG_CTL_P)
 325                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 326
 327         for (i = 0; i < banks; i++) {
 328                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 329                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 330         }
 331 }
 332
 333 /* Add per CPU specific workarounds here */
 334 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 335 {
 336         /* This should be disabled by the BIOS, but isn't always */
 337         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 338                 /* disable GART TBL walk error reporting, which trips off
 339                    incorrectly with the IOMMU & 3ware & Cerberus. */
 340                 clear_bit(10, &bank[4]);
 341         }
 342 }
 343
 344 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 345 {
 346         switch (c->x86_vendor) {
 347         case X86_VENDOR_INTEL:
 348                 mce_intel_feature_init(c);
 349                 break;
 350         default:
 351                 break;
 352         }
 353 }
 354
 355 /*
 356  * Called for each booted CPU to set up machine checks.
 357  * Must be called with preempt off.
 358  */
 359 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 360 {
 361         static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
 362
 363         mce_cpu_quirks(c);
 364
 365         if (mce_dont_init ||
 366             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 367             !mce_available(c))
 368                 return;
 369
 370         mce_init(NULL);
 371         mce_cpu_features(c);
 372 }
 373
 374 /*
 375  * Character device to read and clear the MCE log.
 376  */
 377
 378 static void collect_tscs(void *data)
 379 {
 380         unsigned long *cpu_tsc = (unsigned long *)data;
 381         rdtscll(cpu_tsc[smp_processor_id()]);
 382 }
 383
 384 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 385 {
 386         unsigned long *cpu_tsc;
 387         static DECLARE_MUTEX(mce_read_sem);
 388         unsigned next;
 389         char __user *buf = ubuf;
 390         int i, err;
 391
 392         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 393         if (!cpu_tsc)
 394                 return -ENOMEM;
 395
 396         down(&mce_read_sem);
 397         next = rcu_dereference(mcelog.next);
 398
 399         /* Only supports full reads right now */
 400         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 401                 up(&mce_read_sem);
 402                 kfree(cpu_tsc);
 403                 return -EINVAL;
 404         }
 405
 406         err = 0;
 407         for (i = 0; i < next; i++) {
 408                 if (!mcelog.entry[i].finished)
 409                         continue;
 410                 smp_rmb();
 411                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 412                 buf += sizeof(struct mce);
 413         }
 414
 415         memset(mcelog.entry, 0, next * sizeof(struct mce));
 416         mcelog.next = 0;
 417
 418         synchronize_sched();
 419
 420         /* Collect entries that were still getting written before the synchronize. */
 421
 422         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 423         for (i = next; i < MCE_LOG_LEN; i++) {
 424                 if (mcelog.entry[i].finished &&
 425                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 426                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 427                         smp_rmb();
 428                         buf += sizeof(struct mce);
 429                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 430                 }
 431         }
 432         up(&mce_read_sem);
 433         kfree(cpu_tsc);
 434         return err ? -EFAULT : buf - ubuf;
 435 }
 436
 437 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 438 {
 439         int __user *p = (int __user *)arg;
 440         if (!capable(CAP_SYS_ADMIN))
 441                 return -EPERM;
 442         switch (cmd) {
 443         case MCE_GET_RECORD_LEN:
 444                 return put_user(sizeof(struct mce), p);
 445         case MCE_GET_LOG_LEN:
 446                 return put_user(MCE_LOG_LEN, p);
 447         case MCE_GETCLEAR_FLAGS: {
 448                 unsigned flags;
 449                 do {
 450                         flags = mcelog.flags;
 451                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 452                 return put_user(flags, p);
 453         }
 454         default:
 455                 return -ENOTTY;
 456         }
 457 }
 458
 459 static struct file_operations mce_chrdev_ops = {
 460         .read = mce_read,
 461         .ioctl = mce_ioctl,
 462 };
 463
 464 static struct miscdevice mce_log_device = {
 465         MISC_MCELOG_MINOR,
 466         "mcelog",
 467         &mce_chrdev_ops,
 468 };
 469
 470 /*
 471  * Old style boot options parsing. Only for compatibility.
 472  */
 473
 474 static int __init mcheck_disable(char *str)
 475 {
 476         mce_dont_init = 1;
 477         return 0;
 478 }
 479
 480 /* mce=off disables machine check. Note you can reenable it later
 481    using sysfs.
 482    mce=bootlog Log MCEs from before booting. Disabled by default to work
 483    around buggy BIOS that leave bogus MCEs.  */
 484 static int __init mcheck_enable(char *str)
 485 {
 486         if (*str == '=')
 487                 str++;
 488         if (!strcmp(str, "off"))
 489                 mce_dont_init = 1;
 490         else if (!strcmp(str, "bootlog"))
 491                 mce_bootlog = 1;
 492         else
 493                 printk("mce= argument %s ignored. Please use /sys", str);
 494         return 0;
 495 }
 496
 497 __setup("nomce", mcheck_disable);
 498 __setup("mce", mcheck_enable);
 499
 500 /*
 501  * Sysfs support
 502  */
 503
 504 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
 505 static int mce_resume(struct sys_device *dev)
 506 {
 507         on_each_cpu(mce_init, NULL, 1, 1);
 508         return 0;
 509 }
 510
 511 /* Reinit MCEs after user configuration changes */
 512 static void mce_restart(void)
 513 {
 514         if (check_interval)
 515                 cancel_delayed_work(&mcheck_work);
 516         /* Timer race is harmless here */
 517         on_each_cpu(mce_init, NULL, 1, 1);
 518         if (check_interval)
 519                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 520 }
 521
 522 static struct sysdev_class mce_sysclass = {
 523         .resume = mce_resume,
 524         set_kset_name("machinecheck"),
 525 };
 526
 527 static DEFINE_PER_CPU(struct sys_device, device_mce);
 528
 529 /* Why are there no generic functions for this? */
 530 #define ACCESSOR(name, var, start) \
 531         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 532                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 533         }                                                                          \
 534         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 535                 char *end;                                                         \
 536                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 537                 if (end == buf) return -EINVAL;                                    \
 538                 var = new;                                                         \
 539                 start;                                                             \
 540                 return end-buf;                                                    \
 541         }                                                                          \
 542         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 543
 544 ACCESSOR(bank0ctl,bank[0],mce_restart())
 545 ACCESSOR(bank1ctl,bank[1],mce_restart())
 546 ACCESSOR(bank2ctl,bank[2],mce_restart())
 547 ACCESSOR(bank3ctl,bank[3],mce_restart())
 548 ACCESSOR(bank4ctl,bank[4],mce_restart())
 549 ACCESSOR(tolerant,tolerant,)
 550 ACCESSOR(check_interval,check_interval,mce_restart())
 551
 552 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 553 static __cpuinit int mce_create_device(unsigned int cpu)
 554 {
 555         int err;
 556         if (!mce_available(&cpu_data[cpu]))
 557                 return -EIO;
 558
 559         per_cpu(device_mce,cpu).id = cpu;
 560         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 561
 562         err = sysdev_register(&per_cpu(device_mce,cpu));
 563
 564         if (!err) {
 565                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
 566                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
 567                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
 568                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
 569                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
 570                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 571                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 572         }
 573         return err;
 574 }
 575
 576 #ifdef CONFIG_HOTPLUG_CPU
 577 static __cpuinit void mce_remove_device(unsigned int cpu)
 578 {
 579         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
 580         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
 581         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
 582         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
 583         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
 584         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 585         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 586         sysdev_unregister(&per_cpu(device_mce,cpu));
 587 }
 588 #endif
 589
 590 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 591 static __cpuinit int
 592 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 593 {
 594         unsigned int cpu = (unsigned long)hcpu;
 595
 596         switch (action) {
 597         case CPU_ONLINE:
 598                 mce_create_device(cpu);
 599                 break;
 600 #ifdef CONFIG_HOTPLUG_CPU
 601         case CPU_DEAD:
 602                 mce_remove_device(cpu);
 603                 break;
 604 #endif
 605         }
 606         return NOTIFY_OK;
 607 }
 608
 609 static struct notifier_block mce_cpu_notifier = {
 610         .notifier_call = mce_cpu_callback,
 611 };
 612
 613 static __init int mce_init_device(void)
 614 {
 615         int err;
 616         int i = 0;
 617
 618         if (!mce_available(&boot_cpu_data))
 619                 return -EIO;
 620         err = sysdev_class_register(&mce_sysclass);
 621
 622         for_each_online_cpu(i) {
 623                 mce_create_device(i);
 624         }
 625
 626         register_cpu_notifier(&mce_cpu_notifier);
 627         misc_register(&mce_log_device);
 628         return err;
 629 }
 630
 631 device_initcall(mce_init_device);