git.oblomov.eu Git - linux-2.6/blob - arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/cpu.h>
  19 #include <linux/percpu.h>
  20 #include <linux/ctype.h>
  21 #include <asm/processor.h>
  22 #include <asm/msr.h>
  23 #include <asm/mce.h>
  24 #include <asm/kdebug.h>
  25 #include <asm/uaccess.h>
  26
  27 #define MISC_MCELOG_MINOR 227
  28 #define NR_BANKS 5
  29
  30 static int mce_dont_init;
  31
  32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  33    3: never panic or exit (for testing only) */
  34 static int tolerant = 1;
  35 static int banks;
  36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  37 static unsigned long console_logged;
  38 static int notify_user;
  39 static int rip_msr;
  40 static int mce_bootlog;
  41
  42 /*
  43  * Lockless MCE logging infrastructure.
  44  * This avoids deadlocks on printk locks without having to break locks. Also
  45  * separate MCEs from kernel messages to avoid bogus bug reports.
  46  */
  47
  48 struct mce_log mcelog = {
  49         MCE_LOG_SIGNATURE,
  50         MCE_LOG_LEN,
  51 };
  52
  53 void mce_log(struct mce *mce)
  54 {
  55         unsigned next, entry;
  56         mce->finished = 0;
  57         smp_wmb();
  58         for (;;) {
  59                 entry = rcu_dereference(mcelog.next);
  60                 for (;;) {
  61                         /* When the buffer fills up discard new entries. Assume
  62                            that the earlier errors are the more interesting. */
  63                         if (entry >= MCE_LOG_LEN) {
  64                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  65                                 return;
  66                         }
  67                         /* Old left over entry. Skip. */
  68                         if (mcelog.entry[entry].finished) {
  69                                 entry++;
  70                                 continue;
  71                         }
  72                 }
  73                 smp_rmb();
  74                 next = entry + 1;
  75                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  76                         break;
  77         }
  78         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  79         smp_wmb();
  80         mcelog.entry[entry].finished = 1;
  81         smp_wmb();
  82
  83         if (!test_and_set_bit(0, &console_logged))
  84                 notify_user = 1;
  85 }
  86
  87 static void print_mce(struct mce *m)
  88 {
  89         printk(KERN_EMERG "\n"
  90                KERN_EMERG
  91                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
  92                m->cpu, m->mcgstatus, m->bank, m->status);
  93         if (m->rip) {
  94                 printk(KERN_EMERG
  95                        "RIP%s %02x:<%016Lx> ",
  96                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
  97                        m->cs, m->rip);
  98                 if (m->cs == __KERNEL_CS)
  99                         print_symbol("{%s}", m->rip);
 100                 printk("\n");
 101         }
 102         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 103         if (m->addr)
 104                 printk("ADDR %Lx ", m->addr);
 105         if (m->misc)
 106                 printk("MISC %Lx ", m->misc);
 107         printk("\n");
 108 }
 109
 110 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 111 {
 112         int i;
 113         oops_begin();
 114         for (i = 0; i < MCE_LOG_LEN; i++) {
 115                 unsigned long tsc = mcelog.entry[i].tsc;
 116                 if (time_before(tsc, start))
 117                         continue;
 118                 print_mce(&mcelog.entry[i]);
 119                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 120                         backup = NULL;
 121         }
 122         if (backup)
 123                 print_mce(backup);
 124         if (tolerant >= 3)
 125                 printk("Fake panic: %s\n", msg);
 126         else
 127                 panic(msg);
 128 }
 129
 130 static int mce_available(struct cpuinfo_x86 *c)
 131 {
 132         return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
 133                test_bit(X86_FEATURE_MCA, &c->x86_capability);
 134 }
 135
 136 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 137 {
 138         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 139                 m->rip = regs->rip;
 140                 m->cs = regs->cs;
 141         } else {
 142                 m->rip = 0;
 143                 m->cs = 0;
 144         }
 145         if (rip_msr) {
 146                 /* Assume the RIP in the MSR is exact. Is this true? */
 147                 m->mcgstatus |= MCG_STATUS_EIPV;
 148                 rdmsrl(rip_msr, m->rip);
 149                 m->cs = 0;
 150         }
 151 }
 152
 153 /*
 154  * The actual machine check handler
 155  */
 156
 157 void do_machine_check(struct pt_regs * regs, long error_code)
 158 {
 159         struct mce m, panicm;
 160         int nowayout = (tolerant < 1);
 161         int kill_it = 0;
 162         u64 mcestart = 0;
 163         int i;
 164         int panicm_found = 0;
 165
 166         if (regs)
 167                 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
 168         if (!banks)
 169                 return;
 170
 171         memset(&m, 0, sizeof(struct mce));
 172         m.cpu = hard_smp_processor_id();
 173         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 174         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 175                 kill_it = 1;
 176
 177         rdtscll(mcestart);
 178         barrier();
 179
 180         for (i = 0; i < banks; i++) {
 181                 if (!bank[i])
 182                         continue;
 183
 184                 m.misc = 0;
 185                 m.addr = 0;
 186                 m.bank = i;
 187                 m.tsc = 0;
 188
 189                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 190                 if ((m.status & MCI_STATUS_VAL) == 0)
 191                         continue;
 192
 193                 if (m.status & MCI_STATUS_EN) {
 194                         /* In theory _OVER could be a nowayout too, but
 195                            assume any overflowed errors were no fatal. */
 196                         nowayout |= !!(m.status & MCI_STATUS_PCC);
 197                         kill_it |= !!(m.status & MCI_STATUS_UC);
 198                 }
 199
 200                 if (m.status & MCI_STATUS_MISCV)
 201                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 202                 if (m.status & MCI_STATUS_ADDRV)
 203                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 204
 205                 mce_get_rip(&m, regs);
 206                 if (error_code >= 0)
 207                         rdtscll(m.tsc);
 208                 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 209                 if (error_code != -2)
 210                         mce_log(&m);
 211
 212                 /* Did this bank cause the exception? */
 213                 /* Assume that the bank with uncorrectable errors did it,
 214                    and that there is only a single one. */
 215                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 216                         panicm = m;
 217                         panicm_found = 1;
 218                 }
 219
 220                 add_taint(TAINT_MACHINE_CHECK);
 221         }
 222
 223         /* Never do anything final in the polling timer */
 224         if (!regs)
 225                 goto out;
 226
 227         /* If we didn't find an uncorrectable error, pick
 228            the last one (shouldn't happen, just being safe). */
 229         if (!panicm_found)
 230                 panicm = m;
 231         if (nowayout)
 232                 mce_panic("Machine check", &panicm, mcestart);
 233         if (kill_it) {
 234                 int user_space = 0;
 235
 236                 if (m.mcgstatus & MCG_STATUS_RIPV)
 237                         user_space = panicm.rip && (panicm.cs & 3);
 238
 239                 /* When the machine was in user space and the CPU didn't get
 240                    confused it's normally not necessary to panic, unless you
 241                    are paranoid (tolerant == 0)
 242
 243                    RED-PEN could be more tolerant for MCEs in idle,
 244                    but most likely they occur at boot anyways, where
 245                    it is best to just halt the machine. */
 246                 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
 247                     (unsigned)current->pid <= 1)
 248                         mce_panic("Uncorrected machine check", &panicm, mcestart);
 249
 250                 /* do_exit takes an awful lot of locks and has as
 251                    slight risk of deadlocking. If you don't want that
 252                    don't set tolerant >= 2 */
 253                 if (tolerant < 3)
 254                         do_exit(SIGBUS);
 255         }
 256
 257  out:
 258         /* Last thing done in the machine check exception to clear state. */
 259         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 260 }
 261
 262 /*
 263  * Periodic polling timer for "silent" machine check errors.
 264  */
 265
 266 static int check_interval = 5 * 60; /* 5 minutes */
 267 static void mcheck_timer(void *data);
 268 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
 269
 270 static void mcheck_check_cpu(void *info)
 271 {
 272         if (mce_available(&current_cpu_data))
 273                 do_machine_check(NULL, 0);
 274 }
 275
 276 static void mcheck_timer(void *data)
 277 {
 278         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 279         schedule_delayed_work(&mcheck_work, check_interval * HZ);
 280
 281         /*
 282          * It's ok to read stale data here for notify_user and
 283          * console_logged as we'll simply get the updated versions
 284          * on the next mcheck_timer execution and atomic operations
 285          * on console_logged act as synchronization for notify_user
 286          * writes.
 287          */
 288         if (notify_user && console_logged) {
 289                 notify_user = 0;
 290                 clear_bit(0, &console_logged);
 291                 printk(KERN_INFO "Machine check events logged\n");
 292         }
 293 }
 294
 295
 296 static __init int periodic_mcheck_init(void)
 297 {
 298         if (check_interval)
 299                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 300         return 0;
 301 }
 302 __initcall(periodic_mcheck_init);
 303
 304
 305 /*
 306  * Initialize Machine Checks for a CPU.
 307  */
 308 static void mce_init(void *dummy)
 309 {
 310         u64 cap;
 311         int i;
 312
 313         rdmsrl(MSR_IA32_MCG_CAP, cap);
 314         banks = cap & 0xff;
 315         if (banks > NR_BANKS) {
 316                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 317                 banks = NR_BANKS;
 318         }
 319         /* Use accurate RIP reporting if available. */
 320         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 321                 rip_msr = MSR_IA32_MCG_EIP;
 322
 323         /* Log the machine checks left over from the previous reset.
 324            This also clears all registers */
 325         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 326
 327         set_in_cr4(X86_CR4_MCE);
 328
 329         if (cap & MCG_CTL_P)
 330                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 331
 332         for (i = 0; i < banks; i++) {
 333                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 334                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 335         }
 336 }
 337
 338 /* Add per CPU specific workarounds here */
 339 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 340 {
 341         /* This should be disabled by the BIOS, but isn't always */
 342         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 343                 /* disable GART TBL walk error reporting, which trips off
 344                    incorrectly with the IOMMU & 3ware & Cerberus. */
 345                 clear_bit(10, &bank[4]);
 346         }
 347 }
 348
 349 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 350 {
 351         switch (c->x86_vendor) {
 352         case X86_VENDOR_INTEL:
 353                 mce_intel_feature_init(c);
 354                 break;
 355         default:
 356                 break;
 357         }
 358 }
 359
 360 /*
 361  * Called for each booted CPU to set up machine checks.
 362  * Must be called with preempt off.
 363  */
 364 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 365 {
 366         static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
 367
 368         mce_cpu_quirks(c);
 369
 370         if (mce_dont_init ||
 371             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 372             !mce_available(c))
 373                 return;
 374
 375         mce_init(NULL);
 376         mce_cpu_features(c);
 377 }
 378
 379 /*
 380  * Character device to read and clear the MCE log.
 381  */
 382
 383 static void collect_tscs(void *data)
 384 {
 385         unsigned long *cpu_tsc = (unsigned long *)data;
 386         rdtscll(cpu_tsc[smp_processor_id()]);
 387 }
 388
 389 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 390 {
 391         unsigned long *cpu_tsc;
 392         static DECLARE_MUTEX(mce_read_sem);
 393         unsigned next;
 394         char __user *buf = ubuf;
 395         int i, err;
 396
 397         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 398         if (!cpu_tsc)
 399                 return -ENOMEM;
 400
 401         down(&mce_read_sem);
 402         next = rcu_dereference(mcelog.next);
 403
 404         /* Only supports full reads right now */
 405         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 406                 up(&mce_read_sem);
 407                 kfree(cpu_tsc);
 408                 return -EINVAL;
 409         }
 410
 411         err = 0;
 412         for (i = 0; i < next; i++) {
 413                 unsigned long start = jiffies;
 414                 while (!mcelog.entry[i].finished) {
 415                         if (!time_before(jiffies, start + 2)) {
 416                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 417                                 continue;
 418                         }
 419                         cpu_relax();
 420                 }
 421                 smp_rmb();
 422                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 423                 buf += sizeof(struct mce);
 424         }
 425
 426         memset(mcelog.entry, 0, next * sizeof(struct mce));
 427         mcelog.next = 0;
 428
 429         synchronize_sched();
 430
 431         /* Collect entries that were still getting written before the synchronize. */
 432
 433         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 434         for (i = next; i < MCE_LOG_LEN; i++) {
 435                 if (mcelog.entry[i].finished &&
 436                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 437                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 438                         smp_rmb();
 439                         buf += sizeof(struct mce);
 440                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 441                 }
 442         }
 443         up(&mce_read_sem);
 444         kfree(cpu_tsc);
 445         return err ? -EFAULT : buf - ubuf;
 446 }
 447
 448 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 449 {
 450         int __user *p = (int __user *)arg;
 451         if (!capable(CAP_SYS_ADMIN))
 452                 return -EPERM;
 453         switch (cmd) {
 454         case MCE_GET_RECORD_LEN:
 455                 return put_user(sizeof(struct mce), p);
 456         case MCE_GET_LOG_LEN:
 457                 return put_user(MCE_LOG_LEN, p);
 458         case MCE_GETCLEAR_FLAGS: {
 459                 unsigned flags;
 460                 do {
 461                         flags = mcelog.flags;
 462                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 463                 return put_user(flags, p);
 464         }
 465         default:
 466                 return -ENOTTY;
 467         }
 468 }
 469
 470 static struct file_operations mce_chrdev_ops = {
 471         .read = mce_read,
 472         .ioctl = mce_ioctl,
 473 };
 474
 475 static struct miscdevice mce_log_device = {
 476         MISC_MCELOG_MINOR,
 477         "mcelog",
 478         &mce_chrdev_ops,
 479 };
 480
 481 /*
 482  * Old style boot options parsing. Only for compatibility.
 483  */
 484
 485 static int __init mcheck_disable(char *str)
 486 {
 487         mce_dont_init = 1;
 488         return 0;
 489 }
 490
 491 /* mce=off disables machine check. Note you can reenable it later
 492    using sysfs.
 493    mce=TOLERANCELEVEL (number, see above)
 494    mce=bootlog Log MCEs from before booting. Disabled by default to work
 495    around buggy BIOS that leave bogus MCEs.  */
 496 static int __init mcheck_enable(char *str)
 497 {
 498         if (*str == '=')
 499                 str++;
 500         if (!strcmp(str, "off"))
 501                 mce_dont_init = 1;
 502         else if (!strcmp(str, "bootlog"))
 503                 mce_bootlog = 1;
 504         else if (isdigit(str[0]))
 505                 get_option(&str, &tolerant);
 506         else
 507                 printk("mce= argument %s ignored. Please use /sys", str);
 508         return 0;
 509 }
 510
 511 __setup("nomce", mcheck_disable);
 512 __setup("mce", mcheck_enable);
 513
 514 /*
 515  * Sysfs support
 516  */
 517
 518 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 519    Only one CPU is active at this time, the others get readded later using
 520    CPU hotplug. */
 521 static int mce_resume(struct sys_device *dev)
 522 {
 523         mce_init(NULL);
 524         return 0;
 525 }
 526
 527 /* Reinit MCEs after user configuration changes */
 528 static void mce_restart(void)
 529 {
 530         if (check_interval)
 531                 cancel_delayed_work(&mcheck_work);
 532         /* Timer race is harmless here */
 533         on_each_cpu(mce_init, NULL, 1, 1);
 534         if (check_interval)
 535                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 536 }
 537
 538 static struct sysdev_class mce_sysclass = {
 539         .resume = mce_resume,
 540         set_kset_name("machinecheck"),
 541 };
 542
 543 static DEFINE_PER_CPU(struct sys_device, device_mce);
 544
 545 /* Why are there no generic functions for this? */
 546 #define ACCESSOR(name, var, start) \
 547         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 548                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 549         }                                                                          \
 550         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 551                 char *end;                                                         \
 552                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 553                 if (end == buf) return -EINVAL;                                    \
 554                 var = new;                                                         \
 555                 start;                                                             \
 556                 return end-buf;                                                    \
 557         }                                                                          \
 558         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 559
 560 ACCESSOR(bank0ctl,bank[0],mce_restart())
 561 ACCESSOR(bank1ctl,bank[1],mce_restart())
 562 ACCESSOR(bank2ctl,bank[2],mce_restart())
 563 ACCESSOR(bank3ctl,bank[3],mce_restart())
 564 ACCESSOR(bank4ctl,bank[4],mce_restart())
 565 ACCESSOR(tolerant,tolerant,)
 566 ACCESSOR(check_interval,check_interval,mce_restart())
 567
 568 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 569 static __cpuinit int mce_create_device(unsigned int cpu)
 570 {
 571         int err;
 572         if (!mce_available(&cpu_data[cpu]))
 573                 return -EIO;
 574
 575         per_cpu(device_mce,cpu).id = cpu;
 576         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 577
 578         err = sysdev_register(&per_cpu(device_mce,cpu));
 579
 580         if (!err) {
 581                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
 582                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
 583                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
 584                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
 585                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
 586                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 587                 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 588         }
 589         return err;
 590 }
 591
 592 #ifdef CONFIG_HOTPLUG_CPU
 593 static __cpuinit void mce_remove_device(unsigned int cpu)
 594 {
 595         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
 596         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
 597         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
 598         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
 599         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
 600         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
 601         sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
 602         sysdev_unregister(&per_cpu(device_mce,cpu));
 603 }
 604 #endif
 605
 606 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 607 static __cpuinit int
 608 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 609 {
 610         unsigned int cpu = (unsigned long)hcpu;
 611
 612         switch (action) {
 613         case CPU_ONLINE:
 614                 mce_create_device(cpu);
 615                 break;
 616 #ifdef CONFIG_HOTPLUG_CPU
 617         case CPU_DEAD:
 618                 mce_remove_device(cpu);
 619                 break;
 620 #endif
 621         }
 622         return NOTIFY_OK;
 623 }
 624
 625 static struct notifier_block mce_cpu_notifier = {
 626         .notifier_call = mce_cpu_callback,
 627 };
 628
 629 static __init int mce_init_device(void)
 630 {
 631         int err;
 632         int i = 0;
 633
 634         if (!mce_available(&boot_cpu_data))
 635                 return -EIO;
 636         err = sysdev_class_register(&mce_sysclass);
 637
 638         for_each_online_cpu(i) {
 639                 mce_create_device(i);
 640         }
 641
 642         register_cpu_notifier(&mce_cpu_notifier);
 643         misc_register(&mce_log_device);
 644         return err;
 645 }
 646
 647 device_initcall(mce_init_device);