git.oblomov.eu Git - linux-2.6/blob - arch/x86/kernel/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/poll.h>
  22 #include <linux/thread_info.h>
  23 #include <linux/ctype.h>
  24 #include <linux/kmod.h>
  25 #include <linux/kdebug.h>
  26 #include <asm/processor.h>
  27 #include <asm/msr.h>
  28 #include <asm/mce.h>
  29 #include <asm/uaccess.h>
  30 #include <asm/smp.h>
  31 #include <asm/idle.h>
  32
  33 #define MISC_MCELOG_MINOR 227
  34 #define NR_BANKS 6
  35
  36 atomic_t mce_entry;
  37
  38 static int mce_dont_init;
  39
  40 /*
  41  * Tolerant levels:
  42  *   0: always panic on uncorrected errors, log corrected errors
  43  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  44  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  45  *   3: never panic or SIGBUS, log all errors (for testing only)
  46  */
  47 static int tolerant = 1;
  48 static int banks;
  49 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  50 static unsigned long notify_user;
  51 static int rip_msr;
  52 static int mce_bootlog = 1;
  53 static atomic_t mce_events;
  54
  55 static char trigger[128];
  56 static char *trigger_argv[2] = { trigger, NULL };
  57
  58 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  59
  60 /*
  61  * Lockless MCE logging infrastructure.
  62  * This avoids deadlocks on printk locks without having to break locks. Also
  63  * separate MCEs from kernel messages to avoid bogus bug reports.
  64  */
  65
  66 struct mce_log mcelog = {
  67         MCE_LOG_SIGNATURE,
  68         MCE_LOG_LEN,
  69 };
  70
  71 void mce_log(struct mce *mce)
  72 {
  73         unsigned next, entry;
  74         atomic_inc(&mce_events);
  75         mce->finished = 0;
  76         wmb();
  77         for (;;) {
  78                 entry = rcu_dereference(mcelog.next);
  79                 for (;;) {
  80                         /* When the buffer fills up discard new entries. Assume
  81                            that the earlier errors are the more interesting. */
  82                         if (entry >= MCE_LOG_LEN) {
  83                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  84                                 return;
  85                         }
  86                         /* Old left over entry. Skip. */
  87                         if (mcelog.entry[entry].finished) {
  88                                 entry++;
  89                                 continue;
  90                         }
  91                         break;
  92                 }
  93                 smp_rmb();
  94                 next = entry + 1;
  95                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  96                         break;
  97         }
  98         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  99         wmb();
 100         mcelog.entry[entry].finished = 1;
 101         wmb();
 102
 103         set_bit(0, &notify_user);
 104 }
 105
 106 static void print_mce(struct mce *m)
 107 {
 108         printk(KERN_EMERG "\n"
 109                KERN_EMERG "HARDWARE ERROR\n"
 110                KERN_EMERG
 111                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 112                m->cpu, m->mcgstatus, m->bank, m->status);
 113         if (m->rip) {
 114                 printk(KERN_EMERG
 115                        "RIP%s %02x:<%016Lx> ",
 116                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 117                        m->cs, m->rip);
 118                 if (m->cs == __KERNEL_CS)
 119                         print_symbol("{%s}", m->rip);
 120                 printk("\n");
 121         }
 122         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 123         if (m->addr)
 124                 printk("ADDR %Lx ", m->addr);
 125         if (m->misc)
 126                 printk("MISC %Lx ", m->misc);
 127         printk("\n");
 128         printk(KERN_EMERG "This is not a software problem!\n");
 129         printk(KERN_EMERG
 130     "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 131 }
 132
 133 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 134 {
 135         int i;
 136
 137         oops_begin();
 138         for (i = 0; i < MCE_LOG_LEN; i++) {
 139                 unsigned long tsc = mcelog.entry[i].tsc;
 140                 if (time_before(tsc, start))
 141                         continue;
 142                 print_mce(&mcelog.entry[i]);
 143                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 144                         backup = NULL;
 145         }
 146         if (backup)
 147                 print_mce(backup);
 148         panic(msg);
 149 }
 150
 151 static int mce_available(struct cpuinfo_x86 *c)
 152 {
 153         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 154 }
 155
 156 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 157 {
 158         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 159                 m->rip = regs->rip;
 160                 m->cs = regs->cs;
 161         } else {
 162                 m->rip = 0;
 163                 m->cs = 0;
 164         }
 165         if (rip_msr) {
 166                 /* Assume the RIP in the MSR is exact. Is this true? */
 167                 m->mcgstatus |= MCG_STATUS_EIPV;
 168                 rdmsrl(rip_msr, m->rip);
 169                 m->cs = 0;
 170         }
 171 }
 172
 173 /*
 174  * The actual machine check handler
 175  */
 176
 177 void do_machine_check(struct pt_regs * regs, long error_code)
 178 {
 179         struct mce m, panicm;
 180         u64 mcestart = 0;
 181         int i;
 182         int panicm_found = 0;
 183         /*
 184          * If no_way_out gets set, there is no safe way to recover from this
 185          * MCE.  If tolerant is cranked up, we'll try anyway.
 186          */
 187         int no_way_out = 0;
 188         /*
 189          * If kill_it gets set, there might be a way to recover from this
 190          * error.
 191          */
 192         int kill_it = 0;
 193
 194         atomic_inc(&mce_entry);
 195
 196         if (regs)
 197                 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 198         if (!banks)
 199                 goto out2;
 200
 201         memset(&m, 0, sizeof(struct mce));
 202         m.cpu = smp_processor_id();
 203         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 204         /* if the restart IP is not valid, we're done for */
 205         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 206                 no_way_out = 1;
 207
 208         rdtscll(mcestart);
 209         barrier();
 210
 211         for (i = 0; i < banks; i++) {
 212                 if (!bank[i])
 213                         continue;
 214
 215                 m.misc = 0;
 216                 m.addr = 0;
 217                 m.bank = i;
 218                 m.tsc = 0;
 219
 220                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 221                 if ((m.status & MCI_STATUS_VAL) == 0)
 222                         continue;
 223
 224                 if (m.status & MCI_STATUS_EN) {
 225                         /* if PCC was set, there's no way out */
 226                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 227                         /*
 228                          * If this error was uncorrectable and there was
 229                          * an overflow, we're in trouble.  If no overflow,
 230                          * we might get away with just killing a task.
 231                          */
 232                         if (m.status & MCI_STATUS_UC) {
 233                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 234                                         no_way_out = 1;
 235                                 kill_it = 1;
 236                         }
 237                 }
 238
 239                 if (m.status & MCI_STATUS_MISCV)
 240                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 241                 if (m.status & MCI_STATUS_ADDRV)
 242                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 243
 244                 mce_get_rip(&m, regs);
 245                 if (error_code >= 0)
 246                         rdtscll(m.tsc);
 247                 if (error_code != -2)
 248                         mce_log(&m);
 249
 250                 /* Did this bank cause the exception? */
 251                 /* Assume that the bank with uncorrectable errors did it,
 252                    and that there is only a single one. */
 253                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 254                         panicm = m;
 255                         panicm_found = 1;
 256                 }
 257
 258                 add_taint(TAINT_MACHINE_CHECK);
 259         }
 260
 261         /* Never do anything final in the polling timer */
 262         if (!regs)
 263                 goto out;
 264
 265         /* If we didn't find an uncorrectable error, pick
 266            the last one (shouldn't happen, just being safe). */
 267         if (!panicm_found)
 268                 panicm = m;
 269
 270         /*
 271          * If we have decided that we just CAN'T continue, and the user
 272          *  has not set tolerant to an insane level, give up and die.
 273          */
 274         if (no_way_out && tolerant < 3)
 275                 mce_panic("Machine check", &panicm, mcestart);
 276
 277         /*
 278          * If the error seems to be unrecoverable, something should be
 279          * done.  Try to kill as little as possible.  If we can kill just
 280          * one task, do that.  If the user has set the tolerance very
 281          * high, don't try to do anything at all.
 282          */
 283         if (kill_it && tolerant < 3) {
 284                 int user_space = 0;
 285
 286                 /*
 287                  * If the EIPV bit is set, it means the saved IP is the
 288                  * instruction which caused the MCE.
 289                  */
 290                 if (m.mcgstatus & MCG_STATUS_EIPV)
 291                         user_space = panicm.rip && (panicm.cs & 3);
 292
 293                 /*
 294                  * If we know that the error was in user space, send a
 295                  * SIGBUS.  Otherwise, panic if tolerance is low.
 296                  *
 297                  * do_exit() takes an awful lot of locks and has a slight
 298                  * risk of deadlocking.
 299                  */
 300                 if (user_space) {
 301                         do_exit(SIGBUS);
 302                 } else if (panic_on_oops || tolerant < 2) {
 303                         mce_panic("Uncorrected machine check",
 304                                 &panicm, mcestart);
 305                 }
 306         }
 307
 308         /* notify userspace ASAP */
 309         set_thread_flag(TIF_MCE_NOTIFY);
 310
 311  out:
 312         /* the last thing we do is clear state */
 313         for (i = 0; i < banks; i++)
 314                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 315         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 316  out2:
 317         atomic_dec(&mce_entry);
 318 }
 319
 320 #ifdef CONFIG_X86_MCE_INTEL
 321 /***
 322  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 323  * @cpu: The CPU on which the event occured.
 324  * @status: Event status information
 325  *
 326  * This function should be called by the thermal interrupt after the
 327  * event has been processed and the decision was made to log the event
 328  * further.
 329  *
 330  * The status parameter will be saved to the 'status' field of 'struct mce'
 331  * and historically has been the register value of the
 332  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 333  */
 334 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 335 {
 336         struct mce m;
 337
 338         memset(&m, 0, sizeof(m));
 339         m.cpu = cpu;
 340         m.bank = MCE_THERMAL_BANK;
 341         m.status = status;
 342         rdtscll(m.tsc);
 343         mce_log(&m);
 344 }
 345 #endif /* CONFIG_X86_MCE_INTEL */
 346
 347 /*
 348  * Periodic polling timer for "silent" machine check errors.  If the
 349  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 350  * errors, poll 2x slower (up to check_interval seconds).
 351  */
 352
 353 static int check_interval = 5 * 60; /* 5 minutes */
 354 static int next_interval; /* in jiffies */
 355 static void mcheck_timer(struct work_struct *work);
 356 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 357
 358 static void mcheck_check_cpu(void *info)
 359 {
 360         if (mce_available(&current_cpu_data))
 361                 do_machine_check(NULL, 0);
 362 }
 363
 364 static void mcheck_timer(struct work_struct *work)
 365 {
 366         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 367
 368         /*
 369          * Alert userspace if needed.  If we logged an MCE, reduce the
 370          * polling interval, otherwise increase the polling interval.
 371          */
 372         if (mce_notify_user()) {
 373                 next_interval = max(next_interval/2, HZ/100);
 374         } else {
 375                 next_interval = min(next_interval*2,
 376                                 (int)round_jiffies_relative(check_interval*HZ));
 377         }
 378
 379         schedule_delayed_work(&mcheck_work, next_interval);
 380 }
 381
 382 /*
 383  * This is only called from process context.  This is where we do
 384  * anything we need to alert userspace about new MCEs.  This is called
 385  * directly from the poller and also from entry.S and idle, thanks to
 386  * TIF_MCE_NOTIFY.
 387  */
 388 int mce_notify_user(void)
 389 {
 390         clear_thread_flag(TIF_MCE_NOTIFY);
 391         if (test_and_clear_bit(0, &notify_user)) {
 392                 static unsigned long last_print;
 393                 unsigned long now = jiffies;
 394
 395                 wake_up_interruptible(&mce_wait);
 396                 if (trigger[0])
 397                         call_usermodehelper(trigger, trigger_argv, NULL,
 398                                                 UMH_NO_WAIT);
 399
 400                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 401                         last_print = now;
 402                         printk(KERN_INFO "Machine check events logged\n");
 403                 }
 404
 405                 return 1;
 406         }
 407         return 0;
 408 }
 409
 410 /* see if the idle task needs to notify userspace */
 411 static int
 412 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 413 {
 414         /* IDLE_END should be safe - interrupts are back on */
 415         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 416                 mce_notify_user();
 417
 418         return NOTIFY_OK;
 419 }
 420
 421 static struct notifier_block mce_idle_notifier = {
 422         .notifier_call = mce_idle_callback,
 423 };
 424
 425 static __init int periodic_mcheck_init(void)
 426 {
 427         next_interval = check_interval * HZ;
 428         if (next_interval)
 429                 schedule_delayed_work(&mcheck_work,
 430                                       round_jiffies_relative(next_interval));
 431         idle_notifier_register(&mce_idle_notifier);
 432         return 0;
 433 }
 434 __initcall(periodic_mcheck_init);
 435
 436
 437 /*
 438  * Initialize Machine Checks for a CPU.
 439  */
 440 static void mce_init(void *dummy)
 441 {
 442         u64 cap;
 443         int i;
 444
 445         rdmsrl(MSR_IA32_MCG_CAP, cap);
 446         banks = cap & 0xff;
 447         if (banks > NR_BANKS) {
 448                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 449                 banks = NR_BANKS;
 450         }
 451         /* Use accurate RIP reporting if available. */
 452         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 453                 rip_msr = MSR_IA32_MCG_EIP;
 454
 455         /* Log the machine checks left over from the previous reset.
 456            This also clears all registers */
 457         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 458
 459         set_in_cr4(X86_CR4_MCE);
 460
 461         if (cap & MCG_CTL_P)
 462                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 463
 464         for (i = 0; i < banks; i++) {
 465                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 466                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 467         }
 468 }
 469
 470 /* Add per CPU specific workarounds here */
 471 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 472 {
 473         /* This should be disabled by the BIOS, but isn't always */
 474         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 475                 /* disable GART TBL walk error reporting, which trips off
 476                    incorrectly with the IOMMU & 3ware & Cerberus. */
 477                 clear_bit(10, &bank[4]);
 478                 /* Lots of broken BIOS around that don't clear them
 479                    by default and leave crap in there. Don't log. */
 480                 mce_bootlog = 0;
 481         }
 482
 483 }
 484
 485 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 486 {
 487         switch (c->x86_vendor) {
 488         case X86_VENDOR_INTEL:
 489                 mce_intel_feature_init(c);
 490                 break;
 491         case X86_VENDOR_AMD:
 492                 mce_amd_feature_init(c);
 493                 break;
 494         default:
 495                 break;
 496         }
 497 }
 498
 499 /*
 500  * Called for each booted CPU to set up machine checks.
 501  * Must be called with preempt off.
 502  */
 503 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 504 {
 505         static cpumask_t mce_cpus = CPU_MASK_NONE;
 506
 507         mce_cpu_quirks(c);
 508
 509         if (mce_dont_init ||
 510             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 511             !mce_available(c))
 512                 return;
 513
 514         mce_init(NULL);
 515         mce_cpu_features(c);
 516 }
 517
 518 /*
 519  * Character device to read and clear the MCE log.
 520  */
 521
 522 static DEFINE_SPINLOCK(mce_state_lock);
 523 static int open_count;  /* #times opened */
 524 static int open_exclu;  /* already open exclusive? */
 525
 526 static int mce_open(struct inode *inode, struct file *file)
 527 {
 528         spin_lock(&mce_state_lock);
 529
 530         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 531                 spin_unlock(&mce_state_lock);
 532                 return -EBUSY;
 533         }
 534
 535         if (file->f_flags & O_EXCL)
 536                 open_exclu = 1;
 537         open_count++;
 538
 539         spin_unlock(&mce_state_lock);
 540
 541         return nonseekable_open(inode, file);
 542 }
 543
 544 static int mce_release(struct inode *inode, struct file *file)
 545 {
 546         spin_lock(&mce_state_lock);
 547
 548         open_count--;
 549         open_exclu = 0;
 550
 551         spin_unlock(&mce_state_lock);
 552
 553         return 0;
 554 }
 555
 556 static void collect_tscs(void *data)
 557 {
 558         unsigned long *cpu_tsc = (unsigned long *)data;
 559         rdtscll(cpu_tsc[smp_processor_id()]);
 560 }
 561
 562 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 563 {
 564         unsigned long *cpu_tsc;
 565         static DECLARE_MUTEX(mce_read_sem);
 566         unsigned next;
 567         char __user *buf = ubuf;
 568         int i, err;
 569
 570         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 571         if (!cpu_tsc)
 572                 return -ENOMEM;
 573
 574         down(&mce_read_sem);
 575         next = rcu_dereference(mcelog.next);
 576
 577         /* Only supports full reads right now */
 578         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 579                 up(&mce_read_sem);
 580                 kfree(cpu_tsc);
 581                 return -EINVAL;
 582         }
 583
 584         err = 0;
 585         for (i = 0; i < next; i++) {
 586                 unsigned long start = jiffies;
 587                 while (!mcelog.entry[i].finished) {
 588                         if (time_after_eq(jiffies, start + 2)) {
 589                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 590                                 goto timeout;
 591                         }
 592                         cpu_relax();
 593                 }
 594                 smp_rmb();
 595                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 596                 buf += sizeof(struct mce);
 597  timeout:
 598                 ;
 599         }
 600
 601         memset(mcelog.entry, 0, next * sizeof(struct mce));
 602         mcelog.next = 0;
 603
 604         synchronize_sched();
 605
 606         /* Collect entries that were still getting written before the synchronize. */
 607
 608         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 609         for (i = next; i < MCE_LOG_LEN; i++) {
 610                 if (mcelog.entry[i].finished &&
 611                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 612                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 613                         smp_rmb();
 614                         buf += sizeof(struct mce);
 615                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 616                 }
 617         }
 618         up(&mce_read_sem);
 619         kfree(cpu_tsc);
 620         return err ? -EFAULT : buf - ubuf;
 621 }
 622
 623 static unsigned int mce_poll(struct file *file, poll_table *wait)
 624 {
 625         poll_wait(file, &mce_wait, wait);
 626         if (rcu_dereference(mcelog.next))
 627                 return POLLIN | POLLRDNORM;
 628         return 0;
 629 }
 630
 631 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 632 {
 633         int __user *p = (int __user *)arg;
 634         if (!capable(CAP_SYS_ADMIN))
 635                 return -EPERM;
 636         switch (cmd) {
 637         case MCE_GET_RECORD_LEN:
 638                 return put_user(sizeof(struct mce), p);
 639         case MCE_GET_LOG_LEN:
 640                 return put_user(MCE_LOG_LEN, p);
 641         case MCE_GETCLEAR_FLAGS: {
 642                 unsigned flags;
 643                 do {
 644                         flags = mcelog.flags;
 645                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 646                 return put_user(flags, p);
 647         }
 648         default:
 649                 return -ENOTTY;
 650         }
 651 }
 652
 653 static const struct file_operations mce_chrdev_ops = {
 654         .open = mce_open,
 655         .release = mce_release,
 656         .read = mce_read,
 657         .poll = mce_poll,
 658         .ioctl = mce_ioctl,
 659 };
 660
 661 static struct miscdevice mce_log_device = {
 662         MISC_MCELOG_MINOR,
 663         "mcelog",
 664         &mce_chrdev_ops,
 665 };
 666
 667 static unsigned long old_cr4 __initdata;
 668
 669 void __init stop_mce(void)
 670 {
 671         old_cr4 = read_cr4();
 672         clear_in_cr4(X86_CR4_MCE);
 673 }
 674
 675 void __init restart_mce(void)
 676 {
 677         if (old_cr4 & X86_CR4_MCE)
 678                 set_in_cr4(X86_CR4_MCE);
 679 }
 680
 681 /*
 682  * Old style boot options parsing. Only for compatibility.
 683  */
 684
 685 static int __init mcheck_disable(char *str)
 686 {
 687         mce_dont_init = 1;
 688         return 1;
 689 }
 690
 691 /* mce=off disables machine check. Note you can reenable it later
 692    using sysfs.
 693    mce=TOLERANCELEVEL (number, see above)
 694    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 695    mce=nobootlog Don't log MCEs from before booting. */
 696 static int __init mcheck_enable(char *str)
 697 {
 698         if (!strcmp(str, "off"))
 699                 mce_dont_init = 1;
 700         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 701                 mce_bootlog = str[0] == 'b';
 702         else if (isdigit(str[0]))
 703                 get_option(&str, &tolerant);
 704         else
 705                 printk("mce= argument %s ignored. Please use /sys", str);
 706         return 1;
 707 }
 708
 709 __setup("nomce", mcheck_disable);
 710 __setup("mce=", mcheck_enable);
 711
 712 /*
 713  * Sysfs support
 714  */
 715
 716 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 717    Only one CPU is active at this time, the others get readded later using
 718    CPU hotplug. */
 719 static int mce_resume(struct sys_device *dev)
 720 {
 721         mce_init(NULL);
 722         return 0;
 723 }
 724
 725 /* Reinit MCEs after user configuration changes */
 726 static void mce_restart(void)
 727 {
 728         if (next_interval)
 729                 cancel_delayed_work(&mcheck_work);
 730         /* Timer race is harmless here */
 731         on_each_cpu(mce_init, NULL, 1, 1);
 732         next_interval = check_interval * HZ;
 733         if (next_interval)
 734                 schedule_delayed_work(&mcheck_work,
 735                                       round_jiffies_relative(next_interval));
 736 }
 737
 738 static struct sysdev_class mce_sysclass = {
 739         .resume = mce_resume,
 740         set_kset_name("machinecheck"),
 741 };
 742
 743 DEFINE_PER_CPU(struct sys_device, device_mce);
 744
 745 /* Why are there no generic functions for this? */
 746 #define ACCESSOR(name, var, start) \
 747         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 748                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 749         }                                                                          \
 750         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 751                 char *end;                                                         \
 752                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 753                 if (end == buf) return -EINVAL;                                    \
 754                 var = new;                                                         \
 755                 start;                                                             \
 756                 return end-buf;                                                    \
 757         }                                                                          \
 758         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 759
 760 /* TBD should generate these dynamically based on number of available banks */
 761 ACCESSOR(bank0ctl,bank[0],mce_restart())
 762 ACCESSOR(bank1ctl,bank[1],mce_restart())
 763 ACCESSOR(bank2ctl,bank[2],mce_restart())
 764 ACCESSOR(bank3ctl,bank[3],mce_restart())
 765 ACCESSOR(bank4ctl,bank[4],mce_restart())
 766 ACCESSOR(bank5ctl,bank[5],mce_restart())
 767
 768 static ssize_t show_trigger(struct sys_device *s, char *buf)
 769 {
 770         strcpy(buf, trigger);
 771         strcat(buf, "\n");
 772         return strlen(trigger) + 1;
 773 }
 774
 775 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 776 {
 777         char *p;
 778         int len;
 779         strncpy(trigger, buf, sizeof(trigger));
 780         trigger[sizeof(trigger)-1] = 0;
 781         len = strlen(trigger);
 782         p = strchr(trigger, '\n');
 783         if (*p) *p = 0;
 784         return len;
 785 }
 786
 787 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 788 ACCESSOR(tolerant,tolerant,)
 789 ACCESSOR(check_interval,check_interval,mce_restart())
 790 static struct sysdev_attribute *mce_attributes[] = {
 791         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 792         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 793         &attr_tolerant, &attr_check_interval, &attr_trigger,
 794         NULL
 795 };
 796
 797 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 798 static __cpuinit int mce_create_device(unsigned int cpu)
 799 {
 800         int err;
 801         int i;
 802         if (!mce_available(&cpu_data[cpu]))
 803                 return -EIO;
 804
 805         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 806         per_cpu(device_mce,cpu).id = cpu;
 807         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 808
 809         err = sysdev_register(&per_cpu(device_mce,cpu));
 810         if (err)
 811                 return err;
 812
 813         for (i = 0; mce_attributes[i]; i++) {
 814                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
 815                                          mce_attributes[i]);
 816                 if (err)
 817                         goto error;
 818         }
 819
 820         return 0;
 821 error:
 822         while (i--) {
 823                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 824                                    mce_attributes[i]);
 825         }
 826         sysdev_unregister(&per_cpu(device_mce,cpu));
 827
 828         return err;
 829 }
 830
 831 static void mce_remove_device(unsigned int cpu)
 832 {
 833         int i;
 834
 835         for (i = 0; mce_attributes[i]; i++)
 836                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 837                         mce_attributes[i]);
 838         sysdev_unregister(&per_cpu(device_mce,cpu));
 839 }
 840
 841 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 842 static int
 843 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 844 {
 845         unsigned int cpu = (unsigned long)hcpu;
 846         int err = 0;
 847
 848         switch (action) {
 849         case CPU_UP_PREPARE:
 850         case CPU_UP_PREPARE_FROZEN:
 851                 err = mce_create_device(cpu);
 852                 break;
 853         case CPU_UP_CANCELED:
 854         case CPU_UP_CANCELED_FROZEN:
 855         case CPU_DEAD:
 856         case CPU_DEAD_FROZEN:
 857                 mce_remove_device(cpu);
 858                 break;
 859         }
 860         return err ? NOTIFY_BAD : NOTIFY_OK;
 861 }
 862
 863 static struct notifier_block mce_cpu_notifier = {
 864         .notifier_call = mce_cpu_callback,
 865 };
 866
 867 static __init int mce_init_device(void)
 868 {
 869         int err;
 870         int i = 0;
 871
 872         if (!mce_available(&boot_cpu_data))
 873                 return -EIO;
 874         err = sysdev_class_register(&mce_sysclass);
 875         if (err)
 876                 return err;
 877
 878         for_each_online_cpu(i) {
 879                 err = mce_create_device(i);
 880                 if (err)
 881                         return err;
 882         }
 883
 884         register_hotcpu_notifier(&mce_cpu_notifier);
 885         misc_register(&mce_log_device);
 886         return err;
 887 }
 888
 889 device_initcall(mce_init_device);