arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/poll.h>
  22 #include <linux/thread_info.h>
  23 #include <linux/ctype.h>
  24 #include <linux/kmod.h>
  25 #include <linux/kdebug.h>
  26 #include <asm/processor.h>
  27 #include <asm/msr.h>
  28 #include <asm/mce.h>
  29 #include <asm/uaccess.h>
  30 #include <asm/smp.h>
  31 #include <asm/idle.h>
  32
  33 #define MISC_MCELOG_MINOR 227
  34 #define NR_BANKS 6
  35
  36 atomic_t mce_entry;
  37
  38 static int mce_dont_init;
  39
  40 /*
  41  * Tolerant levels:
  42  *   0: always panic on uncorrected errors, log corrected errors
  43  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  44  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  45  *   3: never panic or SIGBUS, log all errors (for testing only)
  46  */
  47 static int tolerant = 1;
  48 static int banks;
  49 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  50 static unsigned long notify_user;
  51 static int rip_msr;
  52 static int mce_bootlog = 1;
  53 static atomic_t mce_events;
  54
  55 static char trigger[128];
  56 static char *trigger_argv[2] = { trigger, NULL };
  57
  58 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  59
  60 /*
  61  * Lockless MCE logging infrastructure.
  62  * This avoids deadlocks on printk locks without having to break locks. Also
  63  * separate MCEs from kernel messages to avoid bogus bug reports.
  64  */
  65
  66 struct mce_log mcelog = {
  67         MCE_LOG_SIGNATURE,
  68         MCE_LOG_LEN,
  69 };
  70
  71 void mce_log(struct mce *mce)
  72 {
  73         unsigned next, entry;
  74         atomic_inc(&mce_events);
  75         mce->finished = 0;
  76         wmb();
  77         for (;;) {
  78                 entry = rcu_dereference(mcelog.next);
  79                 /* The rmb forces the compiler to reload next in each
  80                     iteration */
  81                 rmb();
  82                 for (;;) {
  83                         /* When the buffer fills up discard new entries. Assume
  84                            that the earlier errors are the more interesting. */
  85                         if (entry >= MCE_LOG_LEN) {
  86                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  87                                 return;
  88                         }
  89                         /* Old left over entry. Skip. */
  90                         if (mcelog.entry[entry].finished) {
  91                                 entry++;
  92                                 continue;
  93                         }
  94                         break;
  95                 }
  96                 smp_rmb();
  97                 next = entry + 1;
  98                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  99                         break;
 100         }
 101         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 102         wmb();
 103         mcelog.entry[entry].finished = 1;
 104         wmb();
 105
 106         set_bit(0, &notify_user);
 107 }
 108
 109 static void print_mce(struct mce *m)
 110 {
 111         printk(KERN_EMERG "\n"
 112                KERN_EMERG "HARDWARE ERROR\n"
 113                KERN_EMERG
 114                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 115                m->cpu, m->mcgstatus, m->bank, m->status);
 116         if (m->rip) {
 117                 printk(KERN_EMERG
 118                        "RIP%s %02x:<%016Lx> ",
 119                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 120                        m->cs, m->rip);
 121                 if (m->cs == __KERNEL_CS)
 122                         print_symbol("{%s}", m->rip);
 123                 printk("\n");
 124         }
 125         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 126         if (m->addr)
 127                 printk("ADDR %Lx ", m->addr);
 128         if (m->misc)
 129                 printk("MISC %Lx ", m->misc);
 130         printk("\n");
 131         printk(KERN_EMERG "This is not a software problem!\n");
 132         printk(KERN_EMERG
 133     "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 134 }
 135
 136 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 137 {
 138         int i;
 139
 140         oops_begin();
 141         for (i = 0; i < MCE_LOG_LEN; i++) {
 142                 unsigned long tsc = mcelog.entry[i].tsc;
 143                 if (time_before(tsc, start))
 144                         continue;
 145                 print_mce(&mcelog.entry[i]);
 146                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 147                         backup = NULL;
 148         }
 149         if (backup)
 150                 print_mce(backup);
 151         panic(msg);
 152 }
 153
 154 static int mce_available(struct cpuinfo_x86 *c)
 155 {
 156         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 157 }
 158
 159 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 160 {
 161         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 162                 m->rip = regs->rip;
 163                 m->cs = regs->cs;
 164         } else {
 165                 m->rip = 0;
 166                 m->cs = 0;
 167         }
 168         if (rip_msr) {
 169                 /* Assume the RIP in the MSR is exact. Is this true? */
 170                 m->mcgstatus |= MCG_STATUS_EIPV;
 171                 rdmsrl(rip_msr, m->rip);
 172                 m->cs = 0;
 173         }
 174 }
 175
 176 /*
 177  * The actual machine check handler
 178  */
 179
 180 void do_machine_check(struct pt_regs * regs, long error_code)
 181 {
 182         struct mce m, panicm;
 183         u64 mcestart = 0;
 184         int i;
 185         int panicm_found = 0;
 186         /*
 187          * If no_way_out gets set, there is no safe way to recover from this
 188          * MCE.  If tolerant is cranked up, we'll try anyway.
 189          */
 190         int no_way_out = 0;
 191         /*
 192          * If kill_it gets set, there might be a way to recover from this
 193          * error.
 194          */
 195         int kill_it = 0;
 196
 197         atomic_inc(&mce_entry);
 198
 199         if (regs)
 200                 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 201         if (!banks)
 202                 goto out2;
 203
 204         memset(&m, 0, sizeof(struct mce));
 205         m.cpu = smp_processor_id();
 206         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 207         /* if the restart IP is not valid, we're done for */
 208         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 209                 no_way_out = 1;
 210
 211         rdtscll(mcestart);
 212         barrier();
 213
 214         for (i = 0; i < banks; i++) {
 215                 if (!bank[i])
 216                         continue;
 217
 218                 m.misc = 0;
 219                 m.addr = 0;
 220                 m.bank = i;
 221                 m.tsc = 0;
 222
 223                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 224                 if ((m.status & MCI_STATUS_VAL) == 0)
 225                         continue;
 226
 227                 if (m.status & MCI_STATUS_EN) {
 228                         /* if PCC was set, there's no way out */
 229                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 230                         /*
 231                          * If this error was uncorrectable and there was
 232                          * an overflow, we're in trouble.  If no overflow,
 233                          * we might get away with just killing a task.
 234                          */
 235                         if (m.status & MCI_STATUS_UC) {
 236                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 237                                         no_way_out = 1;
 238                                 kill_it = 1;
 239                         }
 240                 }
 241
 242                 if (m.status & MCI_STATUS_MISCV)
 243                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 244                 if (m.status & MCI_STATUS_ADDRV)
 245                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 246
 247                 mce_get_rip(&m, regs);
 248                 if (error_code >= 0)
 249                         rdtscll(m.tsc);
 250                 if (error_code != -2)
 251                         mce_log(&m);
 252
 253                 /* Did this bank cause the exception? */
 254                 /* Assume that the bank with uncorrectable errors did it,
 255                    and that there is only a single one. */
 256                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 257                         panicm = m;
 258                         panicm_found = 1;
 259                 }
 260
 261                 add_taint(TAINT_MACHINE_CHECK);
 262         }
 263
 264         /* Never do anything final in the polling timer */
 265         if (!regs)
 266                 goto out;
 267
 268         /* If we didn't find an uncorrectable error, pick
 269            the last one (shouldn't happen, just being safe). */
 270         if (!panicm_found)
 271                 panicm = m;
 272
 273         /*
 274          * If we have decided that we just CAN'T continue, and the user
 275          *  has not set tolerant to an insane level, give up and die.
 276          */
 277         if (no_way_out && tolerant < 3)
 278                 mce_panic("Machine check", &panicm, mcestart);
 279
 280         /*
 281          * If the error seems to be unrecoverable, something should be
 282          * done.  Try to kill as little as possible.  If we can kill just
 283          * one task, do that.  If the user has set the tolerance very
 284          * high, don't try to do anything at all.
 285          */
 286         if (kill_it && tolerant < 3) {
 287                 int user_space = 0;
 288
 289                 /*
 290                  * If the EIPV bit is set, it means the saved IP is the
 291                  * instruction which caused the MCE.
 292                  */
 293                 if (m.mcgstatus & MCG_STATUS_EIPV)
 294                         user_space = panicm.rip && (panicm.cs & 3);
 295
 296                 /*
 297                  * If we know that the error was in user space, send a
 298                  * SIGBUS.  Otherwise, panic if tolerance is low.
 299                  *
 300                  * do_exit() takes an awful lot of locks and has a slight
 301                  * risk of deadlocking.
 302                  */
 303                 if (user_space) {
 304                         do_exit(SIGBUS);
 305                 } else if (panic_on_oops || tolerant < 2) {
 306                         mce_panic("Uncorrected machine check",
 307                                 &panicm, mcestart);
 308                 }
 309         }
 310
 311         /* notify userspace ASAP */
 312         set_thread_flag(TIF_MCE_NOTIFY);
 313
 314  out:
 315         /* the last thing we do is clear state */
 316         for (i = 0; i < banks; i++)
 317                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 318         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 319  out2:
 320         atomic_dec(&mce_entry);
 321 }
 322
 323 #ifdef CONFIG_X86_MCE_INTEL
 324 /***
 325  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 326  * @cpu: The CPU on which the event occured.
 327  * @status: Event status information
 328  *
 329  * This function should be called by the thermal interrupt after the
 330  * event has been processed and the decision was made to log the event
 331  * further.
 332  *
 333  * The status parameter will be saved to the 'status' field of 'struct mce'
 334  * and historically has been the register value of the
 335  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 336  */
 337 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 338 {
 339         struct mce m;
 340
 341         memset(&m, 0, sizeof(m));
 342         m.cpu = cpu;
 343         m.bank = MCE_THERMAL_BANK;
 344         m.status = status;
 345         rdtscll(m.tsc);
 346         mce_log(&m);
 347 }
 348 #endif /* CONFIG_X86_MCE_INTEL */
 349
 350 /*
 351  * Periodic polling timer for "silent" machine check errors.  If the
 352  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 353  * errors, poll 2x slower (up to check_interval seconds).
 354  */
 355
 356 static int check_interval = 5 * 60; /* 5 minutes */
 357 static int next_interval; /* in jiffies */
 358 static void mcheck_timer(struct work_struct *work);
 359 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 360
 361 static void mcheck_check_cpu(void *info)
 362 {
 363         if (mce_available(&current_cpu_data))
 364                 do_machine_check(NULL, 0);
 365 }
 366
 367 static void mcheck_timer(struct work_struct *work)
 368 {
 369         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 370
 371         /*
 372          * Alert userspace if needed.  If we logged an MCE, reduce the
 373          * polling interval, otherwise increase the polling interval.
 374          */
 375         if (mce_notify_user()) {
 376                 next_interval = max(next_interval/2, HZ/100);
 377         } else {
 378                 next_interval = min(next_interval*2, check_interval*HZ);
 379         }
 380
 381         schedule_delayed_work(&mcheck_work, next_interval);
 382 }
 383
 384 /*
 385  * This is only called from process context.  This is where we do
 386  * anything we need to alert userspace about new MCEs.  This is called
 387  * directly from the poller and also from entry.S and idle, thanks to
 388  * TIF_MCE_NOTIFY.
 389  */
 390 int mce_notify_user(void)
 391 {
 392         clear_thread_flag(TIF_MCE_NOTIFY);
 393         if (test_and_clear_bit(0, &notify_user)) {
 394                 static unsigned long last_print;
 395                 unsigned long now = jiffies;
 396
 397                 wake_up_interruptible(&mce_wait);
 398                 if (trigger[0])
 399                         call_usermodehelper(trigger, trigger_argv, NULL,
 400                                                 UMH_NO_WAIT);
 401
 402                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 403                         last_print = now;
 404                         printk(KERN_INFO "Machine check events logged\n");
 405                 }
 406
 407                 return 1;
 408         }
 409         return 0;
 410 }
 411
 412 /* see if the idle task needs to notify userspace */
 413 static int
 414 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 415 {
 416         /* IDLE_END should be safe - interrupts are back on */
 417         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 418                 mce_notify_user();
 419
 420         return NOTIFY_OK;
 421 }
 422
 423 static struct notifier_block mce_idle_notifier = {
 424         .notifier_call = mce_idle_callback,
 425 };
 426
 427 static __init int periodic_mcheck_init(void)
 428 {
 429         next_interval = check_interval * HZ;
 430         if (next_interval)
 431                 schedule_delayed_work(&mcheck_work, next_interval);
 432         idle_notifier_register(&mce_idle_notifier);
 433         return 0;
 434 }
 435 __initcall(periodic_mcheck_init);
 436
 437
 438 /*
 439  * Initialize Machine Checks for a CPU.
 440  */
 441 static void mce_init(void *dummy)
 442 {
 443         u64 cap;
 444         int i;
 445
 446         rdmsrl(MSR_IA32_MCG_CAP, cap);
 447         banks = cap & 0xff;
 448         if (banks > NR_BANKS) {
 449                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 450                 banks = NR_BANKS;
 451         }
 452         /* Use accurate RIP reporting if available. */
 453         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 454                 rip_msr = MSR_IA32_MCG_EIP;
 455
 456         /* Log the machine checks left over from the previous reset.
 457            This also clears all registers */
 458         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 459
 460         set_in_cr4(X86_CR4_MCE);
 461
 462         if (cap & MCG_CTL_P)
 463                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 464
 465         for (i = 0; i < banks; i++) {
 466                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 467                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 468         }
 469 }
 470
 471 /* Add per CPU specific workarounds here */
 472 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 473 {
 474         /* This should be disabled by the BIOS, but isn't always */
 475         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 476                 /* disable GART TBL walk error reporting, which trips off
 477                    incorrectly with the IOMMU & 3ware & Cerberus. */
 478                 clear_bit(10, &bank[4]);
 479                 /* Lots of broken BIOS around that don't clear them
 480                    by default and leave crap in there. Don't log. */
 481                 mce_bootlog = 0;
 482         }
 483
 484 }
 485
 486 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 487 {
 488         switch (c->x86_vendor) {
 489         case X86_VENDOR_INTEL:
 490                 mce_intel_feature_init(c);
 491                 break;
 492         case X86_VENDOR_AMD:
 493                 mce_amd_feature_init(c);
 494                 break;
 495         default:
 496                 break;
 497         }
 498 }
 499
 500 /*
 501  * Called for each booted CPU to set up machine checks.
 502  * Must be called with preempt off.
 503  */
 504 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 505 {
 506         static cpumask_t mce_cpus = CPU_MASK_NONE;
 507
 508         mce_cpu_quirks(c);
 509
 510         if (mce_dont_init ||
 511             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 512             !mce_available(c))
 513                 return;
 514
 515         mce_init(NULL);
 516         mce_cpu_features(c);
 517 }
 518
 519 /*
 520  * Character device to read and clear the MCE log.
 521  */
 522
 523 static DEFINE_SPINLOCK(mce_state_lock);
 524 static int open_count;  /* #times opened */
 525 static int open_exclu;  /* already open exclusive? */
 526
 527 static int mce_open(struct inode *inode, struct file *file)
 528 {
 529         spin_lock(&mce_state_lock);
 530
 531         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 532                 spin_unlock(&mce_state_lock);
 533                 return -EBUSY;
 534         }
 535
 536         if (file->f_flags & O_EXCL)
 537                 open_exclu = 1;
 538         open_count++;
 539
 540         spin_unlock(&mce_state_lock);
 541
 542         return nonseekable_open(inode, file);
 543 }
 544
 545 static int mce_release(struct inode *inode, struct file *file)
 546 {
 547         spin_lock(&mce_state_lock);
 548
 549         open_count--;
 550         open_exclu = 0;
 551
 552         spin_unlock(&mce_state_lock);
 553
 554         return 0;
 555 }
 556
 557 static void collect_tscs(void *data)
 558 {
 559         unsigned long *cpu_tsc = (unsigned long *)data;
 560         rdtscll(cpu_tsc[smp_processor_id()]);
 561 }
 562
 563 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 564 {
 565         unsigned long *cpu_tsc;
 566         static DECLARE_MUTEX(mce_read_sem);
 567         unsigned next;
 568         char __user *buf = ubuf;
 569         int i, err;
 570
 571         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 572         if (!cpu_tsc)
 573                 return -ENOMEM;
 574
 575         down(&mce_read_sem);
 576         next = rcu_dereference(mcelog.next);
 577
 578         /* Only supports full reads right now */
 579         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 580                 up(&mce_read_sem);
 581                 kfree(cpu_tsc);
 582                 return -EINVAL;
 583         }
 584
 585         err = 0;
 586         for (i = 0; i < next; i++) {
 587                 unsigned long start = jiffies;
 588                 while (!mcelog.entry[i].finished) {
 589                         if (time_after_eq(jiffies, start + 2)) {
 590                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 591                                 goto timeout;
 592                         }
 593                         cpu_relax();
 594                 }
 595                 smp_rmb();
 596                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 597                 buf += sizeof(struct mce);
 598  timeout:
 599                 ;
 600         }
 601
 602         memset(mcelog.entry, 0, next * sizeof(struct mce));
 603         mcelog.next = 0;
 604
 605         synchronize_sched();
 606
 607         /* Collect entries that were still getting written before the synchronize. */
 608
 609         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 610         for (i = next; i < MCE_LOG_LEN; i++) {
 611                 if (mcelog.entry[i].finished &&
 612                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 613                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 614                         smp_rmb();
 615                         buf += sizeof(struct mce);
 616                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 617                 }
 618         }
 619         up(&mce_read_sem);
 620         kfree(cpu_tsc);
 621         return err ? -EFAULT : buf - ubuf;
 622 }
 623
 624 static unsigned int mce_poll(struct file *file, poll_table *wait)
 625 {
 626         poll_wait(file, &mce_wait, wait);
 627         if (rcu_dereference(mcelog.next))
 628                 return POLLIN | POLLRDNORM;
 629         return 0;
 630 }
 631
 632 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 633 {
 634         int __user *p = (int __user *)arg;
 635         if (!capable(CAP_SYS_ADMIN))
 636                 return -EPERM;
 637         switch (cmd) {
 638         case MCE_GET_RECORD_LEN:
 639                 return put_user(sizeof(struct mce), p);
 640         case MCE_GET_LOG_LEN:
 641                 return put_user(MCE_LOG_LEN, p);
 642         case MCE_GETCLEAR_FLAGS: {
 643                 unsigned flags;
 644                 do {
 645                         flags = mcelog.flags;
 646                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 647                 return put_user(flags, p);
 648         }
 649         default:
 650                 return -ENOTTY;
 651         }
 652 }
 653
 654 static const struct file_operations mce_chrdev_ops = {
 655         .open = mce_open,
 656         .release = mce_release,
 657         .read = mce_read,
 658         .poll = mce_poll,
 659         .ioctl = mce_ioctl,
 660 };
 661
 662 static struct miscdevice mce_log_device = {
 663         MISC_MCELOG_MINOR,
 664         "mcelog",
 665         &mce_chrdev_ops,
 666 };
 667
 668 /*
 669  * Old style boot options parsing. Only for compatibility.
 670  */
 671
 672 static int __init mcheck_disable(char *str)
 673 {
 674         mce_dont_init = 1;
 675         return 1;
 676 }
 677
 678 /* mce=off disables machine check. Note you can reenable it later
 679    using sysfs.
 680    mce=TOLERANCELEVEL (number, see above)
 681    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 682    mce=nobootlog Don't log MCEs from before booting. */
 683 static int __init mcheck_enable(char *str)
 684 {
 685         if (*str == '=')
 686                 str++;
 687         if (!strcmp(str, "off"))
 688                 mce_dont_init = 1;
 689         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 690                 mce_bootlog = str[0] == 'b';
 691         else if (isdigit(str[0]))
 692                 get_option(&str, &tolerant);
 693         else
 694                 printk("mce= argument %s ignored. Please use /sys", str);
 695         return 1;
 696 }
 697
 698 __setup("nomce", mcheck_disable);
 699 __setup("mce", mcheck_enable);
 700
 701 /*
 702  * Sysfs support
 703  */
 704
 705 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 706    Only one CPU is active at this time, the others get readded later using
 707    CPU hotplug. */
 708 static int mce_resume(struct sys_device *dev)
 709 {
 710         mce_init(NULL);
 711         return 0;
 712 }
 713
 714 /* Reinit MCEs after user configuration changes */
 715 static void mce_restart(void)
 716 {
 717         if (next_interval)
 718                 cancel_delayed_work(&mcheck_work);
 719         /* Timer race is harmless here */
 720         on_each_cpu(mce_init, NULL, 1, 1);
 721         next_interval = check_interval * HZ;
 722         if (next_interval)
 723                 schedule_delayed_work(&mcheck_work, next_interval);
 724 }
 725
 726 static struct sysdev_class mce_sysclass = {
 727         .resume = mce_resume,
 728         set_kset_name("machinecheck"),
 729 };
 730
 731 DEFINE_PER_CPU(struct sys_device, device_mce);
 732
 733 /* Why are there no generic functions for this? */
 734 #define ACCESSOR(name, var, start) \
 735         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 736                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 737         }                                                                          \
 738         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 739                 char *end;                                                         \
 740                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 741                 if (end == buf) return -EINVAL;                                    \
 742                 var = new;                                                         \
 743                 start;                                                             \
 744                 return end-buf;                                                    \
 745         }                                                                          \
 746         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 747
 748 /* TBD should generate these dynamically based on number of available banks */
 749 ACCESSOR(bank0ctl,bank[0],mce_restart())
 750 ACCESSOR(bank1ctl,bank[1],mce_restart())
 751 ACCESSOR(bank2ctl,bank[2],mce_restart())
 752 ACCESSOR(bank3ctl,bank[3],mce_restart())
 753 ACCESSOR(bank4ctl,bank[4],mce_restart())
 754 ACCESSOR(bank5ctl,bank[5],mce_restart())
 755
 756 static ssize_t show_trigger(struct sys_device *s, char *buf)
 757 {
 758         strcpy(buf, trigger);
 759         strcat(buf, "\n");
 760         return strlen(trigger) + 1;
 761 }
 762
 763 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 764 {
 765         char *p;
 766         int len;
 767         strncpy(trigger, buf, sizeof(trigger));
 768         trigger[sizeof(trigger)-1] = 0;
 769         len = strlen(trigger);
 770         p = strchr(trigger, '\n');
 771         if (*p) *p = 0;
 772         return len;
 773 }
 774
 775 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 776 ACCESSOR(tolerant,tolerant,)
 777 ACCESSOR(check_interval,check_interval,mce_restart())
 778 static struct sysdev_attribute *mce_attributes[] = {
 779         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 780         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 781         &attr_tolerant, &attr_check_interval, &attr_trigger,
 782         NULL
 783 };
 784
 785 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 786 static __cpuinit int mce_create_device(unsigned int cpu)
 787 {
 788         int err;
 789         int i;
 790         if (!mce_available(&cpu_data[cpu]))
 791                 return -EIO;
 792
 793         per_cpu(device_mce,cpu).id = cpu;
 794         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 795
 796         err = sysdev_register(&per_cpu(device_mce,cpu));
 797
 798         if (!err) {
 799                 for (i = 0; mce_attributes[i]; i++)
 800                         sysdev_create_file(&per_cpu(device_mce,cpu),
 801                                 mce_attributes[i]);
 802         }
 803         return err;
 804 }
 805
 806 static void mce_remove_device(unsigned int cpu)
 807 {
 808         int i;
 809
 810         for (i = 0; mce_attributes[i]; i++)
 811                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 812                         mce_attributes[i]);
 813         sysdev_unregister(&per_cpu(device_mce,cpu));
 814         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 815 }
 816
 817 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 818 static int
 819 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 820 {
 821         unsigned int cpu = (unsigned long)hcpu;
 822
 823         switch (action) {
 824         case CPU_ONLINE:
 825         case CPU_ONLINE_FROZEN:
 826                 mce_create_device(cpu);
 827                 break;
 828         case CPU_DEAD:
 829         case CPU_DEAD_FROZEN:
 830                 mce_remove_device(cpu);
 831                 break;
 832         }
 833         return NOTIFY_OK;
 834 }
 835
 836 static struct notifier_block mce_cpu_notifier = {
 837         .notifier_call = mce_cpu_callback,
 838 };
 839
 840 static __init int mce_init_device(void)
 841 {
 842         int err;
 843         int i = 0;
 844
 845         if (!mce_available(&boot_cpu_data))
 846                 return -EIO;
 847         err = sysdev_class_register(&mce_sysclass);
 848
 849         for_each_online_cpu(i) {
 850                 mce_create_device(i);
 851         }
 852
 853         register_hotcpu_notifier(&mce_cpu_notifier);
 854         misc_register(&mce_log_device);
 855         return err;
 856 }
 857
 858 device_initcall(mce_init_device);