arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ctype.h>
  22 #include <linux/kmod.h>
  23 #include <linux/kdebug.h>
  24 #include <asm/processor.h>
  25 #include <asm/msr.h>
  26 #include <asm/mce.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/smp.h>
  29
  30 #define MISC_MCELOG_MINOR 227
  31 #define NR_BANKS 6
  32
  33 atomic_t mce_entry;
  34
  35 static int mce_dont_init;
  36
  37 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  38    3: never panic or exit (for testing only) */
  39 static int tolerant = 1;
  40 static int banks;
  41 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  42 static unsigned long console_logged;
  43 static int notify_user;
  44 static int rip_msr;
  45 static int mce_bootlog = 1;
  46 static atomic_t mce_events;
  47
  48 static char trigger[128];
  49 static char *trigger_argv[2] = { trigger, NULL };
  50
  51 /*
  52  * Lockless MCE logging infrastructure.
  53  * This avoids deadlocks on printk locks without having to break locks. Also
  54  * separate MCEs from kernel messages to avoid bogus bug reports.
  55  */
  56
  57 struct mce_log mcelog = {
  58         MCE_LOG_SIGNATURE,
  59         MCE_LOG_LEN,
  60 };
  61
  62 void mce_log(struct mce *mce)
  63 {
  64         unsigned next, entry;
  65         atomic_inc(&mce_events);
  66         mce->finished = 0;
  67         wmb();
  68         for (;;) {
  69                 entry = rcu_dereference(mcelog.next);
  70                 /* The rmb forces the compiler to reload next in each
  71                     iteration */
  72                 rmb();
  73                 for (;;) {
  74                         /* When the buffer fills up discard new entries. Assume
  75                            that the earlier errors are the more interesting. */
  76                         if (entry >= MCE_LOG_LEN) {
  77                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  78                                 return;
  79                         }
  80                         /* Old left over entry. Skip. */
  81                         if (mcelog.entry[entry].finished) {
  82                                 entry++;
  83                                 continue;
  84                         }
  85                         break;
  86                 }
  87                 smp_rmb();
  88                 next = entry + 1;
  89                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  90                         break;
  91         }
  92         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  93         wmb();
  94         mcelog.entry[entry].finished = 1;
  95         wmb();
  96
  97         if (!test_and_set_bit(0, &console_logged))
  98                 notify_user = 1;
  99 }
 100
 101 static void print_mce(struct mce *m)
 102 {
 103         printk(KERN_EMERG "\n"
 104                KERN_EMERG "HARDWARE ERROR\n"
 105                KERN_EMERG
 106                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 107                m->cpu, m->mcgstatus, m->bank, m->status);
 108         if (m->rip) {
 109                 printk(KERN_EMERG
 110                        "RIP%s %02x:<%016Lx> ",
 111                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 112                        m->cs, m->rip);
 113                 if (m->cs == __KERNEL_CS)
 114                         print_symbol("{%s}", m->rip);
 115                 printk("\n");
 116         }
 117         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 118         if (m->addr)
 119                 printk("ADDR %Lx ", m->addr);
 120         if (m->misc)
 121                 printk("MISC %Lx ", m->misc);
 122         printk("\n");
 123         printk(KERN_EMERG "This is not a software problem!\n");
 124         printk(KERN_EMERG
 125     "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 126 }
 127
 128 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 129 {
 130         int i;
 131         oops_begin();
 132         for (i = 0; i < MCE_LOG_LEN; i++) {
 133                 unsigned long tsc = mcelog.entry[i].tsc;
 134                 if (time_before(tsc, start))
 135                         continue;
 136                 print_mce(&mcelog.entry[i]);
 137                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 138                         backup = NULL;
 139         }
 140         if (backup)
 141                 print_mce(backup);
 142         if (tolerant >= 3)
 143                 printk("Fake panic: %s\n", msg);
 144         else
 145                 panic(msg);
 146 }
 147
 148 static int mce_available(struct cpuinfo_x86 *c)
 149 {
 150         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 151 }
 152
 153 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 154 {
 155         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 156                 m->rip = regs->rip;
 157                 m->cs = regs->cs;
 158         } else {
 159                 m->rip = 0;
 160                 m->cs = 0;
 161         }
 162         if (rip_msr) {
 163                 /* Assume the RIP in the MSR is exact. Is this true? */
 164                 m->mcgstatus |= MCG_STATUS_EIPV;
 165                 rdmsrl(rip_msr, m->rip);
 166                 m->cs = 0;
 167         }
 168 }
 169
 170 static void do_mce_trigger(void)
 171 {
 172         static atomic_t mce_logged;
 173         int events = atomic_read(&mce_events);
 174         if (events != atomic_read(&mce_logged) && trigger[0]) {
 175                 /* Small race window, but should be harmless.  */
 176                 atomic_set(&mce_logged, events);
 177                 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 178         }
 179 }
 180
 181 /*
 182  * The actual machine check handler
 183  */
 184
 185 void do_machine_check(struct pt_regs * regs, long error_code)
 186 {
 187         struct mce m, panicm;
 188         int nowayout = (tolerant < 1);
 189         int kill_it = 0;
 190         u64 mcestart = 0;
 191         int i;
 192         int panicm_found = 0;
 193
 194         atomic_inc(&mce_entry);
 195
 196         if (regs)
 197                 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 198         if (!banks)
 199                 goto out2;
 200
 201         memset(&m, 0, sizeof(struct mce));
 202         m.cpu = smp_processor_id();
 203         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 204         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 205                 kill_it = 1;
 206
 207         rdtscll(mcestart);
 208         barrier();
 209
 210         for (i = 0; i < banks; i++) {
 211                 if (!bank[i])
 212                         continue;
 213
 214                 m.misc = 0;
 215                 m.addr = 0;
 216                 m.bank = i;
 217                 m.tsc = 0;
 218
 219                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 220                 if ((m.status & MCI_STATUS_VAL) == 0)
 221                         continue;
 222
 223                 if (m.status & MCI_STATUS_EN) {
 224                         /* In theory _OVER could be a nowayout too, but
 225                            assume any overflowed errors were no fatal. */
 226                         nowayout |= !!(m.status & MCI_STATUS_PCC);
 227                         kill_it |= !!(m.status & MCI_STATUS_UC);
 228                 }
 229
 230                 if (m.status & MCI_STATUS_MISCV)
 231                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 232                 if (m.status & MCI_STATUS_ADDRV)
 233                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 234
 235                 mce_get_rip(&m, regs);
 236                 if (error_code >= 0)
 237                         rdtscll(m.tsc);
 238                 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 239                 if (error_code != -2)
 240                         mce_log(&m);
 241
 242                 /* Did this bank cause the exception? */
 243                 /* Assume that the bank with uncorrectable errors did it,
 244                    and that there is only a single one. */
 245                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 246                         panicm = m;
 247                         panicm_found = 1;
 248                 }
 249
 250                 add_taint(TAINT_MACHINE_CHECK);
 251         }
 252
 253         /* Never do anything final in the polling timer */
 254         if (!regs) {
 255                 /* Normal interrupt context here. Call trigger for any new
 256                    events. */
 257                 do_mce_trigger();
 258                 goto out;
 259         }
 260
 261         /* If we didn't find an uncorrectable error, pick
 262            the last one (shouldn't happen, just being safe). */
 263         if (!panicm_found)
 264                 panicm = m;
 265         if (nowayout)
 266                 mce_panic("Machine check", &panicm, mcestart);
 267         if (kill_it) {
 268                 int user_space = 0;
 269
 270                 if (m.mcgstatus & MCG_STATUS_RIPV)
 271                         user_space = panicm.rip && (panicm.cs & 3);
 272
 273                 /* When the machine was in user space and the CPU didn't get
 274                    confused it's normally not necessary to panic, unless you
 275                    are paranoid (tolerant == 0)
 276
 277                    RED-PEN could be more tolerant for MCEs in idle,
 278                    but most likely they occur at boot anyways, where
 279                    it is best to just halt the machine. */
 280                 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
 281                     (unsigned)current->pid <= 1)
 282                         mce_panic("Uncorrected machine check", &panicm, mcestart);
 283
 284                 /* do_exit takes an awful lot of locks and has as
 285                    slight risk of deadlocking. If you don't want that
 286                    don't set tolerant >= 2 */
 287                 if (tolerant < 3)
 288                         do_exit(SIGBUS);
 289         }
 290
 291  out:
 292         /* Last thing done in the machine check exception to clear state. */
 293         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 294  out2:
 295         atomic_dec(&mce_entry);
 296 }
 297
 298 #ifdef CONFIG_X86_MCE_INTEL
 299 /***
 300  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 301  * @cpu: The CPU on which the event occured.
 302  * @status: Event status information
 303  *
 304  * This function should be called by the thermal interrupt after the
 305  * event has been processed and the decision was made to log the event
 306  * further.
 307  *
 308  * The status parameter will be saved to the 'status' field of 'struct mce'
 309  * and historically has been the register value of the
 310  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 311  */
 312 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 313 {
 314         struct mce m;
 315
 316         memset(&m, 0, sizeof(m));
 317         m.cpu = cpu;
 318         m.bank = MCE_THERMAL_BANK;
 319         m.status = status;
 320         rdtscll(m.tsc);
 321         mce_log(&m);
 322 }
 323 #endif /* CONFIG_X86_MCE_INTEL */
 324
 325 /*
 326  * Periodic polling timer for "silent" machine check errors.  If the
 327  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 328  * errors, poll 2x slower (up to check_interval seconds).
 329  */
 330
 331 static int check_interval = 5 * 60; /* 5 minutes */
 332 static int next_interval; /* in jiffies */
 333 static void mcheck_timer(struct work_struct *work);
 334 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 335
 336 static void mcheck_check_cpu(void *info)
 337 {
 338         if (mce_available(&current_cpu_data))
 339                 do_machine_check(NULL, 0);
 340 }
 341
 342 static void mcheck_timer(struct work_struct *work)
 343 {
 344         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 345
 346         /*
 347          * It's ok to read stale data here for notify_user and
 348          * console_logged as we'll simply get the updated versions
 349          * on the next mcheck_timer execution and atomic operations
 350          * on console_logged act as synchronization for notify_user
 351          * writes.
 352          */
 353         if (notify_user && console_logged) {
 354                 static unsigned long last_print;
 355                 unsigned long now = jiffies;
 356
 357                 /* if we logged an MCE, reduce the polling interval */
 358                 next_interval = max(next_interval/2, HZ/100);
 359                 notify_user = 0;
 360                 clear_bit(0, &console_logged);
 361                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 362                         last_print = now;
 363                         printk(KERN_INFO "Machine check events logged\n");
 364                 }
 365         } else {
 366                 next_interval = min(next_interval*2, check_interval*HZ);
 367         }
 368
 369         schedule_delayed_work(&mcheck_work, next_interval);
 370 }
 371
 372
 373 static __init int periodic_mcheck_init(void)
 374 {
 375         next_interval = check_interval * HZ;
 376         if (next_interval)
 377                 schedule_delayed_work(&mcheck_work, next_interval);
 378         return 0;
 379 }
 380 __initcall(periodic_mcheck_init);
 381
 382
 383 /*
 384  * Initialize Machine Checks for a CPU.
 385  */
 386 static void mce_init(void *dummy)
 387 {
 388         u64 cap;
 389         int i;
 390
 391         rdmsrl(MSR_IA32_MCG_CAP, cap);
 392         banks = cap & 0xff;
 393         if (banks > NR_BANKS) {
 394                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 395                 banks = NR_BANKS;
 396         }
 397         /* Use accurate RIP reporting if available. */
 398         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 399                 rip_msr = MSR_IA32_MCG_EIP;
 400
 401         /* Log the machine checks left over from the previous reset.
 402            This also clears all registers */
 403         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 404
 405         set_in_cr4(X86_CR4_MCE);
 406
 407         if (cap & MCG_CTL_P)
 408                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 409
 410         for (i = 0; i < banks; i++) {
 411                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 412                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 413         }
 414 }
 415
 416 /* Add per CPU specific workarounds here */
 417 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 418 {
 419         /* This should be disabled by the BIOS, but isn't always */
 420         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 421                 /* disable GART TBL walk error reporting, which trips off
 422                    incorrectly with the IOMMU & 3ware & Cerberus. */
 423                 clear_bit(10, &bank[4]);
 424                 /* Lots of broken BIOS around that don't clear them
 425                    by default and leave crap in there. Don't log. */
 426                 mce_bootlog = 0;
 427         }
 428
 429 }
 430
 431 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 432 {
 433         switch (c->x86_vendor) {
 434         case X86_VENDOR_INTEL:
 435                 mce_intel_feature_init(c);
 436                 break;
 437         case X86_VENDOR_AMD:
 438                 mce_amd_feature_init(c);
 439                 break;
 440         default:
 441                 break;
 442         }
 443 }
 444
 445 /*
 446  * Called for each booted CPU to set up machine checks.
 447  * Must be called with preempt off.
 448  */
 449 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 450 {
 451         static cpumask_t mce_cpus = CPU_MASK_NONE;
 452
 453         mce_cpu_quirks(c);
 454
 455         if (mce_dont_init ||
 456             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 457             !mce_available(c))
 458                 return;
 459
 460         mce_init(NULL);
 461         mce_cpu_features(c);
 462 }
 463
 464 /*
 465  * Character device to read and clear the MCE log.
 466  */
 467
 468 static DEFINE_SPINLOCK(mce_state_lock);
 469 static int open_count;  /* #times opened */
 470 static int open_exclu;  /* already open exclusive? */
 471
 472 static int mce_open(struct inode *inode, struct file *file)
 473 {
 474         spin_lock(&mce_state_lock);
 475
 476         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 477                 spin_unlock(&mce_state_lock);
 478                 return -EBUSY;
 479         }
 480
 481         if (file->f_flags & O_EXCL)
 482                 open_exclu = 1;
 483         open_count++;
 484
 485         spin_unlock(&mce_state_lock);
 486
 487         return 0;
 488 }
 489
 490 static int mce_release(struct inode *inode, struct file *file)
 491 {
 492         spin_lock(&mce_state_lock);
 493
 494         open_count--;
 495         open_exclu = 0;
 496
 497         spin_unlock(&mce_state_lock);
 498
 499         return 0;
 500 }
 501
 502 static void collect_tscs(void *data)
 503 {
 504         unsigned long *cpu_tsc = (unsigned long *)data;
 505         rdtscll(cpu_tsc[smp_processor_id()]);
 506 }
 507
 508 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 509 {
 510         unsigned long *cpu_tsc;
 511         static DECLARE_MUTEX(mce_read_sem);
 512         unsigned next;
 513         char __user *buf = ubuf;
 514         int i, err;
 515
 516         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 517         if (!cpu_tsc)
 518                 return -ENOMEM;
 519
 520         down(&mce_read_sem);
 521         next = rcu_dereference(mcelog.next);
 522
 523         /* Only supports full reads right now */
 524         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 525                 up(&mce_read_sem);
 526                 kfree(cpu_tsc);
 527                 return -EINVAL;
 528         }
 529
 530         err = 0;
 531         for (i = 0; i < next; i++) {
 532                 unsigned long start = jiffies;
 533                 while (!mcelog.entry[i].finished) {
 534                         if (time_after_eq(jiffies, start + 2)) {
 535                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 536                                 goto timeout;
 537                         }
 538                         cpu_relax();
 539                 }
 540                 smp_rmb();
 541                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 542                 buf += sizeof(struct mce);
 543  timeout:
 544                 ;
 545         }
 546
 547         memset(mcelog.entry, 0, next * sizeof(struct mce));
 548         mcelog.next = 0;
 549
 550         synchronize_sched();
 551
 552         /* Collect entries that were still getting written before the synchronize. */
 553
 554         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 555         for (i = next; i < MCE_LOG_LEN; i++) {
 556                 if (mcelog.entry[i].finished &&
 557                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 558                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 559                         smp_rmb();
 560                         buf += sizeof(struct mce);
 561                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 562                 }
 563         }
 564         up(&mce_read_sem);
 565         kfree(cpu_tsc);
 566         return err ? -EFAULT : buf - ubuf;
 567 }
 568
 569 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 570 {
 571         int __user *p = (int __user *)arg;
 572         if (!capable(CAP_SYS_ADMIN))
 573                 return -EPERM;
 574         switch (cmd) {
 575         case MCE_GET_RECORD_LEN:
 576                 return put_user(sizeof(struct mce), p);
 577         case MCE_GET_LOG_LEN:
 578                 return put_user(MCE_LOG_LEN, p);
 579         case MCE_GETCLEAR_FLAGS: {
 580                 unsigned flags;
 581                 do {
 582                         flags = mcelog.flags;
 583                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 584                 return put_user(flags, p);
 585         }
 586         default:
 587                 return -ENOTTY;
 588         }
 589 }
 590
 591 static const struct file_operations mce_chrdev_ops = {
 592         .open = mce_open,
 593         .release = mce_release,
 594         .read = mce_read,
 595         .ioctl = mce_ioctl,
 596 };
 597
 598 static struct miscdevice mce_log_device = {
 599         MISC_MCELOG_MINOR,
 600         "mcelog",
 601         &mce_chrdev_ops,
 602 };
 603
 604 /*
 605  * Old style boot options parsing. Only for compatibility.
 606  */
 607
 608 static int __init mcheck_disable(char *str)
 609 {
 610         mce_dont_init = 1;
 611         return 1;
 612 }
 613
 614 /* mce=off disables machine check. Note you can reenable it later
 615    using sysfs.
 616    mce=TOLERANCELEVEL (number, see above)
 617    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 618    mce=nobootlog Don't log MCEs from before booting. */
 619 static int __init mcheck_enable(char *str)
 620 {
 621         if (*str == '=')
 622                 str++;
 623         if (!strcmp(str, "off"))
 624                 mce_dont_init = 1;
 625         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 626                 mce_bootlog = str[0] == 'b';
 627         else if (isdigit(str[0]))
 628                 get_option(&str, &tolerant);
 629         else
 630                 printk("mce= argument %s ignored. Please use /sys", str);
 631         return 1;
 632 }
 633
 634 __setup("nomce", mcheck_disable);
 635 __setup("mce", mcheck_enable);
 636
 637 /*
 638  * Sysfs support
 639  */
 640
 641 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 642    Only one CPU is active at this time, the others get readded later using
 643    CPU hotplug. */
 644 static int mce_resume(struct sys_device *dev)
 645 {
 646         mce_init(NULL);
 647         return 0;
 648 }
 649
 650 /* Reinit MCEs after user configuration changes */
 651 static void mce_restart(void)
 652 {
 653         if (next_interval)
 654                 cancel_delayed_work(&mcheck_work);
 655         /* Timer race is harmless here */
 656         on_each_cpu(mce_init, NULL, 1, 1);
 657         next_interval = check_interval * HZ;
 658         if (next_interval)
 659                 schedule_delayed_work(&mcheck_work, next_interval);
 660 }
 661
 662 static struct sysdev_class mce_sysclass = {
 663         .resume = mce_resume,
 664         set_kset_name("machinecheck"),
 665 };
 666
 667 DEFINE_PER_CPU(struct sys_device, device_mce);
 668
 669 /* Why are there no generic functions for this? */
 670 #define ACCESSOR(name, var, start) \
 671         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 672                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 673         }                                                                          \
 674         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 675                 char *end;                                                         \
 676                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 677                 if (end == buf) return -EINVAL;                                    \
 678                 var = new;                                                         \
 679                 start;                                                             \
 680                 return end-buf;                                                    \
 681         }                                                                          \
 682         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 683
 684 /* TBD should generate these dynamically based on number of available banks */
 685 ACCESSOR(bank0ctl,bank[0],mce_restart())
 686 ACCESSOR(bank1ctl,bank[1],mce_restart())
 687 ACCESSOR(bank2ctl,bank[2],mce_restart())
 688 ACCESSOR(bank3ctl,bank[3],mce_restart())
 689 ACCESSOR(bank4ctl,bank[4],mce_restart())
 690 ACCESSOR(bank5ctl,bank[5],mce_restart())
 691
 692 static ssize_t show_trigger(struct sys_device *s, char *buf)
 693 {
 694         strcpy(buf, trigger);
 695         strcat(buf, "\n");
 696         return strlen(trigger) + 1;
 697 }
 698
 699 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 700 {
 701         char *p;
 702         int len;
 703         strncpy(trigger, buf, sizeof(trigger));
 704         trigger[sizeof(trigger)-1] = 0;
 705         len = strlen(trigger);
 706         p = strchr(trigger, '\n');
 707         if (*p) *p = 0;
 708         return len;
 709 }
 710
 711 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 712 ACCESSOR(tolerant,tolerant,)
 713 ACCESSOR(check_interval,check_interval,mce_restart())
 714 static struct sysdev_attribute *mce_attributes[] = {
 715         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 716         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 717         &attr_tolerant, &attr_check_interval, &attr_trigger,
 718         NULL
 719 };
 720
 721 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 722 static __cpuinit int mce_create_device(unsigned int cpu)
 723 {
 724         int err;
 725         int i;
 726         if (!mce_available(&cpu_data[cpu]))
 727                 return -EIO;
 728
 729         per_cpu(device_mce,cpu).id = cpu;
 730         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 731
 732         err = sysdev_register(&per_cpu(device_mce,cpu));
 733
 734         if (!err) {
 735                 for (i = 0; mce_attributes[i]; i++)
 736                         sysdev_create_file(&per_cpu(device_mce,cpu),
 737                                 mce_attributes[i]);
 738         }
 739         return err;
 740 }
 741
 742 static void mce_remove_device(unsigned int cpu)
 743 {
 744         int i;
 745
 746         for (i = 0; mce_attributes[i]; i++)
 747                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 748                         mce_attributes[i]);
 749         sysdev_unregister(&per_cpu(device_mce,cpu));
 750         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 751 }
 752
 753 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 754 static int
 755 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 756 {
 757         unsigned int cpu = (unsigned long)hcpu;
 758
 759         switch (action) {
 760         case CPU_ONLINE:
 761         case CPU_ONLINE_FROZEN:
 762                 mce_create_device(cpu);
 763                 break;
 764         case CPU_DEAD:
 765         case CPU_DEAD_FROZEN:
 766                 mce_remove_device(cpu);
 767                 break;
 768         }
 769         return NOTIFY_OK;
 770 }
 771
 772 static struct notifier_block mce_cpu_notifier = {
 773         .notifier_call = mce_cpu_callback,
 774 };
 775
 776 static __init int mce_init_device(void)
 777 {
 778         int err;
 779         int i = 0;
 780
 781         if (!mce_available(&boot_cpu_data))
 782                 return -EIO;
 783         err = sysdev_class_register(&mce_sysclass);
 784
 785         for_each_online_cpu(i) {
 786                 mce_create_device(i);
 787         }
 788
 789         register_hotcpu_notifier(&mce_cpu_notifier);
 790         misc_register(&mce_log_device);
 791         return err;
 792 }
 793
 794 device_initcall(mce_init_device);