Merge branch 'x86/mce2' into x86/core

author Ingo Molnar <mingo@elte.hu>

Thu, 5 Mar 2009 20:49:25 +0000 (21:49 +0100)

committer Ingo Molnar <mingo@elte.hu>

Thu, 5 Mar 2009 20:49:25 +0000 (21:49 +0100)
author Ingo Molnar <mingo@elte.hu>
Thu, 5 Mar 2009 20:49:25 +0000 (21:49 +0100)
committer Ingo Molnar <mingo@elte.hu>
Thu, 5 Mar 2009 20:49:25 +0000 (21:49 +0100)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index f5cef3fbf9a5b21588b71add9f3abff0af847e99..31758378bcd2707f7dd2803ecd9a98d2d9a9a063 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -783,6 +783,11 @@ config X86_MCE_AMD
            Additional support for AMD specific MCE features such as
            the DRAM Error Threshold.
  
+config X86_MCE_THRESHOLD
+       depends on X86_MCE_AMD || X86_MCE_INTEL
+       bool
+       default y
+
  config X86_MCE_NONFATAL
         tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
         depends on X86_32 && X86_MCE
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h

index 63134e31e8b933acb973ee2e66f62a2f815bb326..bc9514fb3b13f70d75b11f037546eca8e43d07fd 100644 (file)
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
  #define                APIC_ESR_SENDILL        0x00020
  #define                APIC_ESR_RECVILL        0x00040
  #define                APIC_ESR_ILLREGA        0x00080
+#define        APIC_LVTCMCI    0x2f0
  #define        APIC_ICR        0x300
  #define                APIC_DEST_SELF          0x40000
  #define                APIC_DEST_ALLINC        0x80000
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h

index 32c6e17b960b7aed994078dc1d5c1fb4e547b2a4..563933e06a35fa48f6c828d3884f9b35886ab4eb 100644 (file)
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
   */
  
  #define MCG_CTL_P       (1UL<<8)   /* MCG_CAP register available */
+#define MCG_EXT_P       (1ULL<<9)   /* Extended registers available */
+#define MCG_CMCI_P      (1ULL<<10)  /* CMCI supported */
  
  #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
  #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
  
  #include <asm/atomic.h>
  
+void mce_setup(struct mce *m);
  void mce_log(struct mce *m);
  DECLARE_PER_CPU(struct sys_device, device_mce);
  extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
  
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
  #ifdef CONFIG_X86_MCE_INTEL
  void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
  #else
  static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
  #endif
  
  #ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
  static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
  #endif
  
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
  
  extern atomic_t mce_entry;
  
  extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+       MCP_TIMESTAMP = (1 << 0),       /* log time stamp */
+       MCP_UC = (1 << 1),              /* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
  extern int mce_notify_user(void);
  
  #endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
  #else
  #define mcheck_init(c) do { } while (0)
  #endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
  
  #endif /* __KERNEL__ */
  #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

index 358acc59ae044d421196c3beb8abcc3c6e58efc0..2dbd2314139e2426c511fe1dd7bc8b3a29392c67 100644 (file)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
  #define MSR_IA32_MC0_ADDR              0x00000402
  #define MSR_IA32_MC0_MISC              0x00000403
  
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2              0x00000280
+#define CMCI_EN                        (1ULL << 30)
+#define CMCI_THRESHOLD_MASK            0xffffULL
+
  #define MSR_P6_PERFCTR0                        0x000000c1
  #define MSR_P6_PERFCTR1                        0x000000c2
  #define MSR_P6_EVNTSEL0                        0x00000186
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index 6907b8e85d52c580083d47971e4fa9192259c7f9..4c80f15574335d6b02bc93a7ed797c088c1da6c2 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
            that might execute the to be patched code.
            Other CPUs are not running. */
         stop_nmi();
-#ifdef CONFIG_X86_MCE
-       stop_mce();
-#endif
+
+       /*
+        * Don't stop machine check exceptions while patching.
+        * MCEs only happen when something got corrupted and in this
+        * case we must do something about the corruption.
+        * Ignoring it is worse than a unlikely patching race.
+        * Also machine checks tend to be broadcast and if one CPU
+        * goes into machine check the others follow quickly, so we don't
+        * expect a machine check to cause undue problems during to code
+        * patching.
+        */
  
         apply_alternatives(__alt_instructions, __alt_instructions_end);
  
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
                                 (unsigned long)__smp_locks_end);
  
         restart_nmi();
-#ifdef CONFIG_X86_MCE
-       restart_mce();
-#endif
  }
  
  /**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index f9cecdfd05c5523cb6298a27a3d03706e92fb21a..30909a258d0fbc92165857629f4c492499907430 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
  #include <asm/idle.h>
  #include <asm/mtrr.h>
  #include <asm/smp.h>
+#include <asm/mce.h>
  
  unsigned int num_processors;
  
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
                 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
         }
  #endif
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 6) {
+               v = apic_read(APIC_LVTCMCI);
+               if (!(v & APIC_LVT_MASKED))
+                       apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+       }
+#endif
+
         /*
          * Clean APIC state for other OSs:
          */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
         apic_write(APIC_LVT1, value);
  
         preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+       /* Recheck CMCI information after local APIC is up on CPU #0 */
+       if (smp_processor_id() == 0)
+               cmci_recheck();
+#endif
  }
  
  void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile

index d7d2323bbb6976ffa603246bd06845d280150371..b2f89829bbe824d2cecfd74d8c09b2923406493b 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)            += k7.o p4.o p5.o p6.o winchip.o
  obj-$(CONFIG_X86_MCE_INTEL)    += mce_intel_64.o
  obj-$(CONFIG_X86_MCE_AMD)      += mce_amd_64.o
  obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c

index dfaebce3633e3f9b8415e79f29ad9f012276825a..3552119b091da51e65ce933c9852eee694920b9b 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
         }
  }
  
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
-
  static int __init mcheck_disable(char *str)
  {
         mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c

index fe79985ce0f2f6fb4ae23f3bc92a791994ed9f62..bfbd5323a63538bebed231801cbe22f6c5acfd26 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
   * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   * Rest from unknown author(s).
   * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
   */
  
  #include <linux/init.h>
@@ -24,6 +26,9 @@
  #include <linux/ctype.h>
  #include <linux/kmod.h>
  #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
  #include <asm/processor.h>
  #include <asm/msr.h>
  #include <asm/mce.h>
@@ -32,7 +37,6 @@
  #include <asm/idle.h>
  
  #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
  
  atomic_t mce_entry;
  
@@ -47,7 +51,7 @@ static int mce_dont_init;
   */
  static int tolerant = 1;
  static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
  static unsigned long notify_user;
  static int rip_msr;
  static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
  
  static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+       [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+       memset(m, 0, sizeof(struct mce));
+       m->cpu = smp_processor_id();
+       rdtscll(m->tsc);
+}
+
  /*
   * Lockless MCE logging infrastructure.
   * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
                         print_symbol("{%s}", m->ip);
                 printk("\n");
         }
-       printk(KERN_EMERG "TSC %Lx ", m->tsc);
+       printk(KERN_EMERG "TSC %llx ", m->tsc);
         if (m->addr)
-               printk("ADDR %Lx ", m->addr);
+               printk("ADDR %llx ", m->addr);
         if (m->misc)
-               printk("MISC %Lx ", m->misc);
+               printk("MISC %llx ", m->misc);
         printk("\n");
         printk(KERN_EMERG "This is not a software problem!\n");
         printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
         panic(msg);
  }
  
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
  {
+       if (mce_dont_init)
+               return 0;
         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
  }
  
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  }
  
  /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+       struct mce m;
+       int i;
+
+       mce_setup(&m);
+
+       rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+       for (i = 0; i < banks; i++) {
+               if (!bank[i] || !test_bit(i, *b))
+                       continue;
+
+               m.misc = 0;
+               m.addr = 0;
+               m.bank = i;
+               m.tsc = 0;
+
+               barrier();
+               rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+               if (!(m.status & MCI_STATUS_VAL))
+                       continue;
+
+               /*
+                * Uncorrected events are handled by the exception handler
+                * when it is enabled. But when the exception is disabled log
+                * everything.
+                *
+                * TBD do the same check for MCI_STATUS_EN here?
+                */
+               if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+                       continue;
+
+               if (m.status & MCI_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+               if (m.status & MCI_STATUS_ADDRV)
+                       rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+               if (!(flags & MCP_TIMESTAMP))
+                       m.tsc = 0;
+               /*
+                * Don't get the IP here because it's unlikely to
+                * have anything to do with the actual error location.
+                */
+
+               mce_log(&m);
+               add_taint(TAINT_MACHINE_CHECK);
+
+               /*
+                * Clear state for this bank.
+                */
+               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       }
+
+       /*
+        * Don't clear MCG_STATUS here because it's only defined for
+        * exceptions.
+        */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
   */
  void do_machine_check(struct pt_regs * regs, long error_code)
  {
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
          * error.
          */
         int kill_it = 0;
+       DECLARE_BITMAP(toclear, MAX_NR_BANKS);
  
         atomic_inc(&mce_entry);
  
-       if ((regs
-            && notify_die(DIE_NMI, "machine check", regs, error_code,
+       if (notify_die(DIE_NMI, "machine check", regs, error_code,
                            18, SIGKILL) == NOTIFY_STOP)
-           || !banks)
+               goto out2;
+       if (!banks)
                 goto out2;
  
-       memset(&m, 0, sizeof(struct mce));
-       m.cpu = smp_processor_id();
+       mce_setup(&m);
+
         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
         /* if the restart IP is not valid, we're done for */
         if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
         barrier();
  
         for (i = 0; i < banks; i++) {
-               if (i < NR_SYSFS_BANKS && !bank[i])
+               __clear_bit(i, toclear);
+               if (!bank[i])
                         continue;
  
                 m.misc = 0;
                 m.addr = 0;
                 m.bank = i;
-               m.tsc = 0;
  
                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
                 if ((m.status & MCI_STATUS_VAL) == 0)
                         continue;
  
+               /*
+                * Non uncorrected errors are handled by machine_check_poll
+                * Leave them alone.
+                */
+               if ((m.status & MCI_STATUS_UC) == 0)
+                       continue;
+
+               /*
+                * Set taint even when machine check was not enabled.
+                */
+               add_taint(TAINT_MACHINE_CHECK);
+
+               __set_bit(i, toclear);
+
                 if (m.status & MCI_STATUS_EN) {
                         /* if PCC was set, there's no way out */
                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                                         no_way_out = 1;
                                 kill_it = 1;
                         }
+               } else {
+                       /*
+                        * Machine check event was not enabled. Clear, but
+                        * ignore.
+                        */
+                       continue;
                 }
  
                 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
  
                 mce_get_rip(&m, regs);
-               if (error_code >= 0)
-                       rdtscll(m.tsc);
-               if (error_code != -2)
-                       mce_log(&m);
+               mce_log(&m);
  
                 /* Did this bank cause the exception? */
                 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                         panicm = m;
                         panicm_found = 1;
                 }
-
-               add_taint(TAINT_MACHINE_CHECK);
         }
  
-       /* Never do anything final in the polling timer */
-       if (!regs)
-               goto out;
-
         /* If we didn't find an uncorrectable error, pick
            the last one (shouldn't happen, just being safe). */
         if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
         /* notify userspace ASAP */
         set_thread_flag(TIF_MCE_NOTIFY);
  
- out:
         /* the last thing we do is clear state */
-       for (i = 0; i < banks; i++)
-               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       for (i = 0; i < banks; i++) {
+               if (test_bit(i, toclear))
+                       wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       }
         wrmsrl(MSR_IA32_MCG_STATUS, 0);
   out2:
         atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
   * and historically has been the register value of the
   * MSR_IA32_THERMAL_STATUS (Intel) msr.
   */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
  {
         struct mce m;
  
-       memset(&m, 0, sizeof(m));
-       m.cpu = cpu;
+       mce_setup(&m);
         m.bank = MCE_THERMAL_BANK;
         m.status = status;
-       rdtscll(m.tsc);
         mce_log(&m);
  }
  #endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
  
  static int check_interval = 5 * 60; /* 5 minutes */
  static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
  
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
  {
-       if (mce_available(&current_cpu_data))
-               do_machine_check(NULL, 0);
-}
+       struct timer_list *t = &per_cpu(mce_timer, data);
  
-static void mcheck_timer(struct work_struct *work)
-{
-       on_each_cpu(mcheck_check_cpu, NULL, 1);
+       WARN_ON(smp_processor_id() != data);
+
+       if (mce_available(&current_cpu_data))
+               machine_check_poll(MCP_TIMESTAMP,
+                               &__get_cpu_var(mce_poll_banks));
  
         /*
          * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
                                 (int)round_jiffies_relative(check_interval*HZ));
         }
  
-       schedule_delayed_work(&mcheck_work, next_interval);
+       t->expires = jiffies + next_interval;
+       add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+       call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
  }
  
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
  /*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
   */
  int mce_notify_user(void)
  {
+       /* Not more than two messages every minute */
+       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
         clear_thread_flag(TIF_MCE_NOTIFY);
         if (test_and_clear_bit(0, &notify_user)) {
-               static unsigned long last_print;
-               unsigned long now = jiffies;
-
                 wake_up_interruptible(&mce_wait);
-               if (trigger[0])
-                       call_usermodehelper(trigger, trigger_argv, NULL,
-                                               UMH_NO_WAIT);
  
-               if (time_after_eq(now, last_print + (check_interval*HZ))) {
-                       last_print = now;
+               /*
+                * There is no risk of missing notifications because
+                * work_pending is always cleared before the function is
+                * executed.
+                */
+               if (trigger[0] && !work_pending(&mce_trigger_work))
+                       schedule_work(&mce_trigger_work);
+
+               if (__ratelimit(&ratelimit))
                         printk(KERN_INFO "Machine check events logged\n");
-               }
  
                 return 1;
         }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
  
  static __init int periodic_mcheck_init(void)
  {
-       next_interval = check_interval * HZ;
-       if (next_interval)
-               schedule_delayed_work(&mcheck_work,
-                                     round_jiffies_relative(next_interval));
-       idle_notifier_register(&mce_idle_notifier);
-       return 0;
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
  }
  __initcall(periodic_mcheck_init);
  
-
  /*
   * Initialize Machine Checks for a CPU.
   */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
  {
         u64 cap;
-       int i;
+       unsigned b;
  
         rdmsrl(MSR_IA32_MCG_CAP, cap);
-       banks = cap & 0xff;
-       if (banks > MCE_EXTENDED_BANK) {
-               banks = MCE_EXTENDED_BANK;
-               printk(KERN_INFO "MCE: warning: using only %d banks\n",
-                      MCE_EXTENDED_BANK);
+       b = cap & 0xff;
+       if (b > MAX_NR_BANKS) {
+               printk(KERN_WARNING
+                      "MCE: Using only %u machine check banks out of %u\n",
+                       MAX_NR_BANKS, b);
+               b = MAX_NR_BANKS;
         }
+
+       /* Don't support asymmetric configurations today */
+       WARN_ON(banks != 0 && b != banks);
+       banks = b;
+       if (!bank) {
+               bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+               if (!bank)
+                       return -ENOMEM;
+               memset(bank, 0xff, banks * sizeof(u64));
+       }
+
         /* Use accurate RIP reporting if available. */
         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
                 rip_msr = MSR_IA32_MCG_EIP;
  
-       /* Log the machine checks left over from the previous reset.
-          This also clears all registers */
-       do_machine_check(NULL, mce_bootlog ? -1 : -2);
+       return 0;
+}
+
+static void mce_init(void *dummy)
+{
+       u64 cap;
+       int i;
+       mce_banks_t all_banks;
+
+       /*
+        * Log the machine checks left over from the previous reset.
+        */
+       bitmap_fill(all_banks, MAX_NR_BANKS);
+       machine_check_poll(MCP_UC, &all_banks);
  
         set_in_cr4(X86_CR4_MCE);
  
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
         if (cap & MCG_CTL_P)
                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
  
         for (i = 0; i < banks; i++) {
-               if (i < NR_SYSFS_BANKS)
-                       wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-               else
-                       wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+               wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
         }
  }
  
  /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
  {
         /* This should be disabled by the BIOS, but isn't always */
         if (c->x86_vendor == X86_VENDOR_AMD) {
-               if(c->x86 == 15)
+               if (c->x86 == 15 && banks > 4)
                         /* disable GART TBL walk error reporting, which trips off
                            incorrectly with the IOMMU & 3ware & Cerberus. */
-                       clear_bit(10, &bank[4]);
+                       clear_bit(10, (unsigned long *)&bank[4]);
                 if(c->x86 <= 17 && mce_bootlog < 0)
                         /* Lots of broken BIOS around that don't clear them
                            by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
         }
  }
  
+static void mce_init_timer(void)
+{
+       struct timer_list *t = &__get_cpu_var(mce_timer);
+
+       /* data race harmless because everyone sets to the same value */
+       if (!next_interval)
+               next_interval = check_interval * HZ;
+       if (!next_interval)
+               return;
+       setup_timer(t, mcheck_timer, smp_processor_id());
+       t->expires = round_jiffies_relative(jiffies + next_interval);
+       add_timer(t);
+}
+
  /*
   * Called for each booted CPU to set up machine checks.
   * Must be called with preempt off.
   */
  void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
  {
-       mce_cpu_quirks(c);
+       if (!mce_available(c))
+               return;
  
-       if (mce_dont_init ||
-           !mce_available(c))
+       if (mce_cap_init() < 0) {
+               mce_dont_init = 1;
                 return;
+       }
+       mce_cpu_quirks(c);
  
         mce_init(NULL);
         mce_cpu_features(c);
+       mce_init_timer();
  }
  
  /*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
  {
         unsigned long *cpu_tsc;
         static DEFINE_MUTEX(mce_read_mutex);
-       unsigned next;
+       unsigned prev, next;
         char __user *buf = ubuf;
         int i, err;
  
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
         }
  
         err = 0;
-       for (i = 0; i < next; i++) {
-               unsigned long start = jiffies;
-
-               while (!mcelog.entry[i].finished) {
-                       if (time_after_eq(jiffies, start + 2)) {
-                               memset(mcelog.entry + i,0, sizeof(struct mce));
-                               goto timeout;
+       prev = 0;
+       do {
+               for (i = prev; i < next; i++) {
+                       unsigned long start = jiffies;
+
+                       while (!mcelog.entry[i].finished) {
+                               if (time_after_eq(jiffies, start + 2)) {
+                                       memset(mcelog.entry + i, 0,
+                                              sizeof(struct mce));
+                                       goto timeout;
+                               }
+                               cpu_relax();
                         }
-                       cpu_relax();
+                       smp_rmb();
+                       err |= copy_to_user(buf, mcelog.entry + i,
+                                           sizeof(struct mce));
+                       buf += sizeof(struct mce);
+timeout:
+                       ;
                 }
-               smp_rmb();
-               err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-               buf += sizeof(struct mce);
- timeout:
-               ;
-       }
  
-       memset(mcelog.entry, 0, next * sizeof(struct mce));
-       mcelog.next = 0;
+               memset(mcelog.entry + prev, 0,
+                      (next - prev) * sizeof(struct mce));
+               prev = next;
+               next = cmpxchg(&mcelog.next, prev, 0);
+       } while (next != prev);
  
         synchronize_sched();
  
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
         &mce_chrdev_ops,
  };
  
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
-
  /*
   * Old style boot options parsing. Only for compatibility.
   */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
         return 1;
  }
  
-/* mce=off disables machine check. Note you can re-enable it later
-   using sysfs.
+/* mce=off disables machine check.
     mce=TOLERANCELEVEL (number, see above)
     mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
     mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
   * Sysfs support
   */
  
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+       int i;
+
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+       return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+       return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+       return mce_disable();
+}
+
  /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
     Only one CPU is active at this time, the others get readded later using
     CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
         return 0;
  }
  
+static void mce_cpu_restart(void *data)
+{
+       del_timer_sync(&__get_cpu_var(mce_timer));
+       if (mce_available(&current_cpu_data))
+               mce_init(NULL);
+       mce_init_timer();
+}
+
  /* Reinit MCEs after user configuration changes */
  static void mce_restart(void)
  {
-       if (next_interval)
-               cancel_delayed_work(&mcheck_work);
-       /* Timer race is harmless here */
-       on_each_cpu(mce_init, NULL, 1);
         next_interval = check_interval * HZ;
-       if (next_interval)
-               schedule_delayed_work(&mcheck_work,
-                                     round_jiffies_relative(next_interval));
+       on_each_cpu(mce_cpu_restart, NULL, 1);
  }
  
  static struct sysdev_class mce_sysclass = {
+       .suspend = mce_suspend,
+       .shutdown = mce_shutdown,
         .resume = mce_resume,
         .name = "machinecheck",
  };
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
         }                                                               \
         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
  
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+                        char *buf)
+{
+       u64 b = bank[attr - bank_attrs];
+       return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+                       const char *buf, size_t siz)
+{
+       char *end;
+       u64 new = simple_strtoull(buf, &end, 0);
+       if (end == buf)
+               return -EINVAL;
+       bank[attr - bank_attrs] = new;
+       mce_restart();
+       return end-buf;
+}
  
  static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
                                 char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
  static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
  ACCESSOR(check_interval,check_interval,mce_restart())
  static struct sysdev_attribute *mce_attributes[] = {
-       &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-       &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
         NULL
  };
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
                 if (err)
                         goto error;
         }
+       for (i = 0; i < banks; i++) {
+               err = sysdev_create_file(&per_cpu(device_mce, cpu),
+                                       &bank_attrs[i]);
+               if (err)
+                       goto error2;
+       }
         cpu_set(cpu, mce_device_initialized);
  
         return 0;
+error2:
+       while (--i >= 0) {
+               sysdev_remove_file(&per_cpu(device_mce, cpu),
+                                       &bank_attrs[i]);
+       }
  error:
-       while (i--) {
+       while (--i >= 0) {
                 sysdev_remove_file(&per_cpu(device_mce,cpu),
                                    mce_attributes[i]);
         }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
         for (i = 0; mce_attributes[i]; i++)
                 sysdev_remove_file(&per_cpu(device_mce,cpu),
                         mce_attributes[i]);
+       for (i = 0; i < banks; i++)
+               sysdev_remove_file(&per_cpu(device_mce, cpu),
+                       &bank_attrs[i]);
         sysdev_unregister(&per_cpu(device_mce,cpu));
         cpu_clear(cpu, mce_device_initialized);
  }
  
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+       int i;
+       unsigned long action = *(unsigned long *)h;
+
+       if (!mce_available(&current_cpu_data))
+               return;
+       if (!(action & CPU_TASKS_FROZEN))
+               cmci_clear();
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+       int i;
+       unsigned long action = *(unsigned long *)h;
+
+       if (!mce_available(&current_cpu_data))
+               return;
+       if (!(action & CPU_TASKS_FROZEN))
+               cmci_reenable();
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
         unsigned int cpu = (unsigned long)hcpu;
+       struct timer_list *t = &per_cpu(mce_timer, cpu);
  
         switch (action) {
         case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                         threshold_cpu_callback(action, cpu);
                 mce_remove_device(cpu);
                 break;
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               del_timer_sync(t);
+               smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+               break;
+       case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
+               t->expires = round_jiffies_relative(jiffies + next_interval);
+               add_timer_on(t, cpu);
+               smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+               break;
+       case CPU_POST_DEAD:
+               /* intentionally ignoring frozen here */
+               cmci_rediscover(cpu);
+               break;
         }
         return NOTIFY_OK;
  }
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
         .notifier_call = mce_cpu_callback,
  };
  
+static __init int mce_init_banks(void)
+{
+       int i;
+
+       bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+                               GFP_KERNEL);
+       if (!bank_attrs)
+               return -ENOMEM;
+
+       for (i = 0; i < banks; i++) {
+               struct sysdev_attribute *a = &bank_attrs[i];
+               a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+               if (!a->attr.name)
+                       goto nomem;
+               a->attr.mode = 0644;
+               a->show = show_bank;
+               a->store = set_bank;
+       }
+       return 0;
+
+nomem:
+       while (--i >= 0)
+               kfree(bank_attrs[i].attr.name);
+       kfree(bank_attrs);
+       bank_attrs = NULL;
+       return -ENOMEM;
+}
+
  static __init int mce_init_device(void)
  {
         int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
  
         if (!mce_available(&boot_cpu_data))
                 return -EIO;
+
+       err = mce_init_banks();
+       if (err)
+               return err;
+
         err = sysdev_class_register(&mce_sysclass);
         if (err)
                 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c

index 9817506dd4698c6578646f04100836dd6ad435c8..c5a32f92d07ecc41b55b290f16fb7a0889c07c41 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
  
  static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
  
+static void amd_threshold_interrupt(void);
+
  /*
   * CPU Initialization
   */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                         tr.reset = 0;
                         tr.old_limit = 0;
                         threshold_restart_bank(&tr);
+
+                       mce_threshold_vector = amd_threshold_interrupt;
                 }
         }
  }
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
   * the interrupt goes off when error_count reaches threshold_limit.
   * the handler will simply log mcelog w/ software defined bank number.
   */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
  {
         unsigned int bank, block;
         struct mce m;
         u32 low = 0, high = 0, address = 0;
  
-       ack_APIC_irq();
-       exit_idle();
-       irq_enter();
-
-       memset(&m, 0, sizeof(m));
-       rdtscll(m.tsc);
-       m.cpu = smp_processor_id();
+       mce_setup(&m);
  
         /* assume first bank caused it */
         for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
  
                         /* Log the machine check that caused the threshold
                            event. */
-                       do_machine_check(NULL, 0);
+                       machine_check_poll(MCP_TIMESTAMP,
+                                       &__get_cpu_var(mce_poll_banks));
  
                         if (high & MASK_OVERFLOW_HI) {
                                 rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
                                        + bank * NR_BLOCKS
                                        + block;
                                 mce_log(&m);
-                               goto out;
+                               return;
                         }
                 }
         }
-out:
-       inc_irq_stat(irq_threshold_count);
-       irq_exit();
  }
  
  /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c

index aa5e287c98e01334565a241d7b89e768c403985f..aaa7d97309387d5e99aa63cfa5e7b9e6ea3131e1 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
  /*
   * Intel specific MCE features.
   * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
   */
  
  #include <linux/init.h>
@@ -13,6 +15,7 @@
  #include <asm/hw_irq.h>
  #include <asm/idle.h>
  #include <asm/therm_throt.h>
+#include <asm/apic.h>
  
  asmlinkage void smp_thermal_interrupt(void)
  {
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
  
         rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
         if (therm_throt_process(msr_val & 1))
-               mce_log_therm_throt_event(smp_processor_id(), msr_val);
+               mce_log_therm_throt_event(msr_val);
  
         inc_irq_stat(irq_thermal_count);
         irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
         return;
  }
  
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+       u64 cap;
+
+       /*
+        * Vendor check is not strictly needed, but the initial
+        * initialization is vendor keyed and this
+        * makes sure none of the backdoors are entered otherwise.
+        */
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return 0;
+       if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+               return 0;
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+       *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+       return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+       machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+       mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+       if (*hdr == 0)
+               printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+       *hdr = 1;
+       printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+       unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+       int hdr = 0;
+       int i;
+
+       spin_lock(&cmci_discover_lock);
+       for (i = 0; i < banks; i++) {
+               u64 val;
+
+               if (test_bit(i, owned))
+                       continue;
+
+               rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+               /* Already owned by someone else? */
+               if (val & CMCI_EN) {
+                       if (test_and_clear_bit(i, owned) || boot)
+                               print_update("SHD", &hdr, i);
+                       __clear_bit(i, __get_cpu_var(mce_poll_banks));
+                       continue;
+               }
+
+               val |= CMCI_EN | CMCI_THRESHOLD;
+               wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+               rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+               /* Did the enable bit stick? -- the bank supports CMCI */
+               if (val & CMCI_EN) {
+                       if (!test_and_set_bit(i, owned) || boot)
+                               print_update("CMCI", &hdr, i);
+                       __clear_bit(i, __get_cpu_var(mce_poll_banks));
+               } else {
+                       WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+               }
+       }
+       spin_unlock(&cmci_discover_lock);
+       if (hdr)
+               printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+       unsigned long flags;
+       int banks;
+
+       if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+               return;
+       local_irq_save(flags);
+       machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+       local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+       int i;
+       int banks;
+       u64 val;
+
+       if (!cmci_supported(&banks))
+               return;
+       spin_lock(&cmci_discover_lock);
+       for (i = 0; i < banks; i++) {
+               if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+                       continue;
+               /* Disable CMCI */
+               rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+               val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+               wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+               __clear_bit(i, __get_cpu_var(mce_banks_owned));
+       }
+       spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+       int banks;
+       int cpu;
+       cpumask_var_t old;
+
+       if (!cmci_supported(&banks))
+               return;
+       if (!alloc_cpumask_var(&old, GFP_KERNEL))
+               return;
+       cpumask_copy(old, &current->cpus_allowed);
+
+       for_each_online_cpu (cpu) {
+               if (cpu == dying)
+                       continue;
+               if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+                       continue;
+               /* Recheck banks in case CPUs don't all have the same */
+               if (cmci_supported(&banks))
+                       cmci_discover(banks, 0);
+       }
+
+       set_cpus_allowed_ptr(current, old);
+       free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+       int banks;
+       if (cmci_supported(&banks))
+               cmci_discover(banks, 0);
+}
+
+static __cpuinit void intel_init_cmci(void)
+{
+       int banks;
+
+       if (!cmci_supported(&banks))
+               return;
+
+       mce_threshold_vector = intel_threshold_interrupt;
+       cmci_discover(banks, 1);
+       /*
+        * For CPU #0 this runs with still disabled APIC, but that's
+        * ok because only the vector is set up. We still do another
+        * check for the banks later for CPU #0 just to make sure
+        * to not miss any events.
+        */
+       apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+       cmci_recheck();
+}
+
  void mce_intel_feature_init(struct cpuinfo_x86 *c)
  {
         intel_init_thermal(c);
+       intel_init_cmci();
  }
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c

new file mode 100644 (file)

index 0000000..23ee9e7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+       printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+                        THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+       exit_idle();
+       irq_enter();
+       inc_irq_stat(irq_threshold_count);
+       mce_threshold_vector();
+       irq_exit();
+       /* Ack only at the end to avoid potential reentry */
+       ack_APIC_irq();
+}
author	Ingo Molnar <mingo@elte.hu>
	Thu, 5 Mar 2009 20:49:25 +0000 (21:49 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 5 Mar 2009 20:49:25 +0000 (21:49 +0100)
arch/x86/Kconfig		patch \| blob \| history
arch/x86/include/asm/apicdef.h		patch \| blob \| history
arch/x86/include/asm/mce.h		patch \| blob \| history
arch/x86/include/asm/msr-index.h		patch \| blob \| history
arch/x86/kernel/alternative.c		patch \| blob \| history
arch/x86/kernel/apic/apic.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/Makefile		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce_32.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce_64.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce_amd_64.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce_intel_64.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/threshold.c	[new file with mode: 0644]	patch \| blob