From: Mike Travis <travis@sgi.com>
Date: Thu, 1 Jan 2009 01:34:16 +0000 (-0800)
Subject: Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux... 
X-Git-Tag: v2.6.29-rc1~521^2~11
X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=7eb19553369c46cc1fa64caf120cbcab1b597f7c;hp=-c;p=linux-2.6-omap-h63xx.git

Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-cpumask into merge-rr-cpumask

Conflicts:
	arch/x86/kernel/io_apic.c
	kernel/rcuclassic.c
	kernel/sched.c
	kernel/time/tick-sched.c

Signed-off-by: Mike Travis <travis@sgi.com>
[ mingo@elte.hu: backmerged typo fix for io_apic.c ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---

7eb19553369c46cc1fa64caf120cbcab1b597f7c
diff --combined arch/ia64/include/asm/topology.h
index a3cc9f65f95,97ae7f50910..76a33a91ca6
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@@ -34,6 -34,7 +34,7 @@@
   * Returns a bitmask of CPUs on Node 'node'.
   */
  #define node_to_cpumask(node) (node_to_cpu_mask[node])
+ #define cpumask_of_node(node) (&node_to_cpu_mask[node])
  
  /*
   * Returns the number of the node containing Node 'nid'.
@@@ -45,7 -46,7 +46,7 @@@
  /*
   * Returns the number of the first CPU on Node 'node'.
   */
- #define node_to_first_cpu(node) (first_cpu(node_to_cpumask(node)))
+ #define node_to_first_cpu(node) (cpumask_first(cpumask_of_node(node)))
  
  /*
   * Determines the node for a given pci bus
@@@ -55,6 -56,7 +56,6 @@@
  void build_cpu_to_node_map(void);
  
  #define SD_CPU_INIT (struct sched_domain) {		\
 -	.span			= CPU_MASK_NONE,	\
  	.parent			= NULL,			\
  	.child			= NULL,			\
  	.groups			= NULL,			\
@@@ -79,6 -81,7 +80,6 @@@
  
  /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
  #define SD_NODE_INIT (struct sched_domain) {		\
 -	.span			= CPU_MASK_NONE,	\
  	.parent			= NULL,			\
  	.child			= NULL,			\
  	.groups			= NULL,			\
@@@ -109,6 -112,8 +110,8 @@@
  #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
  #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
  #define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
+ #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
+ #define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
  #define smt_capable() 				(smp_num_siblings > 1)
  #endif
  
@@@ -119,6 -124,10 +122,10 @@@ extern void arch_fix_phys_package_id(in
  					node_to_cpumask(pcibus_to_node(bus)) \
  				)
  
+ #define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
+ 				 cpu_all_mask :				\
+ 				 cpumask_from_node(pcibus_to_node(bus)))
+ 
  #include <asm-generic/topology.h>
  
  #endif /* _ASM_IA64_TOPOLOGY_H */
diff --combined arch/mips/include/asm/mach-ip27/topology.h
index 1fb959f9898,c1c3f5b2f18..55d481569a1
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@@ -25,11 -25,13 +25,13 @@@ extern struct cpuinfo_ip27 sn_cpu_info[
  #define cpu_to_node(cpu)	(sn_cpu_info[(cpu)].p_nodeid)
  #define parent_node(node)	(node)
  #define node_to_cpumask(node)	(hub_data(node)->h_cpus)
- #define node_to_first_cpu(node)	(first_cpu(node_to_cpumask(node)))
+ #define cpumask_of_node(node)	(&hub_data(node)->h_cpus)
+ #define node_to_first_cpu(node)	(cpumask_first(cpumask_of_node(node)))
  struct pci_bus;
  extern int pcibus_to_node(struct pci_bus *);
  
  #define pcibus_to_cpumask(bus)	(cpu_online_map)
+ #define cpumask_of_pcibus(bus)	(cpu_online_mask)
  
  extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
  
@@@ -37,6 -39,7 +39,6 @@@
  
  /* sched_domains SD_NODE_INIT for SGI IP27 machines */
  #define SD_NODE_INIT (struct sched_domain) {		\
 -	.span			= CPU_MASK_NONE,	\
  	.parent			= NULL,			\
  	.child			= NULL,			\
  	.groups			= NULL,			\
diff --combined arch/powerpc/include/asm/topology.h
index 373fca394a5,236dae1cd29..375258559ae
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@@ -22,11 -22,11 +22,11 @@@ static inline cpumask_t node_to_cpumask
  	return numa_cpumask_lookup_table[node];
  }
  
+ #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
+ 
  static inline int node_to_first_cpu(int node)
  {
- 	cpumask_t tmp;
- 	tmp = node_to_cpumask(node);
- 	return first_cpu(tmp);
+ 	return cpumask_first(cpumask_of_node(node));
  }
  
  int of_node_to_nid(struct device_node *device);
@@@ -46,8 -46,13 +46,12 @@@ static inline int pcibus_to_node(struc
  					node_to_cpumask(pcibus_to_node(bus)) \
  				)
  
+ #define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
+ 				 cpu_all_mask :				\
+ 				 cpumask_of_node(pcibus_to_node(bus)))
+ 
  /* sched_domains SD_NODE_INIT for PPC64 machines */
  #define SD_NODE_INIT (struct sched_domain) {		\
 -	.span			= CPU_MASK_NONE,	\
  	.parent			= NULL,			\
  	.child			= NULL,			\
  	.groups			= NULL,			\
@@@ -108,6 -113,8 +112,8 @@@ static inline void sysfs_remove_device_
  
  #define topology_thread_siblings(cpu)	(per_cpu(cpu_sibling_map, cpu))
  #define topology_core_siblings(cpu)	(per_cpu(cpu_core_map, cpu))
+ #define topology_thread_cpumask(cpu)	(&per_cpu(cpu_sibling_map, cpu))
+ #define topology_core_cpumask(cpu)	(&per_cpu(cpu_core_map, cpu))
  #define topology_core_id(cpu)		(cpu_to_core_id(cpu))
  #endif
  #endif
diff --combined arch/sh/include/asm/topology.h
index 279d9cc4a00,9aa160d0efe..066f0fba590
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@@ -5,6 -5,7 +5,6 @@@
  
  /* sched_domains SD_NODE_INIT for sh machines */
  #define SD_NODE_INIT (struct sched_domain) {		\
 -	.span			= CPU_MASK_NONE,	\
  	.parent			= NULL,			\
  	.child			= NULL,			\
  	.groups			= NULL,			\
@@@ -32,6 -33,7 +32,7 @@@
  #define parent_node(node)	((void)(node),0)
  
  #define node_to_cpumask(node)	((void)node, cpu_online_map)
+ #define cpumask_of_node(node)	((void)node, cpu_online_mask)
  #define node_to_first_cpu(node)	((void)(node),0)
  
  #define pcibus_to_node(bus)	((void)(bus), -1)
diff --combined arch/x86/Kconfig
index 0ca2eb7573c,0f44add3e0b..249d1e0824b
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -19,6 -19,8 +19,8 @@@ config X86_6
  config X86
  	def_bool y
  	select HAVE_AOUT if X86_32
+ 	select HAVE_READQ
+ 	select HAVE_WRITEQ
  	select HAVE_UNSTABLE_SCHED_CLOCK
  	select HAVE_IDE
  	select HAVE_OPROFILE
@@@ -90,6 -92,10 +92,10 @@@ config GENERIC_IOMA
  config GENERIC_BUG
  	def_bool y
  	depends on BUG
+ 	select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+ 
+ config GENERIC_BUG_RELATIVE_POINTERS
+ 	bool
  
  config GENERIC_HWEIGHT
  	def_bool y
@@@ -244,16 -250,19 +250,19 @@@ config X86_HAS_BOOT_CPU_I
  config SPARSE_IRQ
  	bool "Support sparse irq numbering"
  	depends on PCI_MSI || HT_IRQ
- 	default y
  	help
- 	  This enables support for sparse irq, esp for msi/msi-x. You may need
- 	  if you have lots of cards supports msi-x installed.
+ 	  This enables support for sparse irqs. This is useful for distro
+ 	  kernels that want to define a high CONFIG_NR_CPUS value but still
+ 	  want to have low kernel memory footprint on smaller machines.
  
- 	  If you don't know what to do here, say Y.
+ 	  ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+ 	    out the irq_desc[] array in a more NUMA-friendly way. )
+ 
+ 	  If you don't know what to do here, say N.
  
  config NUMA_MIGRATE_IRQ_DESC
  	bool "Move irq desc when changing irq smp_affinity"
- 	depends on SPARSE_IRQ && SMP
+ 	depends on SPARSE_IRQ && NUMA
  	default n
  	help
  	  This enables moving irq_desc to cpu/node that irq will use handled.
@@@ -264,21 -273,13 +273,13 @@@ config X86_FIND_SMP_CONFI
  	def_bool y
  	depends on X86_MPPARSE || X86_VOYAGER
  
- if ACPI
  config X86_MPPARSE
- 	def_bool y
- 	bool "Enable MPS table"
+ 	bool "Enable MPS table" if ACPI
+ 	default y
  	depends on X86_LOCAL_APIC
  	help
  	  For old smp systems that do not have proper acpi support. Newer systems
  	  (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
- endif
- 
- if !ACPI
- config X86_MPPARSE
- 	def_bool y
- 	depends on X86_LOCAL_APIC
- endif
  
  choice
  	prompt "Subarchitecture Type"
@@@ -500,7 -501,7 +501,7 @@@ config HPET_TIME
           The HPET provides a stable time base on SMP
           systems, unlike the TSC, but it is more expensive to access,
           as it is off-chip.  You can find the HPET spec at
-          <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+          <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
  
           You can safely choose Y here.  However, HPET will only be
           activated if the platform and the BIOS support this feature.
@@@ -587,7 -588,7 +588,7 @@@ config AMD_IOMM
  
  # need this always selected by IOMMU for the VIA workaround
  config SWIOTLB
- 	bool
+ 	def_bool y if X86_64
  	help
  	  Support for software bounce buffers used on x86-64 systems
  	  which don't have a hardware IOMMU (e.g. the current generation
@@@ -600,20 -601,19 +601,20 @@@ config IOMMU_HELPE
  
  config MAXSMP
  	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
 -	depends on X86_64 && SMP && BROKEN
 +	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
 +	select CPUMASK_OFFSTACK
  	default n
  	help
  	  Configure maximum number of CPUS and NUMA Nodes for this architecture.
  	  If unsure, say N.
  
  config NR_CPUS
 -	int "Maximum number of CPUs (2-512)" if !MAXSMP
 -	range 2 512
 -	depends on SMP
 +	int "Maximum number of CPUs" if SMP && !MAXSMP
 +	range 2 512 if SMP && !MAXSMP
 +	default "1" if !SMP
  	default "4096" if MAXSMP
 -	default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
 -	default "8"
 +	default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
 +	default "8" if SMP
  	help
  	  This allows you to specify the maximum number of CPUs which this
  	  kernel will support.  The maximum supported value is 512 and the
@@@ -679,6 -679,30 +680,30 @@@ config X86_VISWS_API
  	def_bool y
  	depends on X86_32 && X86_VISWS
  
+ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ 	bool "Reroute for broken boot IRQs"
+ 	default n
+ 	depends on X86_IO_APIC
+ 	help
+ 	  This option enables a workaround that fixes a source of
+ 	  spurious interrupts. This is recommended when threaded
+ 	  interrupt handling is used on systems where the generation of
+ 	  superfluous "boot interrupts" cannot be disabled.
+ 
+ 	  Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+ 	  entry in the chipset's IO-APIC is masked (as, e.g. the RT
+ 	  kernel does during interrupt handling). On chipsets where this
+ 	  boot IRQ generation cannot be disabled, this workaround keeps
+ 	  the original IRQ line masked so that only the equivalent "boot
+ 	  IRQ" is delivered to the CPUs. The workaround also tells the
+ 	  kernel to set up the IRQ handler on the boot IRQ line. In this
+ 	  way only one interrupt is delivered to the kernel. Otherwise
+ 	  the spurious second interrupt may cause the kernel to bring
+ 	  down (vital) interrupt lines.
+ 
+ 	  Only affects "broken" chipsets. Interrupt sharing may be
+ 	  increased on these systems.
+ 
  config X86_MCE
  	bool "Machine Check Exception"
  	depends on !X86_VOYAGER
@@@ -975,24 -999,37 +1000,37 @@@ config X86_PA
  config ARCH_PHYS_ADDR_T_64BIT
         def_bool X86_64 || X86_PAE
  
+ config DIRECT_GBPAGES
+ 	bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ 	default y
+ 	depends on X86_64
+ 	help
+ 	  Allow the kernel linear mapping to use 1GB pages on CPUs that
+ 	  support it. This can improve the kernel's performance a tiny bit by
+ 	  reducing TLB pressure. If in doubt, say "Y".
+ 
  # Common NUMA Features
  config NUMA
- 	bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
+ 	bool "Numa Memory Allocation and Scheduler Support"
  	depends on SMP
  	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
  	default n if X86_PC
  	default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
  	help
  	  Enable NUMA (Non Uniform Memory Access) support.
+ 
  	  The kernel will try to allocate memory used by a CPU on the
  	  local memory controller of the CPU and add some more
  	  NUMA awareness to the kernel.
  
- 	  For 32-bit this is currently highly experimental and should be only
- 	  used for kernel development. It might also cause boot failures.
- 	  For 64-bit this is recommended on all multiprocessor Opteron systems.
- 	  If the system is EM64T, you should say N unless your system is
- 	  EM64T NUMA.
+ 	  For 64-bit this is recommended if the system is Intel Core i7
+ 	  (or later), AMD Opteron, or EM64T NUMA.
+ 
+ 	  For 32-bit this is only needed on (rare) 32-bit-only platforms
+ 	  that support NUMA topologies, such as NUMAQ / Summit, or if you
+ 	  boot a 32-bit kernel on a 64-bit NUMA platform.
+ 
+ 	  Otherwise, you should say N.
  
  comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
  	depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@@ -1512,6 -1549,10 +1550,10 @@@ config ARCH_ENABLE_MEMORY_HOTPLU
  	def_bool y
  	depends on X86_64 || (X86_32 && HIGHMEM)
  
+ config ARCH_ENABLE_MEMORY_HOTREMOVE
+ 	def_bool y
+ 	depends on MEMORY_HOTPLUG
+ 
  config HAVE_ARCH_EARLY_PFN_TO_NID
  	def_bool X86_64
  	depends on NUMA
diff --combined arch/x86/include/asm/irq.h
index 4bb732e45a8,28e409fc73f..592688ed04d
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@@ -31,13 -31,9 +31,9 @@@ static inline int irq_canonicalize(int 
  # endif
  #endif
  
- #ifdef CONFIG_IRQBALANCE
- extern int irqbalance_disable(char *str);
- #endif
- 
  #ifdef CONFIG_HOTPLUG_CPU
  #include <linux/cpumask.h>
 -extern void fixup_irqs(cpumask_t map);
 +extern void fixup_irqs(void);
  #endif
  
  extern unsigned int do_IRQ(struct pt_regs *regs);
@@@ -46,6 -42,5 +42,6 @@@ extern void native_init_IRQ(void)
  
  /* Interrupt vector management */
  extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
 +extern int vector_used_by_percpu_irq(unsigned int vector);
  
  #endif /* _ASM_X86_IRQ_H */
diff --combined arch/x86/include/asm/topology.h
index 79e31e9dcdd,168203c0c31..4e2f2e0aab2
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@@ -61,13 -61,19 +61,19 @@@ static inline int cpu_to_node(int cpu
   *
   * Side note: this function creates the returned cpumask on the stack
   * so with a high NR_CPUS count, excessive stack space is used.  The
-  * node_to_cpumask_ptr function should be used whenever possible.
+  * cpumask_of_node function should be used whenever possible.
   */
  static inline cpumask_t node_to_cpumask(int node)
  {
  	return node_to_cpumask_map[node];
  }
  
+ /* Returns a bitmask of CPUs on Node 'node'. */
+ static inline const struct cpumask *cpumask_of_node(int node)
+ {
+ 	return &node_to_cpumask_map[node];
+ }
+ 
  #else /* CONFIG_X86_64 */
  
  /* Mappings between node number and cpus on that node. */
@@@ -82,7 -88,7 +88,7 @@@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_n
  #ifdef CONFIG_DEBUG_PER_CPU_MAPS
  extern int cpu_to_node(int cpu);
  extern int early_cpu_to_node(int cpu);
- extern const cpumask_t *_node_to_cpumask_ptr(int node);
+ extern const cpumask_t *cpumask_of_node(int node);
  extern cpumask_t node_to_cpumask(int node);
  
  #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
@@@ -103,7 -109,7 +109,7 @@@ static inline int early_cpu_to_node(in
  }
  
  /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
- static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+ static inline const cpumask_t *cpumask_of_node(int node)
  {
  	return &node_to_cpumask_map[node];
  }
@@@ -116,12 -122,15 +122,15 @@@ static inline cpumask_t node_to_cpumask
  
  #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
  
- /* Replace default node_to_cpumask_ptr with optimized version */
+ /*
+  * Replace default node_to_cpumask_ptr with optimized version
+  * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+  */
  #define node_to_cpumask_ptr(v, node)		\
- 		const cpumask_t *v = _node_to_cpumask_ptr(node)
+ 		const cpumask_t *v = cpumask_of_node(node)
  
  #define node_to_cpumask_ptr_next(v, node)	\
- 			   v = _node_to_cpumask_ptr(node)
+ 			   v = cpumask_of_node(node)
  
  #endif /* CONFIG_X86_64 */
  
@@@ -187,7 -196,7 +196,7 @@@ extern int __node_distance(int, int)
  #define	cpu_to_node(cpu)	0
  #define	early_cpu_to_node(cpu)	0
  
- static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+ static inline const cpumask_t *cpumask_of_node(int node)
  {
  	return &cpu_online_map;
  }
@@@ -200,12 -209,15 +209,15 @@@ static inline int node_to_first_cpu(in
  	return first_cpu(cpu_online_map);
  }
  
- /* Replace default node_to_cpumask_ptr with optimized version */
+ /*
+  * Replace default node_to_cpumask_ptr with optimized version
+  * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+  */
  #define node_to_cpumask_ptr(v, node)		\
- 		const cpumask_t *v = _node_to_cpumask_ptr(node)
+ 		const cpumask_t *v = cpumask_of_node(node)
  
  #define node_to_cpumask_ptr_next(v, node)	\
- 			   v = _node_to_cpumask_ptr(node)
+ 			   v = cpumask_of_node(node)
  #endif
  
  #include <asm-generic/topology.h>
@@@ -214,20 -226,18 +226,20 @@@
  /* Returns the number of the first CPU on Node 'node'. */
  static inline int node_to_first_cpu(int node)
  {
- 	node_to_cpumask_ptr(mask, node);
- 	return first_cpu(*mask);
+ 	return cpumask_first(cpumask_of_node(node));
  }
  #endif
  
  extern cpumask_t cpu_coregroup_map(int cpu);
+ extern const struct cpumask *cpu_coregroup_mask(int cpu);
  
  #ifdef ENABLE_TOPO_DEFINES
  #define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
  #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
  #define topology_core_siblings(cpu)		(per_cpu(cpu_core_map, cpu))
  #define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
 +#define topology_core_cpumask(cpu)		(&per_cpu(cpu_core_map, cpu))
 +#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
  
  /* indicates that pointers to the topology cpumask_t maps are valid */
  #define arch_provides_topology_pointers		yes
diff --combined arch/x86/kernel/apic.c
index b9019271af6,6107b41da9a..6b7f824db16
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@@ -30,6 -30,7 +30,7 @@@
  #include <linux/module.h>
  #include <linux/dmi.h>
  #include <linux/dmar.h>
+ #include <linux/ftrace.h>
  
  #include <asm/atomic.h>
  #include <asm/smp.h>
@@@ -118,6 -119,8 +119,6 @@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o
  
  int first_system_vector = 0xfe;
  
 -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 -
  /*
   * Debug level, exported for io_apic.c
   */
@@@ -139,7 -142,7 +140,7 @@@ static int lapic_next_event(unsigned lo
  			    struct clock_event_device *evt);
  static void lapic_timer_setup(enum clock_event_mode mode,
  			      struct clock_event_device *evt);
 -static void lapic_timer_broadcast(const struct cpumask *mask);
 +static void lapic_timer_broadcast(const cpumask_t *mask);
  static void apic_pm_activate(void);
  
  /*
@@@ -452,10 -455,10 +453,10 @@@ static void lapic_timer_setup(enum cloc
  /*
   * Local APIC timer broadcast function
   */
 -static void lapic_timer_broadcast(const struct cpumask *mask)
 +static void lapic_timer_broadcast(const cpumask_t *mask)
  {
  #ifdef CONFIG_SMP
 -	send_IPI_mask(*mask, LOCAL_TIMER_VECTOR);
 +	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
  #endif
  }
  
@@@ -775,11 -778,7 +776,7 @@@ static void local_apic_timer_interrupt(
  	/*
  	 * the NMI deadlock-detector uses this.
  	 */
- #ifdef CONFIG_X86_64
- 	add_pda(apic_timer_irqs, 1);
- #else
- 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
- #endif
+ 	inc_irq_stat(apic_timer_irqs);
  
  	evt->event_handler(evt);
  }
@@@ -792,7 -791,7 +789,7 @@@
   * [ if a single-CPU system runs an SMP kernel then we call the local
   *   interrupt as well. Thus we cannot inline the local irq ... ]
   */
- void smp_apic_timer_interrupt(struct pt_regs *regs)
+ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
  {
  	struct pt_regs *old_regs = set_irq_regs(regs);
  
@@@ -806,9 -805,7 +803,7 @@@
  	 * Besides, if we don't timer interrupts ignore the global
  	 * interrupt lock, which is the WrongThing (tm) to do.
  	 */
- #ifdef CONFIG_X86_64
  	exit_idle();
- #endif
  	irq_enter();
  	local_apic_timer_interrupt();
  	irq_exit();
@@@ -1666,9 -1663,7 +1661,7 @@@ void smp_spurious_interrupt(struct pt_r
  {
  	u32 v;
  
- #ifdef CONFIG_X86_64
  	exit_idle();
- #endif
  	irq_enter();
  	/*
  	 * Check if this really is a spurious interrupt and ACK it
@@@ -1679,14 -1674,11 +1672,11 @@@
  	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
  		ack_APIC_irq();
  
- #ifdef CONFIG_X86_64
- 	add_pda(irq_spurious_count, 1);
- #else
+ 	inc_irq_stat(irq_spurious_count);
+ 
  	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
  	pr_info("spurious APIC interrupt on CPU#%d, "
  		"should never happen.\n", smp_processor_id());
- 	__get_cpu_var(irq_stat).irq_spurious_count++;
- #endif
  	irq_exit();
  }
  
@@@ -1697,9 -1689,7 +1687,7 @@@ void smp_error_interrupt(struct pt_reg
  {
  	u32 v, v1;
  
- #ifdef CONFIG_X86_64
  	exit_idle();
- #endif
  	irq_enter();
  	/* First tickle the hardware, only then report what went on. -- REW */
  	v = apic_read(APIC_ESR);
@@@ -1817,32 -1807,28 +1805,32 @@@ void disconnect_bsp_APIC(int virt_wire_
  void __cpuinit generic_processor_info(int apicid, int version)
  {
  	int cpu;
 -	cpumask_t tmp_map;
  
  	/*
  	 * Validate version
  	 */
  	if (version == 0x0) {
  		pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
 -			"fixing up to 0x10. (tell your hw vendor)\n",
 -			version);
 +			   "fixing up to 0x10. (tell your hw vendor)\n",
 +				version);
  		version = 0x10;
  	}
  	apic_version[apicid] = version;
  
 -	if (num_processors >= NR_CPUS) {
 -		pr_warning("WARNING: NR_CPUS limit of %i reached."
 -			"  Processor ignored.\n", NR_CPUS);
 +	if (num_processors >= nr_cpu_ids) {
 +		int max = nr_cpu_ids;
 +		int thiscpu = max + disabled_cpus;
 +
 +		pr_warning(
 +			"ACPI: NR_CPUS/possible_cpus limit of %i reached."
 +			"  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
 +
 +		disabled_cpus++;
  		return;
  	}
  
  	num_processors++;
 -	cpus_complement(tmp_map, cpu_present_map);
 -	cpu = first_cpu(tmp_map);
 +	cpu = cpumask_next_zero(-1, cpu_present_mask);
  
  	physid_set(apicid, phys_cpu_present_map);
  	if (apicid == boot_cpu_physical_apicid) {
@@@ -1892,8 -1878,8 +1880,8 @@@
  	}
  #endif
  
 -	cpu_set(cpu, cpu_possible_map);
 -	cpu_set(cpu, cpu_present_map);
 +	set_cpu_possible(cpu, true);
 +	set_cpu_present(cpu, true);
  }
  
  #ifdef CONFIG_X86_64
@@@ -2095,7 -2081,7 +2083,7 @@@ __cpuinit int apic_is_clustered_box(voi
  	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
  	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
  
 -	for (i = 0; i < NR_CPUS; i++) {
 +	for (i = 0; i < nr_cpu_ids; i++) {
  		/* are we being called early in kernel startup? */
  		if (bios_cpu_apicid) {
  			id = bios_cpu_apicid[i];
diff --combined arch/x86/kernel/cpu/intel_cacheinfo.c
index 7bd00a56567,15cf14e9bf2..48533d77be7
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@@ -534,16 -534,31 +534,16 @@@ static void __cpuinit free_cache_attrib
  	per_cpu(cpuid4_info, cpu) = NULL;
  }
  
 -static int __cpuinit detect_cache_attributes(unsigned int cpu)
 +static void __cpuinit get_cpu_leaves(void *_retval)
  {
 -	struct _cpuid4_info	*this_leaf;
 -	unsigned long		j;
 -	int			retval;
 -	cpumask_t		oldmask;
 -
 -	if (num_cache_leaves == 0)
 -		return -ENOENT;
 -
 -	per_cpu(cpuid4_info, cpu) = kzalloc(
 -	    sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
 -	if (per_cpu(cpuid4_info, cpu) == NULL)
 -		return -ENOMEM;
 -
 -	oldmask = current->cpus_allowed;
 -	retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 -	if (retval)
 -		goto out;
 +	int j, *retval = _retval, cpu = smp_processor_id();
  
  	/* Do cpuid and store the results */
  	for (j = 0; j < num_cache_leaves; j++) {
 +		struct _cpuid4_info *this_leaf;
  		this_leaf = CPUID4_INFO_IDX(cpu, j);
 -		retval = cpuid4_cache_lookup(j, this_leaf);
 -		if (unlikely(retval < 0)) {
 +		*retval = cpuid4_cache_lookup(j, this_leaf);
 +		if (unlikely(*retval < 0)) {
  			int i;
  
  			for (i = 0; i < j; i++)
@@@ -552,21 -567,9 +552,21 @@@
  		}
  		cache_shared_cpu_map_setup(cpu, j);
  	}
 -	set_cpus_allowed_ptr(current, &oldmask);
 +}
 +
 +static int __cpuinit detect_cache_attributes(unsigned int cpu)
 +{
 +	int			retval;
 +
 +	if (num_cache_leaves == 0)
 +		return -ENOENT;
 +
 +	per_cpu(cpuid4_info, cpu) = kzalloc(
 +	    sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
 +	if (per_cpu(cpuid4_info, cpu) == NULL)
 +		return -ENOMEM;
  
 -out:
 +	smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
  	if (retval) {
  		kfree(per_cpu(cpuid4_info, cpu));
  		per_cpu(cpuid4_info, cpu) = NULL;
@@@ -641,20 -644,17 +641,17 @@@ static inline ssize_t show_shared_cpu_l
  	return show_shared_cpu_map_func(leaf, 1, buf);
  }
  
- static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
- 	switch(this_leaf->eax.split.type) {
- 	    case CACHE_TYPE_DATA:
+ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+ {
+ 	switch (this_leaf->eax.split.type) {
+ 	case CACHE_TYPE_DATA:
  		return sprintf(buf, "Data\n");
- 		break;
- 	    case CACHE_TYPE_INST:
+ 	case CACHE_TYPE_INST:
  		return sprintf(buf, "Instruction\n");
- 		break;
- 	    case CACHE_TYPE_UNIFIED:
+ 	case CACHE_TYPE_UNIFIED:
  		return sprintf(buf, "Unified\n");
- 		break;
- 	    default:
+ 	default:
  		return sprintf(buf, "Unknown\n");
- 		break;
  	}
  }
  
diff --combined arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index a1de80f368f,748c8f9e7a0..a5a5e053037
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@@ -83,41 -83,34 +83,41 @@@ static DEFINE_PER_CPU(unsigned char, ba
   * CPU Initialization
   */
  
 +struct thresh_restart {
 +	struct threshold_block *b;
 +	int reset;
 +	u16 old_limit;
 +};
 +
  /* must be called with correct cpu affinity */
 -static void threshold_restart_bank(struct threshold_block *b,
 -				   int reset, u16 old_limit)
 +static long threshold_restart_bank(void *_tr)
  {
 +	struct thresh_restart *tr = _tr;
  	u32 mci_misc_hi, mci_misc_lo;
  
 -	rdmsr(b->address, mci_misc_lo, mci_misc_hi);
 +	rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
  
 -	if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
 -		reset = 1;	/* limit cannot be lower than err count */
 +	if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
 +		tr->reset = 1;	/* limit cannot be lower than err count */
  
 -	if (reset) {		/* reset err count and overflow bit */
 +	if (tr->reset) {		/* reset err count and overflow bit */
  		mci_misc_hi =
  		    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
 -		    (THRESHOLD_MAX - b->threshold_limit);
 -	} else if (old_limit) {	/* change limit w/o reset */
 +		    (THRESHOLD_MAX - tr->b->threshold_limit);
 +	} else if (tr->old_limit) {	/* change limit w/o reset */
  		int new_count = (mci_misc_hi & THRESHOLD_MAX) +
 -		    (old_limit - b->threshold_limit);
 +		    (tr->old_limit - tr->b->threshold_limit);
  		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
  		    (new_count & THRESHOLD_MAX);
  	}
  
 -	b->interrupt_enable ?
 +	tr->b->interrupt_enable ?
  	    (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
  	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
  
  	mci_misc_hi |= MASK_COUNT_EN_HI;
 -	wrmsr(b->address, mci_misc_lo, mci_misc_hi);
 +	wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
 +	return 0;
  }
  
  /* cpu init entry point, called from mce.c with preempt off */
@@@ -127,7 -120,6 +127,7 @@@ void __cpuinit mce_amd_feature_init(str
  	unsigned int cpu = smp_processor_id();
  	u8 lvt_off;
  	u32 low = 0, high = 0, address = 0;
 +	struct thresh_restart tr;
  
  	for (bank = 0; bank < NR_BANKS; ++bank) {
  		for (block = 0; block < NR_BLOCKS; ++block) {
@@@ -170,10 -162,7 +170,10 @@@
  			wrmsr(address, low, high);
  
  			threshold_defaults.address = address;
 -			threshold_restart_bank(&threshold_defaults, 0, 0);
 +			tr.b = &threshold_defaults;
 +			tr.reset = 0;
 +			tr.old_limit = 0;
 +			threshold_restart_bank(&tr);
  		}
  	}
  }
@@@ -248,7 -237,7 +248,7 @@@ asmlinkage void mce_threshold_interrupt
  		}
  	}
  out:
- 	add_pda(irq_threshold_count, 1);
+ 	inc_irq_stat(irq_threshold_count);
  	irq_exit();
  }
  
@@@ -262,6 -251,20 +262,6 @@@ struct threshold_attr 
  	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
  };
  
 -static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
 -					   cpumask_t *newmask)
 -{
 -	*oldmask = current->cpus_allowed;
 -	cpus_clear(*newmask);
 -	cpu_set(cpu, *newmask);
 -	set_cpus_allowed_ptr(current, newmask);
 -}
 -
 -static void affinity_restore(const cpumask_t *oldmask)
 -{
 -	set_cpus_allowed_ptr(current, oldmask);
 -}
 -
  #define SHOW_FIELDS(name)                                           \
  static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
  {                                                                   \
@@@ -274,16 -277,15 +274,16 @@@ static ssize_t store_interrupt_enable(s
  				      const char *buf, size_t count)
  {
  	char *end;
 -	cpumask_t oldmask, newmask;
 +	struct thresh_restart tr;
  	unsigned long new = simple_strtoul(buf, &end, 0);
  	if (end == buf)
  		return -EINVAL;
  	b->interrupt_enable = !!new;
  
 -	affinity_set(b->cpu, &oldmask, &newmask);
 -	threshold_restart_bank(b, 0, 0);
 -	affinity_restore(&oldmask);
 +	tr.b = b;
 +	tr.reset = 0;
 +	tr.old_limit = 0;
 +	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
  
  	return end - buf;
  }
@@@ -292,7 -294,8 +292,7 @@@ static ssize_t store_threshold_limit(st
  				     const char *buf, size_t count)
  {
  	char *end;
 -	cpumask_t oldmask, newmask;
 -	u16 old;
 +	struct thresh_restart tr;
  	unsigned long new = simple_strtoul(buf, &end, 0);
  	if (end == buf)
  		return -EINVAL;
@@@ -300,36 -303,34 +300,36 @@@
  		new = THRESHOLD_MAX;
  	if (new < 1)
  		new = 1;
 -	old = b->threshold_limit;
 +	tr.old_limit = b->threshold_limit;
  	b->threshold_limit = new;
 +	tr.b = b;
 +	tr.reset = 0;
  
 -	affinity_set(b->cpu, &oldmask, &newmask);
 -	threshold_restart_bank(b, 0, old);
 -	affinity_restore(&oldmask);
 +	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
  
  	return end - buf;
  }
  
 -static ssize_t show_error_count(struct threshold_block *b, char *buf)
 +static long local_error_count(void *_b)
  {
 -	u32 high, low;
 -	cpumask_t oldmask, newmask;
 -	affinity_set(b->cpu, &oldmask, &newmask);
 +	struct threshold_block *b = _b;
 +	u32 low, high;
 +
  	rdmsr(b->address, low, high);
 -	affinity_restore(&oldmask);
 -	return sprintf(buf, "%x\n",
 -		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
 +	return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
 +}
 +
 +static ssize_t show_error_count(struct threshold_block *b, char *buf)
 +{
 +	return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
  }
  
  static ssize_t store_error_count(struct threshold_block *b,
  				 const char *buf, size_t count)
  {
 -	cpumask_t oldmask, newmask;
 -	affinity_set(b->cpu, &oldmask, &newmask);
 -	threshold_restart_bank(b, 1, 0);
 -	affinity_restore(&oldmask);
 +	struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
 +
 +	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
  	return 1;
  }
  
@@@ -462,19 -463,12 +462,19 @@@ out_free
  	return err;
  }
  
 +static long local_allocate_threshold_blocks(void *_bank)
 +{
 +	unsigned int *bank = _bank;
 +
 +	return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
 +					 MSR_IA32_MC0_MISC + *bank * 4);
 +}
 +
  /* symlinks sibling shared banks to first core.  first core owns dir/files. */
  static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
  {
  	int i, err = 0;
  	struct threshold_bank *b = NULL;
 -	cpumask_t oldmask, newmask;
  	char name[32];
  
  	sprintf(name, "threshold_bank%i", bank);
@@@ -525,7 -519,11 +525,7 @@@
  
  	per_cpu(threshold_banks, cpu)[bank] = b;
  
 -	affinity_set(cpu, &oldmask, &newmask);
 -	err = allocate_threshold_blocks(cpu, bank, 0,
 -					MSR_IA32_MC0_MISC + bank * 4);
 -	affinity_restore(&oldmask);
 -
 +	err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
  	if (err)
  		goto out_free;
  
diff --combined arch/x86/kernel/genx2apic_uv_x.c
index 0e88be11227,dece1728973..b193e082f6c
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@@ -10,6 -10,7 +10,7 @@@
  
  #include <linux/kernel.h>
  #include <linux/threads.h>
+ #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/string.h>
  #include <linux/ctype.h>
@@@ -17,6 -18,9 +18,9 @@@
  #include <linux/sched.h>
  #include <linux/module.h>
  #include <linux/hardirq.h>
+ #include <linux/timer.h>
+ #include <linux/proc_fs.h>
+ #include <asm/current.h>
  #include <asm/smp.h>
  #include <asm/ipi.h>
  #include <asm/genapic.h>
@@@ -75,15 -79,16 +79,15 @@@ EXPORT_SYMBOL(sn_rtc_cycles_per_second)
  
  /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
  
 -static cpumask_t uv_target_cpus(void)
 +static const struct cpumask *uv_target_cpus(void)
  {
 -	return cpumask_of_cpu(0);
 +	return cpumask_of(0);
  }
  
 -static cpumask_t uv_vector_allocation_domain(int cpu)
 +static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
  {
 -	cpumask_t domain = CPU_MASK_NONE;
 -	cpu_set(cpu, domain);
 -	return domain;
 +	cpumask_clear(retmask);
 +	cpumask_set_cpu(cpu, retmask);
  }
  
  int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@@ -122,37 -127,28 +126,37 @@@ static void uv_send_IPI_one(int cpu, in
  	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
  }
  
 -static void uv_send_IPI_mask(cpumask_t mask, int vector)
 +static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
  {
  	unsigned int cpu;
  
 -	for_each_possible_cpu(cpu)
 -		if (cpu_isset(cpu, mask))
 +	for_each_cpu(cpu, mask)
 +		uv_send_IPI_one(cpu, vector);
 +}
 +
 +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 +{
 +	unsigned int cpu;
 +	unsigned int this_cpu = smp_processor_id();
 +
 +	for_each_cpu(cpu, mask)
 +		if (cpu != this_cpu)
  			uv_send_IPI_one(cpu, vector);
  }
  
  static void uv_send_IPI_allbutself(int vector)
  {
 -	cpumask_t mask = cpu_online_map;
 -
 -	cpu_clear(smp_processor_id(), mask);
 +	unsigned int cpu;
 +	unsigned int this_cpu = smp_processor_id();
  
 -	if (!cpus_empty(mask))
 -		uv_send_IPI_mask(mask, vector);
 +	for_each_online_cpu(cpu)
 +		if (cpu != this_cpu)
 +			uv_send_IPI_one(cpu, vector);
  }
  
  static void uv_send_IPI_all(int vector)
  {
 -	uv_send_IPI_mask(cpu_online_map, vector);
 +	uv_send_IPI_mask(cpu_online_mask, vector);
  }
  
  static int uv_apic_id_registered(void)
@@@ -164,7 -160,7 +168,7 @@@ static void uv_init_apic_ldr(void
  {
  }
  
 -static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
 +static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
  {
  	int cpu;
  
@@@ -172,30 -168,13 +176,30 @@@
  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
  	 * May as well be the first.
  	 */
 -	cpu = first_cpu(cpumask);
 +	cpu = cpumask_first(cpumask);
  	if ((unsigned)cpu < nr_cpu_ids)
  		return per_cpu(x86_cpu_to_apicid, cpu);
  	else
  		return BAD_APICID;
  }
  
 +static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 +					      const struct cpumask *andmask)
 +{
 +	int cpu;
 +
 +	/*
 +	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
 +	 * May as well be the first.
 +	 */
 +	for_each_cpu_and(cpu, cpumask, andmask)
 +		if (cpumask_test_cpu(cpu, cpu_online_mask))
 +			break;
 +	if (cpu < nr_cpu_ids)
 +		return per_cpu(x86_cpu_to_apicid, cpu);
 +	return BAD_APICID;
 +}
 +
  static unsigned int get_apic_id(unsigned long x)
  {
  	unsigned int id;
@@@ -243,10 -222,8 +247,10 @@@ struct genapic apic_x2apic_uv_x = 
  	.send_IPI_all = uv_send_IPI_all,
  	.send_IPI_allbutself = uv_send_IPI_allbutself,
  	.send_IPI_mask = uv_send_IPI_mask,
 +	.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
  	.send_IPI_self = uv_send_IPI_self,
  	.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
 +	.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
  	.phys_pkg_id = phys_pkg_id,
  	.get_apic_id = get_apic_id,
  	.set_apic_id = set_apic_id,
@@@ -382,6 -359,103 +386,103 @@@ static __init void uv_rtc_init(void
  		sn_rtc_cycles_per_second = ticks_per_sec;
  }
  
+ /*
+  * percpu heartbeat timer
+  */
+ static void uv_heartbeat(unsigned long ignored)
+ {
+ 	struct timer_list *timer = &uv_hub_info->scir.timer;
+ 	unsigned char bits = uv_hub_info->scir.state;
+ 
+ 	/* flip heartbeat bit */
+ 	bits ^= SCIR_CPU_HEARTBEAT;
+ 
+ 	/* is this cpu idle? */
+ 	if (idle_cpu(raw_smp_processor_id()))
+ 		bits &= ~SCIR_CPU_ACTIVITY;
+ 	else
+ 		bits |= SCIR_CPU_ACTIVITY;
+ 
+ 	/* update system controller interface reg */
+ 	uv_set_scir_bits(bits);
+ 
+ 	/* enable next timer period */
+ 	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ }
+ 
+ static void __cpuinit uv_heartbeat_enable(int cpu)
+ {
+ 	if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ 		struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+ 
+ 		uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ 		setup_timer(timer, uv_heartbeat, cpu);
+ 		timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ 		add_timer_on(timer, cpu);
+ 		uv_cpu_hub_info(cpu)->scir.enabled = 1;
+ 	}
+ 
+ 	/* check boot cpu */
+ 	if (!uv_cpu_hub_info(0)->scir.enabled)
+ 		uv_heartbeat_enable(0);
+ }
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
+ static void __cpuinit uv_heartbeat_disable(int cpu)
+ {
+ 	if (uv_cpu_hub_info(cpu)->scir.enabled) {
+ 		uv_cpu_hub_info(cpu)->scir.enabled = 0;
+ 		del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ 	}
+ 	uv_set_cpu_scir_bits(cpu, 0xff);
+ }
+ 
+ /*
+  * cpu hotplug notifier
+  */
+ static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
+ 				       unsigned long action, void *hcpu)
+ {
+ 	long cpu = (long)hcpu;
+ 
+ 	switch (action) {
+ 	case CPU_ONLINE:
+ 		uv_heartbeat_enable(cpu);
+ 		break;
+ 	case CPU_DOWN_PREPARE:
+ 		uv_heartbeat_disable(cpu);
+ 		break;
+ 	default:
+ 		break;
+ 	}
+ 	return NOTIFY_OK;
+ }
+ 
+ static __init void uv_scir_register_cpu_notifier(void)
+ {
+ 	hotcpu_notifier(uv_scir_cpu_notify, 0);
+ }
+ 
+ #else /* !CONFIG_HOTPLUG_CPU */
+ 
+ static __init void uv_scir_register_cpu_notifier(void)
+ {
+ }
+ 
+ static __init int uv_init_heartbeat(void)
+ {
+ 	int cpu;
+ 
+ 	if (is_uv_system())
+ 		for_each_online_cpu(cpu)
+ 			uv_heartbeat_enable(cpu);
+ 	return 0;
+ }
+ 
+ late_initcall(uv_init_heartbeat);
+ 
+ #endif /* !CONFIG_HOTPLUG_CPU */
+ 
  /*
   * Called on each cpu to initialize the per_cpu UV data area.
   * 	ZZZ hotplug not supported yet
@@@ -455,7 -529,7 +556,7 @@@ void __init uv_system_init(void
  
  	uv_bios_init();
  	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
- 			    &uv_coherency_id, &uv_region_size);
+ 			    &sn_coherency_id, &sn_region_size);
  	uv_rtc_init();
  
  	for_each_present_cpu(cpu) {
@@@ -466,8 -540,7 +567,7 @@@
  		uv_blade_info[blade].nr_possible_cpus++;
  
  		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
- 		uv_cpu_hub_info(cpu)->lowmem_remap_top =
- 					lowmem_redir_base + lowmem_redir_size;
+ 		uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
  		uv_cpu_hub_info(cpu)->m_val = m_val;
  		uv_cpu_hub_info(cpu)->n_val = m_val;
  		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@@ -477,7 -550,8 +577,8 @@@
  		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
  		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
  		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- 		uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
+ 		uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
+ 		uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
  		uv_node_to_blade[nid] = blade;
  		uv_cpu_to_blade[cpu] = blade;
  		max_pnode = max(pnode, max_pnode);
@@@ -494,4 -568,6 +595,6 @@@
  	map_mmioh_high(max_pnode);
  
  	uv_cpu_init();
+ 	uv_scir_register_cpu_notifier();
+ 	proc_mkdir("sgi_uv", NULL);
  }
diff --combined arch/x86/kernel/io_apic.c
index 1cbf7c8d46e,e7745961ed3..3e070bb961d
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@@ -136,8 -136,8 +136,8 @@@ static struct irq_pin_list *get_one_fre
  
  struct irq_cfg {
  	struct irq_pin_list *irq_2_pin;
 -	cpumask_t domain;
 -	cpumask_t old_domain;
 +	cpumask_var_t domain;
 +	cpumask_var_t old_domain;
  	unsigned move_cleanup_count;
  	u8 vector;
  	u8 move_in_progress : 1;
@@@ -152,22 -152,22 +152,22 @@@ static struct irq_cfg irq_cfgx[] = 
  #else
  static struct irq_cfg irq_cfgx[NR_IRQS] = {
  #endif
 -	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
 -	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
 -	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
 -	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
 -	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
 -	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
 -	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
 -	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
 -	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
 -	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
 -	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
 -	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
 -	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
 -	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
 -	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
 -	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 +	[0]  = { .vector = IRQ0_VECTOR,  },
 +	[1]  = { .vector = IRQ1_VECTOR,  },
 +	[2]  = { .vector = IRQ2_VECTOR,  },
 +	[3]  = { .vector = IRQ3_VECTOR,  },
 +	[4]  = { .vector = IRQ4_VECTOR,  },
 +	[5]  = { .vector = IRQ5_VECTOR,  },
 +	[6]  = { .vector = IRQ6_VECTOR,  },
 +	[7]  = { .vector = IRQ7_VECTOR,  },
 +	[8]  = { .vector = IRQ8_VECTOR,  },
 +	[9]  = { .vector = IRQ9_VECTOR,  },
 +	[10] = { .vector = IRQ10_VECTOR, },
 +	[11] = { .vector = IRQ11_VECTOR, },
 +	[12] = { .vector = IRQ12_VECTOR, },
 +	[13] = { .vector = IRQ13_VECTOR, },
 +	[14] = { .vector = IRQ14_VECTOR, },
 +	[15] = { .vector = IRQ15_VECTOR, },
  };
  
  void __init arch_early_irq_init(void)
@@@ -183,10 -183,6 +183,10 @@@
  	for (i = 0; i < count; i++) {
  		desc = irq_to_desc(i);
  		desc->chip_data = &cfg[i];
 +		alloc_bootmem_cpumask_var(&cfg[i].domain);
 +		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
 +		if (i < NR_IRQS_LEGACY)
 +			cpumask_setall(cfg[i].domain);
  	}
  }
  
@@@ -211,20 -207,6 +211,20 @@@ static struct irq_cfg *get_one_free_irq
  	node = cpu_to_node(cpu);
  
  	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 +	if (cfg) {
 +		/* FIXME: needs alloc_cpumask_var_node() */
 +		if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
 +			kfree(cfg);
 +			cfg = NULL;
 +		} else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
 +			free_cpumask_var(cfg->domain);
 +			kfree(cfg);
 +			cfg = NULL;
 +		} else {
 +			cpumask_clear(cfg->domain);
 +			cpumask_clear(cfg->old_domain);
 +		}
 +	}
  	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
  
  	return cfg;
@@@ -347,14 -329,13 +347,14 @@@ void arch_free_chip_data(struct irq_des
  	}
  }
  
 -static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 +static void
 +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
  	struct irq_cfg *cfg = desc->chip_data;
  
  	if (!cfg->move_in_progress) {
  		/* it means that domain is not changed */
 -		if (!cpus_intersects(desc->affinity, mask))
 +		if (!cpumask_intersects(&desc->affinity, mask))
  			cfg->move_desc_pending = 1;
  	}
  }
@@@ -369,8 -350,7 +369,8 @@@ static struct irq_cfg *irq_cfg(unsigne
  #endif
  
  #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
 -static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 +static inline void
 +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
  }
  #endif
@@@ -501,26 -481,6 +501,26 @@@ static void ioapic_mask_entry(int apic
  }
  
  #ifdef CONFIG_SMP
 +static void send_cleanup_vector(struct irq_cfg *cfg)
 +{
 +	cpumask_var_t cleanup_mask;
 +
 +	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 +		unsigned int i;
 +		cfg->move_cleanup_count = 0;
 +		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
 +			cfg->move_cleanup_count++;
 +		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
 +			send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
 +	} else {
 +		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
 +		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
 +		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 +		free_cpumask_var(cleanup_mask);
 +	}
 +	cfg->move_in_progress = 0;
 +}
 +
  static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
  {
  	int apic, pin;
@@@ -556,61 -516,48 +556,61 @@@
  	}
  }
  
 -static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 +static int
 +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
  
 -static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 +/*
 + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
 + * of that, or returns BAD_APICID and leaves desc->affinity untouched.
 + */
 +static unsigned int
 +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
  {
  	struct irq_cfg *cfg;
 -	unsigned long flags;
 -	unsigned int dest;
 -	cpumask_t tmp;
  	unsigned int irq;
  
 -	cpus_and(tmp, mask, cpu_online_map);
 -	if (cpus_empty(tmp))
 -		return;
 +	if (!cpumask_intersects(mask, cpu_online_mask))
 +		return BAD_APICID;
  
  	irq = desc->irq;
  	cfg = desc->chip_data;
  	if (assign_irq_vector(irq, cfg, mask))
 -		return;
 +		return BAD_APICID;
  
 +	cpumask_and(&desc->affinity, cfg->domain, mask);
  	set_extra_move_desc(desc, mask);
 +	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
 +}
  
 -	cpus_and(tmp, cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
 -	/*
 -	 * Only the high 8 bits are valid.
 -	 */
 -	dest = SET_APIC_LOGICAL_ID(dest);
 +static void
 +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 +{
 +	struct irq_cfg *cfg;
 +	unsigned long flags;
 +	unsigned int dest;
 +	unsigned int irq;
 +
 +	irq = desc->irq;
 +	cfg = desc->chip_data;
  
  	spin_lock_irqsave(&ioapic_lock, flags);
 -	__target_IO_APIC_irq(irq, dest, cfg);
 -	desc->affinity = mask;
 +	dest = set_desc_affinity(desc, mask);
 +	if (dest != BAD_APICID) {
 +		/* Only the high 8 bits are valid. */
 +		dest = SET_APIC_LOGICAL_ID(dest);
 +		__target_IO_APIC_irq(irq, dest, cfg);
 +	}
  	spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
 -static void set_ioapic_affinity_irq(unsigned int irq,
 -				    const struct cpumask *mask)
 +static void
 +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc;
  
  	desc = irq_to_desc(irq);
  
 -	set_ioapic_affinity_irq_desc(desc, *mask);
 +	set_ioapic_affinity_irq_desc(desc, mask);
  }
  #endif /* CONFIG_SMP */
  
@@@ -1272,8 -1219,7 +1272,8 @@@ void unlock_vector_lock(void
  	spin_unlock(&vector_lock);
  }
  
 -static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 +static int
 +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
  {
  	/*
  	 * NOTE! The local APIC isn't very good at handling
@@@ -1288,49 -1234,49 +1288,49 @@@
  	 */
  	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
  	unsigned int old_vector;
 -	int cpu;
 +	int cpu, err;
 +	cpumask_var_t tmp_mask;
  
  	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
  		return -EBUSY;
  
 -	/* Only try and allocate irqs on cpus that are present */
 -	cpus_and(mask, mask, cpu_online_map);
 +	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
 +		return -ENOMEM;
  
  	old_vector = cfg->vector;
  	if (old_vector) {
 -		cpumask_t tmp;
 -		cpus_and(tmp, cfg->domain, mask);
 -		if (!cpus_empty(tmp))
 +		cpumask_and(tmp_mask, mask, cpu_online_mask);
 +		cpumask_and(tmp_mask, cfg->domain, tmp_mask);
 +		if (!cpumask_empty(tmp_mask)) {
 +			free_cpumask_var(tmp_mask);
  			return 0;
 +		}
  	}
  
 -	for_each_cpu_mask_nr(cpu, mask) {
 -		cpumask_t domain, new_mask;
 +	/* Only try and allocate irqs on cpus that are present */
 +	err = -ENOSPC;
 +	for_each_cpu_and(cpu, mask, cpu_online_mask) {
  		int new_cpu;
  		int vector, offset;
  
 -		domain = vector_allocation_domain(cpu);
 -		cpus_and(new_mask, domain, cpu_online_map);
 +		vector_allocation_domain(cpu, tmp_mask);
  
  		vector = current_vector;
  		offset = current_offset;
  next:
  		vector += 8;
  		if (vector >= first_system_vector) {
 -			/* If we run out of vectors on large boxen, must share them. */
 +			/* If out of vectors on large boxen, must share them. */
  			offset = (offset + 1) % 8;
  			vector = FIRST_DEVICE_VECTOR + offset;
  		}
  		if (unlikely(current_vector == vector))
  			continue;
 -#ifdef CONFIG_X86_64
 -		if (vector == IA32_SYSCALL_VECTOR)
 -			goto next;
 -#else
 -		if (vector == SYSCALL_VECTOR)
 +
 +		if (test_bit(vector, used_vectors))
  			goto next;
 -#endif
 -		for_each_cpu_mask_nr(new_cpu, new_mask)
 +
 +		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
  			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
  				goto next;
  		/* Found one! */
@@@ -1338,21 -1284,18 +1338,21 @@@
  		current_offset = offset;
  		if (old_vector) {
  			cfg->move_in_progress = 1;
 -			cfg->old_domain = cfg->domain;
 +			cpumask_copy(cfg->old_domain, cfg->domain);
  		}
 -		for_each_cpu_mask_nr(new_cpu, new_mask)
 +		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
  			per_cpu(vector_irq, new_cpu)[vector] = irq;
  		cfg->vector = vector;
 -		cfg->domain = domain;
 -		return 0;
 +		cpumask_copy(cfg->domain, tmp_mask);
 +		err = 0;
 +		break;
  	}
 -	return -ENOSPC;
 +	free_cpumask_var(tmp_mask);
 +	return err;
  }
  
 -static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 +static int
 +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
  {
  	int err;
  	unsigned long flags;
@@@ -1365,20 -1308,23 +1365,20 @@@
  
  static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
  {
 -	cpumask_t mask;
  	int cpu, vector;
  
  	BUG_ON(!cfg->vector);
  
  	vector = cfg->vector;
 -	cpus_and(mask, cfg->domain, cpu_online_map);
 -	for_each_cpu_mask_nr(cpu, mask)
 +	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
  		per_cpu(vector_irq, cpu)[vector] = -1;
  
  	cfg->vector = 0;
 -	cpus_clear(cfg->domain);
 +	cpumask_clear(cfg->domain);
  
  	if (likely(!cfg->move_in_progress))
  		return;
 -	cpus_and(mask, cfg->old_domain, cpu_online_map);
 -	for_each_cpu_mask_nr(cpu, mask) {
 +	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
  		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
  								vector++) {
  			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@@ -1403,7 -1349,7 +1403,7 @@@ void __setup_vector_irq(int cpu
  		if (!desc)
  			continue;
  		cfg = desc->chip_data;
 -		if (!cpu_isset(cpu, cfg->domain))
 +		if (!cpumask_test_cpu(cpu, cfg->domain))
  			continue;
  		vector = cfg->vector;
  		per_cpu(vector_irq, cpu)[vector] = irq;
@@@ -1415,7 -1361,7 +1415,7 @@@
  			continue;
  
  		cfg = irq_cfg(irq);
 -		if (!cpu_isset(cpu, cfg->domain))
 +		if (!cpumask_test_cpu(cpu, cfg->domain))
  			per_cpu(vector_irq, cpu)[vector] = -1;
  	}
  }
@@@ -1551,17 -1497,18 +1551,17 @@@ static void setup_IO_APIC_irq(int apic
  {
  	struct irq_cfg *cfg;
  	struct IO_APIC_route_entry entry;
 -	cpumask_t mask;
 +	unsigned int dest;
  
  	if (!IO_APIC_IRQ(irq))
  		return;
  
  	cfg = desc->chip_data;
  
 -	mask = TARGET_CPUS;
 -	if (assign_irq_vector(irq, cfg, mask))
 +	if (assign_irq_vector(irq, cfg, TARGET_CPUS))
  		return;
  
 -	cpus_and(mask, cfg->domain, mask);
 +	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
  
  	apic_printk(APIC_VERBOSE,KERN_DEBUG
  		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@@ -1571,7 -1518,8 +1571,7 @@@
  
  
  	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
 -			       cpu_mask_to_apicid(mask), trigger, polarity,
 -			       cfg->vector)) {
 +			       dest, trigger, polarity, cfg->vector)) {
  		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
  		       mp_ioapics[apic].mp_apicid, pin);
  		__clear_irq_vector(irq, cfg);
@@@ -2293,7 -2241,7 +2293,7 @@@ static int ioapic_retrigger_irq(unsigne
  	unsigned long flags;
  
  	spin_lock_irqsave(&vector_lock, flags);
 -	send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
 +	send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
  	spin_unlock_irqrestore(&vector_lock, flags);
  
  	return 1;
@@@ -2342,17 -2290,18 +2342,17 @@@ static DECLARE_DELAYED_WORK(ir_migratio
   * as simple as edge triggered migration and we can do the irq migration
   * with a simple atomic update to IO-APIC RTE.
   */
 -static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
 +static void
 +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
  	struct irq_cfg *cfg;
 -	cpumask_t tmp, cleanup_mask;
  	struct irte irte;
  	int modify_ioapic_rte;
  	unsigned int dest;
  	unsigned long flags;
  	unsigned int irq;
  
 -	cpus_and(tmp, mask, cpu_online_map);
 -	if (cpus_empty(tmp))
 +	if (!cpumask_intersects(mask, cpu_online_mask))
  		return;
  
  	irq = desc->irq;
@@@ -2365,7 -2314,8 +2365,7 @@@
  
  	set_extra_move_desc(desc, mask);
  
 -	cpus_and(tmp, cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
 +	dest = cpu_mask_to_apicid_and(cfg->domain, mask);
  
  	modify_ioapic_rte = desc->status & IRQ_LEVEL;
  	if (modify_ioapic_rte) {
@@@ -2382,10 -2332,14 +2382,10 @@@
  	 */
  	modify_irte(irq, &irte);
  
 -	if (cfg->move_in_progress) {
 -		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 -		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 -		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 -		cfg->move_in_progress = 0;
 -	}
 +	if (cfg->move_in_progress)
 +		send_cleanup_vector(cfg);
  
 -	desc->affinity = mask;
 +	cpumask_copy(&desc->affinity, mask);
  }
  
  static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@@ -2407,11 -2361,11 +2407,11 @@@
  	}
  
  	/* everthing is clear. we have right of way */
 -	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 +	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
  
  	ret = 0;
  	desc->status &= ~IRQ_MOVE_PENDING;
 -	cpus_clear(desc->pending_mask);
 +	cpumask_clear(&desc->pending_mask);
  
  unmask:
  	unmask_IO_APIC_irq_desc(desc);
@@@ -2448,12 -2402,11 +2448,12 @@@ static void ir_irq_migration(struct wor
  /*
   * Migrates the IRQ destination in the process context.
   */
 -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 +					    const struct cpumask *mask)
  {
  	if (desc->status & IRQ_LEVEL) {
  		desc->status |= IRQ_MOVE_PENDING;
 -		desc->pending_mask = mask;
 +		cpumask_copy(&desc->pending_mask, mask);
  		migrate_irq_remapped_level_desc(desc);
  		return;
  	}
@@@ -2465,17 -2418,16 +2465,16 @@@ static void set_ir_ioapic_affinity_irq(
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  
 -	set_ir_ioapic_affinity_irq_desc(desc, *mask);
 +	set_ir_ioapic_affinity_irq_desc(desc, mask);
  }
  #endif
  
  asmlinkage void smp_irq_move_cleanup_interrupt(void)
  {
  	unsigned vector, me;
+ 
  	ack_APIC_irq();
- #ifdef CONFIG_X86_64
  	exit_idle();
- #endif
  	irq_enter();
  
  	me = smp_processor_id();
@@@ -2497,7 -2449,7 +2496,7 @@@
  		if (!cfg->move_cleanup_count)
  			goto unlock;
  
 -		if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
 +		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
  			goto unlock;
  
  		__get_cpu_var(vector_irq)[vector] = -1;
@@@ -2520,7 -2472,7 +2519,7 @@@ static void irq_complete_move(struct ir
  		if (likely(!cfg->move_desc_pending))
  			return;
  
- 		/* domain is not change, but affinity is changed */
+ 		/* domain has not changed, but affinity did */
  		me = smp_processor_id();
  		if (cpu_isset(me, desc->affinity)) {
  			*descp = desc = move_irq_desc(desc, me);
@@@ -2534,14 -2486,20 +2533,14 @@@
  
  	vector = ~get_irq_regs()->orig_ax;
  	me = smp_processor_id();
 -	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 -		cpumask_t cleanup_mask;
 -
  #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
  		*descp = desc = move_irq_desc(desc, me);
  		/* get the new one */
  		cfg = desc->chip_data;
  #endif
  
 -		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 -		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 -		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 -		cfg->move_in_progress = 0;
 -	}
 +	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 +		send_cleanup_vector(cfg);
  }
  #else
  static inline void irq_complete_move(struct irq_desc **descp) {}
@@@ -3266,13 -3224,16 +3265,13 @@@ static int msi_compose_msg(struct pci_d
  	struct irq_cfg *cfg;
  	int err;
  	unsigned dest;
 -	cpumask_t tmp;
  
  	cfg = irq_cfg(irq);
 -	tmp = TARGET_CPUS;
 -	err = assign_irq_vector(irq, cfg, tmp);
 +	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
  	if (err)
  		return err;
  
 -	cpus_and(tmp, cfg->domain, tmp);
 -	dest = cpu_mask_to_apicid(tmp);
 +	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
  
  #ifdef CONFIG_INTR_REMAP
  	if (irq_remapped(irq)) {
@@@ -3332,12 -3293,19 +3331,12 @@@ static void set_msi_irq_affinity(unsign
  	struct irq_cfg *cfg;
  	struct msi_msg msg;
  	unsigned int dest;
 -	cpumask_t tmp;
  
 -	if (!cpumask_intersects(mask, cpu_online_mask))
 +	dest = set_desc_affinity(desc, mask);
 +	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
 -	if (assign_irq_vector(irq, cfg, *mask))
 -		return;
 -
 -	set_extra_move_desc(desc, *mask);
 -
 -	cpumask_and(&tmp, &cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
  
  	read_msi_msg_desc(desc, &msg);
  
@@@ -3347,27 -3315,37 +3346,27 @@@
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
  	write_msi_msg_desc(desc, &msg);
 -	cpumask_copy(&desc->affinity, mask);
  }
  #ifdef CONFIG_INTR_REMAP
  /*
   * Migrate the MSI irq to another cpumask. This migration is
   * done in the process context using interrupt-remapping hardware.
   */
 -static void ir_set_msi_irq_affinity(unsigned int irq,
 -				    const struct cpumask *mask)
 +static void
 +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
 -	struct irq_cfg *cfg;
 +	struct irq_cfg *cfg = desc->chip_data;
  	unsigned int dest;
 -	cpumask_t tmp, cleanup_mask;
  	struct irte irte;
  
 -	if (!cpumask_intersects(mask, cpu_online_mask))
 -		return;
 -
  	if (get_irte(irq, &irte))
  		return;
  
 -	cfg = desc->chip_data;
 -	if (assign_irq_vector(irq, cfg, *mask))
 +	dest = set_desc_affinity(desc, mask);
 +	if (dest == BAD_APICID)
  		return;
  
 -	set_extra_move_desc(desc, *mask);
 -
 -	cpumask_and(&tmp, &cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
 -
  	irte.vector = cfg->vector;
  	irte.dest_id = IRTE_DEST(dest);
  
@@@ -3381,8 -3359,14 +3380,8 @@@
  	 * at the new destination. So, time to cleanup the previous
  	 * vector allocation.
  	 */
 -	if (cfg->move_in_progress) {
 -		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 -		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 -		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 -		cfg->move_in_progress = 0;
 -	}
 -
 -	cpumask_copy(&desc->affinity, mask);
 +	if (cfg->move_in_progress)
 +		send_cleanup_vector(cfg);
  }
  
  #endif
@@@ -3579,12 -3563,19 +3578,12 @@@ static void dmar_msi_set_affinity(unsig
  	struct irq_cfg *cfg;
  	struct msi_msg msg;
  	unsigned int dest;
 -	cpumask_t tmp;
  
 -	if (!cpumask_intersects(mask, cpu_online_mask))
 +	dest = set_desc_affinity(desc, mask);
 +	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
 -	if (assign_irq_vector(irq, cfg, *mask))
 -		return;
 -
 -	set_extra_move_desc(desc, *mask);
 -
 -	cpumask_and(&tmp, &cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
  
  	dmar_msi_read(irq, &msg);
  
@@@ -3594,6 -3585,7 +3593,6 @@@
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
  	dmar_msi_write(irq, &msg);
 -	cpumask_copy(&desc->affinity, mask);
  }
  
  #endif /* CONFIG_SMP */
@@@ -3633,12 -3625,19 +3632,12 @@@ static void hpet_msi_set_affinity(unsig
  	struct irq_cfg *cfg;
  	struct msi_msg msg;
  	unsigned int dest;
 -	cpumask_t tmp;
  
 -	if (!cpumask_intersects(mask, cpu_online_mask))
 +	dest = set_desc_affinity(desc, mask);
 +	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
 -	if (assign_irq_vector(irq, cfg, *mask))
 -		return;
 -
 -	set_extra_move_desc(desc, *mask);
 -
 -	cpumask_and(&tmp, &cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
  
  	hpet_msi_read(irq, &msg);
  
@@@ -3648,6 -3647,7 +3647,6 @@@
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
  	hpet_msi_write(irq, &msg);
 -	cpumask_copy(&desc->affinity, mask);
  }
  
  #endif /* CONFIG_SMP */
@@@ -3707,14 -3707,22 +3706,14 @@@ static void set_ht_irq_affinity(unsigne
  	struct irq_desc *desc = irq_to_desc(irq);
  	struct irq_cfg *cfg;
  	unsigned int dest;
 -	cpumask_t tmp;
  
 -	if (!cpumask_intersects(mask, cpu_online_mask))
 +	dest = set_desc_affinity(desc, mask);
 +	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
 -	if (assign_irq_vector(irq, cfg, *mask))
 -		return;
 -
 -	set_extra_move_desc(desc, *mask);
 -
 -	cpumask_and(&tmp, &cfg->domain, mask);
 -	dest = cpu_mask_to_apicid(tmp);
  
  	target_ht_irq(irq, dest, cfg->vector);
 -	cpumask_copy(&desc->affinity, mask);
  }
  
  #endif
@@@ -3734,14 -3742,17 +3733,14 @@@ int arch_setup_ht_irq(unsigned int irq
  {
  	struct irq_cfg *cfg;
  	int err;
 -	cpumask_t tmp;
  
  	cfg = irq_cfg(irq);
 -	tmp = TARGET_CPUS;
 -	err = assign_irq_vector(irq, cfg, tmp);
 +	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
  	if (!err) {
  		struct ht_irq_msg msg;
  		unsigned dest;
  
 -		cpus_and(tmp, cfg->domain, tmp);
 -		dest = cpu_mask_to_apicid(tmp);
 +		dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
  
  		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
  
@@@ -3777,7 -3788,7 +3776,7 @@@
  int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
  		       unsigned long mmr_offset)
  {
 -	const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
 +	const struct cpumask *eligible_cpu = cpumask_of(cpu);
  	struct irq_cfg *cfg;
  	int mmr_pnode;
  	unsigned long mmr_value;
@@@ -3787,7 -3798,7 +3786,7 @@@
  
  	cfg = irq_cfg(irq);
  
 -	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 +	err = assign_irq_vector(irq, cfg, eligible_cpu);
  	if (err != 0)
  		return err;
  
@@@ -3806,7 -3817,7 +3805,7 @@@
  	entry->polarity = 0;
  	entry->trigger = 0;
  	entry->mask = 0;
 -	entry->dest = cpu_mask_to_apicid(*eligible_cpu);
 +	entry->dest = cpu_mask_to_apicid(eligible_cpu);
  
  	mmr_pnode = uv_blade_to_pnode(mmr_blade);
  	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@@ -4017,7 -4028,7 +4016,7 @@@ void __init setup_ioapic_dest(void
  	int pin, ioapic, irq, irq_entry;
  	struct irq_desc *desc;
  	struct irq_cfg *cfg;
 -	cpumask_t mask;
 +	const struct cpumask *mask;
  
  	if (skip_ioapic_setup == 1)
  		return;
@@@ -4048,7 -4059,7 +4047,7 @@@
  			 */
  			if (desc->status &
  			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
 -				mask = desc->affinity;
 +				mask = &desc->affinity;
  			else
  				mask = TARGET_CPUS;
  
diff --combined arch/x86/kernel/irq_64.c
index fca2991443f,54c69d47a77..6383d50f82e
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@@ -13,12 -13,12 +13,12 @@@
  #include <linux/seq_file.h>
  #include <linux/module.h>
  #include <linux/delay.h>
+ #include <linux/ftrace.h>
  #include <asm/uaccess.h>
  #include <asm/io_apic.h>
  #include <asm/idle.h>
  #include <asm/smp.h>
  
- #ifdef CONFIG_DEBUG_STACKOVERFLOW
  /*
   * Probabilistic stack overflow check:
   *
@@@ -28,26 -28,25 +28,25 @@@
   */
  static inline void stack_overflow_check(struct pt_regs *regs)
  {
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
  	u64 curbase = (u64)task_stack_page(current);
- 	static unsigned long warned = -60*HZ;
- 
- 	if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
- 	    regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
- 	    time_after(jiffies, warned + 60*HZ)) {
- 		printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- 		       current->comm, curbase, regs->sp);
- 		show_stack(NULL,NULL);
- 		warned = jiffies;
- 	}
- }
+ 
+ 	WARN_ONCE(regs->sp >= curbase &&
+ 		  regs->sp <= curbase + THREAD_SIZE &&
+ 		  regs->sp <  curbase + sizeof(struct thread_info) +
+ 					sizeof(struct pt_regs) + 128,
+ 
+ 		  "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+ 			current->comm, curbase, regs->sp);
  #endif
+ }
  
  /*
   * do_IRQ handles all normal device IRQ's (the special
   * SMP cross-CPU interrupts have their own specific
   * handlers).
   */
- asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
  {
  	struct pt_regs *old_regs = set_irq_regs(regs);
  	struct irq_desc *desc;
@@@ -60,9 -59,7 +59,7 @@@
  	irq_enter();
  	irq = __get_cpu_var(vector_irq)[vector];
  
- #ifdef CONFIG_DEBUG_STACKOVERFLOW
  	stack_overflow_check(regs);
- #endif
  
  	desc = irq_to_desc(irq);
  	if (likely(desc))
@@@ -83,17 -80,16 +80,17 @@@
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
 -void fixup_irqs(cpumask_t map)
 +/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 +void fixup_irqs(void)
  {
  	unsigned int irq;
  	static int warned;
  	struct irq_desc *desc;
  
  	for_each_irq_desc(irq, desc) {
 -		cpumask_t mask;
  		int break_affinity = 0;
  		int set_affinity = 1;
 +		const struct cpumask *affinity;
  
  		if (!desc)
  			continue;
@@@ -103,23 -99,23 +100,23 @@@
  		/* interrupt's are disabled at this point */
  		spin_lock(&desc->lock);
  
 +		affinity = &desc->affinity;
  		if (!irq_has_action(irq) ||
 -		    cpus_equal(desc->affinity, map)) {
 +		    cpumask_equal(affinity, cpu_online_mask)) {
  			spin_unlock(&desc->lock);
  			continue;
  		}
  
 -		cpus_and(mask, desc->affinity, map);
 -		if (cpus_empty(mask)) {
 +		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
  			break_affinity = 1;
 -			mask = map;
 +			affinity = cpu_all_mask;
  		}
  
  		if (desc->chip->mask)
  			desc->chip->mask(irq);
  
  		if (desc->chip->set_affinity)
 -			desc->chip->set_affinity(irq, &mask);
 +			desc->chip->set_affinity(irq, affinity);
  		else if (!(warned++))
  			set_affinity = 0;
  
diff --combined arch/x86/kernel/irqinit_32.c
index 61aa2a1004b,203384ed2b5..84723295f88
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@@ -110,18 -110,6 +110,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
  	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
  };
  
 +int vector_used_by_percpu_irq(unsigned int vector)
 +{
 +	int cpu;
 +
 +	for_each_online_cpu(cpu) {
 +		if (per_cpu(vector_irq, cpu)[vector] != -1)
 +			return 1;
 +	}
 +
 +	return 0;
 +}
 +
  /* Overridden in paravirt.c */
  void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
  
@@@ -140,7 -128,7 +140,7 @@@ void __init native_init_IRQ(void
  	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
  		/* SYSCALL_VECTOR was reserved in trap_init. */
  		if (i != SYSCALL_VECTOR)
- 			set_intr_gate(i, interrupt[i]);
+ 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
  	}
  
  
@@@ -158,12 -146,10 +158,12 @@@
  	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
  
  	/* IPI for single call function */
 -	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
 +	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
 +				 call_function_single_interrupt);
  
  	/* Low priority IPI to cleanup after moving an irq */
  	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 +	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
  #endif
  
  #ifdef CONFIG_X86_LOCAL_APIC
diff --combined arch/x86/kernel/irqinit_64.c
index 1020919efe1,6190e6ef546..31ebfe38e96
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@@ -23,41 -23,6 +23,6 @@@
  #include <asm/apic.h>
  #include <asm/i8259.h>
  
- /*
-  * Common place to define all x86 IRQ vectors
-  *
-  * This builds up the IRQ handler stubs using some ugly macros in irq.h
-  *
-  * These macros create the low-level assembly IRQ routines that save
-  * register context and call do_IRQ(). do_IRQ() then does all the
-  * operations that are needed to keep the AT (or SMP IOAPIC)
-  * interrupt-controller happy.
-  */
- 
- #define IRQ_NAME2(nr) nr##_interrupt(void)
- #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
- 
- /*
-  *	SMP has a few special interrupts for IPI messages
-  */
- 
- #define BUILD_IRQ(nr)				\
- 	asmlinkage void IRQ_NAME(nr);		\
- 	asm("\n.text\n.p2align\n"		\
- 	    "IRQ" #nr "_interrupt:\n\t"		\
- 	    "push $~(" #nr ") ; "		\
- 	    "jmp common_interrupt\n"		\
- 	    ".previous");
- 
- #define BI(x,y) \
- 	BUILD_IRQ(x##y)
- 
- #define BUILD_16_IRQS(x) \
- 	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- 	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- 	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- 	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
- 
  /*
   * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
   * (these are usually mapped to vectors 0x30-0x3f)
@@@ -73,37 -38,6 +38,6 @@@
   *
   * (these are usually mapped into the 0x30-0xff vector range)
   */
- 				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
- BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
- BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
- BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
- 
- #undef BUILD_16_IRQS
- #undef BI
- 
- 
- #define IRQ(x,y) \
- 	IRQ##x##y##_interrupt
- 
- #define IRQLIST_16(x) \
- 	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
- 	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
- 	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- 	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
- 
- /* for the irq vectors */
- static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
- 					  IRQLIST_16(0x2), IRQLIST_16(0x3),
- 	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- 	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- 	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
- };
- 
- #undef IRQ
- #undef IRQLIST_16
- 
- 
- 
  
  /*
   * IRQ2 is cascade interrupt to second interrupt controller
@@@ -135,18 -69,6 +69,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
  	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
  };
  
 +int vector_used_by_percpu_irq(unsigned int vector)
 +{
 +	int cpu;
 +
 +	for_each_online_cpu(cpu) {
 +		if (per_cpu(vector_irq, cpu)[vector] != -1)
 +			return 1;
 +	}
 +
 +	return 0;
 +}
 +
  void __init init_ISA_irqs(void)
  {
  	int i;
@@@ -199,7 -121,6 +133,7 @@@ static void __init smp_intr_init(void
  
  	/* Low priority IPI to cleanup after moving an irq */
  	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 +	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
  #endif
  }
  
diff --combined arch/x86/kernel/setup_percpu.c
index 0b63b08e753,8e8b1193add..49f3f709ee1
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@@ -152,11 -152,6 +152,11 @@@ void __init setup_per_cpu_areas(void
  	old_size = PERCPU_ENOUGH_ROOM;
  	align = max_t(unsigned long, PAGE_SIZE, align);
  	size = roundup(old_size, align);
 +
 +	printk(KERN_INFO
 +		"NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 +		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 +
  	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
  			  size);
  
@@@ -173,24 -168,24 +173,24 @@@
  			       "cpu %d has no node %d or node-local memory\n",
  				cpu, node);
  			if (ptr)
 -				printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
 +				printk(KERN_DEBUG
 +					"per cpu data for cpu%d at %016lx\n",
  					 cpu, __pa(ptr));
  		}
  		else {
  			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
  							__pa(MAX_DMA_ADDRESS));
  			if (ptr)
 -				printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
 -					 cpu, node, __pa(ptr));
 +				printk(KERN_DEBUG
 +					"per cpu data for cpu%d on node%d "
 +					"at %016lx\n",
 +					cpu, node, __pa(ptr));
  		}
  #endif
  		per_cpu_offset(cpu) = ptr - __per_cpu_start;
  		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
  	}
  
 -	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
 -		NR_CPUS, nr_cpu_ids, nr_node_ids);
 -
  	/* Setup percpu data maps */
  	setup_per_cpu_maps();
  
@@@ -339,25 -334,25 +339,25 @@@ static const cpumask_t cpu_mask_none
  /*
   * Returns a pointer to the bitmask of CPUs on Node 'node'.
   */
- const cpumask_t *_node_to_cpumask_ptr(int node)
+ const cpumask_t *cpumask_of_node(int node)
  {
  	if (node_to_cpumask_map == NULL) {
  		printk(KERN_WARNING
- 			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+ 			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
  			node);
  		dump_stack();
  		return (const cpumask_t *)&cpu_online_map;
  	}
  	if (node >= nr_node_ids) {
  		printk(KERN_WARNING
- 			"_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
+ 			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
  			node, nr_node_ids);
  		dump_stack();
  		return &cpu_mask_none;
  	}
  	return &node_to_cpumask_map[node];
  }
- EXPORT_SYMBOL(_node_to_cpumask_ptr);
+ EXPORT_SYMBOL(cpumask_of_node);
  
  /*
   * Returns a bitmask of CPUs on Node 'node'.
diff --combined arch/x86/kernel/smp.c
index 49ed667b06f,7e558db362c..beea2649a24
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@@ -118,22 -118,22 +118,22 @@@ static void native_smp_send_reschedule(
  		WARN_ON(1);
  		return;
  	}
 -	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
 +	send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
  }
  
  void native_send_call_func_single_ipi(int cpu)
  {
 -	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 +	send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
  }
  
 -void native_send_call_func_ipi(cpumask_t mask)
 +void native_send_call_func_ipi(const struct cpumask *mask)
  {
  	cpumask_t allbutself;
  
  	allbutself = cpu_online_map;
  	cpu_clear(smp_processor_id(), allbutself);
  
 -	if (cpus_equal(mask, allbutself) &&
 +	if (cpus_equal(*mask, allbutself) &&
  	    cpus_equal(cpu_online_map, cpu_callout_map))
  		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
  	else
@@@ -165,11 -165,7 +165,7 @@@ static void native_smp_send_stop(void
  void smp_reschedule_interrupt(struct pt_regs *regs)
  {
  	ack_APIC_irq();
- #ifdef CONFIG_X86_32
- 	__get_cpu_var(irq_stat).irq_resched_count++;
- #else
- 	add_pda(irq_resched_count, 1);
- #endif
+ 	inc_irq_stat(irq_resched_count);
  }
  
  void smp_call_function_interrupt(struct pt_regs *regs)
@@@ -177,11 -173,7 +173,7 @@@
  	ack_APIC_irq();
  	irq_enter();
  	generic_smp_call_function_interrupt();
- #ifdef CONFIG_X86_32
- 	__get_cpu_var(irq_stat).irq_call_count++;
- #else
- 	add_pda(irq_call_count, 1);
- #endif
+ 	inc_irq_stat(irq_call_count);
  	irq_exit();
  }
  
@@@ -190,11 -182,7 +182,7 @@@ void smp_call_function_single_interrupt
  	ack_APIC_irq();
  	irq_enter();
  	generic_smp_call_function_single_interrupt();
- #ifdef CONFIG_X86_32
- 	__get_cpu_var(irq_stat).irq_call_count++;
- #else
- 	add_pda(irq_call_count, 1);
- #endif
+ 	inc_irq_stat(irq_call_count);
  	irq_exit();
  }
  
diff --combined arch/x86/kernel/smpboot.c
index 1a9941b1115,c5392058cd0..9e177a4077e
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -282,7 -282,7 +282,7 @@@ static int __cpuinitdata unsafe_smp
  /*
   * Activate a secondary processor.
   */
- static void __cpuinit start_secondary(void *unused)
+ notrace static void __cpuinit start_secondary(void *unused)
  {
  	/*
  	 * Don't put *anything* before cpu_init(), SMP booting is too
@@@ -496,7 -496,7 +496,7 @@@ void __cpuinit set_cpu_sibling_map(int 
  }
  
  /* maps the cpu to the sched domain representing multi-core */
- cpumask_t cpu_coregroup_map(int cpu)
+ const struct cpumask *cpu_coregroup_mask(int cpu)
  {
  	struct cpuinfo_x86 *c = &cpu_data(cpu);
  	/*
@@@ -504,9 -504,14 +504,14 @@@
  	 * And for power savings, we return cpu_core_map
  	 */
  	if (sched_mc_power_savings || sched_smt_power_savings)
- 		return per_cpu(cpu_core_map, cpu);
+ 		return &per_cpu(cpu_core_map, cpu);
  	else
- 		return c->llc_shared_map;
+ 		return &c->llc_shared_map;
+ }
+ 
+ cpumask_t cpu_coregroup_map(int cpu)
+ {
+ 	return *cpu_coregroup_mask(cpu);
  }
  
  static void impress_friends(void)
@@@ -1075,8 -1080,10 +1080,10 @@@ static int __init smp_sanity_check(unsi
  #endif
  
  	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- 		printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
- 				    "by the BIOS.\n", hard_smp_processor_id());
+ 		printk(KERN_WARNING
+ 			"weird, boot CPU (#%d) not listed by the BIOS.\n",
+ 			hard_smp_processor_id());
+ 
  		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
  	}
  
@@@ -1252,15 -1259,6 +1259,15 @@@ void __init native_smp_cpus_done(unsign
  	check_nmi_watchdog();
  }
  
 +static int __initdata setup_possible_cpus = -1;
 +static int __init _setup_possible_cpus(char *str)
 +{
 +	get_option(&str, &setup_possible_cpus);
 +	return 0;
 +}
 +early_param("possible_cpus", _setup_possible_cpus);
 +
 +
  /*
   * cpu_possible_map should be static, it cannot change as cpu's
   * are onlined, or offlined. The reason is per-cpu data-structures
@@@ -1273,7 -1271,7 +1280,7 @@@
   *
   * Three ways to find out the number of additional hotplug CPUs:
   * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
 - * - The user can overwrite it with additional_cpus=NUM
 + * - The user can overwrite it with possible_cpus=NUM
   * - Otherwise don't reserve additional CPUs.
   * We do this because additional CPUs waste a lot of memory.
   * -AK
@@@ -1286,17 -1284,9 +1293,17 @@@ __init void prefill_possible_map(void
  	if (!num_processors)
  		num_processors = 1;
  
 -	possible = num_processors + disabled_cpus;
 -	if (possible > NR_CPUS)
 -		possible = NR_CPUS;
 +	if (setup_possible_cpus == -1)
 +		possible = num_processors + disabled_cpus;
 +	else
 +		possible = setup_possible_cpus;
 +
 +	if (possible > CONFIG_NR_CPUS) {
 +		printk(KERN_WARNING
 +			"%d Processors exceeds NR_CPUS limit of %d\n",
 +			possible, CONFIG_NR_CPUS);
 +		possible = CONFIG_NR_CPUS;
 +	}
  
  	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
  		possible, max_t(int, possible - num_processors, 0));
@@@ -1361,7 -1351,7 +1368,7 @@@ void cpu_disable_common(void
  	lock_vector_lock();
  	remove_cpu_from_maps(cpu);
  	unlock_vector_lock();
 -	fixup_irqs(cpu_online_map);
 +	fixup_irqs();
  }
  
  int native_cpu_disable(void)
diff --combined arch/x86/kernel/tlb_32.c
index 174ea90d1cb,8da059f949b..ce505464224
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@@ -34,9 -34,8 +34,8 @@@ static DEFINE_SPINLOCK(tlbstate_lock)
   */
  void leave_mm(int cpu)
  {
- 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- 		BUG();
- 	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+ 	BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
+ 	cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
  	load_cr3(swapper_pg_dir);
  }
  EXPORT_SYMBOL_GPL(leave_mm);
@@@ -104,8 -103,8 +103,8 @@@ void smp_invalidate_interrupt(struct pt
  		 * BUG();
  		 */
  
- 	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
- 		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+ 	if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
+ 		if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
  			if (flush_va == TLB_FLUSH_ALL)
  				local_flush_tlb();
  			else
@@@ -119,7 -118,7 +118,7 @@@
  	smp_mb__after_clear_bit();
  out:
  	put_cpu_no_resched();
- 	__get_cpu_var(irq_stat).irq_tlb_count++;
+ 	inc_irq_stat(irq_tlb_count);
  }
  
  void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@@ -164,7 -163,7 +163,7 @@@
  	 * We have to send the IPI only to
  	 * CPUs affected.
  	 */
 -	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
 +	send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
  
  	while (!cpus_empty(flush_cpumask))
  		/* nothing. lockup detection does not belong here */
@@@ -238,7 -237,7 +237,7 @@@ static void do_flush_tlb_all(void *info
  	unsigned long cpu = smp_processor_id();
  
  	__flush_tlb_all();
- 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+ 	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
  		leave_mm(cpu);
  }
  
diff --combined arch/x86/kernel/tlb_64.c
index de6f1bda0c5,29887d7081a..f8be6f1d2e4
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@@ -154,7 -154,7 +154,7 @@@ asmlinkage void smp_invalidate_interrup
  out:
  	ack_APIC_irq();
  	cpu_clear(cpu, f->flush_cpumask);
- 	add_pda(irq_tlb_count, 1);
+ 	inc_irq_stat(irq_tlb_count);
  }
  
  void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@@ -191,7 -191,7 +191,7 @@@
  	 * We have to send the IPI only to
  	 * CPUs affected.
  	 */
 -	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
 +	send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
  
  	while (!cpus_empty(f->flush_cpumask))
  		cpu_relax();
diff --combined arch/x86/kernel/traps.c
index 4a6dff39a47,141907ab6e2..2d1f4c7e405
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -72,6 -72,9 +72,6 @@@
  
  #include "cpu/mcheck/mce.h"
  
 -DECLARE_BITMAP(used_vectors, NR_VECTORS);
 -EXPORT_SYMBOL_GPL(used_vectors);
 -
  asmlinkage int system_call(void);
  
  /* Do we ignore FPU interrupts ? */
@@@ -86,9 -89,6 +86,9 @@@ gate_desc idt_table[256
  	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
  #endif
  
 +DECLARE_BITMAP(used_vectors, NR_VECTORS);
 +EXPORT_SYMBOL_GPL(used_vectors);
 +
  static int ignore_nmis;
  
  static inline void conditional_sti(struct pt_regs *regs)
@@@ -481,11 -481,7 +481,7 @@@ do_nmi(struct pt_regs *regs, long error
  {
  	nmi_enter();
  
- #ifdef CONFIG_X86_32
- 	{ int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
- #else
- 	add_pda(__nmi_count, 1);
- #endif
+ 	inc_irq_stat(__nmi_count);
  
  	if (!ignore_nmis)
  		default_do_nmi(regs);
@@@ -664,7 -660,7 +660,7 @@@ void math_error(void __user *ip
  {
  	struct task_struct *task;
  	siginfo_t info;
- 	unsigned short cwd, swd;
+ 	unsigned short cwd, swd, err;
  
  	/*
  	 * Save the info for the exception handler and clear the error.
@@@ -675,7 -671,6 +671,6 @@@
  	task->thread.error_code = 0;
  	info.si_signo = SIGFPE;
  	info.si_errno = 0;
- 	info.si_code = __SI_FAULT;
  	info.si_addr = ip;
  	/*
  	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@@ -689,34 -684,31 +684,31 @@@
  	 */
  	cwd = get_fpu_cwd(task);
  	swd = get_fpu_swd(task);
- 	switch (swd & ~cwd & 0x3f) {
- 	case 0x000: /* No unmasked exception */
+ 
+ 	err = swd & ~cwd & 0x3f;
+ 
  #ifdef CONFIG_X86_32
+ 	if (!err)
  		return;
  #endif
- 	default: /* Multiple exceptions */
- 		break;
- 	case 0x001: /* Invalid Op */
+ 
+ 	if (err & 0x001) {	/* Invalid op */
  		/*
  		 * swd & 0x240 == 0x040: Stack Underflow
  		 * swd & 0x240 == 0x240: Stack Overflow
  		 * User must clear the SF bit (0x40) if set
  		 */
  		info.si_code = FPE_FLTINV;
- 		break;
- 	case 0x002: /* Denormalize */
- 	case 0x010: /* Underflow */
- 		info.si_code = FPE_FLTUND;
- 		break;
- 	case 0x004: /* Zero Divide */
+ 	} else if (err & 0x004) { /* Divide by Zero */
  		info.si_code = FPE_FLTDIV;
- 		break;
- 	case 0x008: /* Overflow */
+ 	} else if (err & 0x008) { /* Overflow */
  		info.si_code = FPE_FLTOVF;
- 		break;
- 	case 0x020: /* Precision */
+ 	} else if (err & 0x012) { /* Denormal, Underflow */
+ 		info.si_code = FPE_FLTUND;
+ 	} else if (err & 0x020) { /* Precision */
  		info.si_code = FPE_FLTRES;
- 		break;
+ 	} else {
+ 		info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
  	}
  	force_sig_info(SIGFPE, &info, task);
  }
@@@ -949,7 -941,9 +941,7 @@@ dotraplinkage void do_iret_error(struc
  
  void __init trap_init(void)
  {
 -#ifdef CONFIG_X86_32
  	int i;
 -#endif
  
  #ifdef CONFIG_EISA
  	void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@@ -1006,15 -1000,11 +998,15 @@@
  	}
  
  	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
 +#endif
  
  	/* Reserve all the builtin and the syscall vector: */
  	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
  		set_bit(i, used_vectors);
  
 +#ifdef CONFIG_X86_64
 +	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 +#else
  	set_bit(SYSCALL_VECTOR, used_vectors);
  #endif
  	/*
diff --combined arch/x86/xen/mmu.c
index e59e53b11e2,773d68d3e91..503c240e26c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -154,13 -154,13 +154,13 @@@ void xen_setup_mfn_list_list(void
  {
  	unsigned pfn, idx;
  
- 	for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+ 	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
  		unsigned topidx = p2m_top_index(pfn);
  
  		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
  	}
  
- 	for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+ 	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
  		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
  		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
  	}
@@@ -179,7 -179,7 +179,7 @@@ void __init xen_build_dynamic_phys_to_m
  	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
  	unsigned pfn;
  
- 	for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+ 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
  		unsigned topidx = p2m_top_index(pfn);
  
  		p2m_top[topidx] = &mfn_list[pfn];
@@@ -207,7 -207,7 +207,7 @@@ static void alloc_p2m(unsigned long **p
  	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
  	BUG_ON(p == NULL);
  
- 	for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+ 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
  		p[i] = INVALID_P2M_ENTRY;
  
  	if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@@ -407,7 -407,8 +407,8 @@@ out
  		preempt_enable();
  }
  
- pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+ 				 unsigned long addr, pte_t *ptep)
  {
  	/* Just return the pte as-is.  We preserve the bits on commit */
  	return *ptep;
@@@ -878,7 -879,8 +879,8 @@@ static void __xen_pgd_pin(struct mm_str
  
  		if (user_pgd) {
  			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
- 			xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+ 			xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ 				   PFN_DOWN(__pa(user_pgd)));
  		}
  	}
  #else /* CONFIG_X86_32 */
@@@ -993,7 -995,8 +995,8 @@@ static void __xen_pgd_unpin(struct mm_s
  		pgd_t *user_pgd = xen_get_user_pgd(pgd);
  
  		if (user_pgd) {
- 			xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+ 			xen_do_pin(MMUEXT_UNPIN_TABLE,
+ 				   PFN_DOWN(__pa(user_pgd)));
  			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
  		}
  	}
@@@ -1079,7 -1082,7 +1082,7 @@@ static void drop_other_mm_ref(void *inf
  
  static void xen_drop_mm_ref(struct mm_struct *mm)
  {
 -	cpumask_t mask;
 +	cpumask_var_t mask;
  	unsigned cpu;
  
  	if (current->active_mm == mm) {
@@@ -1091,16 -1094,7 +1094,16 @@@
  	}
  
  	/* Get the "official" set of cpus referring to our pagetable. */
 -	mask = mm->cpu_vm_mask;
 +	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
 +		for_each_online_cpu(cpu) {
 +			if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
 +			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 +				continue;
 +			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
 +		}
 +		return;
 +	}
 +	cpumask_copy(mask, &mm->cpu_vm_mask);
  
  	/* It's possible that a vcpu may have a stale reference to our
  	   cr3, because its in lazy mode, and it hasn't yet flushed
@@@ -1109,12 -1103,11 +1112,12 @@@
  	   if needed. */
  	for_each_online_cpu(cpu) {
  		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 -			cpu_set(cpu, mask);
 +			cpumask_set_cpu(cpu, mask);
  	}
  
 -	if (!cpus_empty(mask))
 -		smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 +	if (!cpumask_empty(mask))
 +		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
 +	free_cpumask_var(mask);
  }
  #else
  static void xen_drop_mm_ref(struct mm_struct *mm)
diff --combined include/linux/sched.h
index e5f928a079e,8395e715809..158d53d0776
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc
  extern int runqueue_is_locked(void);
  extern void task_rq_unlock_wait(struct task_struct *p);
  
 -extern cpumask_t nohz_cpu_mask;
 +extern cpumask_var_t nohz_cpu_mask;
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
  extern int select_nohz_load_balancer(int cpu);
  #else
@@@ -571,12 -571,6 +571,6 @@@ struct signal_struct 
  	 */
  	struct rlimit rlim[RLIM_NLIMITS];
  
- 	/* keep the process-shared keyrings here so that they do the right
- 	 * thing in threads created with CLONE_THREAD */
- #ifdef CONFIG_KEYS
- 	struct key *session_keyring;	/* keyring inherited over fork */
- 	struct key *process_keyring;	/* keyring private to this process */
- #endif
  #ifdef CONFIG_BSD_PROCESS_ACCT
  	struct pacct_struct pacct;	/* per-process accounting information */
  #endif
@@@ -647,6 -641,7 +641,7 @@@ struct user_struct 
  	/* Hash table maintenance information */
  	struct hlist_node uidhash_node;
  	uid_t uid;
+ 	struct user_namespace *user_ns;
  
  #ifdef CONFIG_USER_SCHED
  	struct task_group *tg;
@@@ -664,6 -659,7 +659,7 @@@ extern struct user_struct *find_user(ui
  extern struct user_struct root_user;
  #define INIT_USER (&root_user)
  
+ 
  struct backing_dev_info;
  struct reclaim_state;
  
@@@ -671,8 -667,7 +667,7 @@@
  struct sched_info {
  	/* cumulative counters */
  	unsigned long pcount;	      /* # of times run on this cpu */
- 	unsigned long long cpu_time,  /* time spent on the cpu */
- 			   run_delay; /* time spent waiting on a runqueue */
+ 	unsigned long long run_delay; /* time spent waiting on a runqueue */
  
  	/* timestamps */
  	unsigned long long last_arrival,/* when we last ran on a cpu */
@@@ -763,51 -758,20 +758,51 @@@ enum cpu_idle_type 
  #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
  #define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
  
 -#define BALANCE_FOR_MC_POWER	\
 -	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
 +enum powersavings_balance_level {
 +	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
 +	POWERSAVINGS_BALANCE_BASIC,	/* Fill one thread/core/package
 +					 * first for long running threads
 +					 */
 +	POWERSAVINGS_BALANCE_WAKEUP,	/* Also bias task wakeups to semi-idle
 +					 * cpu package for power savings
 +					 */
 +	MAX_POWERSAVINGS_BALANCE_LEVELS
 +};
  
 -#define BALANCE_FOR_PKG_POWER	\
 -	((sched_mc_power_savings || sched_smt_power_savings) ?	\
 -	 SD_POWERSAVINGS_BALANCE : 0)
 +extern int sched_mc_power_savings, sched_smt_power_savings;
  
 -#define test_sd_parent(sd, flag)	((sd->parent &&		\
 -					 (sd->parent->flags & flag)) ? 1 : 0)
 +static inline int sd_balance_for_mc_power(void)
 +{
 +	if (sched_smt_power_savings)
 +		return SD_POWERSAVINGS_BALANCE;
  
 +	return 0;
 +}
 +
 +static inline int sd_balance_for_package_power(void)
 +{
 +	if (sched_mc_power_savings | sched_smt_power_savings)
 +		return SD_POWERSAVINGS_BALANCE;
 +
 +	return 0;
 +}
 +
 +/*
 + * Optimise SD flags for power savings:
 + * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
 + * Keep default SD flags if sched_{smt,mc}_power_saving=0
 + */
 +
 +static inline int sd_power_saving_flags(void)
 +{
 +	if (sched_mc_power_savings | sched_smt_power_savings)
 +		return SD_BALANCE_NEWIDLE;
 +
 +	return 0;
 +}
  
  struct sched_group {
  	struct sched_group *next;	/* Must be a circular list */
 -	cpumask_t cpumask;
  
  	/*
  	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@@ -820,15 -784,8 +815,15 @@@
  	 * (see include/linux/reciprocal_div.h)
  	 */
  	u32 reciprocal_cpu_power;
 +
 +	unsigned long cpumask[];
  };
  
 +static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 +{
 +	return to_cpumask(sg->cpumask);
 +}
 +
  enum sched_domain_level {
  	SD_LV_NONE = 0,
  	SD_LV_SIBLING,
@@@ -852,6 -809,7 +847,6 @@@ struct sched_domain 
  	struct sched_domain *parent;	/* top domain must be null terminated */
  	struct sched_domain *child;	/* bottom domain must be null terminated */
  	struct sched_group *groups;	/* the balancing groups of the domain */
 -	cpumask_t span;			/* span of all CPUs in this domain */
  	unsigned long min_interval;	/* Minimum balance interval ms */
  	unsigned long max_interval;	/* Maximum balance interval ms */
  	unsigned int busy_factor;	/* less balancing by factor if busy */
@@@ -906,73 -864,25 +901,42 @@@
  #ifdef CONFIG_SCHED_DEBUG
  	char *name;
  #endif
 +
 +	/* span of all CPUs in this domain */
 +	unsigned long span[];
  };
  
 -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 +static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 +{
 +	return to_cpumask(sd->span);
 +}
 +
 +extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
  				    struct sched_domain_attr *dattr_new);
  extern int arch_reinit_sched_domains(void);
  
 +/* Test a flag in parent sched domain */
 +static inline int test_sd_parent(struct sched_domain *sd, int flag)
 +{
 +	if (sd->parent && (sd->parent->flags & flag))
 +		return 1;
 +
 +	return 0;
 +}
 +
  #else /* CONFIG_SMP */
  
  struct sched_domain_attr;
  
  static inline void
 -partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 +partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
  			struct sched_domain_attr *dattr_new)
  {
  }
  #endif	/* !CONFIG_SMP */
  
  struct io_context;			/* See blkdev.h */
- #define NGROUPS_SMALL		32
- #define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
- struct group_info {
- 	int ngroups;
- 	atomic_t usage;
- 	gid_t small_block[NGROUPS_SMALL];
- 	int nblocks;
- 	gid_t *blocks[0];
- };
  
- /*
-  * get_group_info() must be called with the owning task locked (via task_lock())
-  * when task != current.  The reason being that the vast majority of callers are
-  * looking at current->group_info, which can not be changed except by the
-  * current task.  Changing current->group_info requires the task lock, too.
-  */
- #define get_group_info(group_info) do { \
- 	atomic_inc(&(group_info)->usage); \
- } while (0)
- 
- #define put_group_info(group_info) do { \
- 	if (atomic_dec_and_test(&(group_info)->usage)) \
- 		groups_free(group_info); \
- } while (0)
- 
- extern struct group_info *groups_alloc(int gidsetsize);
- extern void groups_free(struct group_info *group_info);
- extern int set_current_groups(struct group_info *group_info);
- extern int groups_search(struct group_info *group_info, gid_t grp);
- /* access the groups "array" with this macro */
- #define GROUP_AT(gi, i) \
-     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
  
  #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
  extern void prefetch_stack(struct task_struct *t);
@@@ -1016,7 -926,7 +980,7 @@@ struct sched_class 
  	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
  	void (*set_cpus_allowed)(struct task_struct *p,
 -				 const cpumask_t *newmask);
 +				 const struct cpumask *newmask);
  
  	void (*rq_online)(struct rq *rq);
  	void (*rq_offline)(struct rq *rq);
@@@ -1228,6 -1138,7 +1192,7 @@@ struct task_struct 
  	 * The buffer to hold the BTS data.
  	 */
  	void *bts_buffer;
+ 	size_t bts_size;
  #endif /* CONFIG_X86_PTRACE_BTS */
  
  	/* PID/PID hash table linkage. */
@@@ -1251,17 -1162,12 +1216,12 @@@
  	struct list_head cpu_timers[3];
  
  /* process credentials */
- 	uid_t uid,euid,suid,fsuid;
- 	gid_t gid,egid,sgid,fsgid;
- 	struct group_info *group_info;
- 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
- 	struct user_struct *user;
- 	unsigned securebits;
- #ifdef CONFIG_KEYS
- 	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
- 	struct key *request_key_auth;	/* assumed request_key authority */
- 	struct key *thread_keyring;	/* keyring private to this thread */
- #endif
+ 	const struct cred *real_cred;	/* objective and real subjective task
+ 					 * credentials (COW) */
+ 	const struct cred *cred;	/* effective (overridable) subjective task
+ 					 * credentials (COW) */
+ 	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
+ 
  	char comm[TASK_COMM_LEN]; /* executable name excluding path
  				     - access with [gs]et_task_comm (which lock
  				       it with task_lock())
@@@ -1298,9 -1204,6 +1258,6 @@@
  	int (*notifier)(void *priv);
  	void *notifier_data;
  	sigset_t *notifier_mask;
- #ifdef CONFIG_SECURITY
- 	void *security;
- #endif
  	struct audit_context *audit_context;
  #ifdef CONFIG_AUDITSYSCALL
  	uid_t loginuid;
@@@ -1676,12 -1579,12 +1633,12 @@@ extern cputime_t task_gtime(struct task
  
  #ifdef CONFIG_SMP
  extern int set_cpus_allowed_ptr(struct task_struct *p,
 -				const cpumask_t *new_mask);
 +				const struct cpumask *new_mask);
  #else
  static inline int set_cpus_allowed_ptr(struct task_struct *p,
 -				       const cpumask_t *new_mask)
 +				       const struct cpumask *new_mask)
  {
 -	if (!cpu_isset(0, *new_mask))
 +	if (!cpumask_test_cpu(0, new_mask))
  		return -EINVAL;
  	return 0;
  }
@@@ -1857,7 -1760,6 +1814,6 @@@ static inline struct user_struct *get_u
  	return u;
  }
  extern void free_uid(struct user_struct *);
- extern void switch_uid(struct user_struct *);
  extern void release_uids(struct user_namespace *ns);
  
  #include <asm/current.h>
@@@ -1876,9 -1778,6 +1832,6 @@@ extern void wake_up_new_task(struct tas
  extern void sched_fork(struct task_struct *p, int clone_flags);
  extern void sched_dead(struct task_struct *p);
  
- extern int in_group_p(gid_t);
- extern int in_egroup_p(gid_t);
- 
  extern void proc_caches_init(void);
  extern void flush_signals(struct task_struct *);
  extern void ignore_signals(struct task_struct *);
@@@ -2010,6 -1909,8 +1963,8 @@@ static inline unsigned long wait_task_i
  #define for_each_process(p) \
  	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
  
+ extern bool is_single_threaded(struct task_struct *);
+ 
  /*
   * Careful: do_each_thread/while_each_thread is a double loop so
   *          'break' will not work as expected - use goto instead.
@@@ -2294,8 -2195,10 +2249,8 @@@ __trace_special(void *__tr, void *__dat
  }
  #endif
  
 -extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 -extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 -
 -extern int sched_mc_power_savings, sched_smt_power_savings;
 +extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 +extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  
  extern void normalize_rt_tasks(void);
  
diff --combined kernel/rcuclassic.c
index c03ca3e6191,0ff9b05706a..6ec495f60ea
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@@ -63,14 -63,14 +63,14 @@@ static struct rcu_ctrlblk rcu_ctrlblk 
  	.completed = -300,
  	.pending = -300,
  	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
- 	.cpumask = CPU_MASK_NONE,
+ 	.cpumask = CPU_BITS_NONE,
  };
  static struct rcu_ctrlblk rcu_bh_ctrlblk = {
  	.cur = -300,
  	.completed = -300,
  	.pending = -300,
  	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
- 	.cpumask = CPU_MASK_NONE,
+ 	.cpumask = CPU_BITS_NONE,
  };
  
  DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@@ -85,7 -85,6 +85,6 @@@ static void force_quiescent_state(struc
  			struct rcu_ctrlblk *rcp)
  {
  	int cpu;
- 	cpumask_t cpumask;
  	unsigned long flags;
  
  	set_need_resched();
@@@ -96,10 -95,10 +95,10 @@@
  		 * Don't send IPI to itself. With irqs disabled,
  		 * rdp->cpu is the current cpu.
  		 *
- 		 * cpu_online_map is updated by the _cpu_down()
+ 		 * cpu_online_mask is updated by the _cpu_down()
  		 * using __stop_machine(). Since we're in irqs disabled
  		 * section, __stop_machine() is not exectuting, hence
- 		 * the cpu_online_map is stable.
+ 		 * the cpu_online_mask is stable.
  		 *
  		 * However,  a cpu might have been offlined _just_ before
  		 * we disabled irqs while entering here.
@@@ -107,13 -106,14 +106,14 @@@
  		 * notification, leading to the offlined cpu's bit
  		 * being set in the rcp->cpumask.
  		 *
- 		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+ 		 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
  		 * sending smp_reschedule() to an offlined CPU.
  		 */
- 		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
- 		cpu_clear(rdp->cpu, cpumask);
- 		for_each_cpu_mask_nr(cpu, cpumask)
- 			smp_send_reschedule(cpu);
+ 		for_each_cpu_and(cpu,
+ 				  to_cpumask(rcp->cpumask), cpu_online_mask) {
+ 			if (cpu != rdp->cpu)
+ 				smp_send_reschedule(cpu);
+ 		}
  	}
  	spin_unlock_irqrestore(&rcp->lock, flags);
  }
@@@ -193,7 -193,7 +193,7 @@@ static void print_other_cpu_stall(struc
  
  	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
  	for_each_possible_cpu(cpu) {
- 		if (cpu_isset(cpu, rcp->cpumask))
+ 		if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
  			printk(" %d", cpu);
  	}
  	printk(" (detected by %d, t=%ld jiffies)\n",
@@@ -221,7 -221,8 +221,8 @@@ static void check_cpu_stall(struct rcu_
  	long delta;
  
  	delta = jiffies - rcp->jiffies_stall;
- 	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+ 	if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
+ 		delta >= 0) {
  
  		/* We haven't checked in, so go dump stack. */
  		print_cpu_stall(rcp);
@@@ -393,7 -394,8 +394,7 @@@ static void rcu_start_batch(struct rcu_
  		 * unnecessarily.
  		 */
  		smp_mb();
 -		cpumask_andnot(to_cpumask(rcp->cpumask),
 -			       cpu_online_mask, &nohz_cpu_mask);
 +		cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
  
  		rcp->signaled = 0;
  	}
@@@ -406,8 -408,8 +407,8 @@@
   */
  static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
  {
- 	cpu_clear(cpu, rcp->cpumask);
- 	if (cpus_empty(rcp->cpumask)) {
+ 	cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
+ 	if (cpumask_empty(to_cpumask(rcp->cpumask))) {
  		/* batch completed ! */
  		rcp->completed = rcp->cur;
  		rcu_start_batch(rcp);
diff --combined kernel/sched.c
index 756d981d91a,f2095660efe..27ba1d642f0
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@@ -209,7 -209,6 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi
  	hrtimer_init(&rt_b->rt_period_timer,
  			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	rt_b->rt_period_timer.function = sched_rt_period_timer;
- 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
  }
  
  static inline int rt_bandwidth_enabled(void)
@@@ -361,7 -360,9 +360,9 @@@ static inline struct task_group *task_g
  	struct task_group *tg;
  
  #ifdef CONFIG_USER_SCHED
- 	tg = p->user->tg;
+ 	rcu_read_lock();
+ 	tg = __task_cred(p)->user->tg;
+ 	rcu_read_unlock();
  #elif defined(CONFIG_CGROUP_SCHED)
  	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
  				struct task_group, css);
@@@ -497,26 -498,18 +498,26 @@@ struct rt_rq 
   */
  struct root_domain {
  	atomic_t refcount;
 -	cpumask_t span;
 -	cpumask_t online;
 +	cpumask_var_t span;
 +	cpumask_var_t online;
  
  	/*
  	 * The "RT overload" flag: it gets set if a CPU has more than
  	 * one runnable RT task.
  	 */
 -	cpumask_t rto_mask;
 +	cpumask_var_t rto_mask;
  	atomic_t rto_count;
  #ifdef CONFIG_SMP
  	struct cpupri cpupri;
  #endif
 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 +	/*
 +	 * Preferred wake up cpu nominated by sched_mc balance that will be
 +	 * used when most cpus are idle in the system indicating overall very
 +	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
 +	 */
 +	unsigned int sched_mc_preferred_wakeup_cpu;
 +#endif
  };
  
  /*
@@@ -610,6 -603,8 +611,8 @@@ struct rq 
  #ifdef CONFIG_SCHEDSTATS
  	/* latency stats */
  	struct sched_info rq_sched_info;
+ 	unsigned long long rq_cpu_time;
+ 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
  	/* sys_sched_yield() stats */
  	unsigned int yld_exp_empty;
@@@ -1143,7 -1138,6 +1146,6 @@@ static void init_rq_hrtick(struct rq *r
  
  	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	rq->hrtick_timer.function = hrtick;
- 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
  #else	/* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
@@@ -1520,7 -1514,7 +1522,7 @@@ static int tg_shares_up(struct task_gro
  	struct sched_domain *sd = data;
  	int i;
  
 -	for_each_cpu_mask(i, sd->span) {
 +	for_each_cpu(i, sched_domain_span(sd)) {
  		/*
  		 * If there are currently no tasks on the cpu pretend there
  		 * is one of average load so that when a new task gets to
@@@ -1541,7 -1535,7 +1543,7 @@@
  	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
  		shares = tg->shares;
  
 -	for_each_cpu_mask(i, sd->span)
 +	for_each_cpu(i, sched_domain_span(sd))
  		update_group_shares_cpu(tg, i, shares, rq_weight);
  
  	return 0;
@@@ -1871,6 -1865,8 +1873,8 @@@ void set_task_cpu(struct task_struct *p
  
  	clock_offset = old_rq->clock - new_rq->clock;
  
+ 	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ 
  #ifdef CONFIG_SCHEDSTATS
  	if (p->se.wait_start)
  		p->se.wait_start -= clock_offset;
@@@ -2105,17 -2101,15 +2109,17 @@@ find_idlest_group(struct sched_domain *
  		int i;
  
  		/* Skip over this group if it has no CPUs allowed */
 -		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
 +		if (!cpumask_intersects(sched_group_cpus(group),
 +					&p->cpus_allowed))
  			continue;
  
 -		local_group = cpu_isset(this_cpu, group->cpumask);
 +		local_group = cpumask_test_cpu(this_cpu,
 +					       sched_group_cpus(group));
  
  		/* Tally up the load of all CPUs in the group */
  		avg_load = 0;
  
 -		for_each_cpu_mask_nr(i, group->cpumask) {
 +		for_each_cpu(i, sched_group_cpus(group)) {
  			/* Bias balancing toward cpus of our domain */
  			if (local_group)
  				load = source_load(i, load_idx);
@@@ -2147,14 -2141,17 +2151,14 @@@
   * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
 -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 -		cpumask_t *tmp)
 +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
  	unsigned long load, min_load = ULONG_MAX;
  	int idlest = -1;
  	int i;
  
  	/* Traverse only the allowed CPUs */
 -	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 -
 -	for_each_cpu_mask_nr(i, *tmp) {
 +	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
  		load = weighted_cpuload(i);
  
  		if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2196,6 -2193,7 +2200,6 @@@ static int sched_balance_self(int cpu, 
  		update_shares(sd);
  
  	while (sd) {
 -		cpumask_t span, tmpmask;
  		struct sched_group *group;
  		int new_cpu, weight;
  
@@@ -2204,13 -2202,14 +2208,13 @@@
  			continue;
  		}
  
 -		span = sd->span;
  		group = find_idlest_group(sd, t, cpu);
  		if (!group) {
  			sd = sd->child;
  			continue;
  		}
  
 -		new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
 +		new_cpu = find_idlest_cpu(group, t, cpu);
  		if (new_cpu == -1 || new_cpu == cpu) {
  			/* Now try balancing at a lower domain level of cpu */
  			sd = sd->child;
@@@ -2219,10 -2218,10 +2223,10 @@@
  
  		/* Now try balancing at a lower domain level of new_cpu */
  		cpu = new_cpu;
 +		weight = cpumask_weight(sched_domain_span(sd));
  		sd = NULL;
 -		weight = cpus_weight(span);
  		for_each_domain(cpu, tmp) {
 -			if (weight <= cpus_weight(tmp->span))
 +			if (weight <= cpumask_weight(sched_domain_span(tmp)))
  				break;
  			if (tmp->flags & flag)
  				sd = tmp;
@@@ -2267,7 -2266,7 +2271,7 @@@ static int try_to_wake_up(struct task_s
  		cpu = task_cpu(p);
  
  		for_each_domain(this_cpu, sd) {
 -			if (cpu_isset(cpu, sd->span)) {
 +			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  				update_shares(sd);
  				break;
  			}
@@@ -2277,6 -2276,7 +2281,7 @@@
  
  	smp_wmb();
  	rq = task_rq_lock(p, &flags);
+ 	update_rq_clock(rq);
  	old_state = p->state;
  	if (!(old_state & state))
  		goto out;
@@@ -2315,7 -2315,7 +2320,7 @@@
  	else {
  		struct sched_domain *sd;
  		for_each_domain(this_cpu, sd) {
 -			if (cpu_isset(cpu, sd->span)) {
 +			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  				schedstat_inc(sd, ttwu_wake_remote);
  				break;
  			}
@@@ -2334,12 -2334,11 +2339,11 @@@ out_activate
  		schedstat_inc(p, se.nr_wakeups_local);
  	else
  		schedstat_inc(p, se.nr_wakeups_remote);
- 	update_rq_clock(rq);
  	activate_task(rq, p, 1);
  	success = 1;
  
  out_running:
- 	trace_sched_wakeup(rq, p);
+ 	trace_sched_wakeup(rq, p, success);
  	check_preempt_curr(rq, p, sync);
  
  	p->state = TASK_RUNNING;
@@@ -2472,7 -2471,7 +2476,7 @@@ void wake_up_new_task(struct task_struc
  		p->sched_class->task_new(rq, p);
  		inc_nr_running(rq);
  	}
- 	trace_sched_wakeup_new(rq, p);
+ 	trace_sched_wakeup_new(rq, p, 1);
  	check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
  	if (p->sched_class->task_wake_up)
@@@ -2847,11 -2846,10 +2851,10 @@@ static void sched_migrate_task(struct t
  	struct rq *rq;
  
  	rq = task_rq_lock(p, &flags);
 -	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 +	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
  	    || unlikely(!cpu_active(dest_cpu)))
  		goto out;
  
- 	trace_sched_migrate_task(rq, p, dest_cpu);
  	/* force the process onto the specified CPU */
  	if (migrate_task(p, dest_cpu, &req)) {
  		/* Need to wait for migration thread (might exit: take ref). */
@@@ -2913,7 -2911,7 +2916,7 @@@ int can_migrate_task(struct task_struc
  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
  	 * 3) are cache-hot on their current CPU.
  	 */
 -	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
 +	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
  		schedstat_inc(p, se.nr_failed_migrations_affine);
  		return 0;
  	}
@@@ -3088,7 -3086,7 +3091,7 @@@ static int move_one_task(struct rq *thi
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
  		   unsigned long *imbalance, enum cpu_idle_type idle,
 -		   int *sd_idle, const cpumask_t *cpus, int *balance)
 +		   int *sd_idle, const struct cpumask *cpus, int *balance)
  {
  	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
  	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@@ -3124,11 -3122,10 +3127,11 @@@
  		unsigned long sum_avg_load_per_task;
  		unsigned long avg_load_per_task;
  
 -		local_group = cpu_isset(this_cpu, group->cpumask);
 +		local_group = cpumask_test_cpu(this_cpu,
 +					       sched_group_cpus(group));
  
  		if (local_group)
 -			balance_cpu = first_cpu(group->cpumask);
 +			balance_cpu = cpumask_first(sched_group_cpus(group));
  
  		/* Tally up the load of all CPUs in the group */
  		sum_weighted_load = sum_nr_running = avg_load = 0;
@@@ -3137,8 -3134,13 +3140,8 @@@
  		max_cpu_load = 0;
  		min_cpu_load = ~0UL;
  
 -		for_each_cpu_mask_nr(i, group->cpumask) {
 -			struct rq *rq;
 -
 -			if (!cpu_isset(i, *cpus))
 -				continue;
 -
 -			rq = cpu_rq(i);
 +		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 +			struct rq *rq = cpu_rq(i);
  
  			if (*sd_idle && rq->nr_running)
  				*sd_idle = 0;
@@@ -3249,8 -3251,8 +3252,8 @@@
  		 */
  		if ((sum_nr_running < min_nr_running) ||
  		    (sum_nr_running == min_nr_running &&
 -		     first_cpu(group->cpumask) <
 -		     first_cpu(group_min->cpumask))) {
 +		     cpumask_first(sched_group_cpus(group)) >
 +		     cpumask_first(sched_group_cpus(group_min)))) {
  			group_min = group;
  			min_nr_running = sum_nr_running;
  			min_load_per_task = sum_weighted_load /
@@@ -3265,8 -3267,8 +3268,8 @@@
  		if (sum_nr_running <= group_capacity - 1) {
  			if (sum_nr_running > leader_nr_running ||
  			    (sum_nr_running == leader_nr_running &&
 -			     first_cpu(group->cpumask) >
 -			      first_cpu(group_leader->cpumask))) {
 +			     cpumask_first(sched_group_cpus(group)) <
 +			     cpumask_first(sched_group_cpus(group_leader)))) {
  				group_leader = group;
  				leader_nr_running = sum_nr_running;
  			}
@@@ -3392,10 -3394,6 +3395,10 @@@ out_balanced
  
  	if (this == group_leader && group_leader != group_min) {
  		*imbalance = min_load_per_task;
 +		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 +			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
 +				cpumask_first(sched_group_cpus(group_leader));
 +		}
  		return group_min;
  	}
  #endif
@@@ -3409,16 -3407,16 +3412,16 @@@ ret
   */
  static struct rq *
  find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 -		   unsigned long imbalance, const cpumask_t *cpus)
 +		   unsigned long imbalance, const struct cpumask *cpus)
  {
  	struct rq *busiest = NULL, *rq;
  	unsigned long max_load = 0;
  	int i;
  
 -	for_each_cpu_mask_nr(i, group->cpumask) {
 +	for_each_cpu(i, sched_group_cpus(group)) {
  		unsigned long wl;
  
 -		if (!cpu_isset(i, *cpus))
 +		if (!cpumask_test_cpu(i, cpus))
  			continue;
  
  		rq = cpu_rq(i);
@@@ -3448,7 -3446,7 +3451,7 @@@
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
  			struct sched_domain *sd, enum cpu_idle_type idle,
 -			int *balance, cpumask_t *cpus)
 +			int *balance, struct cpumask *cpus)
  {
  	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
  	struct sched_group *group;
@@@ -3456,7 -3454,7 +3459,7 @@@
  	struct rq *busiest;
  	unsigned long flags;
  
 -	cpus_setall(*cpus);
 +	cpumask_setall(cpus);
  
  	/*
  	 * When power savings policy is enabled for the parent domain, idle
@@@ -3516,8 -3514,8 +3519,8 @@@ redo
  
  		/* All tasks on this runqueue were pinned by CPU affinity */
  		if (unlikely(all_pinned)) {
 -			cpu_clear(cpu_of(busiest), *cpus);
 -			if (!cpus_empty(*cpus))
 +			cpumask_clear_cpu(cpu_of(busiest), cpus);
 +			if (!cpumask_empty(cpus))
  				goto redo;
  			goto out_balanced;
  		}
@@@ -3534,8 -3532,7 +3537,8 @@@
  			/* don't kick the migration_thread, if the curr
  			 * task on busiest cpu can't be moved to this_cpu
  			 */
 -			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 +			if (!cpumask_test_cpu(this_cpu,
 +					      &busiest->curr->cpus_allowed)) {
  				spin_unlock_irqrestore(&busiest->lock, flags);
  				all_pinned = 1;
  				goto out_one_pinned;
@@@ -3610,7 -3607,7 +3613,7 @@@ out
   */
  static int
  load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 -			cpumask_t *cpus)
 +			struct cpumask *cpus)
  {
  	struct sched_group *group;
  	struct rq *busiest = NULL;
@@@ -3619,7 -3616,7 +3622,7 @@@
  	int sd_idle = 0;
  	int all_pinned = 0;
  
 -	cpus_setall(*cpus);
 +	cpumask_setall(cpus);
  
  	/*
  	 * When power savings policy is enabled for the parent domain, idle
@@@ -3663,71 -3660,17 +3666,71 @@@ redo
  		double_unlock_balance(this_rq, busiest);
  
  		if (unlikely(all_pinned)) {
 -			cpu_clear(cpu_of(busiest), *cpus);
 -			if (!cpus_empty(*cpus))
 +			cpumask_clear_cpu(cpu_of(busiest), cpus);
 +			if (!cpumask_empty(cpus))
  				goto redo;
  		}
  	}
  
  	if (!ld_moved) {
 +		int active_balance = 0;
 +
  		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
  		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
  		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
  			return -1;
 +
 +		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
 +			return -1;
 +
 +		if (sd->nr_balance_failed++ < 2)
 +			return -1;
 +
 +		/*
 +		 * The only task running in a non-idle cpu can be moved to this
 +		 * cpu in an attempt to completely freeup the other CPU
 +		 * package. The same method used to move task in load_balance()
 +		 * have been extended for load_balance_newidle() to speedup
 +		 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
 +		 *
 +		 * The package power saving logic comes from
 +		 * find_busiest_group().  If there are no imbalance, then
 +		 * f_b_g() will return NULL.  However when sched_mc={1,2} then
 +		 * f_b_g() will select a group from which a running task may be
 +		 * pulled to this cpu in order to make the other package idle.
 +		 * If there is no opportunity to make a package idle and if
 +		 * there are no imbalance, then f_b_g() will return NULL and no
 +		 * action will be taken in load_balance_newidle().
 +		 *
 +		 * Under normal task pull operation due to imbalance, there
 +		 * will be more than one task in the source run queue and
 +		 * move_tasks() will succeed.  ld_moved will be true and this
 +		 * active balance code will not be triggered.
 +		 */
 +
 +		/* Lock busiest in correct order while this_rq is held */
 +		double_lock_balance(this_rq, busiest);
 +
 +		/*
 +		 * don't kick the migration_thread, if the curr
 +		 * task on busiest cpu can't be moved to this_cpu
 +		 */
 +		if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 +			double_unlock_balance(this_rq, busiest);
 +			all_pinned = 1;
 +			return ld_moved;
 +		}
 +
 +		if (!busiest->active_balance) {
 +			busiest->active_balance = 1;
 +			busiest->push_cpu = this_cpu;
 +			active_balance = 1;
 +		}
 +
 +		double_unlock_balance(this_rq, busiest);
 +		if (active_balance)
 +			wake_up_process(busiest->migration_thread);
 +
  	} else
  		sd->nr_balance_failed = 0;
  
@@@ -3753,10 -3696,7 +3756,10 @@@ static void idle_balance(int this_cpu, 
  	struct sched_domain *sd;
  	int pulled_task = 0;
  	unsigned long next_balance = jiffies + HZ;
 -	cpumask_t tmpmask;
 +	cpumask_var_t tmpmask;
 +
 +	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
 +		return;
  
  	for_each_domain(this_cpu, sd) {
  		unsigned long interval;
@@@ -3767,7 -3707,7 +3770,7 @@@
  		if (sd->flags & SD_BALANCE_NEWIDLE)
  			/* If we've pulled tasks over stop searching: */
  			pulled_task = load_balance_newidle(this_cpu, this_rq,
 -							   sd, &tmpmask);
 +							   sd, tmpmask);
  
  		interval = msecs_to_jiffies(sd->balance_interval);
  		if (time_after(next_balance, sd->last_balance + interval))
@@@ -3782,7 -3722,6 +3785,7 @@@
  		 */
  		this_rq->next_balance = next_balance;
  	}
 +	free_cpumask_var(tmpmask);
  }
  
  /*
@@@ -3820,7 -3759,7 +3823,7 @@@ static void active_load_balance(struct 
  	/* Search for an sd spanning us and the target CPU. */
  	for_each_domain(target_cpu, sd) {
  		if ((sd->flags & SD_LOAD_BALANCE) &&
 -		    cpu_isset(busiest_cpu, sd->span))
 +		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
  				break;
  	}
  
@@@ -3839,9 -3778,10 +3842,9 @@@
  #ifdef CONFIG_NO_HZ
  static struct {
  	atomic_t load_balancer;
 -	cpumask_t cpu_mask;
 +	cpumask_var_t cpu_mask;
  } nohz ____cacheline_aligned = {
  	.load_balancer = ATOMIC_INIT(-1),
 -	.cpu_mask = CPU_MASK_NONE,
  };
  
  /*
@@@ -3869,7 -3809,7 +3872,7 @@@ int select_nohz_load_balancer(int stop_
  	int cpu = smp_processor_id();
  
  	if (stop_tick) {
 -		cpu_set(cpu, nohz.cpu_mask);
 +		cpumask_set_cpu(cpu, nohz.cpu_mask);
  		cpu_rq(cpu)->in_nohz_recently = 1;
  
  		/*
@@@ -3883,7 -3823,7 +3886,7 @@@
  		}
  
  		/* time for ilb owner also to sleep */
 -		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 +		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
  			if (atomic_read(&nohz.load_balancer) == cpu)
  				atomic_set(&nohz.load_balancer, -1);
  			return 0;
@@@ -3896,10 -3836,10 +3899,10 @@@
  		} else if (atomic_read(&nohz.load_balancer) == cpu)
  			return 1;
  	} else {
 -		if (!cpu_isset(cpu, nohz.cpu_mask))
 +		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
  			return 0;
  
 -		cpu_clear(cpu, nohz.cpu_mask);
 +		cpumask_clear_cpu(cpu, nohz.cpu_mask);
  
  		if (atomic_read(&nohz.load_balancer) == cpu)
  			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3927,11 -3867,7 +3930,11 @@@ static void rebalance_domains(int cpu, 
  	unsigned long next_balance = jiffies + 60*HZ;
  	int update_next_balance = 0;
  	int need_serialize;
 -	cpumask_t tmp;
 +	cpumask_var_t tmp;
 +
 +	/* Fails alloc?  Rebalancing probably not a priority right now. */
 +	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
 +		return;
  
  	for_each_domain(cpu, sd) {
  		if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -3956,7 -3892,7 +3959,7 @@@
  		}
  
  		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 -			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
 +			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
  				/*
  				 * We've pulled tasks over so either we're no
  				 * longer idle, or one of our SMT siblings is
@@@ -3990,8 -3926,6 +3993,8 @@@ out
  	 */
  	if (likely(update_next_balance))
  		rq->next_balance = next_balance;
 +
 +	free_cpumask_var(tmp);
  }
  
  /*
@@@ -4016,13 -3950,12 +4019,13 @@@ static void run_rebalance_domains(struc
  	 */
  	if (this_rq->idle_at_tick &&
  	    atomic_read(&nohz.load_balancer) == this_cpu) {
 -		cpumask_t cpus = nohz.cpu_mask;
  		struct rq *rq;
  		int balance_cpu;
  
 -		cpu_clear(this_cpu, cpus);
 -		for_each_cpu_mask_nr(balance_cpu, cpus) {
 +		for_each_cpu(balance_cpu, nohz.cpu_mask) {
 +			if (balance_cpu == this_cpu)
 +				continue;
 +
  			/*
  			 * If this cpu gets work to do, stop the load balancing
  			 * work being done for other cpus. Next load
@@@ -4060,7 -3993,7 +4063,7 @@@ static inline void trigger_load_balance
  		rq->in_nohz_recently = 0;
  
  		if (atomic_read(&nohz.load_balancer) == cpu) {
 -			cpu_clear(cpu, nohz.cpu_mask);
 +			cpumask_clear_cpu(cpu, nohz.cpu_mask);
  			atomic_set(&nohz.load_balancer, -1);
  		}
  
@@@ -4073,7 -4006,7 +4076,7 @@@
  			 * TBD: Traverse the sched domains and nominate
  			 * the nearest cpu in the nohz.cpu_mask.
  			 */
 -			int ilb = first_cpu(nohz.cpu_mask);
 +			int ilb = cpumask_first(nohz.cpu_mask);
  
  			if (ilb < nr_cpu_ids)
  				resched_cpu(ilb);
@@@ -4085,7 -4018,7 +4088,7 @@@
  	 * cpus with ticks stopped, is it time for that to stop?
  	 */
  	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
 -	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 +	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
  		resched_cpu(cpu);
  		return;
  	}
@@@ -4095,7 -4028,7 +4098,7 @@@
  	 * someone else, then no need raise the SCHED_SOFTIRQ
  	 */
  	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
 -	    cpu_isset(cpu, nohz.cpu_mask))
 +	    cpumask_test_cpu(cpu, nohz.cpu_mask))
  		return;
  #endif
  	if (time_after_eq(jiffies, rq->next_balance))
@@@ -5187,6 -5120,22 +5190,22 @@@ __setscheduler(struct rq *rq, struct ta
  	set_load_weight(p);
  }
  
+ /*
+  * check the target process has a UID that matches the current process's
+  */
+ static bool check_same_owner(struct task_struct *p)
+ {
+ 	const struct cred *cred = current_cred(), *pcred;
+ 	bool match;
+ 
+ 	rcu_read_lock();
+ 	pcred = __task_cred(p);
+ 	match = (cred->euid == pcred->euid ||
+ 		 cred->euid == pcred->uid);
+ 	rcu_read_unlock();
+ 	return match;
+ }
+ 
  static int __sched_setscheduler(struct task_struct *p, int policy,
  				struct sched_param *param, bool user)
  {
@@@ -5246,8 -5195,7 +5265,7 @@@ recheck
  			return -EPERM;
  
  		/* can't change other user's priorities */
- 		if ((current->euid != p->euid) &&
- 		    (current->euid != p->uid))
+ 		if (!check_same_owner(p))
  			return -EPERM;
  	}
  
@@@ -5453,9 -5401,10 +5471,9 @@@ out_unlock
  	return retval;
  }
  
 -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
 -	cpumask_t cpus_allowed;
 -	cpumask_t new_mask = *in_mask;
 +	cpumask_var_t cpus_allowed, new_mask;
  	struct task_struct *p;
  	int retval;
  
@@@ -5477,58 -5426,45 +5495,57 @@@
  	get_task_struct(p);
  	read_unlock(&tasklist_lock);
  
 +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 +		retval = -ENOMEM;
 +		goto out_put_task;
 +	}
 +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 +		retval = -ENOMEM;
 +		goto out_free_cpus_allowed;
 +	}
  	retval = -EPERM;
- 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
- 			!capable(CAP_SYS_NICE))
+ 	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
  		goto out_unlock;
  
  	retval = security_task_setscheduler(p, 0, NULL);
  	if (retval)
  		goto out_unlock;
  
 -	cpuset_cpus_allowed(p, &cpus_allowed);
 -	cpus_and(new_mask, new_mask, cpus_allowed);
 +	cpuset_cpus_allowed(p, cpus_allowed);
 +	cpumask_and(new_mask, in_mask, cpus_allowed);
   again:
 -	retval = set_cpus_allowed_ptr(p, &new_mask);
 +	retval = set_cpus_allowed_ptr(p, new_mask);
  
  	if (!retval) {
 -		cpuset_cpus_allowed(p, &cpus_allowed);
 -		if (!cpus_subset(new_mask, cpus_allowed)) {
 +		cpuset_cpus_allowed(p, cpus_allowed);
 +		if (!cpumask_subset(new_mask, cpus_allowed)) {
  			/*
  			 * We must have raced with a concurrent cpuset
  			 * update. Just reset the cpus_allowed to the
  			 * cpuset's cpus_allowed
  			 */
 -			new_mask = cpus_allowed;
 +			cpumask_copy(new_mask, cpus_allowed);
  			goto again;
  		}
  	}
  out_unlock:
 +	free_cpumask_var(new_mask);
 +out_free_cpus_allowed:
 +	free_cpumask_var(cpus_allowed);
 +out_put_task:
  	put_task_struct(p);
  	put_online_cpus();
  	return retval;
  }
  
  static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 -			     cpumask_t *new_mask)
 +			     struct cpumask *new_mask)
  {
 -	if (len < sizeof(cpumask_t)) {
 -		memset(new_mask, 0, sizeof(cpumask_t));
 -	} else if (len > sizeof(cpumask_t)) {
 -		len = sizeof(cpumask_t);
 -	}
 +	if (len < cpumask_size())
 +		cpumask_clear(new_mask);
 +	else if (len > cpumask_size())
 +		len = cpumask_size();
 +
  	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
  }
  
@@@ -5541,20 -5477,17 +5558,20 @@@
  asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
  				      unsigned long __user *user_mask_ptr)
  {
 -	cpumask_t new_mask;
 +	cpumask_var_t new_mask;
  	int retval;
  
 -	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
 -	if (retval)
 -		return retval;
 +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 +		return -ENOMEM;
  
 -	return sched_setaffinity(pid, &new_mask);
 +	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
 +	if (retval == 0)
 +		retval = sched_setaffinity(pid, new_mask);
 +	free_cpumask_var(new_mask);
 +	return retval;
  }
  
 -long sched_getaffinity(pid_t pid, cpumask_t *mask)
 +long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
  	struct task_struct *p;
  	int retval;
@@@ -5571,7 -5504,7 +5588,7 @@@
  	if (retval)
  		goto out_unlock;
  
 -	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 +	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
  
  out_unlock:
  	read_unlock(&tasklist_lock);
@@@ -5590,24 -5523,19 +5607,24 @@@ asmlinkage long sys_sched_getaffinity(p
  				      unsigned long __user *user_mask_ptr)
  {
  	int ret;
 -	cpumask_t mask;
 +	cpumask_var_t mask;
  
 -	if (len < sizeof(cpumask_t))
 +	if (len < cpumask_size())
  		return -EINVAL;
  
 -	ret = sched_getaffinity(pid, &mask);
 -	if (ret < 0)
 -		return ret;
 +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 +		return -ENOMEM;
  
 -	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
 -		return -EFAULT;
 +	ret = sched_getaffinity(pid, mask);
 +	if (ret == 0) {
 +		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
 +			ret = -EFAULT;
 +		else
 +			ret = cpumask_size();
 +	}
 +	free_cpumask_var(mask);
  
 -	return sizeof(cpumask_t);
 +	return ret;
  }
  
  /**
@@@ -5949,7 -5877,7 +5966,7 @@@ void __cpuinit init_idle(struct task_st
  	idle->se.exec_start = sched_clock();
  
  	idle->prio = idle->normal_prio = MAX_PRIO;
 -	idle->cpus_allowed = cpumask_of_cpu(cpu);
 +	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
  	__set_task_cpu(idle, cpu);
  
  	rq->curr = rq->idle = idle;
@@@ -5976,9 -5904,9 +5993,9 @@@
   * indicates which cpus entered this state. This is used
   * in the rcu update to wait only for active cpus. For system
   * which do not switch off the HZ timer nohz_cpu_mask should
 - * always be CPU_MASK_NONE.
 + * always be CPU_BITS_NONE.
   */
 -cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 +cpumask_var_t nohz_cpu_mask;
  
  /*
   * Increase the granularity value when there are more CPUs,
@@@ -6033,7 -5961,7 +6050,7 @@@ static inline void sched_init_granulari
   * task must not exit() & deallocate itself prematurely. The
   * call is not atomic; no spinlocks may be held.
   */
 -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
  	struct migration_req req;
  	unsigned long flags;
@@@ -6041,13 -5969,13 +6058,13 @@@
  	int ret = 0;
  
  	rq = task_rq_lock(p, &flags);
 -	if (!cpus_intersects(*new_mask, cpu_online_map)) {
 +	if (!cpumask_intersects(new_mask, cpu_online_mask)) {
  		ret = -EINVAL;
  		goto out;
  	}
  
  	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
 -		     !cpus_equal(p->cpus_allowed, *new_mask))) {
 +		     !cpumask_equal(&p->cpus_allowed, new_mask))) {
  		ret = -EINVAL;
  		goto out;
  	}
@@@ -6055,15 -5983,15 +6072,15 @@@
  	if (p->sched_class->set_cpus_allowed)
  		p->sched_class->set_cpus_allowed(p, new_mask);
  	else {
 -		p->cpus_allowed = *new_mask;
 -		p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
 +		cpumask_copy(&p->cpus_allowed, new_mask);
 +		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
  	}
  
  	/* Can the task run on the task's current CPU? If so, we're done */
 -	if (cpu_isset(task_cpu(p), *new_mask))
 +	if (cpumask_test_cpu(task_cpu(p), new_mask))
  		goto out;
  
 -	if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
 +	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
  		/* Need help from migration thread: drop lock and wait. */
  		task_rq_unlock(rq, &flags);
  		wake_up_process(rq->migration_thread);
@@@ -6105,7 -6033,7 +6122,7 @@@ static int __migrate_task(struct task_s
  	if (task_cpu(p) != src_cpu)
  		goto done;
  	/* Affinity changed (again). */
 -	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 +	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
  		goto fail;
  
  	on_rq = p->se.on_rq;
@@@ -6202,43 -6130,50 +6219,43 @@@ static int __migrate_task_irq(struct ta
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
 -	unsigned long flags;
 -	cpumask_t mask;
 -	struct rq *rq;
  	int dest_cpu;
 +	/* FIXME: Use cpumask_of_node here. */
 +	cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
 +	const struct cpumask *nodemask = &_nodemask;
 +
 +again:
 +	/* Look for allowed, online CPU in same node. */
 +	for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
 +		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 +			goto move;
 +
 +	/* Any allowed, online CPU? */
 +	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
 +	if (dest_cpu < nr_cpu_ids)
 +		goto move;
 +
 +	/* No more Mr. Nice Guy. */
 +	if (dest_cpu >= nr_cpu_ids) {
 +		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
 +		dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
  
 -	do {
 -		/* On same node? */
 -		mask = node_to_cpumask(cpu_to_node(dead_cpu));
 -		cpus_and(mask, mask, p->cpus_allowed);
 -		dest_cpu = any_online_cpu(mask);
 -
 -		/* On any allowed CPU? */
 -		if (dest_cpu >= nr_cpu_ids)
 -			dest_cpu = any_online_cpu(p->cpus_allowed);
 -
 -		/* No more Mr. Nice Guy. */
 -		if (dest_cpu >= nr_cpu_ids) {
 -			cpumask_t cpus_allowed;
 -
 -			cpuset_cpus_allowed_locked(p, &cpus_allowed);
 -			/*
 -			 * Try to stay on the same cpuset, where the
 -			 * current cpuset may be a subset of all cpus.
 -			 * The cpuset_cpus_allowed_locked() variant of
 -			 * cpuset_cpus_allowed() will not block. It must be
 -			 * called within calls to cpuset_lock/cpuset_unlock.
 -			 */
 -			rq = task_rq_lock(p, &flags);
 -			p->cpus_allowed = cpus_allowed;
 -			dest_cpu = any_online_cpu(p->cpus_allowed);
 -			task_rq_unlock(rq, &flags);
 -
 -			/*
 -			 * Don't tell them about moving exiting tasks or
 -			 * kernel threads (both mm NULL), since they never
 -			 * leave kernel.
 -			 */
 -			if (p->mm && printk_ratelimit()) {
 -				printk(KERN_INFO "process %d (%s) no "
 -				       "longer affine to cpu%d\n",
 -					task_pid_nr(p), p->comm, dead_cpu);
 -			}
 +		/*
 +		 * Don't tell them about moving exiting tasks or
 +		 * kernel threads (both mm NULL), since they never
 +		 * leave kernel.
 +		 */
 +		if (p->mm && printk_ratelimit()) {
 +			printk(KERN_INFO "process %d (%s) no "
 +			       "longer affine to cpu%d\n",
 +			       task_pid_nr(p), p->comm, dead_cpu);
  		}
 -	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
 +	}
 +
 +move:
 +	/* It can have affinity changed while we were choosing. */
 +	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
 +		goto again;
  }
  
  /*
@@@ -6250,7 -6185,7 +6267,7 @@@
   */
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
 -	struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
 +	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
  	unsigned long flags;
  
  	local_irq_save(flags);
@@@ -6540,7 -6475,7 +6557,7 @@@ static void set_rq_online(struct rq *rq
  	if (!rq->online) {
  		const struct sched_class *class;
  
 -		cpu_set(rq->cpu, rq->rd->online);
 +		cpumask_set_cpu(rq->cpu, rq->rd->online);
  		rq->online = 1;
  
  		for_each_class(class) {
@@@ -6560,7 -6495,7 +6577,7 @@@ static void set_rq_offline(struct rq *r
  				class->rq_offline(rq);
  		}
  
 -		cpu_clear(rq->cpu, rq->rd->online);
 +		cpumask_clear_cpu(rq->cpu, rq->rd->online);
  		rq->online = 0;
  	}
  }
@@@ -6601,7 -6536,7 +6618,7 @@@ migration_call(struct notifier_block *n
  		rq = cpu_rq(cpu);
  		spin_lock_irqsave(&rq->lock, flags);
  		if (rq->rd) {
 -			BUG_ON(!cpu_isset(cpu, rq->rd->span));
 +			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
  			set_rq_online(rq);
  		}
@@@ -6615,7 -6550,7 +6632,7 @@@
  			break;
  		/* Unbind it from offline cpu so it can run. Fall thru. */
  		kthread_bind(cpu_rq(cpu)->migration_thread,
 -			     any_online_cpu(cpu_online_map));
 +			     cpumask_any(cpu_online_mask));
  		kthread_stop(cpu_rq(cpu)->migration_thread);
  		cpu_rq(cpu)->migration_thread = NULL;
  		break;
@@@ -6665,7 -6600,7 +6682,7 @@@
  		rq = cpu_rq(cpu);
  		spin_lock_irqsave(&rq->lock, flags);
  		if (rq->rd) {
 -			BUG_ON(!cpu_isset(cpu, rq->rd->span));
 +			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  			set_rq_offline(rq);
  		}
  		spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6704,13 -6639,13 +6721,13 @@@ early_initcall(migration_init)
  #ifdef CONFIG_SCHED_DEBUG
  
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 -				  cpumask_t *groupmask)
 +				  struct cpumask *groupmask)
  {
  	struct sched_group *group = sd->groups;
  	char str[256];
  
 -	cpulist_scnprintf(str, sizeof(str), &sd->span);
 -	cpus_clear(*groupmask);
 +	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
 +	cpumask_clear(groupmask);
  
  	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
  
@@@ -6724,11 -6659,11 +6741,11 @@@
  
  	printk(KERN_CONT "span %s level %s\n", str, sd->name);
  
 -	if (!cpu_isset(cpu, sd->span)) {
 +	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  		printk(KERN_ERR "ERROR: domain->span does not contain "
  				"CPU%d\n", cpu);
  	}
 -	if (!cpu_isset(cpu, group->cpumask)) {
 +	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
  		printk(KERN_ERR "ERROR: domain->groups does not contain"
  				" CPU%d\n", cpu);
  	}
@@@ -6748,32 -6683,31 +6765,32 @@@
  			break;
  		}
  
 -		if (!cpus_weight(group->cpumask)) {
 +		if (!cpumask_weight(sched_group_cpus(group))) {
  			printk(KERN_CONT "\n");
  			printk(KERN_ERR "ERROR: empty group\n");
  			break;
  		}
  
 -		if (cpus_intersects(*groupmask, group->cpumask)) {
 +		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
  			printk(KERN_CONT "\n");
  			printk(KERN_ERR "ERROR: repeated CPUs\n");
  			break;
  		}
  
 -		cpus_or(*groupmask, *groupmask, group->cpumask);
 +		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
 -		cpulist_scnprintf(str, sizeof(str), &group->cpumask);
 +		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
  		printk(KERN_CONT " %s", str);
  
  		group = group->next;
  	} while (group != sd->groups);
  	printk(KERN_CONT "\n");
  
 -	if (!cpus_equal(sd->span, *groupmask))
 +	if (!cpumask_equal(sched_domain_span(sd), groupmask))
  		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
  
 -	if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
 +	if (sd->parent &&
 +	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
  		printk(KERN_ERR "ERROR: parent span is not a superset "
  			"of domain->span\n");
  	return 0;
@@@ -6781,7 -6715,7 +6798,7 @@@
  
  static void sched_domain_debug(struct sched_domain *sd, int cpu)
  {
 -	cpumask_t *groupmask;
 +	cpumask_var_t groupmask;
  	int level = 0;
  
  	if (!sd) {
@@@ -6791,7 -6725,8 +6808,7 @@@
  
  	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
  
 -	groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 -	if (!groupmask) {
 +	if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
  		printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
  		return;
  	}
@@@ -6804,7 -6739,7 +6821,7 @@@
  		if (!sd)
  			break;
  	}
 -	kfree(groupmask);
 +	free_cpumask_var(groupmask);
  }
  #else /* !CONFIG_SCHED_DEBUG */
  # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6812,7 -6747,7 +6829,7 @@@
  
  static int sd_degenerate(struct sched_domain *sd)
  {
 -	if (cpus_weight(sd->span) == 1)
 +	if (cpumask_weight(sched_domain_span(sd)) == 1)
  		return 1;
  
  	/* Following flags need at least 2 groups */
@@@ -6843,7 -6778,7 +6860,7 @@@ sd_parent_degenerate(struct sched_domai
  	if (sd_degenerate(parent))
  		return 1;
  
 -	if (!cpus_equal(sd->span, parent->span))
 +	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
  		return 0;
  
  	/* Does parent contain flags not in child? */
@@@ -6867,16 -6802,6 +6884,16 @@@
  	return 1;
  }
  
 +static void free_rootdomain(struct root_domain *rd)
 +{
 +	cpupri_cleanup(&rd->cpupri);
 +
 +	free_cpumask_var(rd->rto_mask);
 +	free_cpumask_var(rd->online);
 +	free_cpumask_var(rd->span);
 +	kfree(rd);
 +}
 +
  static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
  	unsigned long flags;
@@@ -6886,63 -6811,38 +6903,63 @@@
  	if (rq->rd) {
  		struct root_domain *old_rd = rq->rd;
  
 -		if (cpu_isset(rq->cpu, old_rd->online))
 +		if (cpumask_test_cpu(rq->cpu, old_rd->online))
  			set_rq_offline(rq);
  
 -		cpu_clear(rq->cpu, old_rd->span);
 +		cpumask_clear_cpu(rq->cpu, old_rd->span);
  
  		if (atomic_dec_and_test(&old_rd->refcount))
 -			kfree(old_rd);
 +			free_rootdomain(old_rd);
  	}
  
  	atomic_inc(&rd->refcount);
  	rq->rd = rd;
  
 -	cpu_set(rq->cpu, rd->span);
 -	if (cpu_isset(rq->cpu, cpu_online_map))
 +	cpumask_set_cpu(rq->cpu, rd->span);
 +	if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
  		set_rq_online(rq);
  
  	spin_unlock_irqrestore(&rq->lock, flags);
  }
  
 -static void init_rootdomain(struct root_domain *rd)
 +static int init_rootdomain(struct root_domain *rd, bool bootmem)
  {
  	memset(rd, 0, sizeof(*rd));
  
 -	cpus_clear(rd->span);
 -	cpus_clear(rd->online);
 +	if (bootmem) {
 +		alloc_bootmem_cpumask_var(&def_root_domain.span);
 +		alloc_bootmem_cpumask_var(&def_root_domain.online);
 +		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
 +		cpupri_init(&rd->cpupri, true);
 +		return 0;
 +	}
 +
 +	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
 +		goto free_rd;
 +	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
 +		goto free_span;
 +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 +		goto free_online;
 +
 +	if (cpupri_init(&rd->cpupri, false) != 0)
 +		goto free_rto_mask;
 +	return 0;
  
 -	cpupri_init(&rd->cpupri);
 +free_rto_mask:
 +	free_cpumask_var(rd->rto_mask);
 +free_online:
 +	free_cpumask_var(rd->online);
 +free_span:
 +	free_cpumask_var(rd->span);
 +free_rd:
 +	kfree(rd);
 +	return -ENOMEM;
  }
  
  static void init_defrootdomain(void)
  {
 -	init_rootdomain(&def_root_domain);
 +	init_rootdomain(&def_root_domain, true);
 +
  	atomic_set(&def_root_domain.refcount, 1);
  }
  
@@@ -6954,10 -6854,7 +6971,10 @@@ static struct root_domain *alloc_rootdo
  	if (!rd)
  		return NULL;
  
 -	init_rootdomain(rd);
 +	if (init_rootdomain(rd, false) != 0) {
 +		kfree(rd);
 +		return NULL;
 +	}
  
  	return rd;
  }
@@@ -6999,12 -6896,19 +7016,12 @@@ cpu_attach_domain(struct sched_domain *
  }
  
  /* cpus with isolated domains */
 -static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 +static cpumask_var_t cpu_isolated_map;
  
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
 -	static int __initdata ints[NR_CPUS];
 -	int i;
 -
 -	str = get_options(str, ARRAY_SIZE(ints), ints);
 -	cpus_clear(cpu_isolated_map);
 -	for (i = 1; i <= ints[0]; i++)
 -		if (ints[i] < NR_CPUS)
 -			cpu_set(ints[i], cpu_isolated_map);
 +	cpulist_parse(str, cpu_isolated_map);
  	return 1;
  }
  
@@@ -7013,43 -6917,42 +7030,43 @@@ __setup("isolcpus=", isolated_cpu_setup
  /*
   * init_sched_build_groups takes the cpumask we wish to span, and a pointer
   * to a function which identifies what group(along with sched group) a CPU
 - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
 - * (due to the fact that we keep track of groups covered with a cpumask_t).
 + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
 + * (due to the fact that we keep track of groups covered with a struct cpumask).
   *
   * init_sched_build_groups will build a circular linked list of the groups
   * covered by the given span, and will set each group's ->cpumask correctly,
   * and ->cpu_power to 0.
   */
  static void
 -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 -			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
 +init_sched_build_groups(const struct cpumask *span,
 +			const struct cpumask *cpu_map,
 +			int (*group_fn)(int cpu, const struct cpumask *cpu_map,
  					struct sched_group **sg,
 -					cpumask_t *tmpmask),
 -			cpumask_t *covered, cpumask_t *tmpmask)
 +					struct cpumask *tmpmask),
 +			struct cpumask *covered, struct cpumask *tmpmask)
  {
  	struct sched_group *first = NULL, *last = NULL;
  	int i;
  
 -	cpus_clear(*covered);
 +	cpumask_clear(covered);
  
 -	for_each_cpu_mask_nr(i, *span) {
 +	for_each_cpu(i, span) {
  		struct sched_group *sg;
  		int group = group_fn(i, cpu_map, &sg, tmpmask);
  		int j;
  
 -		if (cpu_isset(i, *covered))
 +		if (cpumask_test_cpu(i, covered))
  			continue;
  
 -		cpus_clear(sg->cpumask);
 +		cpumask_clear(sched_group_cpus(sg));
  		sg->__cpu_power = 0;
  
 -		for_each_cpu_mask_nr(j, *span) {
 +		for_each_cpu(j, span) {
  			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
  				continue;
  
 -			cpu_set(j, *covered);
 -			cpu_set(j, sg->cpumask);
 +			cpumask_set_cpu(j, covered);
 +			cpumask_set_cpu(j, sched_group_cpus(sg));
  		}
  		if (!first)
  			first = sg;
@@@ -7113,10 -7016,9 +7130,10 @@@ static int find_next_best_node(int node
   * should be one that prevents unnecessary balancing, but also spreads tasks
   * out optimally.
   */
 -static void sched_domain_node_span(int node, cpumask_t *span)
 +static void sched_domain_node_span(int node, struct cpumask *span)
  {
  	nodemask_t used_nodes;
 +	/* FIXME: use cpumask_of_node() */
  	node_to_cpumask_ptr(nodemask, node);
  	int i;
  
@@@ -7137,34 -7039,19 +7154,34 @@@
  
  int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
 +/*
 + * The cpus mask in sched_group and sched_domain hangs off the end.
 + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
 + * for nr_cpu_ids < CONFIG_NR_CPUS.
 + */
 +struct static_sched_group {
 +	struct sched_group sg;
 +	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
 +};
 +
 +struct static_sched_domain {
 +	struct sched_domain sd;
 +	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
 +};
 +
  /*
   * SMT sched-domains:
   */
  #ifdef CONFIG_SCHED_SMT
 -static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
 +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
  
  static int
 -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 -		 cpumask_t *unused)
 +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
 +		 struct sched_group **sg, struct cpumask *unused)
  {
  	if (sg)
 -		*sg = &per_cpu(sched_group_cpus, cpu);
 +		*sg = &per_cpu(sched_group_cpus, cpu).sg;
  	return cpu;
  }
  #endif /* CONFIG_SCHED_SMT */
@@@ -7173,55 -7060,56 +7190,55 @@@
   * multi-core sched-domains:
   */
  #ifdef CONFIG_SCHED_MC
 -static DEFINE_PER_CPU(struct sched_domain, core_domains);
 -static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 +static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
 +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
  #endif /* CONFIG_SCHED_MC */
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
 -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 -		  cpumask_t *mask)
 +cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 +		  struct sched_group **sg, struct cpumask *mask)
  {
  	int group;
  
 -	*mask = per_cpu(cpu_sibling_map, cpu);
 -	cpus_and(*mask, *mask, *cpu_map);
 -	group = first_cpu(*mask);
 +	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
 +	group = cpumask_first(mask);
  	if (sg)
 -		*sg = &per_cpu(sched_group_core, group);
 +		*sg = &per_cpu(sched_group_core, group).sg;
  	return group;
  }
  #elif defined(CONFIG_SCHED_MC)
  static int
 -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 -		  cpumask_t *unused)
 +cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 +		  struct sched_group **sg, struct cpumask *unused)
  {
  	if (sg)
 -		*sg = &per_cpu(sched_group_core, cpu);
 +		*sg = &per_cpu(sched_group_core, cpu).sg;
  	return cpu;
  }
  #endif
  
 -static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 -static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
  
  static int
 -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 -		  cpumask_t *mask)
 +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 +		  struct sched_group **sg, struct cpumask *mask)
  {
  	int group;
  #ifdef CONFIG_SCHED_MC
 -	*mask = *cpu_coregroup_mask(cpu);
 +	/* FIXME: Use cpu_coregroup_mask. */
 +	*mask = cpu_coregroup_map(cpu);
  	cpus_and(*mask, *mask, *cpu_map);
 -	group = first_cpu(*mask);
 +	group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
 -	*mask = per_cpu(cpu_sibling_map, cpu);
 -	cpus_and(*mask, *mask, *cpu_map);
 -	group = first_cpu(*mask);
 +	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
 +	group = cpumask_first(mask);
  #else
  	group = cpu;
  #endif
  	if (sg)
 -		*sg = &per_cpu(sched_group_phys, group);
 +		*sg = &per_cpu(sched_group_phys, group).sg;
  	return group;
  }
  
@@@ -7235,21 -7123,19 +7252,21 @@@ static DEFINE_PER_CPU(struct sched_doma
  static struct sched_group ***sched_group_nodes_bycpu;
  
  static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
  
 -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 -				 struct sched_group **sg, cpumask_t *nodemask)
 +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
 +				 struct sched_group **sg,
 +				 struct cpumask *nodemask)
  {
  	int group;
 +	/* FIXME: use cpumask_of_node */
 +	node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
  
 -	*nodemask = node_to_cpumask(cpu_to_node(cpu));
 -	cpus_and(*nodemask, *nodemask, *cpu_map);
 -	group = first_cpu(*nodemask);
 +	cpumask_and(nodemask, pnodemask, cpu_map);
 +	group = cpumask_first(nodemask);
  
  	if (sg)
 -		*sg = &per_cpu(sched_group_allnodes, group);
 +		*sg = &per_cpu(sched_group_allnodes, group).sg;
  	return group;
  }
  
@@@ -7261,11 -7147,11 +7278,11 @@@ static void init_numa_sched_groups_powe
  	if (!sg)
  		return;
  	do {
 -		for_each_cpu_mask_nr(j, sg->cpumask) {
 +		for_each_cpu(j, sched_group_cpus(sg)) {
  			struct sched_domain *sd;
  
 -			sd = &per_cpu(phys_domains, j);
 -			if (j != first_cpu(sd->groups->cpumask)) {
 +			sd = &per_cpu(phys_domains, j).sd;
 +			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
  				/*
  				 * Only add "power" once for each
  				 * physical package.
@@@ -7282,12 -7168,11 +7299,12 @@@
  
  #ifdef CONFIG_NUMA
  /* Free memory allocated for various sched_group structures */
 -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 +static void free_sched_groups(const struct cpumask *cpu_map,
 +			      struct cpumask *nodemask)
  {
  	int cpu, i;
  
 -	for_each_cpu_mask_nr(cpu, *cpu_map) {
 +	for_each_cpu(cpu, cpu_map) {
  		struct sched_group **sched_group_nodes
  			= sched_group_nodes_bycpu[cpu];
  
@@@ -7296,11 -7181,10 +7313,11 @@@
  
  		for (i = 0; i < nr_node_ids; i++) {
  			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 +			/* FIXME: Use cpumask_of_node */
 +			node_to_cpumask_ptr(pnodemask, i);
  
 -			*nodemask = node_to_cpumask(i);
 -			cpus_and(*nodemask, *nodemask, *cpu_map);
 -			if (cpus_empty(*nodemask))
 +			cpus_and(*nodemask, *pnodemask, *cpu_map);
 +			if (cpumask_empty(nodemask))
  				continue;
  
  			if (sg == NULL)
@@@ -7318,8 -7202,7 +7335,8 @@@ next_sg
  	}
  }
  #else /* !CONFIG_NUMA */
 -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 +static void free_sched_groups(const struct cpumask *cpu_map,
 +			      struct cpumask *nodemask)
  {
  }
  #endif /* CONFIG_NUMA */
@@@ -7345,7 -7228,7 +7362,7 @@@ static void init_sched_groups_power(in
  
  	WARN_ON(!sd || !sd->groups);
  
 -	if (cpu != first_cpu(sd->groups->cpumask))
 +	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
  		return;
  
  	child = sd->child;
@@@ -7410,6 -7293,48 +7427,6 @@@ SD_INIT_FUNC(CPU
   SD_INIT_FUNC(MC)
  #endif
  
 -/*
 - * To minimize stack usage kmalloc room for cpumasks and share the
 - * space as the usage in build_sched_domains() dictates.  Used only
 - * if the amount of space is significant.
 - */
 -struct allmasks {
 -	cpumask_t tmpmask;			/* make this one first */
 -	union {
 -		cpumask_t nodemask;
 -		cpumask_t this_sibling_map;
 -		cpumask_t this_core_map;
 -	};
 -	cpumask_t send_covered;
 -
 -#ifdef CONFIG_NUMA
 -	cpumask_t domainspan;
 -	cpumask_t covered;
 -	cpumask_t notcovered;
 -#endif
 -};
 -
 -#if	NR_CPUS > 128
 -#define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
 -static inline void sched_cpumask_alloc(struct allmasks **masks)
 -{
 -	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
 -}
 -static inline void sched_cpumask_free(struct allmasks *masks)
 -{
 -	kfree(masks);
 -}
 -#else
 -#define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
 -static inline void sched_cpumask_alloc(struct allmasks **masks)
 -{ }
 -static inline void sched_cpumask_free(struct allmasks *masks)
 -{ }
 -#endif
 -
 -#define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
 -			((unsigned long)(a) + offsetof(struct allmasks, v))
 -
  static int default_relax_domain_level = -1;
  
  static int __init setup_relax_domain_level(char *str)
@@@ -7449,38 -7374,17 +7466,38 @@@ static void set_domain_attribute(struc
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
 -static int __build_sched_domains(const cpumask_t *cpu_map,
 +static int __build_sched_domains(const struct cpumask *cpu_map,
  				 struct sched_domain_attr *attr)
  {
 -	int i;
 +	int i, err = -ENOMEM;
  	struct root_domain *rd;
 -	SCHED_CPUMASK_DECLARE(allmasks);
 -	cpumask_t *tmpmask;
 +	cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
 +		tmpmask;
  #ifdef CONFIG_NUMA
 +	cpumask_var_t domainspan, covered, notcovered;
  	struct sched_group **sched_group_nodes = NULL;
  	int sd_allnodes = 0;
  
 +	if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
 +		goto out;
 +	if (!alloc_cpumask_var(&covered, GFP_KERNEL))
 +		goto free_domainspan;
 +	if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
 +		goto free_covered;
 +#endif
 +
 +	if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
 +		goto free_notcovered;
 +	if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
 +		goto free_nodemask;
 +	if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
 +		goto free_this_sibling_map;
 +	if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
 +		goto free_this_core_map;
 +	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 +		goto free_send_covered;
 +
 +#ifdef CONFIG_NUMA
  	/*
  	 * Allocate the per-node list of sched groups
  	 */
@@@ -7488,37 -7392,54 +7505,37 @@@
  				    GFP_KERNEL);
  	if (!sched_group_nodes) {
  		printk(KERN_WARNING "Can not alloc sched group node list\n");
 -		return -ENOMEM;
 +		goto free_tmpmask;
  	}
  #endif
  
  	rd = alloc_rootdomain();
  	if (!rd) {
  		printk(KERN_WARNING "Cannot alloc root domain\n");
 -#ifdef CONFIG_NUMA
 -		kfree(sched_group_nodes);
 -#endif
 -		return -ENOMEM;
 +		goto free_sched_groups;
  	}
  
 -	/* get space for all scratch cpumask variables */
 -	sched_cpumask_alloc(&allmasks);
 -	if (!allmasks) {
 -		printk(KERN_WARNING "Cannot alloc cpumask array\n");
 -		kfree(rd);
  #ifdef CONFIG_NUMA
 -		kfree(sched_group_nodes);
 -#endif
 -		return -ENOMEM;
 -	}
 -
 -	tmpmask = (cpumask_t *)allmasks;
 -
 -
 -#ifdef CONFIG_NUMA
 -	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 +	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
  #endif
  
  	/*
  	 * Set up domains for cpus specified by the cpu_map.
  	 */
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 +	for_each_cpu(i, cpu_map) {
  		struct sched_domain *sd = NULL, *p;
 -		SCHED_CPUMASK_VAR(nodemask, allmasks);
  
 +		/* FIXME: use cpumask_of_node */
  		*nodemask = node_to_cpumask(cpu_to_node(i));
  		cpus_and(*nodemask, *nodemask, *cpu_map);
  
  #ifdef CONFIG_NUMA
 -		if (cpus_weight(*cpu_map) >
 -				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
 +		if (cpumask_weight(cpu_map) >
 +				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
  			sd = &per_cpu(allnodes_domains, i);
  			SD_INIT(sd, ALLNODES);
  			set_domain_attribute(sd, attr);
 -			sd->span = *cpu_map;
 +			cpumask_copy(sched_domain_span(sd), cpu_map);
  			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
  			p = sd;
  			sd_allnodes = 1;
@@@ -7528,19 -7449,18 +7545,19 @@@
  		sd = &per_cpu(node_domains, i);
  		SD_INIT(sd, NODE);
  		set_domain_attribute(sd, attr);
 -		sched_domain_node_span(cpu_to_node(i), &sd->span);
 +		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
  		sd->parent = p;
  		if (p)
  			p->child = sd;
 -		cpus_and(sd->span, sd->span, *cpu_map);
 +		cpumask_and(sched_domain_span(sd),
 +			    sched_domain_span(sd), cpu_map);
  #endif
  
  		p = sd;
 -		sd = &per_cpu(phys_domains, i);
 +		sd = &per_cpu(phys_domains, i).sd;
  		SD_INIT(sd, CPU);
  		set_domain_attribute(sd, attr);
 -		sd->span = *nodemask;
 +		cpumask_copy(sched_domain_span(sd), nodemask);
  		sd->parent = p;
  		if (p)
  			p->child = sd;
@@@ -7548,12 -7468,11 +7565,12 @@@
  
  #ifdef CONFIG_SCHED_MC
  		p = sd;
 -		sd = &per_cpu(core_domains, i);
 +		sd = &per_cpu(core_domains, i).sd;
  		SD_INIT(sd, MC);
  		set_domain_attribute(sd, attr);
 -		sd->span = *cpu_coregroup_mask(i);
 -		cpus_and(sd->span, sd->span, *cpu_map);
 +		*sched_domain_span(sd) = cpu_coregroup_map(i);
 +		cpumask_and(sched_domain_span(sd),
 +			    sched_domain_span(sd), cpu_map);
  		sd->parent = p;
  		p->child = sd;
  		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7561,11 -7480,11 +7578,11 @@@
  
  #ifdef CONFIG_SCHED_SMT
  		p = sd;
 -		sd = &per_cpu(cpu_domains, i);
 +		sd = &per_cpu(cpu_domains, i).sd;
  		SD_INIT(sd, SIBLING);
  		set_domain_attribute(sd, attr);
 -		sd->span = per_cpu(cpu_sibling_map, i);
 -		cpus_and(sd->span, sd->span, *cpu_map);
 +		cpumask_and(sched_domain_span(sd),
 +			    &per_cpu(cpu_sibling_map, i), cpu_map);
  		sd->parent = p;
  		p->child = sd;
  		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7574,10 -7493,13 +7591,10 @@@
  
  #ifdef CONFIG_SCHED_SMT
  	/* Set up CPU (sibling) groups */
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 -		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 -		SCHED_CPUMASK_VAR(send_covered, allmasks);
 -
 -		*this_sibling_map = per_cpu(cpu_sibling_map, i);
 -		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
 -		if (i != first_cpu(*this_sibling_map))
 +	for_each_cpu(i, cpu_map) {
 +		cpumask_and(this_sibling_map,
 +			    &per_cpu(cpu_sibling_map, i), cpu_map);
 +		if (i != cpumask_first(this_sibling_map))
  			continue;
  
  		init_sched_build_groups(this_sibling_map, cpu_map,
@@@ -7588,11 -7510,13 +7605,11 @@@
  
  #ifdef CONFIG_SCHED_MC
  	/* Set up multi-core groups */
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 -		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 -		SCHED_CPUMASK_VAR(send_covered, allmasks);
 -
 -		*this_core_map = *cpu_coregroup_mask(i);
 +	for_each_cpu(i, cpu_map) {
 +		/* FIXME: Use cpu_coregroup_mask */
 +		*this_core_map = cpu_coregroup_map(i);
  		cpus_and(*this_core_map, *this_core_map, *cpu_map);
 -		if (i != first_cpu(*this_core_map))
 +		if (i != cpumask_first(this_core_map))
  			continue;
  
  		init_sched_build_groups(this_core_map, cpu_map,
@@@ -7603,10 -7527,12 +7620,10 @@@
  
  	/* Set up physical groups */
  	for (i = 0; i < nr_node_ids; i++) {
 -		SCHED_CPUMASK_VAR(nodemask, allmasks);
 -		SCHED_CPUMASK_VAR(send_covered, allmasks);
 -
 +		/* FIXME: Use cpumask_of_node */
  		*nodemask = node_to_cpumask(i);
  		cpus_and(*nodemask, *nodemask, *cpu_map);
 -		if (cpus_empty(*nodemask))
 +		if (cpumask_empty(nodemask))
  			continue;
  
  		init_sched_build_groups(nodemask, cpu_map,
@@@ -7617,6 -7543,8 +7634,6 @@@
  #ifdef CONFIG_NUMA
  	/* Set up node groups */
  	if (sd_allnodes) {
 -		SCHED_CPUMASK_VAR(send_covered, allmasks);
 -
  		init_sched_build_groups(cpu_map, cpu_map,
  					&cpu_to_allnodes_group,
  					send_covered, tmpmask);
@@@ -7625,58 -7553,58 +7642,58 @@@
  	for (i = 0; i < nr_node_ids; i++) {
  		/* Set up node groups */
  		struct sched_group *sg, *prev;
 -		SCHED_CPUMASK_VAR(nodemask, allmasks);
 -		SCHED_CPUMASK_VAR(domainspan, allmasks);
 -		SCHED_CPUMASK_VAR(covered, allmasks);
  		int j;
  
 +		/* FIXME: Use cpumask_of_node */
  		*nodemask = node_to_cpumask(i);
 -		cpus_clear(*covered);
 +		cpumask_clear(covered);
  
  		cpus_and(*nodemask, *nodemask, *cpu_map);
 -		if (cpus_empty(*nodemask)) {
 +		if (cpumask_empty(nodemask)) {
  			sched_group_nodes[i] = NULL;
  			continue;
  		}
  
  		sched_domain_node_span(i, domainspan);
 -		cpus_and(*domainspan, *domainspan, *cpu_map);
 +		cpumask_and(domainspan, domainspan, cpu_map);
  
 -		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
 +		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 +				  GFP_KERNEL, i);
  		if (!sg) {
  			printk(KERN_WARNING "Can not alloc domain group for "
  				"node %d\n", i);
  			goto error;
  		}
  		sched_group_nodes[i] = sg;
 -		for_each_cpu_mask_nr(j, *nodemask) {
 +		for_each_cpu(j, nodemask) {
  			struct sched_domain *sd;
  
  			sd = &per_cpu(node_domains, j);
  			sd->groups = sg;
  		}
  		sg->__cpu_power = 0;
 -		sg->cpumask = *nodemask;
 +		cpumask_copy(sched_group_cpus(sg), nodemask);
  		sg->next = sg;
 -		cpus_or(*covered, *covered, *nodemask);
 +		cpumask_or(covered, covered, nodemask);
  		prev = sg;
  
  		for (j = 0; j < nr_node_ids; j++) {
 -			SCHED_CPUMASK_VAR(notcovered, allmasks);
  			int n = (i + j) % nr_node_ids;
 +			/* FIXME: Use cpumask_of_node */
  			node_to_cpumask_ptr(pnodemask, n);
  
 -			cpus_complement(*notcovered, *covered);
 -			cpus_and(*tmpmask, *notcovered, *cpu_map);
 -			cpus_and(*tmpmask, *tmpmask, *domainspan);
 -			if (cpus_empty(*tmpmask))
 +			cpumask_complement(notcovered, covered);
 +			cpumask_and(tmpmask, notcovered, cpu_map);
 +			cpumask_and(tmpmask, tmpmask, domainspan);
 +			if (cpumask_empty(tmpmask))
  				break;
  
 -			cpus_and(*tmpmask, *tmpmask, *pnodemask);
 -			if (cpus_empty(*tmpmask))
 +			cpumask_and(tmpmask, tmpmask, pnodemask);
 +			if (cpumask_empty(tmpmask))
  				continue;
  
 -			sg = kmalloc_node(sizeof(struct sched_group),
 +			sg = kmalloc_node(sizeof(struct sched_group) +
 +					  cpumask_size(),
  					  GFP_KERNEL, i);
  			if (!sg) {
  				printk(KERN_WARNING
@@@ -7684,9 -7612,9 +7701,9 @@@
  				goto error;
  			}
  			sg->__cpu_power = 0;
 -			sg->cpumask = *tmpmask;
 +			cpumask_copy(sched_group_cpus(sg), tmpmask);
  			sg->next = prev->next;
 -			cpus_or(*covered, *covered, *tmpmask);
 +			cpumask_or(covered, covered, tmpmask);
  			prev->next = sg;
  			prev = sg;
  		}
@@@ -7695,22 -7623,22 +7712,22 @@@
  
  	/* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 -		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 +	for_each_cpu(i, cpu_map) {
 +		struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
  
  		init_sched_groups_power(i, sd);
  	}
  #endif
  #ifdef CONFIG_SCHED_MC
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 -		struct sched_domain *sd = &per_cpu(core_domains, i);
 +	for_each_cpu(i, cpu_map) {
 +		struct sched_domain *sd = &per_cpu(core_domains, i).sd;
  
  		init_sched_groups_power(i, sd);
  	}
  #endif
  
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 -		struct sched_domain *sd = &per_cpu(phys_domains, i);
 +	for_each_cpu(i, cpu_map) {
 +		struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
  
  		init_sched_groups_power(i, sd);
  	}
@@@ -7722,78 -7650,53 +7739,78 @@@
  	if (sd_allnodes) {
  		struct sched_group *sg;
  
 -		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
 +		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
  								tmpmask);
  		init_numa_sched_groups_power(sg);
  	}
  #endif
  
  	/* Attach the domains */
 -	for_each_cpu_mask_nr(i, *cpu_map) {
 +	for_each_cpu(i, cpu_map) {
  		struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
 -		sd = &per_cpu(cpu_domains, i);
 +		sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
 -		sd = &per_cpu(core_domains, i);
 +		sd = &per_cpu(core_domains, i).sd;
  #else
 -		sd = &per_cpu(phys_domains, i);
 +		sd = &per_cpu(phys_domains, i).sd;
  #endif
  		cpu_attach_domain(sd, rd, i);
  	}
  
 -	sched_cpumask_free(allmasks);
 -	return 0;
 +	err = 0;
 +
 +free_tmpmask:
 +	free_cpumask_var(tmpmask);
 +free_send_covered:
 +	free_cpumask_var(send_covered);
 +free_this_core_map:
 +	free_cpumask_var(this_core_map);
 +free_this_sibling_map:
 +	free_cpumask_var(this_sibling_map);
 +free_nodemask:
 +	free_cpumask_var(nodemask);
 +free_notcovered:
 +#ifdef CONFIG_NUMA
 +	free_cpumask_var(notcovered);
 +free_covered:
 +	free_cpumask_var(covered);
 +free_domainspan:
 +	free_cpumask_var(domainspan);
 +out:
 +#endif
 +	return err;
 +
 +free_sched_groups:
 +#ifdef CONFIG_NUMA
 +	kfree(sched_group_nodes);
 +#endif
 +	goto free_tmpmask;
  
  #ifdef CONFIG_NUMA
  error:
  	free_sched_groups(cpu_map, tmpmask);
 -	sched_cpumask_free(allmasks);
 -	kfree(rd);
 -	return -ENOMEM;
 +	free_rootdomain(rd);
 +	goto free_tmpmask;
  #endif
  }
  
 -static int build_sched_domains(const cpumask_t *cpu_map)
 +static int build_sched_domains(const struct cpumask *cpu_map)
  {
  	return __build_sched_domains(cpu_map, NULL);
  }
  
 -static cpumask_t *doms_cur;	/* current sched domains */
 +static struct cpumask *doms_cur;	/* current sched domains */
  static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
  static struct sched_domain_attr *dattr_cur;
  				/* attribues of custom domains in 'doms_cur' */
  
  /*
   * Special case: If a kmalloc of a doms_cur partition (array of
 - * cpumask_t) fails, then fallback to a single sched domain,
 - * as determined by the single cpumask_t fallback_doms.
 + * cpumask) fails, then fallback to a single sched domain,
 + * as determined by the single cpumask fallback_doms.
   */
 -static cpumask_t fallback_doms;
 +static cpumask_var_t fallback_doms;
  
  /*
   * arch_update_cpu_topology lets virtualized architectures update the
@@@ -7810,16 -7713,16 +7827,16 @@@ int __attribute__((weak)) arch_update_c
   * For now this just excludes isolated cpus, but could be used to
   * exclude other special cases in the future.
   */
 -static int arch_init_sched_domains(const cpumask_t *cpu_map)
 +static int arch_init_sched_domains(const struct cpumask *cpu_map)
  {
  	int err;
  
  	arch_update_cpu_topology();
  	ndoms_cur = 1;
 -	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 +	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
  	if (!doms_cur)
 -		doms_cur = &fallback_doms;
 -	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
 +		doms_cur = fallback_doms;
 +	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
  	dattr_cur = NULL;
  	err = build_sched_domains(doms_cur);
  	register_sched_domain_sysctl();
@@@ -7827,8 -7730,8 +7844,8 @@@
  	return err;
  }
  
 -static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
 -				       cpumask_t *tmpmask)
 +static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
 +				       struct cpumask *tmpmask)
  {
  	free_sched_groups(cpu_map, tmpmask);
  }
@@@ -7837,16 -7740,15 +7854,16 @@@
   * Detach sched domains from a group of cpus specified in cpu_map
   * These cpus will now be attached to the NULL domain
   */
 -static void detach_destroy_domains(const cpumask_t *cpu_map)
 +static void detach_destroy_domains(const struct cpumask *cpu_map)
  {
 -	cpumask_t tmpmask;
 +	/* Save because hotplug lock held. */
 +	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
  	int i;
  
 -	for_each_cpu_mask_nr(i, *cpu_map)
 +	for_each_cpu(i, cpu_map)
  		cpu_attach_domain(NULL, &def_root_domain, i);
  	synchronize_sched();
 -	arch_destroy_sched_domains(cpu_map, &tmpmask);
 +	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
  }
  
  /* handle null as "default" */
@@@ -7871,7 -7773,7 +7888,7 @@@ static int dattrs_equal(struct sched_do
   * doms_new[] to the current sched domain partitioning, doms_cur[].
   * It destroys each deleted domain and builds each new domain.
   *
 - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
 + * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
   * The masks don't intersect (don't overlap.) We should setup one
   * sched domain for each mask. CPUs not in any of the cpumasks will
   * not be load balanced. If the same cpumask appears both in the
@@@ -7885,14 -7787,13 +7902,14 @@@
   * the single partition 'fallback_doms', it also forces the domains
   * to be rebuilt.
   *
 - * If doms_new == NULL it will be replaced with cpu_online_map.
 + * If doms_new == NULL it will be replaced with cpu_online_mask.
   * ndoms_new == 0 is a special case for destroying existing domains,
   * and it will not create the default domain.
   *
   * Call with hotplug lock held
   */
 -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 +/* FIXME: Change to struct cpumask *doms_new[] */
 +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
  			     struct sched_domain_attr *dattr_new)
  {
  	int i, j, n;
@@@ -7911,7 -7812,7 +7928,7 @@@
  	/* Destroy deleted domains */
  	for (i = 0; i < ndoms_cur; i++) {
  		for (j = 0; j < n && !new_topology; j++) {
 -			if (cpus_equal(doms_cur[i], doms_new[j])
 +			if (cpumask_equal(&doms_cur[i], &doms_new[j])
  			    && dattrs_equal(dattr_cur, i, dattr_new, j))
  				goto match1;
  		}
@@@ -7923,15 -7824,15 +7940,15 @@@ match1
  
  	if (doms_new == NULL) {
  		ndoms_cur = 0;
 -		doms_new = &fallback_doms;
 -		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 +		doms_new = fallback_doms;
 +		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
  		WARN_ON_ONCE(dattr_new);
  	}
  
  	/* Build new domains */
  	for (i = 0; i < ndoms_new; i++) {
  		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 -			if (cpus_equal(doms_new[i], doms_cur[j])
 +			if (cpumask_equal(&doms_new[i], &doms_cur[j])
  			    && dattrs_equal(dattr_new, i, dattr_cur, j))
  				goto match2;
  		}
@@@ -7943,7 -7844,7 +7960,7 @@@ match2
  	}
  
  	/* Remember the new sched domains */
 -	if (doms_cur != &fallback_doms)
 +	if (doms_cur != fallback_doms)
  		kfree(doms_cur);
  	kfree(dattr_cur);	/* kfree(NULL) is safe */
  	doms_cur = doms_new;
@@@ -7972,25 -7873,14 +7989,25 @@@ int arch_reinit_sched_domains(void
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  {
  	int ret;
 +	unsigned int level = 0;
  
 -	if (buf[0] != '0' && buf[0] != '1')
 +	if (sscanf(buf, "%u", &level) != 1)
 +		return -EINVAL;
 +
 +	/*
 +	 * level is always be positive so don't check for
 +	 * level < POWERSAVINGS_BALANCE_NONE which is 0
 +	 * What happens on 0 or 1 byte write,
 +	 * need to check for count as well?
 +	 */
 +
 +	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
  		return -EINVAL;
  
  	if (smt)
 -		sched_smt_power_savings = (buf[0] == '1');
 +		sched_smt_power_savings = level;
  	else
 -		sched_mc_power_savings = (buf[0] == '1');
 +		sched_mc_power_savings = level;
  
  	ret = arch_reinit_sched_domains();
  
@@@ -8094,9 -7984,7 +8111,9 @@@ static int update_runtime(struct notifi
  
  void __init sched_init_smp(void)
  {
 -	cpumask_t non_isolated_cpus;
 +	cpumask_var_t non_isolated_cpus;
 +
 +	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
  
  #if defined(CONFIG_NUMA)
  	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@@ -8105,10 -7993,10 +8122,10 @@@
  #endif
  	get_online_cpus();
  	mutex_lock(&sched_domains_mutex);
 -	arch_init_sched_domains(&cpu_online_map);
 -	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 -	if (cpus_empty(non_isolated_cpus))
 -		cpu_set(smp_processor_id(), non_isolated_cpus);
 +	arch_init_sched_domains(cpu_online_mask);
 +	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 +	if (cpumask_empty(non_isolated_cpus))
 +		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
  	mutex_unlock(&sched_domains_mutex);
  	put_online_cpus();
  
@@@ -8123,13 -8011,9 +8140,13 @@@
  	init_hrtick();
  
  	/* Move init over to a non-isolated CPU */
 -	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
 +	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
  		BUG();
  	sched_init_granularity();
 +	free_cpumask_var(non_isolated_cpus);
 +
 +	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 +	init_sched_rt_class();
  }
  #else
  void __init sched_init_smp(void)
@@@ -8444,15 -8328,6 +8461,15 @@@ void __init sched_init(void
  	 */
  	current->sched_class = &fair_sched_class;
  
 +	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
 +	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
 +#ifdef CONFIG_SMP
 +#ifdef CONFIG_NO_HZ
 +	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
 +#endif
 +	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 +#endif /* SMP */
 +
  	scheduler_running = 1;
  }
  
@@@ -9423,6 -9298,41 +9440,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
  	kfree(ca);
  }
  
+ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+ {
+ 	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ 	u64 data;
+ 
+ #ifndef CONFIG_64BIT
+ 	/*
+ 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ 	 */
+ 	spin_lock_irq(&cpu_rq(cpu)->lock);
+ 	data = *cpuusage;
+ 	spin_unlock_irq(&cpu_rq(cpu)->lock);
+ #else
+ 	data = *cpuusage;
+ #endif
+ 
+ 	return data;
+ }
+ 
+ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+ {
+ 	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ 
+ #ifndef CONFIG_64BIT
+ 	/*
+ 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ 	 */
+ 	spin_lock_irq(&cpu_rq(cpu)->lock);
+ 	*cpuusage = val;
+ 	spin_unlock_irq(&cpu_rq(cpu)->lock);
+ #else
+ 	*cpuusage = val;
+ #endif
+ }
+ 
  /* return total cpu usage (in nanoseconds) of a group */
  static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
  {
@@@ -9430,17 -9340,8 +9482,8 @@@
  	u64 totalcpuusage = 0;
  	int i;
  
- 	for_each_possible_cpu(i) {
- 		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
- 
- 		/*
- 		 * Take rq->lock to make 64-bit addition safe on 32-bit
- 		 * platforms.
- 		 */
- 		spin_lock_irq(&cpu_rq(i)->lock);
- 		totalcpuusage += *cpuusage;
- 		spin_unlock_irq(&cpu_rq(i)->lock);
- 	}
+ 	for_each_present_cpu(i)
+ 		totalcpuusage += cpuacct_cpuusage_read(ca, i);
  
  	return totalcpuusage;
  }
@@@ -9457,23 -9358,39 +9500,39 @@@ static int cpuusage_write(struct cgrou
  		goto out;
  	}
  
- 	for_each_possible_cpu(i) {
- 		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+ 	for_each_present_cpu(i)
+ 		cpuacct_cpuusage_write(ca, i, 0);
  
- 		spin_lock_irq(&cpu_rq(i)->lock);
- 		*cpuusage = 0;
- 		spin_unlock_irq(&cpu_rq(i)->lock);
- 	}
  out:
  	return err;
  }
  
+ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ 				   struct seq_file *m)
+ {
+ 	struct cpuacct *ca = cgroup_ca(cgroup);
+ 	u64 percpu;
+ 	int i;
+ 
+ 	for_each_present_cpu(i) {
+ 		percpu = cpuacct_cpuusage_read(ca, i);
+ 		seq_printf(m, "%llu ", (unsigned long long) percpu);
+ 	}
+ 	seq_printf(m, "\n");
+ 	return 0;
+ }
+ 
  static struct cftype files[] = {
  	{
  		.name = "usage",
  		.read_u64 = cpuusage_read,
  		.write_u64 = cpuusage_write,
  	},
+ 	{
+ 		.name = "usage_percpu",
+ 		.read_seq_string = cpuacct_percpu_seq_read,
+ 	},
+ 
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
diff --combined kernel/sched_fair.c
index 36b5e34fa99,5ad4440f0fc..56c0efe902a
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -492,6 -492,8 +492,8 @@@ static void update_curr(struct cfs_rq *
  	 * overflow on 32 bits):
  	 */
  	delta_exec = (unsigned long)(now - curr->exec_start);
+ 	if (!delta_exec)
+ 		return;
  
  	__update_curr(cfs_rq, curr, delta_exec);
  	curr->exec_start = now;
@@@ -1017,33 -1019,16 +1019,33 @@@ static void yield_task_fair(struct rq *
   * search starts with cpus closest then further out as needed,
   * so we always favor a closer, idle cpu.
   * Domains may include CPUs that are not usable for migration,
 - * hence we need to mask them out (cpu_active_map)
 + * hence we need to mask them out (cpu_active_mask)
   *
   * Returns the CPU we should wake onto.
   */
  #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
  static int wake_idle(int cpu, struct task_struct *p)
  {
 -	cpumask_t tmp;
  	struct sched_domain *sd;
  	int i;
 +	unsigned int chosen_wakeup_cpu;
 +	int this_cpu;
 +
 +	/*
 +	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
 +	 * are idle and this is not a kernel thread and this task's affinity
 +	 * allows it to be moved to preferred cpu, then just move!
 +	 */
 +
 +	this_cpu = smp_processor_id();
 +	chosen_wakeup_cpu =
 +		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
 +
 +	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
 +		idle_cpu(cpu) && idle_cpu(this_cpu) &&
 +		p->mm && !(p->flags & PF_KTHREAD) &&
 +		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
 +		return chosen_wakeup_cpu;
  
  	/*
  	 * If it is idle, then it is the best cpu to run this task.
@@@ -1061,9 -1046,10 +1063,9 @@@
  		if ((sd->flags & SD_WAKE_IDLE)
  		    || ((sd->flags & SD_WAKE_IDLE_FAR)
  			&& !task_hot(p, task_rq(p)->clock, sd))) {
 -			cpus_and(tmp, sd->span, p->cpus_allowed);
 -			cpus_and(tmp, tmp, cpu_active_map);
 -			for_each_cpu_mask_nr(i, tmp) {
 -				if (idle_cpu(i)) {
 +			for_each_cpu_and(i, sched_domain_span(sd),
 +					 &p->cpus_allowed) {
 +				if (cpu_active(i) && idle_cpu(i)) {
  					if (i != task_cpu(p)) {
  						schedstat_inc(p,
  						       se.nr_wakeups_idle);
@@@ -1256,13 -1242,13 +1258,13 @@@ static int select_task_rq_fair(struct t
  	 * this_cpu and prev_cpu are present in:
  	 */
  	for_each_domain(this_cpu, sd) {
 -		if (cpu_isset(prev_cpu, sd->span)) {
 +		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
  			this_sd = sd;
  			break;
  		}
  	}
  
 -	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 +	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
  		goto out;
  
  	/*
@@@ -1361,12 -1347,11 +1363,11 @@@ static void check_preempt_wakeup(struc
  {
  	struct task_struct *curr = rq->curr;
  	struct sched_entity *se = &curr->se, *pse = &p->se;
+ 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  
- 	if (unlikely(rt_prio(p->prio))) {
- 		struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ 	update_curr(cfs_rq);
  
- 		update_rq_clock(rq);
- 		update_curr(cfs_rq);
+ 	if (unlikely(rt_prio(p->prio))) {
  		resched_task(curr);
  		return;
  	}
diff --combined kernel/sched_rt.c
index 1bbd9901401,51d2af3e619..833b6d44483
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -15,7 -15,7 +15,7 @@@ static inline void rt_set_overload(stru
  	if (!rq->online)
  		return;
  
 -	cpu_set(rq->cpu, rq->rd->rto_mask);
 +	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  	/*
  	 * Make sure the mask is visible before we set
  	 * the overload count. That is checked to determine
@@@ -34,7 -34,7 +34,7 @@@ static inline void rt_clear_overload(st
  
  	/* the order here really doesn't matter */
  	atomic_dec(&rq->rd->rto_count);
 -	cpu_clear(rq->cpu, rq->rd->rto_mask);
 +	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  }
  
  static void update_rt_migration(struct rq *rq)
@@@ -77,7 -77,7 +77,7 @@@ static inline u64 sched_rt_period(struc
  }
  
  #define for_each_leaf_rt_rq(rt_rq, rq) \
- 	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+ 	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  
  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  {
@@@ -139,14 -139,14 +139,14 @@@ static int rt_se_boosted(struct sched_r
  }
  
  #ifdef CONFIG_SMP
 -static inline cpumask_t sched_rt_period_mask(void)
 +static inline const struct cpumask *sched_rt_period_mask(void)
  {
  	return cpu_rq(smp_processor_id())->rd->span;
  }
  #else
 -static inline cpumask_t sched_rt_period_mask(void)
 +static inline const struct cpumask *sched_rt_period_mask(void)
  {
 -	return cpu_online_map;
 +	return cpu_online_mask;
  }
  #endif
  
@@@ -212,9 -212,9 +212,9 @@@ static inline int rt_rq_throttled(struc
  	return rt_rq->rt_throttled;
  }
  
 -static inline cpumask_t sched_rt_period_mask(void)
 +static inline const struct cpumask *sched_rt_period_mask(void)
  {
 -	return cpu_online_map;
 +	return cpu_online_mask;
  }
  
  static inline
@@@ -241,11 -241,11 +241,11 @@@ static int do_balance_runtime(struct rt
  	int i, weight, more = 0;
  	u64 rt_period;
  
 -	weight = cpus_weight(rd->span);
 +	weight = cpumask_weight(rd->span);
  
  	spin_lock(&rt_b->rt_runtime_lock);
  	rt_period = ktime_to_ns(rt_b->rt_period);
 -	for_each_cpu_mask_nr(i, rd->span) {
 +	for_each_cpu(i, rd->span) {
  		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  		s64 diff;
  
@@@ -324,7 -324,7 +324,7 @@@ static void __disable_runtime(struct r
  		/*
  		 * Greedy reclaim, take back as much as we can.
  		 */
 -		for_each_cpu_mask(i, rd->span) {
 +		for_each_cpu(i, rd->span) {
  			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  			s64 diff;
  
@@@ -429,13 -429,13 +429,13 @@@ static inline int balance_runtime(struc
  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
  {
  	int i, idle = 1;
 -	cpumask_t span;
 +	const struct cpumask *span;
  
  	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  		return 1;
  
  	span = sched_rt_period_mask();
 -	for_each_cpu_mask(i, span) {
 +	for_each_cpu(i, span) {
  		int enqueue = 0;
  		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
  		struct rq *rq = rq_of_rt_rq(rt_rq);
@@@ -805,20 -805,17 +805,20 @@@ static int select_task_rq_rt(struct tas
  
  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  {
 -	cpumask_t mask;
 +	cpumask_var_t mask;
  
  	if (rq->curr->rt.nr_cpus_allowed == 1)
  		return;
  
 -	if (p->rt.nr_cpus_allowed != 1
 -	    && cpupri_find(&rq->rd->cpupri, p, &mask))
 +	if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
  		return;
  
 -	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
 -		return;
 +	if (p->rt.nr_cpus_allowed != 1
 +	    && cpupri_find(&rq->rd->cpupri, p, mask))
 +		goto free;
 +
 +	if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
 +		goto free;
  
  	/*
  	 * There appears to be other cpus that can accept
@@@ -827,8 -824,6 +827,8 @@@
  	 */
  	requeue_task_rt(rq, p, 1);
  	resched_task(rq->curr);
 +free:
 +	free_cpumask_var(mask);
  }
  
  #endif /* CONFIG_SMP */
@@@ -919,7 -914,7 +919,7 @@@ static void deactivate_task(struct rq *
  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  {
  	if (!task_running(rq, p) &&
 -	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
 +	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
  	    (p->rt.nr_cpus_allowed > 1))
  		return 1;
  	return 0;
@@@ -958,7 -953,7 +958,7 @@@ static struct task_struct *pick_next_hi
  	return next;
  }
  
 -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
  
  static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
  {
@@@ -978,7 -973,7 +978,7 @@@
  static int find_lowest_rq(struct task_struct *task)
  {
  	struct sched_domain *sd;
 -	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 +	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
  	int this_cpu = smp_processor_id();
  	int cpu      = task_cpu(task);
  
@@@ -993,7 -988,7 +993,7 @@@
  	 * I guess we might want to change cpupri_find() to ignore those
  	 * in the first place.
  	 */
 -	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
 +	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
  
  	/*
  	 * At this point we have built a mask of cpus representing the
@@@ -1003,7 -998,7 +1003,7 @@@
  	 * We prioritize the last cpu that the task executed on since
  	 * it is most likely cache-hot in that location.
  	 */
 -	if (cpu_isset(cpu, *lowest_mask))
 +	if (cpumask_test_cpu(cpu, lowest_mask))
  		return cpu;
  
  	/*
@@@ -1018,8 -1013,7 +1018,8 @@@
  			cpumask_t domain_mask;
  			int       best_cpu;
  
 -			cpus_and(domain_mask, sd->span, *lowest_mask);
 +			cpumask_and(&domain_mask, sched_domain_span(sd),
 +				    lowest_mask);
  
  			best_cpu = pick_optimal_cpu(this_cpu,
  						    &domain_mask);
@@@ -1060,8 -1054,8 +1060,8 @@@ static struct rq *find_lock_lowest_rq(s
  			 * Also make sure that it wasn't scheduled on its rq.
  			 */
  			if (unlikely(task_rq(task) != rq ||
 -				     !cpu_isset(lowest_rq->cpu,
 -						task->cpus_allowed) ||
 +				     !cpumask_test_cpu(lowest_rq->cpu,
 +						       &task->cpus_allowed) ||
  				     task_running(rq, task) ||
  				     !task->se.on_rq)) {
  
@@@ -1182,7 -1176,7 +1182,7 @@@ static int pull_rt_task(struct rq *this
  
  	next = pick_next_task_rt(this_rq);
  
 -	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
 +	for_each_cpu(cpu, this_rq->rd->rto_mask) {
  		if (this_cpu == cpu)
  			continue;
  
@@@ -1311,9 -1305,9 +1311,9 @@@ move_one_task_rt(struct rq *this_rq, in
  }
  
  static void set_cpus_allowed_rt(struct task_struct *p,
 -				const cpumask_t *new_mask)
 +				const struct cpumask *new_mask)
  {
 -	int weight = cpus_weight(*new_mask);
 +	int weight = cpumask_weight(new_mask);
  
  	BUG_ON(!rt_task(p));
  
@@@ -1334,7 -1328,7 +1334,7 @@@
  		update_rt_migration(rq);
  	}
  
 -	p->cpus_allowed    = *new_mask;
 +	cpumask_copy(&p->cpus_allowed, new_mask);
  	p->rt.nr_cpus_allowed = weight;
  }
  
@@@ -1377,14 -1371,6 +1377,14 @@@ static void switched_from_rt(struct rq 
  	if (!rq->rt.rt_nr_running)
  		pull_rt_task(rq);
  }
 +
 +static inline void init_sched_rt_class(void)
 +{
 +	unsigned int i;
 +
 +	for_each_possible_cpu(i)
 +		alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
 +}
  #endif /* CONFIG_SMP */
  
  /*
@@@ -1555,4 -1541,3 +1555,4 @@@ static void print_rt_stats(struct seq_f
  	rcu_read_unlock();
  }
  #endif /* CONFIG_SCHED_DEBUG */
 +
diff --combined kernel/sched_stats.h
index 5fcf0e18458,b59fd9cdc1b..f2773b5d122
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@@ -31,7 -31,7 +31,7 @@@ static int show_schedstat(struct seq_fi
  		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
  		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
  		    rq->ttwu_count, rq->ttwu_local,
- 		    rq->rq_sched_info.cpu_time,
+ 		    rq->rq_cpu_time,
  		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
  
  		seq_printf(seq, "\n");
@@@ -42,8 -42,7 +42,8 @@@
  		for_each_domain(cpu, sd) {
  			enum cpu_idle_type itype;
  
 -			cpumask_scnprintf(mask_str, mask_len, &sd->span);
 +			cpumask_scnprintf(mask_str, mask_len,
 +					  sched_domain_span(sd));
  			seq_printf(seq, "domain%d %s", dcount++, mask_str);
  			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
  					itype++) {
@@@ -124,7 -123,7 +124,7 @@@ static inline voi
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {
  	if (rq)
- 		rq->rq_sched_info.cpu_time += delta;
+ 		rq->rq_cpu_time += delta;
  }
  
  static inline void
@@@ -237,7 -236,6 +237,6 @@@ static inline void sched_info_depart(st
  	unsigned long long delta = task_rq(t)->clock -
  					t->sched_info.last_arrival;
  
- 	t->sched_info.cpu_time += delta;
  	rq_sched_info_depart(task_rq(t), delta);
  
  	if (t->state == TASK_RUNNING)
diff --combined kernel/time/tick-sched.c
index 70f872c71f4,8f3fc2582d3..76a574bbef9
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@@ -144,7 -144,7 +144,7 @@@ void tick_nohz_update_jiffies(void
  	if (!ts->tick_stopped)
  		return;
  
 -	cpu_clear(cpu, nohz_cpu_mask);
 +	cpumask_clear_cpu(cpu, nohz_cpu_mask);
  	now = ktime_get();
  	ts->idle_waketime = now;
  
@@@ -247,7 -247,7 +247,7 @@@ void tick_nohz_stop_sched_tick(int inid
  	if (need_resched())
  		goto end;
  
- 	if (unlikely(local_softirq_pending())) {
+ 	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
  		static int ratelimit;
  
  		if (ratelimit < 10) {
@@@ -282,8 -282,31 +282,31 @@@
  	/* Schedule the tick, if we are at least one jiffie off */
  	if ((long)delta_jiffies >= 1) {
  
+ 		/*
+ 		* calculate the expiry time for the next timer wheel
+ 		* timer
+ 		*/
+ 		expires = ktime_add_ns(last_update, tick_period.tv64 *
+ 				   delta_jiffies);
+ 
+ 		/*
+ 		 * If this cpu is the one which updates jiffies, then
+ 		 * give up the assignment and let it be taken by the
+ 		 * cpu which runs the tick timer next, which might be
+ 		 * this cpu as well. If we don't drop this here the
+ 		 * jiffies might be stale and do_timer() never
+ 		 * invoked.
+ 		 */
+ 		if (cpu == tick_do_timer_cpu)
+ 			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ 
  		if (delta_jiffies > 1)
 -			cpu_set(cpu, nohz_cpu_mask);
 +			cpumask_set_cpu(cpu, nohz_cpu_mask);
+ 
+ 		/* Skip reprogram of event if its not changed */
+ 		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+ 			goto out;
+ 
  		/*
  		 * nohz_stop_sched_tick can be called several times before
  		 * the nohz_restart_sched_tick is called. This happens when
@@@ -296,7 -319,7 +319,7 @@@
  				/*
  				 * sched tick not stopped!
  				 */
 -				cpu_clear(cpu, nohz_cpu_mask);
 +				cpumask_clear_cpu(cpu, nohz_cpu_mask);
  				goto out;
  			}
  
@@@ -306,17 -329,6 +329,6 @@@
  			rcu_enter_nohz();
  		}
  
- 		/*
- 		 * If this cpu is the one which updates jiffies, then
- 		 * give up the assignment and let it be taken by the
- 		 * cpu which runs the tick timer next, which might be
- 		 * this cpu as well. If we don't drop this here the
- 		 * jiffies might be stale and do_timer() never
- 		 * invoked.
- 		 */
- 		if (cpu == tick_do_timer_cpu)
- 			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- 
  		ts->idle_sleeps++;
  
  		/*
@@@ -332,12 -344,7 +344,7 @@@
  			goto out;
  		}
  
- 		/*
- 		 * calculate the expiry time for the next timer wheel
- 		 * timer
- 		 */
- 		expires = ktime_add_ns(last_update, tick_period.tv64 *
- 				       delta_jiffies);
+ 		/* Mark expiries */
  		ts->idle_expires = expires;
  
  		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@@ -354,7 -361,7 +361,7 @@@
  		 * softirq.
  		 */
  		tick_do_update_jiffies64(ktime_get());
 -		cpu_clear(cpu, nohz_cpu_mask);
 +		cpumask_clear_cpu(cpu, nohz_cpu_mask);
  	}
  	raise_softirq_irqoff(TIMER_SOFTIRQ);
  out:
@@@ -432,7 -439,7 +439,7 @@@ void tick_nohz_restart_sched_tick(void
  	select_nohz_load_balancer(0);
  	now = ktime_get();
  	tick_do_update_jiffies64(now);
 -	cpu_clear(cpu, nohz_cpu_mask);
 +	cpumask_clear_cpu(cpu, nohz_cpu_mask);
  
  	/*
  	 * We stopped the tick in idle. Update process times would miss the
@@@ -681,7 -688,6 +688,6 @@@ void tick_setup_sched_timer(void
  	 */
  	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  	ts->sched_timer.function = tick_sched_timer;
- 	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  
  	/* Get the next period (per cpu) */
  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());