From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 2 Jan 2009 19:44:09 +0000 (-0800)
Subject: Merge branch 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel... 
X-Git-Tag: v2.6.29-rc1~538
X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=b840d79631c882786925303c2b0f4fefc31845ed;hp=-c;p=linux-2.6-omap-h63xx.git

Merge branch 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (66 commits)
  x86: export vector_used_by_percpu_irq
  x86: use logical apicid in x2apic_cluster's x2apic_cpu_mask_to_apicid_and()
  sched: nominate preferred wakeup cpu, fix
  x86: fix lguest used_vectors breakage, -v2
  x86: fix warning in arch/x86/kernel/io_apic.c
  sched: fix warning in kernel/sched.c
  sched: move test_sd_parent() to an SMP section of sched.h
  sched: add SD_BALANCE_NEWIDLE at MC and CPU level for sched_mc>0
  sched: activate active load balancing in new idle cpus
  sched: bias task wakeups to preferred semi-idle packages
  sched: nominate preferred wakeup cpu
  sched: favour lower logical cpu number for sched_mc balance
  sched: framework for sched_mc/smt_power_savings=N
  sched: convert BALANCE_FOR_xx_POWER to inline functions
  x86: use possible_cpus=NUM to extend the possible cpus allowed
  x86: fix cpu_mask_to_apicid_and to include cpu_online_mask
  x86: update io_apic.c to the new cpumask code
  x86: Introduce topology_core_cpumask()/topology_thread_cpumask()
  x86: xen: use smp_call_function_many()
  x86: use work_on_cpu in x86/kernel/cpu/mcheck/mce_amd_64.c
  ...

Fixed up trivial conflict in kernel/time/tick-sched.c manually
---

b840d79631c882786925303c2b0f4fefc31845ed
diff --combined arch/arm/kernel/smp.c
index 019237d2162,bd905c0a736..55fa7ff96a3
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@@ -33,16 -33,6 +33,6 @@@
  #include <asm/tlbflush.h>
  #include <asm/ptrace.h>
  
- /*
-  * bitmask of present and online CPUs.
-  * The present bitmask indicates that the CPU is physically present.
-  * The online bitmask indicates that the CPU is up and running.
-  */
- cpumask_t cpu_possible_map;
- EXPORT_SYMBOL(cpu_possible_map);
- cpumask_t cpu_online_map;
- EXPORT_SYMBOL(cpu_online_map);
- 
  /*
   * as from 2.5, kernels no longer have an init_tasks structure
   * so we need some other way of telling a new secondary core
@@@ -181,7 -171,7 +171,7 @@@ int __cpuexit __cpu_disable(void
  	/*
  	 * Stop the local timer for this CPU.
  	 */
 -	local_timer_stop(cpu);
 +	local_timer_stop();
  
  	/*
  	 * Flush user cache and TLB mappings, and then remove this CPU
@@@ -284,7 -274,7 +274,7 @@@ asmlinkage void __cpuinit secondary_sta
  	/*
  	 * Setup local timer for this CPU.
  	 */
 -	local_timer_setup(cpu);
 +	local_timer_setup();
  
  	calibrate_delay();
  
diff --combined arch/arm/mach-at91/at91rm9200_time.c
index d140eae53de,72f51d39202..1ff1bda0a89
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@@ -141,15 -141,6 +141,15 @@@ clkevt32k_next_event(unsigned long delt
  	/* Use "raw" primitives so we behave correctly on RT kernels. */
  	raw_local_irq_save(flags);
  
 +	/*
 +	 * According to Thomas Gleixner irqs are already disabled here.  Simply
 +	 * removing raw_local_irq_save above (and the matching
 +	 * raw_local_irq_restore) was not accepted.  See
 +	 * http://thread.gmane.org/gmane.linux.ports.arm.kernel/41174
 +	 * So for now (2008-11-20) just warn once if irqs were not disabled ...
 +	 */
 +	WARN_ON_ONCE(!raw_irqs_disabled_flags(flags));
 +
  	/* The alarm IRQ uses absolute time (now+delta), not the relative
  	 * time (delta) in our calling convention.  Like all clockevents
  	 * using such "match" hardware, we have a race to defend against.
@@@ -178,7 -169,6 +178,6 @@@ static struct clock_event_device clkev
  	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
  	.shift		= 32,
  	.rating		= 150,
- 	.cpumask	= CPU_MASK_CPU0,
  	.set_next_event	= clkevt32k_next_event,
  	.set_mode	= clkevt32k_mode,
  };
@@@ -206,7 -196,7 +205,7 @@@ void __init at91rm9200_timer_init(void
  	clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift);
  	clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt);
  	clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1;
- 	clkevt.cpumask = cpumask_of_cpu(0);
+ 	clkevt.cpumask = cpumask_of(0);
  	clockevents_register_device(&clkevt);
  
  	/* register clocksource */
diff --combined arch/arm/mach-pxa/time.c
index 00162415851,bf3c9a4aad5..95656a72268
--- a/arch/arm/mach-pxa/time.c
+++ b/arch/arm/mach-pxa/time.c
@@@ -22,8 -22,8 +22,8 @@@
  #include <asm/div64.h>
  #include <asm/mach/irq.h>
  #include <asm/mach/time.h>
 +#include <mach/hardware.h>
  #include <mach/pxa-regs.h>
 -#include <asm/mach-types.h>
  
  /*
   * This is PXA's sched_clock implementation. This has a resolution
@@@ -122,7 -122,6 +122,6 @@@ static struct clock_event_device ckevt_
  	.features	= CLOCK_EVT_FEAT_ONESHOT,
  	.shift		= 32,
  	.rating		= 200,
- 	.cpumask	= CPU_MASK_CPU0,
  	.set_next_event	= pxa_osmr0_set_next_event,
  	.set_mode	= pxa_osmr0_set_mode,
  };
@@@ -150,11 -149,18 +149,11 @@@ static struct irqaction pxa_ost0_irq = 
  
  static void __init pxa_timer_init(void)
  {
 -	unsigned long clock_tick_rate;
 +	unsigned long clock_tick_rate = get_clock_tick_rate();
  
  	OIER = 0;
  	OSSR = OSSR_M0 | OSSR_M1 | OSSR_M2 | OSSR_M3;
  
 -	if (cpu_is_pxa25x())
 -		clock_tick_rate = 3686400;
 -	else if (machine_is_mainstone())
 -		clock_tick_rate = 3249600;
 -	else
 -		clock_tick_rate = 3250000;
 -
  	set_oscr2ns_scale(clock_tick_rate);
  
  	ckevt_pxa_osmr0.mult =
@@@ -163,6 -169,7 +162,7 @@@
  		clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0);
  	ckevt_pxa_osmr0.min_delta_ns =
  		clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1;
+ 	ckevt_pxa_osmr0.cpumask = cpumask_of(0);
  
  	cksrc_pxa_oscr0.mult =
  		clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift);
diff --combined arch/arm/mach-realview/core.c
index 5f1d55963ce,b07cb9b7adb..bd2aa4f1614
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@@ -28,14 -28,11 +28,14 @@@
  #include <linux/clocksource.h>
  #include <linux/clockchips.h>
  #include <linux/io.h>
 +#include <linux/smc911x.h>
  
 +#include <asm/clkdev.h>
  #include <asm/system.h>
  #include <mach/hardware.h>
  #include <asm/irq.h>
  #include <asm/leds.h>
 +#include <asm/mach-types.h>
  #include <asm/hardware/arm_timer.h>
  #include <asm/hardware/icst307.h>
  
@@@ -52,7 -49,7 +52,7 @@@
  
  #define REALVIEW_REFCOUNTER	(__io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_24MHz_OFFSET)
  
 -/* used by entry-macro.S */
 +/* used by entry-macro.S and platsmp.c */
  void __iomem *gic_cpu_base_addr;
  
  /*
@@@ -127,29 -124,6 +127,29 @@@ int realview_flash_register(struct reso
  	return platform_device_register(&realview_flash_device);
  }
  
 +static struct smc911x_platdata realview_smc911x_platdata = {
 +	.flags		= SMC911X_USE_32BIT,
 +	.irq_flags	= IRQF_SHARED,
 +	.irq_polarity	= 1,
 +};
 +
 +static struct platform_device realview_eth_device = {
 +	.name		= "smc911x",
 +	.id		= 0,
 +	.num_resources	= 2,
 +};
 +
 +int realview_eth_register(const char *name, struct resource *res)
 +{
 +	if (name)
 +		realview_eth_device.name = name;
 +	realview_eth_device.resource = res;
 +	if (strcmp(realview_eth_device.name, "smc911x") == 0)
 +		realview_eth_device.dev.platform_data = &realview_smc911x_platdata;
 +
 +	return platform_device_register(&realview_eth_device);
 +}
 +
  static struct resource realview_i2c_resource = {
  	.start		= REALVIEW_I2C_BASE,
  	.end		= REALVIEW_I2C_BASE + SZ_4K - 1,
@@@ -203,14 -177,9 +203,14 @@@ static const struct icst307_params real
  static void realview_oscvco_set(struct clk *clk, struct icst307_vco vco)
  {
  	void __iomem *sys_lock = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_LOCK_OFFSET;
 -	void __iomem *sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC4_OFFSET;
 +	void __iomem *sys_osc;
  	u32 val;
  
 +	if (machine_is_realview_pb1176())
 +		sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC0_OFFSET;
 +	else
 +		sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC4_OFFSET;
 +
  	val = readl(sys_osc) & ~0x7ffff;
  	val |= vco.v | (vco.r << 9) | (vco.s << 16);
  
@@@ -219,59 -188,12 +219,59 @@@
  	writel(0, sys_lock);
  }
  
 -struct clk realview_clcd_clk = {
 -	.name	= "CLCDCLK",
 +static struct clk oscvco_clk = {
  	.params	= &realview_oscvco_params,
  	.setvco = realview_oscvco_set,
  };
  
 +/*
 + * These are fixed clocks.
 + */
 +static struct clk ref24_clk = {
 +	.rate	= 24000000,
 +};
 +
 +static struct clk_lookup lookups[] = {
 +	{	/* UART0 */
 +		.dev_id		= "dev:f1",
 +		.clk		= &ref24_clk,
 +	}, {	/* UART1 */
 +		.dev_id		= "dev:f2",
 +		.clk		= &ref24_clk,
 +	}, {	/* UART2 */
 +		.dev_id		= "dev:f3",
 +		.clk		= &ref24_clk,
 +	}, {	/* UART3 */
 +		.dev_id		= "fpga:09",
 +		.clk		= &ref24_clk,
 +	}, {	/* KMI0 */
 +		.dev_id		= "fpga:06",
 +		.clk		= &ref24_clk,
 +	}, {	/* KMI1 */
 +		.dev_id		= "fpga:07",
 +		.clk		= &ref24_clk,
 +	}, {	/* MMC0 */
 +		.dev_id		= "fpga:05",
 +		.clk		= &ref24_clk,
 +	}, {	/* EB:CLCD */
 +		.dev_id		= "dev:20",
 +		.clk		= &oscvco_clk,
 +	}, {	/* PB:CLCD */
 +		.dev_id		= "issp:20",
 +		.clk		= &oscvco_clk,
 +	}
 +};
 +
 +static int __init clk_init(void)
 +{
 +	int i;
 +
 +	for (i = 0; i < ARRAY_SIZE(lookups); i++)
 +		clkdev_add(&lookups[i]);
 +	return 0;
 +}
 +arch_initcall(clk_init);
 +
  /*
   * CLCD support.
   */
@@@ -304,30 -226,7 +304,30 @@@ static struct clcd_panel vga = 
  	.width		= -1,
  	.height		= -1,
  	.tim2		= TIM2_BCD | TIM2_IPC,
 -	.cntl		= CNTL_LCDTFT | CNTL_LCDVCOMP(1),
 +	.cntl		= CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
 +	.bpp		= 16,
 +};
 +
 +static struct clcd_panel xvga = {
 +	.mode		= {
 +		.name		= "XVGA",
 +		.refresh	= 60,
 +		.xres		= 1024,
 +		.yres		= 768,
 +		.pixclock	= 15748,
 +		.left_margin	= 152,
 +		.right_margin	= 48,
 +		.upper_margin	= 23,
 +		.lower_margin	= 3,
 +		.hsync_len	= 104,
 +		.vsync_len	= 4,
 +		.sync		= 0,
 +		.vmode		= FB_VMODE_NONINTERLACED,
 +	},
 +	.width		= -1,
 +	.height		= -1,
 +	.tim2		= TIM2_BCD | TIM2_IPC,
 +	.cntl		= CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
  	.bpp		= 16,
  };
  
@@@ -350,7 -249,7 +350,7 @@@ static struct clcd_panel sanyo_3_8_in 
  	.width		= -1,
  	.height		= -1,
  	.tim2		= TIM2_BCD,
 -	.cntl		= CNTL_LCDTFT | CNTL_LCDVCOMP(1),
 +	.cntl		= CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
  	.bpp		= 16,
  };
  
@@@ -373,7 -272,7 +373,7 @@@ static struct clcd_panel sanyo_2_5_in 
  	.width		= -1,
  	.height		= -1,
  	.tim2		= TIM2_IVS | TIM2_IHS | TIM2_IPC,
 -	.cntl		= CNTL_LCDTFT | CNTL_LCDVCOMP(1),
 +	.cntl		= CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
  	.bpp		= 16,
  };
  
@@@ -396,7 -295,7 +396,7 @@@ static struct clcd_panel epson_2_2_in 
  	.width		= -1,
  	.height		= -1,
  	.tim2		= TIM2_BCD | TIM2_IPC,
 -	.cntl		= CNTL_LCDTFT | CNTL_LCDVCOMP(1),
 +	.cntl		= CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
  	.bpp		= 16,
  };
  
@@@ -409,15 -308,9 +409,15 @@@
  static struct clcd_panel *realview_clcd_panel(void)
  {
  	void __iomem *sys_clcd = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_CLCD_OFFSET;
 -	struct clcd_panel *panel = &vga;
 +	struct clcd_panel *vga_panel;
 +	struct clcd_panel *panel;
  	u32 val;
  
 +	if (machine_is_realview_eb())
 +		vga_panel = &vga;
 +	else
 +		vga_panel = &xvga;
 +
  	val = readl(sys_clcd) & SYS_CLCD_ID_MASK;
  	if (val == SYS_CLCD_ID_SANYO_3_8)
  		panel = &sanyo_3_8_in;
@@@ -426,11 -319,11 +426,11 @@@
  	else if (val == SYS_CLCD_ID_EPSON_2_2)
  		panel = &epson_2_2_in;
  	else if (val == SYS_CLCD_ID_VGA)
 -		panel = &vga;
 +		panel = vga_panel;
  	else {
  		printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n",
  			val);
 -		panel = &vga;
 +		panel = vga_panel;
  	}
  
  	return panel;
@@@ -465,18 -358,12 +465,18 @@@ static void realview_clcd_enable(struc
  	writel(val, sys_clcd);
  }
  
 -static unsigned long framesize = SZ_1M;
 -
  static int realview_clcd_setup(struct clcd_fb *fb)
  {
 +	unsigned long framesize;
  	dma_addr_t dma;
  
 +	if (machine_is_realview_eb())
 +		/* VGA, 16bpp */
 +		framesize = 640 * 480 * 2;
 +	else
 +		/* XVGA, 16bpp */
 +		framesize = 1024 * 768 * 2;
 +
  	fb->panel		= realview_clcd_panel();
  
  	fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
@@@ -624,7 -511,7 +624,7 @@@ static struct clock_event_device timer0
  	.set_mode	= timer_set_mode,
  	.set_next_event	= timer_set_next_event,
  	.rating		= 300,
- 	.cpumask	= CPU_MASK_ALL,
+ 	.cpumask	= cpu_all_mask,
  };
  
  static void __init realview_clockevents_init(unsigned int timer_irq)
@@@ -701,7 -588,7 +701,7 @@@ void __init realview_timer_init(unsigne
  	 * The dummy clock device has to be registered before the main device
  	 * so that the latter will broadcast the clock events
  	 */
 -	local_timer_setup(smp_processor_id());
 +	local_timer_setup();
  #endif
  
  	/* 
diff --combined arch/arm/mach-realview/localtimer.c
index 9019ef2e561,504961ef343..67d6d9cc68b
--- a/arch/arm/mach-realview/localtimer.c
+++ b/arch/arm/mach-realview/localtimer.c
@@@ -38,14 -38,18 +38,14 @@@ void local_timer_interrupt(void
  
  #ifdef CONFIG_LOCAL_TIMERS
  
 -#define TWD_BASE(cpu)	(twd_base_addr + (cpu) * twd_size)
 -
  /* set up by the platform code */
 -void __iomem *twd_base_addr;
 -unsigned int twd_size;
 +void __iomem *twd_base;
  
  static unsigned long mpcore_timer_rate;
  
  static void local_timer_set_mode(enum clock_event_mode mode,
  				 struct clock_event_device *clk)
  {
 -	void __iomem *base = TWD_BASE(smp_processor_id());
  	unsigned long ctrl;
  
  	switch(mode) {
@@@ -64,16 -68,17 +64,16 @@@
  		ctrl = 0;
  	}
  
 -	__raw_writel(ctrl, base + TWD_TIMER_CONTROL);
 +	__raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
  }
  
  static int local_timer_set_next_event(unsigned long evt,
  				      struct clock_event_device *unused)
  {
 -	void __iomem *base = TWD_BASE(smp_processor_id());
 -	unsigned long ctrl = __raw_readl(base + TWD_TIMER_CONTROL);
 +	unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL);
  
 -	__raw_writel(evt, base + TWD_TIMER_COUNTER);
 -	__raw_writel(ctrl | TWD_TIMER_CONTROL_ENABLE, base + TWD_TIMER_CONTROL);
 +	__raw_writel(evt, twd_base + TWD_TIMER_COUNTER);
 +	__raw_writel(ctrl | TWD_TIMER_CONTROL_ENABLE, twd_base + TWD_TIMER_CONTROL);
  
  	return 0;
  }
@@@ -86,16 -91,19 +86,16 @@@
   */
  int local_timer_ack(void)
  {
 -	void __iomem *base = TWD_BASE(smp_processor_id());
 -
 -	if (__raw_readl(base + TWD_TIMER_INTSTAT)) {
 -		__raw_writel(1, base + TWD_TIMER_INTSTAT);
 +	if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) {
 +		__raw_writel(1, twd_base + TWD_TIMER_INTSTAT);
  		return 1;
  	}
  
  	return 0;
  }
  
 -static void __cpuinit twd_calibrate_rate(unsigned int cpu)
 +static void __cpuinit twd_calibrate_rate(void)
  {
 -	void __iomem *base = TWD_BASE(cpu);
  	unsigned long load, count;
  	u64 waitjiffies;
  
@@@ -116,15 -124,15 +116,15 @@@
  		waitjiffies += 5;
  
  				 /* enable, no interrupt or reload */
 -		__raw_writel(0x1, base + TWD_TIMER_CONTROL);
 +		__raw_writel(0x1, twd_base + TWD_TIMER_CONTROL);
  
  				 /* maximum value */
 -		__raw_writel(0xFFFFFFFFU, base + TWD_TIMER_COUNTER);
 +		__raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
  
  		while (get_jiffies_64() < waitjiffies)
  			udelay(10);
  
 -		count = __raw_readl(base + TWD_TIMER_COUNTER);
 +		count = __raw_readl(twd_base + TWD_TIMER_COUNTER);
  
  		mpcore_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
  
@@@ -134,19 -142,18 +134,19 @@@
  
  	load = mpcore_timer_rate / HZ;
  
 -	__raw_writel(load, base + TWD_TIMER_LOAD);
 +	__raw_writel(load, twd_base + TWD_TIMER_LOAD);
  }
  
  /*
   * Setup the local clock events for a CPU.
   */
 -void __cpuinit local_timer_setup(unsigned int cpu)
 +void __cpuinit local_timer_setup(void)
  {
 +	unsigned int cpu = smp_processor_id();
  	struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
  	unsigned long flags;
  
 -	twd_calibrate_rate(cpu);
 +	twd_calibrate_rate();
  
  	clk->name		= "local_timer";
  	clk->features		= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
@@@ -154,7 -161,7 +154,7 @@@
  	clk->set_mode		= local_timer_set_mode;
  	clk->set_next_event	= local_timer_set_next_event;
  	clk->irq		= IRQ_LOCALTIMER;
- 	clk->cpumask		= cpumask_of_cpu(cpu);
+ 	clk->cpumask		= cpumask_of(cpu);
  	clk->shift		= 20;
  	clk->mult		= div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift);
  	clk->max_delta_ns	= clockevent_delta2ns(0xffffffff, clk);
@@@ -171,9 -178,9 +171,9 @@@
  /*
   * take a local timer down
   */
 -void __cpuexit local_timer_stop(unsigned int cpu)
 +void __cpuexit local_timer_stop(void)
  {
 -	__raw_writel(0, TWD_BASE(cpu) + TWD_TIMER_CONTROL);
 +	__raw_writel(0, twd_base + TWD_TIMER_CONTROL);
  }
  
  #else	/* CONFIG_LOCAL_TIMERS */
@@@ -183,9 -190,8 +183,9 @@@ static void dummy_timer_set_mode(enum c
  {
  }
  
 -void __cpuinit local_timer_setup(unsigned int cpu)
 +void __cpuinit local_timer_setup(void)
  {
 +	unsigned int cpu = smp_processor_id();
  	struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
  
  	clk->name		= "dummy_timer";
@@@ -193,7 -199,7 +193,7 @@@
  	clk->rating		= 200;
  	clk->set_mode		= dummy_timer_set_mode;
  	clk->broadcast		= smp_timer_broadcast;
- 	clk->cpumask		= cpumask_of_cpu(cpu);
+ 	clk->cpumask		= cpumask_of(cpu);
  
  	clockevents_register_device(clk);
  }
diff --combined arch/arm/mach-sa1100/time.c
index 8c5e727f3b7,1cac4ac0b4b..711c0295c66
--- a/arch/arm/mach-sa1100/time.c
+++ b/arch/arm/mach-sa1100/time.c
@@@ -2,8 -2,8 +2,8 @@@
   * linux/arch/arm/mach-sa1100/time.c
   *
   * Copyright (C) 1998 Deborah Wallach.
 - * Twiddles  (C) 1999 	Hugo Fiennes <hugo@empeg.com>
 - * 
 + * Twiddles  (C) 1999 Hugo Fiennes <hugo@empeg.com>
 + *
   * 2000/03/29 (C) Nicolas Pitre <nico@cam.org>
   *	Rewritten: big cleanup, much simpler, better HZ accuracy.
   *
@@@ -73,7 -73,6 +73,6 @@@ static struct clock_event_device ckevt_
  	.features	= CLOCK_EVT_FEAT_ONESHOT,
  	.shift		= 32,
  	.rating		= 200,
- 	.cpumask	= CPU_MASK_CPU0,
  	.set_next_event	= sa1100_osmr0_set_next_event,
  	.set_mode	= sa1100_osmr0_set_mode,
  };
@@@ -110,6 -109,7 +109,7 @@@ static void __init sa1100_timer_init(vo
  		clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0);
  	ckevt_sa1100_osmr0.min_delta_ns =
  		clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1;
+ 	ckevt_sa1100_osmr0.cpumask = cpumask_of(0);
  
  	cksrc_sa1100_oscr.mult =
  		clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift);
diff --combined arch/arm/mach-versatile/core.c
index df25aa13850,a3f1933434e..1c43494f5c4
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@@ -31,7 -31,6 +31,7 @@@
  #include <linux/cnt32_to_63.h>
  #include <linux/io.h>
  
 +#include <asm/clkdev.h>
  #include <asm/system.h>
  #include <mach/hardware.h>
  #include <asm/irq.h>
@@@ -374,60 -373,22 +374,60 @@@ static const struct icst307_params vers
  
  static void versatile_oscvco_set(struct clk *clk, struct icst307_vco vco)
  {
 -	void __iomem *sys_lock = __io_address(VERSATILE_SYS_BASE) + VERSATILE_SYS_LOCK_OFFSET;
 -	void __iomem *sys_osc = __io_address(VERSATILE_SYS_BASE) + VERSATILE_SYS_OSCCLCD_OFFSET;
 +	void __iomem *sys = __io_address(VERSATILE_SYS_BASE);
 +	void __iomem *sys_lock = sys + VERSATILE_SYS_LOCK_OFFSET;
  	u32 val;
  
 -	val = readl(sys_osc) & ~0x7ffff;
 +	val = readl(sys + clk->oscoff) & ~0x7ffff;
  	val |= vco.v | (vco.r << 9) | (vco.s << 16);
  
  	writel(0xa05f, sys_lock);
 -	writel(val, sys_osc);
 +	writel(val, sys + clk->oscoff);
  	writel(0, sys_lock);
  }
  
 -static struct clk versatile_clcd_clk = {
 -	.name	= "CLCDCLK",
 +static struct clk osc4_clk = {
  	.params	= &versatile_oscvco_params,
 -	.setvco = versatile_oscvco_set,
 +	.oscoff	= VERSATILE_SYS_OSCCLCD_OFFSET,
 +	.setvco	= versatile_oscvco_set,
 +};
 +
 +/*
 + * These are fixed clocks.
 + */
 +static struct clk ref24_clk = {
 +	.rate	= 24000000,
 +};
 +
 +static struct clk_lookup lookups[] __initdata = {
 +	{	/* UART0 */
 +		.dev_id		= "dev:f1",
 +		.clk		= &ref24_clk,
 +	}, {	/* UART1 */
 +		.dev_id		= "dev:f2",
 +		.clk		= &ref24_clk,
 +	}, {	/* UART2 */
 +		.dev_id		= "dev:f3",
 +		.clk		= &ref24_clk,
 +	}, {	/* UART3 */
 +		.dev_id		= "fpga:09",
 +		.clk		= &ref24_clk,
 +	}, {	/* KMI0 */
 +		.dev_id		= "fpga:06",
 +		.clk		= &ref24_clk,
 +	}, {	/* KMI1 */
 +		.dev_id		= "fpga:07",
 +		.clk		= &ref24_clk,
 +	}, {	/* MMC0 */
 +		.dev_id		= "fpga:05",
 +		.clk		= &ref24_clk,
 +	}, {	/* MMC1 */
 +		.dev_id		= "fpga:0b",
 +		.clk		= &ref24_clk,
 +	}, {	/* CLCD */
 +		.dev_id		= "dev:20",
 +		.clk		= &osc4_clk,
 +	}
  };
  
  /*
@@@ -825,8 -786,7 +825,8 @@@ void __init versatile_init(void
  {
  	int i;
  
 -	clk_register(&versatile_clcd_clk);
 +	for (i = 0; i < ARRAY_SIZE(lookups); i++)
 +		clkdev_add(&lookups[i]);
  
  	platform_device_register(&versatile_flash_device);
  	platform_device_register(&versatile_i2c_device);
@@@ -1005,7 -965,7 +1005,7 @@@ static void __init versatile_timer_init
  	timer0_clockevent.min_delta_ns =
  		clockevent_delta2ns(0xf, &timer0_clockevent);
  
- 	timer0_clockevent.cpumask = cpumask_of_cpu(0);
+ 	timer0_clockevent.cpumask = cpumask_of(0);
  	clockevents_register_device(&timer0_clockevent);
  }
  
diff --combined arch/powerpc/kernel/smp.c
index 8ac3f721d23,d1165566f06..65484b2200b
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@@ -57,15 -57,12 +57,11 @@@
  #define DBG(fmt...)
  #endif
  
 -int smp_hw_index[NR_CPUS];
  struct thread_info *secondary_ti;
  
- cpumask_t cpu_possible_map = CPU_MASK_NONE;
- cpumask_t cpu_online_map = CPU_MASK_NONE;
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
  DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE;
  
- EXPORT_SYMBOL(cpu_online_map);
- EXPORT_SYMBOL(cpu_possible_map);
  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
  
@@@ -122,65 -119,6 +118,65 @@@ void smp_message_recv(int msg
  	}
  }
  
 +static irqreturn_t call_function_action(int irq, void *data)
 +{
 +	generic_smp_call_function_interrupt();
 +	return IRQ_HANDLED;
 +}
 +
 +static irqreturn_t reschedule_action(int irq, void *data)
 +{
 +	/* we just need the return path side effect of checking need_resched */
 +	return IRQ_HANDLED;
 +}
 +
 +static irqreturn_t call_function_single_action(int irq, void *data)
 +{
 +	generic_smp_call_function_single_interrupt();
 +	return IRQ_HANDLED;
 +}
 +
 +static irqreturn_t debug_ipi_action(int irq, void *data)
 +{
 +	smp_message_recv(PPC_MSG_DEBUGGER_BREAK);
 +	return IRQ_HANDLED;
 +}
 +
 +static irq_handler_t smp_ipi_action[] = {
 +	[PPC_MSG_CALL_FUNCTION] =  call_function_action,
 +	[PPC_MSG_RESCHEDULE] = reschedule_action,
 +	[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
 +	[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 +};
 +
 +const char *smp_ipi_name[] = {
 +	[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
 +	[PPC_MSG_RESCHEDULE] = "ipi reschedule",
 +	[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
 +	[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
 +};
 +
 +/* optional function to request ipi, for controllers with >= 4 ipis */
 +int smp_request_message_ipi(int virq, int msg)
 +{
 +	int err;
 +
 +	if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
 +		return -EINVAL;
 +	}
 +#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC)
 +	if (msg == PPC_MSG_DEBUGGER_BREAK) {
 +		return 1;
 +	}
 +#endif
 +	err = request_irq(virq, smp_ipi_action[msg], IRQF_DISABLED|IRQF_PERCPU,
 +			  smp_ipi_name[msg], 0);
 +	WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n",
 +		virq, smp_ipi_name[msg], err);
 +
 +	return err;
 +}
 +
  void smp_send_reschedule(int cpu)
  {
  	if (likely(smp_ops))
@@@ -466,7 -404,8 +462,7 @@@ out
  static struct device_node *cpu_to_l2cache(int cpu)
  {
  	struct device_node *np;
 -	const phandle *php;
 -	phandle ph;
 +	struct device_node *cache;
  
  	if (!cpu_present(cpu))
  		return NULL;
@@@ -475,11 -414,13 +471,11 @@@
  	if (np == NULL)
  		return NULL;
  
 -	php = of_get_property(np, "l2-cache", NULL);
 -	if (php == NULL)
 -		return NULL;
 -	ph = *php;
 +	cache = of_find_next_cache_node(np);
 +
  	of_node_put(np);
  
 -	return of_find_node_by_phandle(ph);
 +	return cache;
  }
  
  /* Activate a secondary processor. */
diff --combined arch/powerpc/kernel/time.c
index e1f3a514042,6f39d35d6f5..99f1ddd6858
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@@ -164,6 -164,8 +164,6 @@@ static u64 tb_to_ns_scale __read_mostly
  static unsigned tb_to_ns_shift __read_mostly;
  static unsigned long boot_tb __read_mostly;
  
 -static struct gettimeofday_struct do_gtod;
 -
  extern struct timezone sys_tz;
  static long timezone_offset;
  
@@@ -413,9 -415,31 +413,9 @@@ void udelay(unsigned long usecs
  }
  EXPORT_SYMBOL(udelay);
  
 -
 -/*
 - * There are two copies of tb_to_xs and stamp_xsec so that no
 - * lock is needed to access and use these values in
 - * do_gettimeofday.  We alternate the copies and as long as a
 - * reasonable time elapses between changes, there will never
 - * be inconsistent values.  ntpd has a minimum of one minute
 - * between updates.
 - */
  static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
  			       u64 new_tb_to_xs)
  {
 -	unsigned temp_idx;
 -	struct gettimeofday_vars *temp_varp;
 -
 -	temp_idx = (do_gtod.var_idx == 0);
 -	temp_varp = &do_gtod.vars[temp_idx];
 -
 -	temp_varp->tb_to_xs = new_tb_to_xs;
 -	temp_varp->tb_orig_stamp = new_tb_stamp;
 -	temp_varp->stamp_xsec = new_stamp_xsec;
 -	smp_mb();
 -	do_gtod.varp = temp_varp;
 -	do_gtod.var_idx = temp_idx;
 -
  	/*
  	 * tb_update_count is used to allow the userspace gettimeofday code
  	 * to assure itself that it sees a consistent view of the tb_to_xs and
@@@ -432,7 -456,6 +432,7 @@@
  	vdso_data->tb_to_xs = new_tb_to_xs;
  	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
  	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
 +	vdso_data->stamp_xtime = xtime;
  	smp_wmb();
  	++(vdso_data->tb_update_count);
  }
@@@ -491,7 -514,9 +491,7 @@@ static int __init iSeries_tb_recal(void
  				tb_ticks_per_sec   = new_tb_ticks_per_sec;
  				calc_cputime_factors();
  				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
 -				do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
  				tb_to_xs = divres.result_low;
 -				do_gtod.varp->tb_to_xs = tb_to_xs;
  				vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
  				vdso_data->tb_to_xs = tb_to_xs;
  			}
@@@ -844,7 -869,7 +844,7 @@@ static void register_decrementer_clocke
  	struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
  
  	*dec = decrementer_clockevent;
- 	dec->cpumask = cpumask_of_cpu(cpu);
+ 	dec->cpumask = cpumask_of(cpu);
  
  	printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
  	       dec->name, dec->mult, dec->shift, cpu);
@@@ -963,6 -988,15 +963,6 @@@ void __init time_init(void
  		sys_tz.tz_dsttime = 0;
          }
  
 -	do_gtod.varp = &do_gtod.vars[0];
 -	do_gtod.var_idx = 0;
 -	do_gtod.varp->tb_orig_stamp = tb_last_jiffy;
 -	__get_cpu_var(last_jiffy) = tb_last_jiffy;
 -	do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
 -	do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
 -	do_gtod.varp->tb_to_xs = tb_to_xs;
 -	do_gtod.tb_to_us = tb_to_us;
 -
  	vdso_data->tb_orig_stamp = tb_last_jiffy;
  	vdso_data->tb_update_count = 0;
  	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
diff --combined arch/powerpc/platforms/pseries/xics.c
index f7a69021b7b,424b335a71c..84e058f1e1c
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@@ -332,7 -332,7 +332,7 @@@ static void xics_eoi_lpar(unsigned int 
  	lpar_xirr_info_set((0xff << 24) | irq);
  }
  
- static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
  {
  	unsigned int irq;
  	int status;
@@@ -579,7 -579,7 +579,7 @@@ static void xics_update_irq_servers(voi
  	int i, j;
  	struct device_node *np;
  	u32 ilen;
 -	const u32 *ireg, *isize;
 +	const u32 *ireg;
  	u32 hcpuid;
  
  	/* Find the server numbers for the boot cpu. */
@@@ -607,6 -607,11 +607,6 @@@
  		}
  	}
  
 -	/* get the bit size of server numbers */
 -	isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
 -	if (isize)
 -		interrupt_server_size = *isize;
 -
  	of_node_put(np);
  }
  
@@@ -677,7 -682,6 +677,7 @@@ void __init xics_init_IRQ(void
  	struct device_node *np;
  	u32 indx = 0;
  	int found = 0;
 +	const u32 *isize;
  
  	ppc64_boot_msg(0x20, "XICS Init");
  
@@@ -697,26 -701,6 +697,26 @@@
  	if (found == 0)
  		return;
  
 +	/* get the bit size of server numbers */
 +	found = 0;
 +
 +	for_each_compatible_node(np, NULL, "ibm,ppc-xics") {
 +		isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
 +
 +		if (!isize)
 +			continue;
 +
 +		if (!found) {
 +			interrupt_server_size = *isize;
 +			found = 1;
 +		} else if (*isize != interrupt_server_size) {
 +			printk(KERN_WARNING "XICS: "
 +			       "mismatched ibm,interrupt-server#-size\n");
 +			interrupt_server_size = max(*isize,
 +						    interrupt_server_size);
 +		}
 +	}
 +
  	xics_update_irq_servers();
  	xics_init_host();
  
@@@ -744,18 -728,9 +744,18 @@@ static void xics_set_cpu_priority(unsig
  /* Have the calling processor join or leave the specified global queue */
  static void xics_set_cpu_giq(unsigned int gserver, unsigned int join)
  {
 -	int status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE,
 -		(1UL << interrupt_server_size) - 1 - gserver, join);
 -	WARN_ON(status < 0);
 +	int index;
 +	int status;
 +
 +	if (!rtas_indicator_present(GLOBAL_INTERRUPT_QUEUE, NULL))
 +		return;
 +
 +	index = (1UL << interrupt_server_size) - 1 - gserver;
 +
 +	status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, index, join);
 +
 +	WARN(status < 0, "set-indicator(%d, %d, %u) returned %d\n",
 +	     GLOBAL_INTERRUPT_QUEUE, index, join, status);
  }
  
  void xics_setup_cpu(void)
@@@ -870,7 -845,7 +870,7 @@@ void xics_migrate_irqs_away(void
  
  		/* Reset affinity to all cpus */
  		irq_desc[virq].affinity = CPU_MASK_ALL;
- 		desc->chip->set_affinity(virq, CPU_MASK_ALL);
+ 		desc->chip->set_affinity(virq, cpu_all_mask);
  unlock:
  		spin_unlock_irqrestore(&desc->lock, flags);
  	}
diff --combined arch/powerpc/sysdev/mpic.c
index c82babb7007,5d7f9f0c93c..3e0d89dcdba
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@@ -661,6 -661,17 +661,6 @@@ static inline void mpic_eoi(struct mpi
  	(void)mpic_cpu_read(MPIC_INFO(CPU_WHOAMI));
  }
  
 -#ifdef CONFIG_SMP
 -static irqreturn_t mpic_ipi_action(int irq, void *data)
 -{
 -	long ipi = (long)data;
 -
 -	smp_message_recv(ipi);
 -
 -	return IRQ_HANDLED;
 -}
 -#endif /* CONFIG_SMP */
 -
  /*
   * Linux descriptor level callbacks
   */
@@@ -806,7 -817,7 +806,7 @@@ static void mpic_end_ipi(unsigned int i
  
  #endif /* CONFIG_SMP */
  
- void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
+ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  {
  	struct mpic *mpic = mpic_from_irq(irq);
  	unsigned int src = mpic_irq_to_hw(irq);
@@@ -818,7 -829,7 +818,7 @@@
  	} else {
  		cpumask_t tmp;
  
- 		cpus_and(tmp, cpumask, cpu_online_map);
+ 		cpumask_and(&tmp, cpumask, cpu_online_mask);
  
  		mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
  			       mpic_physmask(cpus_addr(tmp)[0]));
@@@ -1537,7 -1548,13 +1537,7 @@@ unsigned int mpic_get_mcirq(void
  void mpic_request_ipis(void)
  {
  	struct mpic *mpic = mpic_primary;
 -	long i, err;
 -	static char *ipi_names[] = {
 -		"IPI0 (call function)",
 -		"IPI1 (reschedule)",
 -		"IPI2 (call function single)",
 -		"IPI3 (debugger break)",
 -	};
 +	int i;
  	BUG_ON(mpic == NULL);
  
  	printk(KERN_INFO "mpic: requesting IPIs ... \n");
@@@ -1546,10 -1563,17 +1546,10 @@@
  		unsigned int vipi = irq_create_mapping(mpic->irqhost,
  						       mpic->ipi_vecs[0] + i);
  		if (vipi == NO_IRQ) {
 -			printk(KERN_ERR "Failed to map IPI %ld\n", i);
 -			break;
 -		}
 -		err = request_irq(vipi, mpic_ipi_action,
 -				  IRQF_DISABLED|IRQF_PERCPU,
 -				  ipi_names[i], (void *)i);
 -		if (err) {
 -			printk(KERN_ERR "Request of irq %d for IPI %ld failed\n",
 -			       vipi, i);
 -			break;
 +			printk(KERN_ERR "Failed to map %s\n", smp_ipi_name[i]);
 +			continue;
  		}
 +		smp_request_message_ipi(vipi, i);
  	}
  }
  
diff --combined arch/s390/Kconfig
index 8152fefc97b,b4aa5869c7f..19577aeffd7
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@@ -43,9 -43,6 +43,9 @@@ config GENERIC_HWEIGH
  config GENERIC_TIME
  	def_bool y
  
 +config GENERIC_TIME_VSYSCALL
 +	def_bool y
 +
  config GENERIC_CLOCKEVENTS
  	def_bool y
  
@@@ -69,20 -66,16 +69,21 @@@ config PGST
  	bool
  	default y if KVM
  
 +config VIRT_CPU_ACCOUNTING
 +	def_bool y
 +
  mainmenu "Linux Kernel Configuration"
  
  config S390
  	def_bool y
 +	select USE_GENERIC_SMP_HELPERS if SMP
 +	select HAVE_FUNCTION_TRACER
  	select HAVE_OPROFILE
  	select HAVE_KPROBES
  	select HAVE_KRETPROBES
  	select HAVE_KVM if 64BIT
  	select HAVE_ARCH_TRACEHOOK
+ 	select INIT_ALL_POSSIBLE
  
  source "init/Kconfig"
  
@@@ -233,14 -226,6 +234,14 @@@ config MARCH_Z9_10
  	  Class (z9 BC). The kernel will be slightly faster but will not
  	  work on older machines such as the z990, z890, z900, and z800.
  
 +config MARCH_Z10
 +	bool "IBM System z10"
 +	help
 +	  Select this to enable optimizations for IBM System z10. The
 +	  kernel will be slightly faster but will not work on older
 +	  machines such as the z990, z890, z900, z800, z9-109, z9-ec
 +	  and z9-bc.
 +
  endchoice
  
  config PACK_STACK
@@@ -359,6 -344,16 +360,6 @@@ config QDI
  
  	  If unsure, say Y.
  
 -config QDIO_DEBUG
 -	bool "Extended debugging information"
 -	depends on QDIO
 -	help
 -	  Say Y here to get extended debugging output in
 -	    /sys/kernel/debug/s390dbf/qdio...
 -	  Warning: this option reduces the performance of the QDIO module.
 -
 -	  If unsure, say N.
 -
  config CHSC_SCH
  	tristate "Support for CHSC subchannels"
  	help
@@@ -472,9 -467,22 +473,9 @@@ config PAGE_STATE
  	  hypervisor. The ESSA instruction is used to do the states
  	  changes between a page that has content and the unused state.
  
 -config VIRT_TIMER
 -	bool "Virtual CPU timer support"
 -	help
 -	  This provides a kernel interface for virtual CPU timers.
 -	  Default is disabled.
 -
 -config VIRT_CPU_ACCOUNTING
 -	bool "Base user process accounting on virtual cpu timer"
 -	depends on VIRT_TIMER
 -	help
 -	  Select this option to use CPU timer deltas to do user
 -	  process accounting.
 -
  config APPLDATA_BASE
  	bool "Linux - VM Monitor Stream, base infrastructure"
 -	depends on PROC_FS && VIRT_TIMER=y
 +	depends on PROC_FS
  	help
  	  This provides a kernel interface for creating and updating z/VM APPLDATA
  	  monitor records. The monitor records are updated at certain time
diff --combined arch/s390/kernel/smp.c
index 6fc78541dc5,f03914b8ed2..3ed5c7a83c6
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@@ -20,9 -20,6 +20,9 @@@
   * cpu_number_map in other architectures.
   */
  
 +#define KMSG_COMPONENT "cpu"
 +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 +
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/mm.h>
@@@ -55,12 -52,6 +55,6 @@@
  struct _lowcore *lowcore_ptr[NR_CPUS];
  EXPORT_SYMBOL(lowcore_ptr);
  
- cpumask_t cpu_online_map = CPU_MASK_NONE;
- EXPORT_SYMBOL(cpu_online_map);
- 
- cpumask_t cpu_possible_map = CPU_MASK_ALL;
- EXPORT_SYMBOL(cpu_possible_map);
- 
  static struct task_struct *current_set[NR_CPUS];
  
  static u8 smp_cpu_type;
@@@ -80,6 -71,159 +74,6 @@@ static DEFINE_PER_CPU(struct cpu, cpu_d
  
  static void smp_ext_bitcall(int, ec_bit_sig);
  
 -/*
 - * Structure and data for __smp_call_function_map(). This is designed to
 - * minimise static memory requirements. It also looks cleaner.
 - */
 -static DEFINE_SPINLOCK(call_lock);
 -
 -struct call_data_struct {
 -	void (*func) (void *info);
 -	void *info;
 -	cpumask_t started;
 -	cpumask_t finished;
 -	int wait;
 -};
 -
 -static struct call_data_struct *call_data;
 -
 -/*
 - * 'Call function' interrupt callback
 - */
 -static void do_call_function(void)
 -{
 -	void (*func) (void *info) = call_data->func;
 -	void *info = call_data->info;
 -	int wait = call_data->wait;
 -
 -	cpu_set(smp_processor_id(), call_data->started);
 -	(*func)(info);
 -	if (wait)
 -		cpu_set(smp_processor_id(), call_data->finished);;
 -}
 -
 -static void __smp_call_function_map(void (*func) (void *info), void *info,
 -				    int wait, cpumask_t map)
 -{
 -	struct call_data_struct data;
 -	int cpu, local = 0;
 -
 -	/*
 -	 * Can deadlock when interrupts are disabled or if in wrong context.
 -	 */
 -	WARN_ON(irqs_disabled() || in_irq());
 -
 -	/*
 -	 * Check for local function call. We have to have the same call order
 -	 * as in on_each_cpu() because of machine_restart_smp().
 -	 */
 -	if (cpu_isset(smp_processor_id(), map)) {
 -		local = 1;
 -		cpu_clear(smp_processor_id(), map);
 -	}
 -
 -	cpus_and(map, map, cpu_online_map);
 -	if (cpus_empty(map))
 -		goto out;
 -
 -	data.func = func;
 -	data.info = info;
 -	data.started = CPU_MASK_NONE;
 -	data.wait = wait;
 -	if (wait)
 -		data.finished = CPU_MASK_NONE;
 -
 -	call_data = &data;
 -
 -	for_each_cpu_mask(cpu, map)
 -		smp_ext_bitcall(cpu, ec_call_function);
 -
 -	/* Wait for response */
 -	while (!cpus_equal(map, data.started))
 -		cpu_relax();
 -	if (wait)
 -		while (!cpus_equal(map, data.finished))
 -			cpu_relax();
 -out:
 -	if (local) {
 -		local_irq_disable();
 -		func(info);
 -		local_irq_enable();
 -	}
 -}
 -
 -/*
 - * smp_call_function:
 - * @func: the function to run; this must be fast and non-blocking
 - * @info: an arbitrary pointer to pass to the function
 - * @wait: if true, wait (atomically) until function has completed on other CPUs
 - *
 - * Run a function on all other CPUs.
 - *
 - * You must not call this function with disabled interrupts, from a
 - * hardware interrupt handler or from a bottom half.
 - */
 -int smp_call_function(void (*func) (void *info), void *info, int wait)
 -{
 -	cpumask_t map;
 -
 -	spin_lock(&call_lock);
 -	map = cpu_online_map;
 -	cpu_clear(smp_processor_id(), map);
 -	__smp_call_function_map(func, info, wait, map);
 -	spin_unlock(&call_lock);
 -	return 0;
 -}
 -EXPORT_SYMBOL(smp_call_function);
 -
 -/*
 - * smp_call_function_single:
 - * @cpu: the CPU where func should run
 - * @func: the function to run; this must be fast and non-blocking
 - * @info: an arbitrary pointer to pass to the function
 - * @wait: if true, wait (atomically) until function has completed on other CPUs
 - *
 - * Run a function on one processor.
 - *
 - * You must not call this function with disabled interrupts, from a
 - * hardware interrupt handler or from a bottom half.
 - */
 -int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 -			     int wait)
 -{
 -	spin_lock(&call_lock);
 -	__smp_call_function_map(func, info, wait, cpumask_of_cpu(cpu));
 -	spin_unlock(&call_lock);
 -	return 0;
 -}
 -EXPORT_SYMBOL(smp_call_function_single);
 -
 -/**
 - * smp_call_function_mask(): Run a function on a set of other CPUs.
 - * @mask: The set of cpus to run on.  Must not include the current cpu.
 - * @func: The function to run. This must be fast and non-blocking.
 - * @info: An arbitrary pointer to pass to the function.
 - * @wait: If true, wait (atomically) until function has completed on other CPUs.
 - *
 - * Returns 0 on success, else a negative status code.
 - *
 - * If @wait is true, then returns once @func has returned; otherwise
 - * it returns just before the target cpu calls @func.
 - *
 - * You must not call this function with disabled interrupts or from a
 - * hardware interrupt handler or from a bottom half handler.
 - */
 -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 -			   int wait)
 -{
 -	spin_lock(&call_lock);
 -	cpu_clear(smp_processor_id(), mask);
 -	__smp_call_function_map(func, info, wait, mask);
 -	spin_unlock(&call_lock);
 -	return 0;
 -}
 -EXPORT_SYMBOL(smp_call_function_mask);
 -
  void smp_send_stop(void)
  {
  	int cpu, rc;
@@@ -121,10 -265,7 +115,10 @@@ static void do_ext_call_interrupt(__u1
  	bits = xchg(&S390_lowcore.ext_call_fast, 0);
  
  	if (test_bit(ec_call_function, &bits))
 -		do_call_function();
 +		generic_smp_call_function_interrupt();
 +
 +	if (test_bit(ec_call_function_single, &bits))
 +		generic_smp_call_function_single_interrupt();
  }
  
  /*
@@@ -141,19 -282,6 +135,19 @@@ static void smp_ext_bitcall(int cpu, ec
  		udelay(10);
  }
  
 +void arch_send_call_function_ipi(cpumask_t mask)
 +{
 +	int cpu;
 +
 +	for_each_cpu_mask(cpu, mask)
 +		smp_ext_bitcall(cpu, ec_call_function);
 +}
 +
 +void arch_send_call_function_single_ipi(int cpu)
 +{
 +	smp_ext_bitcall(cpu, ec_call_function_single);
 +}
 +
  #ifndef CONFIG_64BIT
  /*
   * this function sends a 'purge tlb' signal to another CPU.
@@@ -254,8 -382,8 +248,8 @@@ static void __init smp_get_save_area(un
  	if (ipl_info.type != IPL_TYPE_FCP_DUMP)
  		return;
  	if (cpu >= NR_CPUS) {
 -		printk(KERN_WARNING "Registers for cpu %i not saved since dump "
 -		       "kernel was compiled with NR_CPUS=%i\n", cpu, NR_CPUS);
 +		pr_warning("CPU %i exceeds the maximum %i and is excluded from "
 +			   "the dump\n", cpu, NR_CPUS - 1);
  		return;
  	}
  	zfcpdump_save_areas[cpu] = kmalloc(sizeof(union save_area), GFP_KERNEL);
@@@ -428,7 -556,7 +422,7 @@@ static void __init smp_detect_cpus(void
  	}
  out:
  	kfree(info);
 -	printk(KERN_INFO "CPUs: %d configured, %d standby\n", c_cpus, s_cpus);
 +	pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
  	get_online_cpus();
  	__smp_rescan_cpus();
  	put_online_cpus();
@@@ -444,17 -572,19 +438,17 @@@ int __cpuinit start_secondary(void *cpu
  	preempt_disable();
  	/* Enable TOD clock interrupts on the secondary cpu. */
  	init_cpu_timer();
 -#ifdef CONFIG_VIRT_TIMER
  	/* Enable cpu timer interrupts on the secondary cpu. */
  	init_cpu_vtimer();
 -#endif
  	/* Enable pfault pseudo page faults on this cpu. */
  	pfault_init();
  
  	/* call cpu notifiers */
  	notify_cpu_starting(smp_processor_id());
  	/* Mark this cpu as online */
 -	spin_lock(&call_lock);
 +	ipi_call_lock();
  	cpu_set(smp_processor_id(), cpu_online_map);
 -	spin_unlock(&call_lock);
 +	ipi_call_unlock();
  	/* Switch on interrupts */
  	local_irq_enable();
  	/* Print info about this processor */
@@@ -503,15 -633,18 +497,15 @@@ static int __cpuinit smp_alloc_lowcore(
  
  		save_area = get_zeroed_page(GFP_KERNEL);
  		if (!save_area)
 -			goto out_save_area;
 +			goto out;
  		lowcore->extended_save_area_addr = (u32) save_area;
  	}
  #endif
  	lowcore_ptr[cpu] = lowcore;
  	return 0;
  
 -#ifndef CONFIG_64BIT
 -out_save_area:
 -	free_page(panic_stack);
 -#endif
  out:
 +	free_page(panic_stack);
  	free_pages(async_stack, ASYNC_ORDER);
  	free_pages((unsigned long) lowcore, lc_order);
  	return -ENOMEM;
@@@ -551,8 -684,12 +545,8 @@@ int __cpuinit __cpu_up(unsigned int cpu
  
  	ccode = signal_processor_p((__u32)(unsigned long)(lowcore_ptr[cpu]),
  				   cpu, sigp_set_prefix);
 -	if (ccode) {
 -		printk("sigp_set_prefix failed for cpu %d "
 -		       "with condition code %d\n",
 -		       (int) cpu, (int) ccode);
 +	if (ccode)
  		return -EIO;
 -	}
  
  	idle = current_set[cpu];
  	cpu_lowcore = lowcore_ptr[cpu];
@@@ -635,7 -772,7 +629,7 @@@ void __cpu_die(unsigned int cpu
  	while (!smp_cpu_not_running(cpu))
  		cpu_relax();
  	smp_free_lowcore(cpu);
 -	printk(KERN_INFO "Processor %d spun down\n", cpu);
 +	pr_info("Processor %d stopped\n", cpu);
  }
  
  void cpu_die(void)
diff --combined arch/s390/kernel/time.c
index 5be981a36c3,f5bd141c844..d649600df5b
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@@ -12,9 -12,6 +12,9 @@@
   *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
   */
  
 +#define KMSG_COMPONENT "time"
 +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 +
  #include <linux/errno.h>
  #include <linux/module.h>
  #include <linux/sched.h>
@@@ -23,8 -20,6 +23,8 @@@
  #include <linux/string.h>
  #include <linux/mm.h>
  #include <linux/interrupt.h>
 +#include <linux/cpu.h>
 +#include <linux/stop_machine.h>
  #include <linux/time.h>
  #include <linux/sysdev.h>
  #include <linux/delay.h>
@@@ -41,7 -36,6 +41,7 @@@
  #include <asm/delay.h>
  #include <asm/s390_ext.h>
  #include <asm/div64.h>
 +#include <asm/vdso.h>
  #include <asm/irq.h>
  #include <asm/irq_regs.h>
  #include <asm/timer.h>
@@@ -160,7 -154,7 +160,7 @@@ void init_cpu_timer(void
  	cd->min_delta_ns	= 1;
  	cd->max_delta_ns	= LONG_MAX;
  	cd->rating		= 400;
- 	cd->cpumask		= cpumask_of_cpu(cpu);
+ 	cd->cpumask		= cpumask_of(cpu);
  	cd->set_next_event	= s390_next_event;
  	cd->set_mode		= s390_set_mode;
  
@@@ -229,36 -223,6 +229,36 @@@ static struct clocksource clocksource_t
  };
  
  
 +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 +{
 +	if (clock != &clocksource_tod)
 +		return;
 +
 +	/* Make userspace gettimeofday spin until we're done. */
 +	++vdso_data->tb_update_count;
 +	smp_wmb();
 +	vdso_data->xtime_tod_stamp = clock->cycle_last;
 +	vdso_data->xtime_clock_sec = xtime.tv_sec;
 +	vdso_data->xtime_clock_nsec = xtime.tv_nsec;
 +	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
 +	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
 +	smp_wmb();
 +	++vdso_data->tb_update_count;
 +}
 +
 +extern struct timezone sys_tz;
 +
 +void update_vsyscall_tz(void)
 +{
 +	/* Make userspace gettimeofday spin until we're done. */
 +	++vdso_data->tb_update_count;
 +	smp_wmb();
 +	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
 +	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
 +	smp_wmb();
 +	++vdso_data->tb_update_count;
 +}
 +
  /*
   * Initialize the TOD clock and the CPU timer of
   * the boot cpu.
@@@ -289,8 -253,10 +289,8 @@@ void __init time_init(void
  
  	/* Enable TOD clock interrupts on the boot cpu. */
  	init_cpu_timer();
 -
 -#ifdef CONFIG_VIRT_TIMER
 +	/* Enable cpu timer interrupts on the boot cpu. */
  	vtime_init();
 -#endif
  }
  
  /*
@@@ -322,8 -288,8 +322,8 @@@ static unsigned long long adjust_time(u
  	}
  	sched_clock_base_cc += delta;
  	if (adjust.offset != 0) {
 -		printk(KERN_NOTICE "etr: time adjusted by %li micro-seconds\n",
 -		       adjust.offset);
 +		pr_notice("The ETR interface has adjusted the clock "
 +			  "by %li microseconds\n", adjust.offset);
  		adjust.modes = ADJ_OFFSET_SINGLESHOT;
  		do_adjtimex(&adjust);
  	}
@@@ -394,15 -360,6 +394,15 @@@ static void enable_sync_clock(void
  	atomic_set_mask(0x80000000, sw_ptr);
  }
  
 +/* Single threaded workqueue used for etr and stp sync events */
 +static struct workqueue_struct *time_sync_wq;
 +
 +static void __init time_init_wq(void)
 +{
 +	if (!time_sync_wq)
 +		time_sync_wq = create_singlethread_workqueue("timesync");
 +}
 +
  /*
   * External Time Reference (ETR) code.
   */
@@@ -468,7 -425,6 +468,7 @@@ static struct timer_list etr_timer
  
  static void etr_timeout(unsigned long dummy);
  static void etr_work_fn(struct work_struct *work);
 +static DEFINE_MUTEX(etr_work_mutex);
  static DECLARE_WORK(etr_work, etr_work_fn);
  
  /*
@@@ -484,8 -440,8 +484,8 @@@ static void etr_reset(void
  		etr_tolec = get_clock();
  		set_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags);
  	} else if (etr_port0_online || etr_port1_online) {
 -		printk(KERN_WARNING "Running on non ETR capable "
 -		       "machine, only local mode available.\n");
 +		pr_warning("The real or virtual hardware system does "
 +			   "not provide an ETR interface\n");
  		etr_port0_online = etr_port1_online = 0;
  	}
  }
@@@ -496,18 -452,17 +496,18 @@@ static int __init etr_init(void
  
  	if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags))
  		return 0;
 +	time_init_wq();
  	/* Check if this machine has the steai instruction. */
  	if (etr_steai(&aib, ETR_STEAI_STEPPING_PORT) == 0)
  		etr_steai_available = 1;
  	setup_timer(&etr_timer, etr_timeout, 0UL);
  	if (etr_port0_online) {
  		set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
 -		schedule_work(&etr_work);
 +		queue_work(time_sync_wq, &etr_work);
  	}
  	if (etr_port1_online) {
  		set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
 -		schedule_work(&etr_work);
 +		queue_work(time_sync_wq, &etr_work);
  	}
  	return 0;
  }
@@@ -534,7 -489,7 +534,7 @@@ void etr_switch_to_local(void
  	if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
  		disable_sync_clock(NULL);
  	set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events);
 -	schedule_work(&etr_work);
 +	queue_work(time_sync_wq, &etr_work);
  }
  
  /*
@@@ -550,7 -505,7 +550,7 @@@ void etr_sync_check(void
  	if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
  		disable_sync_clock(NULL);
  	set_bit(ETR_EVENT_SYNC_CHECK, &etr_events);
 -	schedule_work(&etr_work);
 +	queue_work(time_sync_wq, &etr_work);
  }
  
  /*
@@@ -574,13 -529,13 +574,13 @@@ static void etr_timing_alert(struct etr
  		 * Both ports are not up-to-date now.
  		 */
  		set_bit(ETR_EVENT_PORT_ALERT, &etr_events);
 -	schedule_work(&etr_work);
 +	queue_work(time_sync_wq, &etr_work);
  }
  
  static void etr_timeout(unsigned long dummy)
  {
  	set_bit(ETR_EVENT_UPDATE, &etr_events);
 -	schedule_work(&etr_work);
 +	queue_work(time_sync_wq, &etr_work);
  }
  
  /*
@@@ -687,16 -642,14 +687,16 @@@ static int etr_aib_follows(struct etr_a
  }
  
  struct clock_sync_data {
 +	atomic_t cpus;
  	int in_sync;
  	unsigned long long fixup_cc;
 +	int etr_port;
 +	struct etr_aib *etr_aib;
  };
  
 -static void clock_sync_cpu_start(void *dummy)
 +static void clock_sync_cpu(struct clock_sync_data *sync)
  {
 -	struct clock_sync_data *sync = dummy;
 -
 +	atomic_dec(&sync->cpus);
  	enable_sync_clock();
  	/*
  	 * This looks like a busy wait loop but it isn't. etr_sync_cpus
@@@ -722,35 -675,39 +722,35 @@@
  	fixup_clock_comparator(sync->fixup_cc);
  }
  
 -static void clock_sync_cpu_end(void *dummy)
 -{
 -}
 -
  /*
   * Sync the TOD clock using the port refered to by aibp. This port
   * has to be enabled and the other port has to be disabled. The
   * last eacr update has to be more than 1.6 seconds in the past.
   */
 -static int etr_sync_clock(struct etr_aib *aib, int port)
 +static int etr_sync_clock(void *data)
  {
 -	struct etr_aib *sync_port;
 -	struct clock_sync_data etr_sync;
 +	static int first;
  	unsigned long long clock, old_clock, delay, delta;
 -	int follows;
 +	struct clock_sync_data *etr_sync;
 +	struct etr_aib *sync_port, *aib;
 +	int port;
  	int rc;
  
 -	/* Check if the current aib is adjacent to the sync port aib. */
 -	sync_port = (port == 0) ? &etr_port0 : &etr_port1;
 -	follows = etr_aib_follows(sync_port, aib, port);
 -	memcpy(sync_port, aib, sizeof(*aib));
 -	if (!follows)
 -		return -EAGAIN;
 +	etr_sync = data;
  
 -	/*
 -	 * Catch all other cpus and make them wait until we have
 -	 * successfully synced the clock. smp_call_function will
 -	 * return after all other cpus are in etr_sync_cpu_start.
 -	 */
 -	memset(&etr_sync, 0, sizeof(etr_sync));
 -	preempt_disable();
 -	smp_call_function(clock_sync_cpu_start, &etr_sync, 0);
 -	local_irq_disable();
 +	if (xchg(&first, 1) == 1) {
 +		/* Slave */
 +		clock_sync_cpu(etr_sync);
 +		return 0;
 +	}
 +
 +	/* Wait until all other cpus entered the sync function. */
 +	while (atomic_read(&etr_sync->cpus) != 0)
 +		cpu_relax();
 +
 +	port = etr_sync->etr_port;
 +	aib = etr_sync->etr_aib;
 +	sync_port = (port == 0) ? &etr_port0 : &etr_port1;
  	enable_sync_clock();
  
  	/* Set clock to next OTE. */
@@@ -767,16 -724,16 +767,16 @@@
  		delay = (unsigned long long)
  			(aib->edf2.etv - sync_port->edf2.etv) << 32;
  		delta = adjust_time(old_clock, clock, delay);
 -		etr_sync.fixup_cc = delta;
 +		etr_sync->fixup_cc = delta;
  		fixup_clock_comparator(delta);
  		/* Verify that the clock is properly set. */
  		if (!etr_aib_follows(sync_port, aib, port)) {
  			/* Didn't work. */
  			disable_sync_clock(NULL);
 -			etr_sync.in_sync = -EAGAIN;
 +			etr_sync->in_sync = -EAGAIN;
  			rc = -EAGAIN;
  		} else {
 -			etr_sync.in_sync = 1;
 +			etr_sync->in_sync = 1;
  			rc = 0;
  		}
  	} else {
@@@ -784,33 -741,12 +784,33 @@@
  		__ctl_clear_bit(0, 29);
  		__ctl_clear_bit(14, 21);
  		disable_sync_clock(NULL);
 -		etr_sync.in_sync = -EAGAIN;
 +		etr_sync->in_sync = -EAGAIN;
  		rc = -EAGAIN;
  	}
 -	local_irq_enable();
 -	smp_call_function(clock_sync_cpu_end, NULL, 0);
 -	preempt_enable();
 +	xchg(&first, 0);
 +	return rc;
 +}
 +
 +static int etr_sync_clock_stop(struct etr_aib *aib, int port)
 +{
 +	struct clock_sync_data etr_sync;
 +	struct etr_aib *sync_port;
 +	int follows;
 +	int rc;
 +
 +	/* Check if the current aib is adjacent to the sync port aib. */
 +	sync_port = (port == 0) ? &etr_port0 : &etr_port1;
 +	follows = etr_aib_follows(sync_port, aib, port);
 +	memcpy(sync_port, aib, sizeof(*aib));
 +	if (!follows)
 +		return -EAGAIN;
 +	memset(&etr_sync, 0, sizeof(etr_sync));
 +	etr_sync.etr_aib = aib;
 +	etr_sync.etr_port = port;
 +	get_online_cpus();
 +	atomic_set(&etr_sync.cpus, num_online_cpus() - 1);
 +	rc = stop_machine(etr_sync_clock, &etr_sync, &cpu_online_map);
 +	put_online_cpus();
  	return rc;
  }
  
@@@ -967,7 -903,7 +967,7 @@@ static void etr_update_eacr(struct etr_
  }
  
  /*
 - * ETR tasklet. In this function you'll find the main logic. In
 + * ETR work. In this function you'll find the main logic. In
   * particular this is the only function that calls etr_update_eacr(),
   * it "controls" the etr control register.
   */
@@@ -978,9 -914,6 +978,9 @@@ static void etr_work_fn(struct work_str
  	struct etr_aib aib;
  	int sync_port;
  
 +	/* prevent multiple execution. */
 +	mutex_lock(&etr_work_mutex);
 +
  	/* Create working copy of etr_eacr. */
  	eacr = etr_eacr;
  
@@@ -996,7 -929,7 +996,7 @@@
  		del_timer_sync(&etr_timer);
  		etr_update_eacr(eacr);
  		clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
 -		return;
 +		goto out_unlock;
  	}
  
  	/* Store aib to get the current ETR status word. */
@@@ -1083,7 -1016,7 +1083,7 @@@
  	    eacr.es || sync_port < 0) {
  		etr_update_eacr(eacr);
  		etr_set_tolec_timeout(now);
 -		return;
 +		goto out_unlock;
  	}
  
  	/*
@@@ -1103,7 -1036,7 +1103,7 @@@
  	etr_update_eacr(eacr);
  	set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
  	if (now < etr_tolec + (1600000 << 12) ||
 -	    etr_sync_clock(&aib, sync_port) != 0) {
 +	    etr_sync_clock_stop(&aib, sync_port) != 0) {
  		/* Sync failed. Try again in 1/2 second. */
  		eacr.es = 0;
  		etr_update_eacr(eacr);
@@@ -1111,8 -1044,6 +1111,8 @@@
  		etr_set_sync_timeout();
  	} else
  		etr_set_tolec_timeout(now);
 +out_unlock:
 +	mutex_unlock(&etr_work_mutex);
  }
  
  /*
@@@ -1194,13 -1125,13 +1194,13 @@@ static ssize_t etr_online_store(struct 
  			return count;	/* Nothing to do. */
  		etr_port0_online = value;
  		set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
 -		schedule_work(&etr_work);
 +		queue_work(time_sync_wq, &etr_work);
  	} else {
  		if (etr_port1_online == value)
  			return count;	/* Nothing to do. */
  		etr_port1_online = value;
  		set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
 -		schedule_work(&etr_work);
 +		queue_work(time_sync_wq, &etr_work);
  	}
  	return count;
  }
@@@ -1401,7 -1332,6 +1401,7 @@@ static struct stp_sstpi stp_info
  static void *stp_page;
  
  static void stp_work_fn(struct work_struct *work);
 +static DEFINE_MUTEX(stp_work_mutex);
  static DECLARE_WORK(stp_work, stp_work_fn);
  
  static int __init early_parse_stp(char *p)
@@@ -1426,8 -1356,7 +1426,8 @@@ static void __init stp_reset(void
  	if (rc == 0)
  		set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
  	else if (stp_online) {
 -		printk(KERN_WARNING "Running on non STP capable machine.\n");
 +		pr_warning("The real or virtual hardware system does "
 +			   "not provide an STP interface\n");
  		free_bootmem((unsigned long) stp_page, PAGE_SIZE);
  		stp_page = NULL;
  		stp_online = 0;
@@@ -1436,12 -1365,8 +1436,12 @@@
  
  static int __init stp_init(void)
  {
 -	if (test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online)
 -		schedule_work(&stp_work);
 +	if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
 +		return 0;
 +	time_init_wq();
 +	if (!stp_online)
 +		return 0;
 +	queue_work(time_sync_wq, &stp_work);
  	return 0;
  }
  
@@@ -1458,7 -1383,7 +1458,7 @@@ arch_initcall(stp_init)
  static void stp_timing_alert(struct stp_irq_parm *intparm)
  {
  	if (intparm->tsc || intparm->lac || intparm->tcpc)
 -		schedule_work(&stp_work);
 +		queue_work(time_sync_wq, &stp_work);
  }
  
  /*
@@@ -1472,7 -1397,7 +1472,7 @@@ void stp_sync_check(void
  	if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags))
  		return;
  	disable_sync_clock(NULL);
 -	schedule_work(&stp_work);
 +	queue_work(time_sync_wq, &stp_work);
  }
  
  /*
@@@ -1486,34 -1411,46 +1486,34 @@@ void stp_island_check(void
  	if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags))
  		return;
  	disable_sync_clock(NULL);
 -	schedule_work(&stp_work);
 +	queue_work(time_sync_wq, &stp_work);
  }
  
 -/*
 - * STP tasklet. Check for the STP state and take over the clock
 - * synchronization if the STP clock source is usable.
 - */
 -static void stp_work_fn(struct work_struct *work)
 +
 +static int stp_sync_clock(void *data)
  {
 -	struct clock_sync_data stp_sync;
 +	static int first;
  	unsigned long long old_clock, delta;
 +	struct clock_sync_data *stp_sync;
  	int rc;
  
 -	if (!stp_online) {
 -		chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
 -		return;
 -	}
 +	stp_sync = data;
  
 -	rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
 -	if (rc)
 -		return;
 +	if (xchg(&first, 1) == 1) {
 +		/* Slave */
 +		clock_sync_cpu(stp_sync);
 +		return 0;
 +	}
  
 -	rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
 -	if (rc || stp_info.c == 0)
 -		return;
 +	/* Wait until all other cpus entered the sync function. */
 +	while (atomic_read(&stp_sync->cpus) != 0)
 +		cpu_relax();
  
 -	/*
 -	 * Catch all other cpus and make them wait until we have
 -	 * successfully synced the clock. smp_call_function will
 -	 * return after all other cpus are in clock_sync_cpu_start.
 -	 */
 -	memset(&stp_sync, 0, sizeof(stp_sync));
 -	preempt_disable();
 -	smp_call_function(clock_sync_cpu_start, &stp_sync, 0);
 -	local_irq_disable();
  	enable_sync_clock();
  
  	set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
  	if (test_and_clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
 -		schedule_work(&etr_work);
 +		queue_work(time_sync_wq, &etr_work);
  
  	rc = 0;
  	if (stp_info.todoff[0] || stp_info.todoff[1] ||
@@@ -1532,49 -1469,16 +1532,49 @@@
  	}
  	if (rc) {
  		disable_sync_clock(NULL);
 -		stp_sync.in_sync = -EAGAIN;
 +		stp_sync->in_sync = -EAGAIN;
  		clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
  		if (etr_port0_online || etr_port1_online)
 -			schedule_work(&etr_work);
 +			queue_work(time_sync_wq, &etr_work);
  	} else
 -		stp_sync.in_sync = 1;
 +		stp_sync->in_sync = 1;
 +	xchg(&first, 0);
 +	return 0;
 +}
 +
 +/*
 + * STP work. Check for the STP state and take over the clock
 + * synchronization if the STP clock source is usable.
 + */
 +static void stp_work_fn(struct work_struct *work)
 +{
 +	struct clock_sync_data stp_sync;
 +	int rc;
 +
 +	/* prevent multiple execution. */
 +	mutex_lock(&stp_work_mutex);
 +
 +	if (!stp_online) {
 +		chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
 +		goto out_unlock;
 +	}
 +
 +	rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
 +	if (rc)
 +		goto out_unlock;
 +
 +	rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
 +	if (rc || stp_info.c == 0)
 +		goto out_unlock;
 +
 +	memset(&stp_sync, 0, sizeof(stp_sync));
 +	get_online_cpus();
 +	atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
 +	stop_machine(stp_sync_clock, &stp_sync, &cpu_online_map);
 +	put_online_cpus();
  
 -	local_irq_enable();
 -	smp_call_function(clock_sync_cpu_end, NULL, 0);
 -	preempt_enable();
 +out_unlock:
 +	mutex_unlock(&stp_work_mutex);
  }
  
  /*
@@@ -1683,7 -1587,7 +1683,7 @@@ static ssize_t stp_online_store(struct 
  	if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
  		return -EOPNOTSUPP;
  	stp_online = value;
 -	schedule_work(&stp_work);
 +	queue_work(time_sync_wq, &stp_work);
  	return count;
  }
  
diff --combined arch/sparc/kernel/irq_64.c
index a3ea2bcb95d,4aaf18e83c8..cab8e028687
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@@ -312,7 -312,8 +312,8 @@@ static void sun4u_irq_enable(unsigned i
  	}
  }
  
- static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
+ static void sun4u_set_affinity(unsigned int virt_irq,
+ 			       const struct cpumask *mask)
  {
  	sun4u_irq_enable(virt_irq);
  }
@@@ -362,7 -363,8 +363,8 @@@ static void sun4v_irq_enable(unsigned i
  		       ino, err);
  }
  
- static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
+ static void sun4v_set_affinity(unsigned int virt_irq,
+ 			       const struct cpumask *mask)
  {
  	unsigned int ino = virt_irq_table[virt_irq].dev_ino;
  	unsigned long cpuid = irq_choose_cpu(virt_irq);
@@@ -429,7 -431,8 +431,8 @@@ static void sun4v_virq_enable(unsigned 
  		       dev_handle, dev_ino, err);
  }
  
- static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
+ static void sun4v_virt_set_affinity(unsigned int virt_irq,
+ 				    const struct cpumask *mask)
  {
  	unsigned long cpuid, dev_handle, dev_ino;
  	int err;
@@@ -775,69 -778,6 +778,69 @@@ void do_softirq(void
  	local_irq_restore(flags);
  }
  
 +static void unhandled_perf_irq(struct pt_regs *regs)
 +{
 +	unsigned long pcr, pic;
 +
 +	read_pcr(pcr);
 +	read_pic(pic);
 +
 +	write_pcr(0);
 +
 +	printk(KERN_EMERG "CPU %d: Got unexpected perf counter IRQ.\n",
 +	       smp_processor_id());
 +	printk(KERN_EMERG "CPU %d: PCR[%016lx] PIC[%016lx]\n",
 +	       smp_processor_id(), pcr, pic);
 +}
 +
 +/* Almost a direct copy of the powerpc PMC code.  */
 +static DEFINE_SPINLOCK(perf_irq_lock);
 +static void *perf_irq_owner_caller; /* mostly for debugging */
 +static void (*perf_irq)(struct pt_regs *regs) = unhandled_perf_irq;
 +
 +/* Invoked from level 15 PIL handler in trap table.  */
 +void perfctr_irq(int irq, struct pt_regs *regs)
 +{
 +	clear_softint(1 << irq);
 +	perf_irq(regs);
 +}
 +
 +int register_perfctr_intr(void (*handler)(struct pt_regs *))
 +{
 +	int ret;
 +
 +	if (!handler)
 +		return -EINVAL;
 +
 +	spin_lock(&perf_irq_lock);
 +	if (perf_irq != unhandled_perf_irq) {
 +		printk(KERN_WARNING "register_perfctr_intr: "
 +		       "perf IRQ busy (reserved by caller %p)\n",
 +		       perf_irq_owner_caller);
 +		ret = -EBUSY;
 +		goto out;
 +	}
 +
 +	perf_irq_owner_caller = __builtin_return_address(0);
 +	perf_irq = handler;
 +
 +	ret = 0;
 +out:
 +	spin_unlock(&perf_irq_lock);
 +
 +	return ret;
 +}
 +EXPORT_SYMBOL_GPL(register_perfctr_intr);
 +
 +void release_perfctr_intr(void (*handler)(struct pt_regs *))
 +{
 +	spin_lock(&perf_irq_lock);
 +	perf_irq_owner_caller = NULL;
 +	perf_irq = unhandled_perf_irq;
 +	spin_unlock(&perf_irq_lock);
 +}
 +EXPORT_SYMBOL_GPL(release_perfctr_intr);
 +
  #ifdef CONFIG_HOTPLUG_CPU
  void fixup_irqs(void)
  {
@@@ -851,7 -791,7 +854,7 @@@
  		    !(irq_desc[irq].status & IRQ_PER_CPU)) {
  			if (irq_desc[irq].chip->set_affinity)
  				irq_desc[irq].chip->set_affinity(irq,
- 					irq_desc[irq].affinity);
+ 					&irq_desc[irq].affinity);
  		}
  		spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
  	}
diff --combined arch/sparc/kernel/of_device_64.c
index 46e231f7c5c,df2efb7fc14..322046cdf85
--- a/arch/sparc/kernel/of_device_64.c
+++ b/arch/sparc/kernel/of_device_64.c
@@@ -780,7 -780,7 +780,7 @@@ out
  	if (nid != -1) {
  		cpumask_t numa_mask = node_to_cpumask(nid);
  
- 		irq_set_affinity(irq, numa_mask);
+ 		irq_set_affinity(irq, &numa_mask);
  	}
  
  	return irq;
@@@ -811,20 -811,20 +811,20 @@@ static struct of_device * __init scan_o
  
  	irq = of_get_property(dp, "interrupts", &len);
  	if (irq) {
 -		memcpy(op->irqs, irq, len);
  		op->num_irqs = len / 4;
 +
 +		/* Prevent overrunning the op->irqs[] array.  */
 +		if (op->num_irqs > PROMINTR_MAX) {
 +			printk(KERN_WARNING "%s: Too many irqs (%d), "
 +			       "limiting to %d.\n",
 +			       dp->full_name, op->num_irqs, PROMINTR_MAX);
 +			op->num_irqs = PROMINTR_MAX;
 +		}
 +		memcpy(op->irqs, irq, op->num_irqs * 4);
  	} else {
  		op->num_irqs = 0;
  	}
  
 -	/* Prevent overrunning the op->irqs[] array.  */
 -	if (op->num_irqs > PROMINTR_MAX) {
 -		printk(KERN_WARNING "%s: Too many irqs (%d), "
 -		       "limiting to %d.\n",
 -		       dp->full_name, op->num_irqs, PROMINTR_MAX);
 -		op->num_irqs = PROMINTR_MAX;
 -	}
 -
  	build_device_resources(op, parent);
  	for (i = 0; i < op->num_irqs; i++)
  		op->irqs[i] = build_one_device_irq(op, parent, op->irqs[i]);
diff --combined arch/sparc/kernel/pci_msi.c
index 2e680f34f72,0d0cd815e83..0d0cd815e83
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@@ -288,7 -288,7 +288,7 @@@ static int bringup_one_msi_queue(struc
  	if (nid != -1) {
  		cpumask_t numa_mask = node_to_cpumask(nid);
  
- 		irq_set_affinity(irq, numa_mask);
+ 		irq_set_affinity(irq, &numa_mask);
  	}
  	err = request_irq(irq, sparc64_msiq_interrupt, 0,
  			  "MSIQ",
diff --combined arch/sparc/kernel/smp_32.c
index e396c1f17a9,1e5ac4e282e..1e5ac4e282e
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@@ -39,8 -39,6 +39,6 @@@ volatile unsigned long cpu_callin_map[N
  unsigned char boot_cpu_id = 0;
  unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */
  
- cpumask_t cpu_online_map = CPU_MASK_NONE;
- cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
  cpumask_t smp_commenced_mask = CPU_MASK_NONE;
  
  /* The only guaranteed locking primitive available on all Sparc
@@@ -334,7 -332,7 +332,7 @@@ void __init smp_setup_cpu_possible_map(
  	instance = 0;
  	while (!cpu_find_by_instance(instance, NULL, &mid)) {
  		if (mid < NR_CPUS) {
- 			cpu_set(mid, phys_cpu_present_map);
+ 			cpu_set(mid, cpu_possible_map);
  			cpu_set(mid, cpu_present_map);
  		}
  		instance++;
@@@ -354,7 -352,7 +352,7 @@@ void __init smp_prepare_boot_cpu(void
  
  	current_thread_info()->cpu = cpuid;
  	cpu_set(cpuid, cpu_online_map);
- 	cpu_set(cpuid, phys_cpu_present_map);
+ 	cpu_set(cpuid, cpu_possible_map);
  }
  
  int __cpuinit __cpu_up(unsigned int cpu)
diff --combined arch/sparc/kernel/smp_64.c
index bfe99d82d45,a97b8822c22..46329799f34
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@@ -49,14 -49,10 +49,10 @@@
  
  int sparc64_multi_core __read_mostly;
  
- cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
- cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
  cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
  	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
  
- EXPORT_SYMBOL(cpu_possible_map);
- EXPORT_SYMBOL(cpu_online_map);
  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  EXPORT_SYMBOL(cpu_core_map);
  
@@@ -163,7 -159,7 +159,7 @@@ static inline long get_delta (long *rt
  	for (i = 0; i < NUM_ITERS; i++) {
  		t0 = tick_ops->get_tick();
  		go[MASTER] = 1;
 -		membar_storeload();
 +		membar_safe("#StoreLoad");
  		while (!(tm = go[SLAVE]))
  			rmb();
  		go[SLAVE] = 0;
@@@ -257,7 -253,7 +253,7 @@@ static void smp_synchronize_one_tick(in
  
  	/* now let the client proceed into his loop */
  	go[MASTER] = 0;
 -	membar_storeload();
 +	membar_safe("#StoreLoad");
  
  	spin_lock_irqsave(&itc_sync_lock, flags);
  	{
@@@ -267,7 -263,7 +263,7 @@@
  			go[MASTER] = 0;
  			wmb();
  			go[SLAVE] = tick_ops->get_tick();
 -			membar_storeload();
 +			membar_safe("#StoreLoad");
  		}
  	}
  	spin_unlock_irqrestore(&itc_sync_lock, flags);
@@@ -773,7 -769,7 +769,7 @@@ static void xcall_deliver(u64 data0, u6
  
  	/* Setup the initial cpu list.  */
  	cnt = 0;
 -	for_each_cpu_mask_nr(i, *mask) {
 +	for_each_cpu(i, mask) {
  		if (i == this_cpu || !cpu_online(i))
  			continue;
  		cpu_list[cnt++] = i;
@@@ -1122,6 -1118,7 +1118,6 @@@ void smp_capture(void
  		       smp_processor_id());
  #endif
  		penguins_are_doing_time = 1;
 -		membar_storestore_loadstore();
  		atomic_inc(&smp_capture_registry);
  		smp_cross_call(&xcall_capture, 0, 0, 0);
  		while (atomic_read(&smp_capture_registry) != ncpus)
@@@ -1141,13 -1138,13 +1137,13 @@@ void smp_release(void
  		       smp_processor_id());
  #endif
  		penguins_are_doing_time = 0;
 -		membar_storeload_storestore();
 +		membar_safe("#StoreLoad");
  		atomic_dec(&smp_capture_registry);
  	}
  }
  
 -/* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
 - * can service tlb flush xcalls...
 +/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE
 + * set, so they can service tlb flush xcalls...
   */
  extern void prom_world(int);
  
@@@ -1160,7 -1157,7 +1156,7 @@@ void smp_penguin_jailcell(int irq, stru
  	__asm__ __volatile__("flushw");
  	prom_world(1);
  	atomic_inc(&smp_capture_registry);
 -	membar_storeload_storestore();
 +	membar_safe("#StoreLoad");
  	while (penguins_are_doing_time)
  		rmb();
  	atomic_dec(&smp_capture_registry);
diff --combined arch/sparc/kernel/sparc_ksyms_32.c
index a4d45fc29b2,32d11a5fe3a..e1e97639231
--- a/arch/sparc/kernel/sparc_ksyms_32.c
+++ b/arch/sparc/kernel/sparc_ksyms_32.c
@@@ -61,6 -61,7 +61,6 @@@ extern void (*bzero_1page)(void *)
  extern void *__bzero(void *, size_t);
  extern void *__memscan_zero(void *, size_t);
  extern void *__memscan_generic(void *, int, size_t);
 -extern int __memcmp(const void *, const void *, __kernel_size_t);
  extern int __strncmp(const char *, const char *, __kernel_size_t);
  
  extern int __ashrdi3(int, int);
@@@ -112,17 -113,15 +112,13 @@@ EXPORT_PER_CPU_SYMBOL(__cpu_data)
  #ifdef CONFIG_SMP
  /* IRQ implementation. */
  EXPORT_SYMBOL(synchronize_irq);
- 
- /* CPU online map and active count. */
- EXPORT_SYMBOL(cpu_online_map);
- EXPORT_SYMBOL(phys_cpu_present_map);
  #endif
  
  EXPORT_SYMBOL(__udelay);
  EXPORT_SYMBOL(__ndelay);
  EXPORT_SYMBOL(rtc_lock);
 -#ifdef CONFIG_SUN_AUXIO
  EXPORT_SYMBOL(set_auxio);
  EXPORT_SYMBOL(get_auxio);
 -#endif
  EXPORT_SYMBOL(io_remap_pfn_range);
  
  #ifndef CONFIG_SMP
@@@ -210,6 -209,7 +206,6 @@@ EXPORT_SYMBOL(bzero_1page)
  EXPORT_SYMBOL(__bzero);
  EXPORT_SYMBOL(__memscan_zero);
  EXPORT_SYMBOL(__memscan_generic);
 -EXPORT_SYMBOL(__memcmp);
  EXPORT_SYMBOL(__strncmp);
  EXPORT_SYMBOL(__memmove);
  
diff --combined arch/sparc/kernel/time_64.c
index 141da375909,9df8f095a8b..9df8f095a8b
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@@ -763,7 -763,7 +763,7 @@@ void __devinit setup_sparc64_timer(void
  	sevt = &__get_cpu_var(sparc64_events);
  
  	memcpy(sevt, &sparc64_clockevent, sizeof(*sevt));
- 	sevt->cpumask = cpumask_of_cpu(smp_processor_id());
+ 	sevt->cpumask = cpumask_of(smp_processor_id());
  
  	clockevents_register_device(sevt);
  }
diff --combined arch/x86/Kconfig
index 0f44add3e0b,0ca2eb7573c..249d1e0824b
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -19,8 -19,6 +19,8 @@@ config X86_6
  config X86
  	def_bool y
  	select HAVE_AOUT if X86_32
 +	select HAVE_READQ
 +	select HAVE_WRITEQ
  	select HAVE_UNSTABLE_SCHED_CLOCK
  	select HAVE_IDE
  	select HAVE_OPROFILE
@@@ -92,10 -90,6 +92,10 @@@ config GENERIC_IOMA
  config GENERIC_BUG
  	def_bool y
  	depends on BUG
 +	select GENERIC_BUG_RELATIVE_POINTERS if X86_64
 +
 +config GENERIC_BUG_RELATIVE_POINTERS
 +	bool
  
  config GENERIC_HWEIGHT
  	def_bool y
@@@ -250,19 -244,16 +250,19 @@@ config X86_HAS_BOOT_CPU_I
  config SPARSE_IRQ
  	bool "Support sparse irq numbering"
  	depends on PCI_MSI || HT_IRQ
 -	default y
  	help
 -	  This enables support for sparse irq, esp for msi/msi-x. You may need
 -	  if you have lots of cards supports msi-x installed.
 +	  This enables support for sparse irqs. This is useful for distro
 +	  kernels that want to define a high CONFIG_NR_CPUS value but still
 +	  want to have low kernel memory footprint on smaller machines.
  
 -	  If you don't know what to do here, say Y.
 +	  ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
 +	    out the irq_desc[] array in a more NUMA-friendly way. )
 +
 +	  If you don't know what to do here, say N.
  
  config NUMA_MIGRATE_IRQ_DESC
  	bool "Move irq desc when changing irq smp_affinity"
 -	depends on SPARSE_IRQ && SMP
 +	depends on SPARSE_IRQ && NUMA
  	default n
  	help
  	  This enables moving irq_desc to cpu/node that irq will use handled.
@@@ -273,13 -264,21 +273,13 @@@ config X86_FIND_SMP_CONFI
  	def_bool y
  	depends on X86_MPPARSE || X86_VOYAGER
  
 -if ACPI
  config X86_MPPARSE
 -	def_bool y
 -	bool "Enable MPS table"
 +	bool "Enable MPS table" if ACPI
 +	default y
  	depends on X86_LOCAL_APIC
  	help
  	  For old smp systems that do not have proper acpi support. Newer systems
  	  (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
 -endif
 -
 -if !ACPI
 -config X86_MPPARSE
 -	def_bool y
 -	depends on X86_LOCAL_APIC
 -endif
  
  choice
  	prompt "Subarchitecture Type"
@@@ -501,7 -500,7 +501,7 @@@ config HPET_TIME
           The HPET provides a stable time base on SMP
           systems, unlike the TSC, but it is more expensive to access,
           as it is off-chip.  You can find the HPET spec at
 -         <http://www.intel.com/hardwaredesign/hpetspec.htm>.
 +         <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
  
           You can safely choose Y here.  However, HPET will only be
           activated if the platform and the BIOS support this feature.
@@@ -588,7 -587,7 +588,7 @@@ config AMD_IOMM
  
  # need this always selected by IOMMU for the VIA workaround
  config SWIOTLB
 -	bool
 +	def_bool y if X86_64
  	help
  	  Support for software bounce buffers used on x86-64 systems
  	  which don't have a hardware IOMMU (e.g. the current generation
@@@ -601,19 -600,20 +601,20 @@@ config IOMMU_HELPE
  
  config MAXSMP
  	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
- 	depends on X86_64 && SMP && BROKEN
+ 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+ 	select CPUMASK_OFFSTACK
  	default n
  	help
  	  Configure maximum number of CPUS and NUMA Nodes for this architecture.
  	  If unsure, say N.
  
  config NR_CPUS
- 	int "Maximum number of CPUs (2-512)" if !MAXSMP
- 	range 2 512
- 	depends on SMP
+ 	int "Maximum number of CPUs" if SMP && !MAXSMP
+ 	range 2 512 if SMP && !MAXSMP
+ 	default "1" if !SMP
  	default "4096" if MAXSMP
- 	default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
- 	default "8"
+ 	default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+ 	default "8" if SMP
  	help
  	  This allows you to specify the maximum number of CPUs which this
  	  kernel will support.  The maximum supported value is 512 and the
@@@ -679,30 -679,6 +680,30 @@@ config X86_VISWS_API
  	def_bool y
  	depends on X86_32 && X86_VISWS
  
 +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 +	bool "Reroute for broken boot IRQs"
 +	default n
 +	depends on X86_IO_APIC
 +	help
 +	  This option enables a workaround that fixes a source of
 +	  spurious interrupts. This is recommended when threaded
 +	  interrupt handling is used on systems where the generation of
 +	  superfluous "boot interrupts" cannot be disabled.
 +
 +	  Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
 +	  entry in the chipset's IO-APIC is masked (as, e.g. the RT
 +	  kernel does during interrupt handling). On chipsets where this
 +	  boot IRQ generation cannot be disabled, this workaround keeps
 +	  the original IRQ line masked so that only the equivalent "boot
 +	  IRQ" is delivered to the CPUs. The workaround also tells the
 +	  kernel to set up the IRQ handler on the boot IRQ line. In this
 +	  way only one interrupt is delivered to the kernel. Otherwise
 +	  the spurious second interrupt may cause the kernel to bring
 +	  down (vital) interrupt lines.
 +
 +	  Only affects "broken" chipsets. Interrupt sharing may be
 +	  increased on these systems.
 +
  config X86_MCE
  	bool "Machine Check Exception"
  	depends on !X86_VOYAGER
@@@ -999,37 -975,24 +1000,37 @@@ config X86_PA
  config ARCH_PHYS_ADDR_T_64BIT
         def_bool X86_64 || X86_PAE
  
 +config DIRECT_GBPAGES
 +	bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
 +	default y
 +	depends on X86_64
 +	help
 +	  Allow the kernel linear mapping to use 1GB pages on CPUs that
 +	  support it. This can improve the kernel's performance a tiny bit by
 +	  reducing TLB pressure. If in doubt, say "Y".
 +
  # Common NUMA Features
  config NUMA
 -	bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
 +	bool "Numa Memory Allocation and Scheduler Support"
  	depends on SMP
  	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
  	default n if X86_PC
  	default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
  	help
  	  Enable NUMA (Non Uniform Memory Access) support.
 +
  	  The kernel will try to allocate memory used by a CPU on the
  	  local memory controller of the CPU and add some more
  	  NUMA awareness to the kernel.
  
 -	  For 32-bit this is currently highly experimental and should be only
 -	  used for kernel development. It might also cause boot failures.
 -	  For 64-bit this is recommended on all multiprocessor Opteron systems.
 -	  If the system is EM64T, you should say N unless your system is
 -	  EM64T NUMA.
 +	  For 64-bit this is recommended if the system is Intel Core i7
 +	  (or later), AMD Opteron, or EM64T NUMA.
 +
 +	  For 32-bit this is only needed on (rare) 32-bit-only platforms
 +	  that support NUMA topologies, such as NUMAQ / Summit, or if you
 +	  boot a 32-bit kernel on a 64-bit NUMA platform.
 +
 +	  Otherwise, you should say N.
  
  comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
  	depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
@@@ -1549,10 -1512,6 +1550,10 @@@ config ARCH_ENABLE_MEMORY_HOTPLU
  	def_bool y
  	depends on X86_64 || (X86_32 && HIGHMEM)
  
 +config ARCH_ENABLE_MEMORY_HOTREMOVE
 +	def_bool y
 +	depends on MEMORY_HOTPLUG
 +
  config HAVE_ARCH_EARLY_PFN_TO_NID
  	def_bool X86_64
  	depends on NUMA
diff --combined arch/x86/include/asm/irq.h
index 28e409fc73f,4bb732e45a8..592688ed04d
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@@ -31,9 -31,13 +31,9 @@@ static inline int irq_canonicalize(int 
  # endif
  #endif
  
 -#ifdef CONFIG_IRQBALANCE
 -extern int irqbalance_disable(char *str);
 -#endif
 -
  #ifdef CONFIG_HOTPLUG_CPU
  #include <linux/cpumask.h>
- extern void fixup_irqs(cpumask_t map);
+ extern void fixup_irqs(void);
  #endif
  
  extern unsigned int do_IRQ(struct pt_regs *regs);
@@@ -42,5 -46,6 +42,6 @@@ extern void native_init_IRQ(void)
  
  /* Interrupt vector management */
  extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+ extern int vector_used_by_percpu_irq(unsigned int vector);
  
  #endif /* _ASM_X86_IRQ_H */
diff --combined arch/x86/kernel/apic.c
index b5229affb95,b9019271af6..6b7f824db16
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@@ -30,7 -30,6 +30,7 @@@
  #include <linux/module.h>
  #include <linux/dmi.h>
  #include <linux/dmar.h>
 +#include <linux/ftrace.h>
  
  #include <asm/atomic.h>
  #include <asm/smp.h>
@@@ -119,8 -118,6 +119,6 @@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o
  
  int first_system_vector = 0xfe;
  
- char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
- 
  /*
   * Debug level, exported for io_apic.c
   */
@@@ -142,7 -139,7 +140,7 @@@ static int lapic_next_event(unsigned lo
  			    struct clock_event_device *evt);
  static void lapic_timer_setup(enum clock_event_mode mode,
  			      struct clock_event_device *evt);
- static void lapic_timer_broadcast(cpumask_t mask);
+ static void lapic_timer_broadcast(const cpumask_t *mask);
  static void apic_pm_activate(void);
  
  /*
@@@ -455,7 -452,7 +453,7 @@@ static void lapic_timer_setup(enum cloc
  /*
   * Local APIC timer broadcast function
   */
- static void lapic_timer_broadcast(cpumask_t mask)
+ static void lapic_timer_broadcast(const cpumask_t *mask)
  {
  #ifdef CONFIG_SMP
  	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@@ -471,7 -468,7 +469,7 @@@ static void __cpuinit setup_APIC_timer(
  	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
  
  	memcpy(levt, &lapic_clockevent, sizeof(*levt));
- 	levt->cpumask = cpumask_of_cpu(smp_processor_id());
+ 	levt->cpumask = cpumask_of(smp_processor_id());
  
  	clockevents_register_device(levt);
  }
@@@ -778,7 -775,11 +776,7 @@@ static void local_apic_timer_interrupt(
  	/*
  	 * the NMI deadlock-detector uses this.
  	 */
 -#ifdef CONFIG_X86_64
 -	add_pda(apic_timer_irqs, 1);
 -#else
 -	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 -#endif
 +	inc_irq_stat(apic_timer_irqs);
  
  	evt->event_handler(evt);
  }
@@@ -791,7 -792,7 +789,7 @@@
   * [ if a single-CPU system runs an SMP kernel then we call the local
   *   interrupt as well. Thus we cannot inline the local irq ... ]
   */
 -void smp_apic_timer_interrupt(struct pt_regs *regs)
 +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
  {
  	struct pt_regs *old_regs = set_irq_regs(regs);
  
@@@ -805,7 -806,9 +803,7 @@@
  	 * Besides, if we don't timer interrupts ignore the global
  	 * interrupt lock, which is the WrongThing (tm) to do.
  	 */
 -#ifdef CONFIG_X86_64
  	exit_idle();
 -#endif
  	irq_enter();
  	local_apic_timer_interrupt();
  	irq_exit();
@@@ -1663,7 -1666,9 +1661,7 @@@ void smp_spurious_interrupt(struct pt_r
  {
  	u32 v;
  
 -#ifdef CONFIG_X86_64
  	exit_idle();
 -#endif
  	irq_enter();
  	/*
  	 * Check if this really is a spurious interrupt and ACK it
@@@ -1674,11 -1679,14 +1672,11 @@@
  	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
  		ack_APIC_irq();
  
 -#ifdef CONFIG_X86_64
 -	add_pda(irq_spurious_count, 1);
 -#else
 +	inc_irq_stat(irq_spurious_count);
 +
  	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
  	pr_info("spurious APIC interrupt on CPU#%d, "
  		"should never happen.\n", smp_processor_id());
 -	__get_cpu_var(irq_stat).irq_spurious_count++;
 -#endif
  	irq_exit();
  }
  
@@@ -1689,7 -1697,9 +1687,7 @@@ void smp_error_interrupt(struct pt_reg
  {
  	u32 v, v1;
  
 -#ifdef CONFIG_X86_64
  	exit_idle();
 -#endif
  	irq_enter();
  	/* First tickle the hardware, only then report what went on. -- REW */
  	v = apic_read(APIC_ESR);
@@@ -1807,28 -1817,32 +1805,32 @@@ void disconnect_bsp_APIC(int virt_wire_
  void __cpuinit generic_processor_info(int apicid, int version)
  {
  	int cpu;
- 	cpumask_t tmp_map;
  
  	/*
  	 * Validate version
  	 */
  	if (version == 0x0) {
  		pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
- 			"fixing up to 0x10. (tell your hw vendor)\n",
- 			version);
+ 			   "fixing up to 0x10. (tell your hw vendor)\n",
+ 				version);
  		version = 0x10;
  	}
  	apic_version[apicid] = version;
  
- 	if (num_processors >= NR_CPUS) {
- 		pr_warning("WARNING: NR_CPUS limit of %i reached."
- 			"  Processor ignored.\n", NR_CPUS);
+ 	if (num_processors >= nr_cpu_ids) {
+ 		int max = nr_cpu_ids;
+ 		int thiscpu = max + disabled_cpus;
+ 
+ 		pr_warning(
+ 			"ACPI: NR_CPUS/possible_cpus limit of %i reached."
+ 			"  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ 
+ 		disabled_cpus++;
  		return;
  	}
  
  	num_processors++;
- 	cpus_complement(tmp_map, cpu_present_map);
- 	cpu = first_cpu(tmp_map);
+ 	cpu = cpumask_next_zero(-1, cpu_present_mask);
  
  	physid_set(apicid, phys_cpu_present_map);
  	if (apicid == boot_cpu_physical_apicid) {
@@@ -1878,8 -1892,8 +1880,8 @@@
  	}
  #endif
  
- 	cpu_set(cpu, cpu_possible_map);
- 	cpu_set(cpu, cpu_present_map);
+ 	set_cpu_possible(cpu, true);
+ 	set_cpu_present(cpu, true);
  }
  
  #ifdef CONFIG_X86_64
@@@ -2081,7 -2095,7 +2083,7 @@@ __cpuinit int apic_is_clustered_box(voi
  	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
  	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
  
- 	for (i = 0; i < NR_CPUS; i++) {
+ 	for (i = 0; i < nr_cpu_ids; i++) {
  		/* are we being called early in kernel startup? */
  		if (bios_cpu_apicid) {
  			id = bios_cpu_apicid[i];
diff --combined arch/x86/kernel/cpu/intel_cacheinfo.c
index 68b5d8681cb,fb7f946cb65..c6ecda64f5f
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@@ -534,31 -534,16 +534,16 @@@ static void __cpuinit free_cache_attrib
  	per_cpu(cpuid4_info, cpu) = NULL;
  }
  
- static int __cpuinit detect_cache_attributes(unsigned int cpu)
+ static void get_cpu_leaves(void *_retval)
  {
- 	struct _cpuid4_info	*this_leaf;
- 	unsigned long		j;
- 	int			retval;
- 	cpumask_t		oldmask;
- 
- 	if (num_cache_leaves == 0)
- 		return -ENOENT;
- 
- 	per_cpu(cpuid4_info, cpu) = kzalloc(
- 	    sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- 	if (per_cpu(cpuid4_info, cpu) == NULL)
- 		return -ENOMEM;
- 
- 	oldmask = current->cpus_allowed;
- 	retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- 	if (retval)
- 		goto out;
+ 	int j, *retval = _retval, cpu = smp_processor_id();
  
  	/* Do cpuid and store the results */
  	for (j = 0; j < num_cache_leaves; j++) {
+ 		struct _cpuid4_info *this_leaf;
  		this_leaf = CPUID4_INFO_IDX(cpu, j);
- 		retval = cpuid4_cache_lookup(j, this_leaf);
- 		if (unlikely(retval < 0)) {
+ 		*retval = cpuid4_cache_lookup(j, this_leaf);
+ 		if (unlikely(*retval < 0)) {
  			int i;
  
  			for (i = 0; i < j; i++)
@@@ -567,9 -552,21 +552,21 @@@
  		}
  		cache_shared_cpu_map_setup(cpu, j);
  	}
- 	set_cpus_allowed_ptr(current, &oldmask);
+ }
+ 
+ static int __cpuinit detect_cache_attributes(unsigned int cpu)
+ {
+ 	int			retval;
+ 
+ 	if (num_cache_leaves == 0)
+ 		return -ENOENT;
+ 
+ 	per_cpu(cpuid4_info, cpu) = kzalloc(
+ 	    sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+ 	if (per_cpu(cpuid4_info, cpu) == NULL)
+ 		return -ENOMEM;
  
- out:
+ 	smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
  	if (retval) {
  		kfree(per_cpu(cpuid4_info, cpu));
  		per_cpu(cpuid4_info, cpu) = NULL;
@@@ -626,8 -623,8 +623,8 @@@ static ssize_t show_shared_cpu_map_func
  		cpumask_t *mask = &this_leaf->shared_cpu_map;
  
  		n = type?
- 			cpulist_scnprintf(buf, len-2, *mask):
- 			cpumask_scnprintf(buf, len-2, *mask);
+ 			cpulist_scnprintf(buf, len-2, mask) :
+ 			cpumask_scnprintf(buf, len-2, mask);
  		buf[n++] = '\n';
  		buf[n] = '\0';
  	}
@@@ -644,17 -641,20 +641,17 @@@ static inline ssize_t show_shared_cpu_l
  	return show_shared_cpu_map_func(leaf, 1, buf);
  }
  
 -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
 -	switch(this_leaf->eax.split.type) {
 -	    case CACHE_TYPE_DATA:
 +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
 +{
 +	switch (this_leaf->eax.split.type) {
 +	case CACHE_TYPE_DATA:
  		return sprintf(buf, "Data\n");
 -		break;
 -	    case CACHE_TYPE_INST:
 +	case CACHE_TYPE_INST:
  		return sprintf(buf, "Instruction\n");
 -		break;
 -	    case CACHE_TYPE_UNIFIED:
 +	case CACHE_TYPE_UNIFIED:
  		return sprintf(buf, "Unified\n");
 -		break;
 -	    default:
 +	default:
  		return sprintf(buf, "Unknown\n");
 -		break;
  	}
  }
  
diff --combined arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 748c8f9e7a0,a1de80f368f..a5a5e053037
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@@ -83,34 -83,41 +83,41 @@@ static DEFINE_PER_CPU(unsigned char, ba
   * CPU Initialization
   */
  
+ struct thresh_restart {
+ 	struct threshold_block *b;
+ 	int reset;
+ 	u16 old_limit;
+ };
+ 
  /* must be called with correct cpu affinity */
- static void threshold_restart_bank(struct threshold_block *b,
- 				   int reset, u16 old_limit)
+ static long threshold_restart_bank(void *_tr)
  {
+ 	struct thresh_restart *tr = _tr;
  	u32 mci_misc_hi, mci_misc_lo;
  
- 	rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+ 	rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
  
- 	if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- 		reset = 1;	/* limit cannot be lower than err count */
+ 	if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ 		tr->reset = 1;	/* limit cannot be lower than err count */
  
- 	if (reset) {		/* reset err count and overflow bit */
+ 	if (tr->reset) {		/* reset err count and overflow bit */
  		mci_misc_hi =
  		    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- 		    (THRESHOLD_MAX - b->threshold_limit);
- 	} else if (old_limit) {	/* change limit w/o reset */
+ 		    (THRESHOLD_MAX - tr->b->threshold_limit);
+ 	} else if (tr->old_limit) {	/* change limit w/o reset */
  		int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- 		    (old_limit - b->threshold_limit);
+ 		    (tr->old_limit - tr->b->threshold_limit);
  		mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
  		    (new_count & THRESHOLD_MAX);
  	}
  
- 	b->interrupt_enable ?
+ 	tr->b->interrupt_enable ?
  	    (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
  	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);
  
  	mci_misc_hi |= MASK_COUNT_EN_HI;
- 	wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+ 	wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ 	return 0;
  }
  
  /* cpu init entry point, called from mce.c with preempt off */
@@@ -120,6 -127,7 +127,7 @@@ void __cpuinit mce_amd_feature_init(str
  	unsigned int cpu = smp_processor_id();
  	u8 lvt_off;
  	u32 low = 0, high = 0, address = 0;
+ 	struct thresh_restart tr;
  
  	for (bank = 0; bank < NR_BANKS; ++bank) {
  		for (block = 0; block < NR_BLOCKS; ++block) {
@@@ -162,7 -170,10 +170,10 @@@
  			wrmsr(address, low, high);
  
  			threshold_defaults.address = address;
- 			threshold_restart_bank(&threshold_defaults, 0, 0);
+ 			tr.b = &threshold_defaults;
+ 			tr.reset = 0;
+ 			tr.old_limit = 0;
+ 			threshold_restart_bank(&tr);
  		}
  	}
  }
@@@ -237,7 -248,7 +248,7 @@@ asmlinkage void mce_threshold_interrupt
  		}
  	}
  out:
 -	add_pda(irq_threshold_count, 1);
 +	inc_irq_stat(irq_threshold_count);
  	irq_exit();
  }
  
@@@ -251,20 -262,6 +262,6 @@@ struct threshold_attr 
  	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
  };
  
- static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
- 					   cpumask_t *newmask)
- {
- 	*oldmask = current->cpus_allowed;
- 	cpus_clear(*newmask);
- 	cpu_set(cpu, *newmask);
- 	set_cpus_allowed_ptr(current, newmask);
- }
- 
- static void affinity_restore(const cpumask_t *oldmask)
- {
- 	set_cpus_allowed_ptr(current, oldmask);
- }
- 
  #define SHOW_FIELDS(name)                                           \
  static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
  {                                                                   \
@@@ -277,15 -274,16 +274,16 @@@ static ssize_t store_interrupt_enable(s
  				      const char *buf, size_t count)
  {
  	char *end;
- 	cpumask_t oldmask, newmask;
+ 	struct thresh_restart tr;
  	unsigned long new = simple_strtoul(buf, &end, 0);
  	if (end == buf)
  		return -EINVAL;
  	b->interrupt_enable = !!new;
  
- 	affinity_set(b->cpu, &oldmask, &newmask);
- 	threshold_restart_bank(b, 0, 0);
- 	affinity_restore(&oldmask);
+ 	tr.b = b;
+ 	tr.reset = 0;
+ 	tr.old_limit = 0;
+ 	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
  
  	return end - buf;
  }
@@@ -294,8 -292,7 +292,7 @@@ static ssize_t store_threshold_limit(st
  				     const char *buf, size_t count)
  {
  	char *end;
- 	cpumask_t oldmask, newmask;
- 	u16 old;
+ 	struct thresh_restart tr;
  	unsigned long new = simple_strtoul(buf, &end, 0);
  	if (end == buf)
  		return -EINVAL;
@@@ -303,34 -300,36 +300,36 @@@
  		new = THRESHOLD_MAX;
  	if (new < 1)
  		new = 1;
- 	old = b->threshold_limit;
+ 	tr.old_limit = b->threshold_limit;
  	b->threshold_limit = new;
+ 	tr.b = b;
+ 	tr.reset = 0;
  
- 	affinity_set(b->cpu, &oldmask, &newmask);
- 	threshold_restart_bank(b, 0, old);
- 	affinity_restore(&oldmask);
+ 	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
  
  	return end - buf;
  }
  
- static ssize_t show_error_count(struct threshold_block *b, char *buf)
+ static long local_error_count(void *_b)
  {
- 	u32 high, low;
- 	cpumask_t oldmask, newmask;
- 	affinity_set(b->cpu, &oldmask, &newmask);
+ 	struct threshold_block *b = _b;
+ 	u32 low, high;
+ 
  	rdmsr(b->address, low, high);
- 	affinity_restore(&oldmask);
- 	return sprintf(buf, "%x\n",
- 		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+ 	return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+ }
+ 
+ static ssize_t show_error_count(struct threshold_block *b, char *buf)
+ {
+ 	return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
  }
  
  static ssize_t store_error_count(struct threshold_block *b,
  				 const char *buf, size_t count)
  {
- 	cpumask_t oldmask, newmask;
- 	affinity_set(b->cpu, &oldmask, &newmask);
- 	threshold_restart_bank(b, 1, 0);
- 	affinity_restore(&oldmask);
+ 	struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+ 
+ 	work_on_cpu(b->cpu, threshold_restart_bank, &tr);
  	return 1;
  }
  
@@@ -463,12 -462,19 +462,19 @@@ out_free
  	return err;
  }
  
+ static long local_allocate_threshold_blocks(void *_bank)
+ {
+ 	unsigned int *bank = _bank;
+ 
+ 	return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+ 					 MSR_IA32_MC0_MISC + *bank * 4);
+ }
+ 
  /* symlinks sibling shared banks to first core.  first core owns dir/files. */
  static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
  {
  	int i, err = 0;
  	struct threshold_bank *b = NULL;
- 	cpumask_t oldmask, newmask;
  	char name[32];
  
  	sprintf(name, "threshold_bank%i", bank);
@@@ -519,11 -525,7 +525,7 @@@
  
  	per_cpu(threshold_banks, cpu)[bank] = b;
  
- 	affinity_set(cpu, &oldmask, &newmask);
- 	err = allocate_threshold_blocks(cpu, bank, 0,
- 					MSR_IA32_MC0_MISC + bank * 4);
- 	affinity_restore(&oldmask);
- 
+ 	err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
  	if (err)
  		goto out_free;
  
diff --combined arch/x86/kernel/genx2apic_uv_x.c
index dece1728973,0e88be11227..b193e082f6c
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@@ -10,7 -10,6 +10,7 @@@
  
  #include <linux/kernel.h>
  #include <linux/threads.h>
 +#include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/string.h>
  #include <linux/ctype.h>
@@@ -18,9 -17,6 +18,9 @@@
  #include <linux/sched.h>
  #include <linux/module.h>
  #include <linux/hardirq.h>
 +#include <linux/timer.h>
 +#include <linux/proc_fs.h>
 +#include <asm/current.h>
  #include <asm/smp.h>
  #include <asm/ipi.h>
  #include <asm/genapic.h>
@@@ -79,16 -75,15 +79,15 @@@ EXPORT_SYMBOL(sn_rtc_cycles_per_second)
  
  /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
  
- static cpumask_t uv_target_cpus(void)
+ static const struct cpumask *uv_target_cpus(void)
  {
- 	return cpumask_of_cpu(0);
+ 	return cpumask_of(0);
  }
  
- static cpumask_t uv_vector_allocation_domain(int cpu)
+ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
  {
- 	cpumask_t domain = CPU_MASK_NONE;
- 	cpu_set(cpu, domain);
- 	return domain;
+ 	cpumask_clear(retmask);
+ 	cpumask_set_cpu(cpu, retmask);
  }
  
  int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@@ -127,28 -122,37 +126,37 @@@ static void uv_send_IPI_one(int cpu, in
  	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
  }
  
- static void uv_send_IPI_mask(cpumask_t mask, int vector)
+ static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
  {
  	unsigned int cpu;
  
- 	for_each_possible_cpu(cpu)
- 		if (cpu_isset(cpu, mask))
+ 	for_each_cpu(cpu, mask)
+ 		uv_send_IPI_one(cpu, vector);
+ }
+ 
+ static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+ {
+ 	unsigned int cpu;
+ 	unsigned int this_cpu = smp_processor_id();
+ 
+ 	for_each_cpu(cpu, mask)
+ 		if (cpu != this_cpu)
  			uv_send_IPI_one(cpu, vector);
  }
  
  static void uv_send_IPI_allbutself(int vector)
  {
- 	cpumask_t mask = cpu_online_map;
- 
- 	cpu_clear(smp_processor_id(), mask);
+ 	unsigned int cpu;
+ 	unsigned int this_cpu = smp_processor_id();
  
- 	if (!cpus_empty(mask))
- 		uv_send_IPI_mask(mask, vector);
+ 	for_each_online_cpu(cpu)
+ 		if (cpu != this_cpu)
+ 			uv_send_IPI_one(cpu, vector);
  }
  
  static void uv_send_IPI_all(int vector)
  {
- 	uv_send_IPI_mask(cpu_online_map, vector);
+ 	uv_send_IPI_mask(cpu_online_mask, vector);
  }
  
  static int uv_apic_id_registered(void)
@@@ -160,7 -164,7 +168,7 @@@ static void uv_init_apic_ldr(void
  {
  }
  
- static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
  {
  	int cpu;
  
@@@ -168,13 -172,30 +176,30 @@@
  	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
  	 * May as well be the first.
  	 */
- 	cpu = first_cpu(cpumask);
+ 	cpu = cpumask_first(cpumask);
  	if ((unsigned)cpu < nr_cpu_ids)
  		return per_cpu(x86_cpu_to_apicid, cpu);
  	else
  		return BAD_APICID;
  }
  
+ static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ 					      const struct cpumask *andmask)
+ {
+ 	int cpu;
+ 
+ 	/*
+ 	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ 	 * May as well be the first.
+ 	 */
+ 	for_each_cpu_and(cpu, cpumask, andmask)
+ 		if (cpumask_test_cpu(cpu, cpu_online_mask))
+ 			break;
+ 	if (cpu < nr_cpu_ids)
+ 		return per_cpu(x86_cpu_to_apicid, cpu);
+ 	return BAD_APICID;
+ }
+ 
  static unsigned int get_apic_id(unsigned long x)
  {
  	unsigned int id;
@@@ -222,8 -243,10 +247,10 @@@ struct genapic apic_x2apic_uv_x = 
  	.send_IPI_all = uv_send_IPI_all,
  	.send_IPI_allbutself = uv_send_IPI_allbutself,
  	.send_IPI_mask = uv_send_IPI_mask,
+ 	.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
  	.send_IPI_self = uv_send_IPI_self,
  	.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ 	.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
  	.phys_pkg_id = phys_pkg_id,
  	.get_apic_id = get_apic_id,
  	.set_apic_id = set_apic_id,
@@@ -359,103 -382,6 +386,103 @@@ static __init void uv_rtc_init(void
  		sn_rtc_cycles_per_second = ticks_per_sec;
  }
  
 +/*
 + * percpu heartbeat timer
 + */
 +static void uv_heartbeat(unsigned long ignored)
 +{
 +	struct timer_list *timer = &uv_hub_info->scir.timer;
 +	unsigned char bits = uv_hub_info->scir.state;
 +
 +	/* flip heartbeat bit */
 +	bits ^= SCIR_CPU_HEARTBEAT;
 +
 +	/* is this cpu idle? */
 +	if (idle_cpu(raw_smp_processor_id()))
 +		bits &= ~SCIR_CPU_ACTIVITY;
 +	else
 +		bits |= SCIR_CPU_ACTIVITY;
 +
 +	/* update system controller interface reg */
 +	uv_set_scir_bits(bits);
 +
 +	/* enable next timer period */
 +	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
 +}
 +
 +static void __cpuinit uv_heartbeat_enable(int cpu)
 +{
 +	if (!uv_cpu_hub_info(cpu)->scir.enabled) {
 +		struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
 +
 +		uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
 +		setup_timer(timer, uv_heartbeat, cpu);
 +		timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
 +		add_timer_on(timer, cpu);
 +		uv_cpu_hub_info(cpu)->scir.enabled = 1;
 +	}
 +
 +	/* check boot cpu */
 +	if (!uv_cpu_hub_info(0)->scir.enabled)
 +		uv_heartbeat_enable(0);
 +}
 +
 +#ifdef CONFIG_HOTPLUG_CPU
 +static void __cpuinit uv_heartbeat_disable(int cpu)
 +{
 +	if (uv_cpu_hub_info(cpu)->scir.enabled) {
 +		uv_cpu_hub_info(cpu)->scir.enabled = 0;
 +		del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
 +	}
 +	uv_set_cpu_scir_bits(cpu, 0xff);
 +}
 +
 +/*
 + * cpu hotplug notifier
 + */
 +static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
 +				       unsigned long action, void *hcpu)
 +{
 +	long cpu = (long)hcpu;
 +
 +	switch (action) {
 +	case CPU_ONLINE:
 +		uv_heartbeat_enable(cpu);
 +		break;
 +	case CPU_DOWN_PREPARE:
 +		uv_heartbeat_disable(cpu);
 +		break;
 +	default:
 +		break;
 +	}
 +	return NOTIFY_OK;
 +}
 +
 +static __init void uv_scir_register_cpu_notifier(void)
 +{
 +	hotcpu_notifier(uv_scir_cpu_notify, 0);
 +}
 +
 +#else /* !CONFIG_HOTPLUG_CPU */
 +
 +static __init void uv_scir_register_cpu_notifier(void)
 +{
 +}
 +
 +static __init int uv_init_heartbeat(void)
 +{
 +	int cpu;
 +
 +	if (is_uv_system())
 +		for_each_online_cpu(cpu)
 +			uv_heartbeat_enable(cpu);
 +	return 0;
 +}
 +
 +late_initcall(uv_init_heartbeat);
 +
 +#endif /* !CONFIG_HOTPLUG_CPU */
 +
  /*
   * Called on each cpu to initialize the per_cpu UV data area.
   * 	ZZZ hotplug not supported yet
@@@ -529,7 -455,7 +556,7 @@@ void __init uv_system_init(void
  
  	uv_bios_init();
  	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
 -			    &uv_coherency_id, &uv_region_size);
 +			    &sn_coherency_id, &sn_region_size);
  	uv_rtc_init();
  
  	for_each_present_cpu(cpu) {
@@@ -540,7 -466,8 +567,7 @@@
  		uv_blade_info[blade].nr_possible_cpus++;
  
  		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
 -		uv_cpu_hub_info(cpu)->lowmem_remap_top =
 -					lowmem_redir_base + lowmem_redir_size;
 +		uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
  		uv_cpu_hub_info(cpu)->m_val = m_val;
  		uv_cpu_hub_info(cpu)->n_val = m_val;
  		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@@ -550,8 -477,7 +577,8 @@@
  		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
  		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
  		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
 -		uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
 +		uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
 +		uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
  		uv_node_to_blade[nid] = blade;
  		uv_cpu_to_blade[cpu] = blade;
  		max_pnode = max(pnode, max_pnode);
@@@ -568,6 -494,4 +595,6 @@@
  	map_mmioh_high(max_pnode);
  
  	uv_cpu_init();
 +	uv_scir_register_cpu_notifier();
 +	proc_mkdir("sgi_uv", NULL);
  }
diff --combined arch/x86/kernel/hpet.c
index 845ea097383,e76d7e27297..cd759ad9069
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@@ -33,9 -33,7 +33,9 @@@
   * HPET address is set in acpi/boot.c, when an ACPI entry exists
   */
  unsigned long				hpet_address;
 -unsigned long				hpet_num_timers;
 +#ifdef CONFIG_PCI_MSI
 +static unsigned long			hpet_num_timers;
 +#endif
  static void __iomem			*hpet_virt_address;
  
  struct hpet_dev {
@@@ -248,7 -246,7 +248,7 @@@ static void hpet_legacy_clockevent_regi
  	 * Start hpet with the boot cpu mask and make it
  	 * global after the IO_APIC has been initialized.
  	 */
- 	hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+ 	hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
  	clockevents_register_device(&hpet_clockevent);
  	global_clock_event = &hpet_clockevent;
  	printk(KERN_DEBUG "hpet clockevent registered\n");
@@@ -303,7 -301,7 +303,7 @@@ static void hpet_set_mode(enum clock_ev
  			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
  			hpet_setup_msi_irq(hdev->irq);
  			disable_irq(hdev->irq);
- 			irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ 			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
  			enable_irq(hdev->irq);
  		}
  		break;
@@@ -451,7 -449,7 +451,7 @@@ static int hpet_setup_irq(struct hpet_d
  		return -1;
  
  	disable_irq(dev->irq);
- 	irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ 	irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
  	enable_irq(dev->irq);
  
  	printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@@ -502,7 -500,7 +502,7 @@@ static void init_one_hpet_msi_clockeven
  	/* 5 usec minimum reprogramming delta. */
  	evt->min_delta_ns = 5000;
  
- 	evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ 	evt->cpumask = cpumask_of(hdev->cpu);
  	clockevents_register_device(evt);
  }
  
@@@ -813,7 -811,7 +813,7 @@@ int __init hpet_enable(void
  
  out_nohpet:
  	hpet_clear_mapping();
 -	boot_hpet_disable = 1;
 +	hpet_address = 0;
  	return 0;
  }
  
@@@ -836,11 -834,10 +836,11 @@@ static __init int hpet_late_init(void
  
  		hpet_address = force_hpet_address;
  		hpet_enable();
 -		if (!hpet_virt_address)
 -			return -ENODEV;
  	}
  
 +	if (!hpet_virt_address)
 +		return -ENODEV;
 +
  	hpet_reserve_platform_timers(hpet_readl(HPET_ID));
  
  	for_each_online_cpu(cpu) {
diff --combined arch/x86/kernel/io_apic.c
index 74917658b00,1cbf7c8d46e..62ecfc991e1
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@@ -136,8 -136,8 +136,8 @@@ static struct irq_pin_list *get_one_fre
  
  struct irq_cfg {
  	struct irq_pin_list *irq_2_pin;
- 	cpumask_t domain;
- 	cpumask_t old_domain;
+ 	cpumask_var_t domain;
+ 	cpumask_var_t old_domain;
  	unsigned move_cleanup_count;
  	u8 vector;
  	u8 move_in_progress : 1;
@@@ -152,25 -152,25 +152,25 @@@ static struct irq_cfg irq_cfgx[] = 
  #else
  static struct irq_cfg irq_cfgx[NR_IRQS] = {
  #endif
- 	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
- 	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
- 	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
- 	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
- 	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
- 	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
- 	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
- 	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
- 	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
- 	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
- 	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- 	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- 	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- 	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- 	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- 	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+ 	[0]  = { .vector = IRQ0_VECTOR,  },
+ 	[1]  = { .vector = IRQ1_VECTOR,  },
+ 	[2]  = { .vector = IRQ2_VECTOR,  },
+ 	[3]  = { .vector = IRQ3_VECTOR,  },
+ 	[4]  = { .vector = IRQ4_VECTOR,  },
+ 	[5]  = { .vector = IRQ5_VECTOR,  },
+ 	[6]  = { .vector = IRQ6_VECTOR,  },
+ 	[7]  = { .vector = IRQ7_VECTOR,  },
+ 	[8]  = { .vector = IRQ8_VECTOR,  },
+ 	[9]  = { .vector = IRQ9_VECTOR,  },
+ 	[10] = { .vector = IRQ10_VECTOR, },
+ 	[11] = { .vector = IRQ11_VECTOR, },
+ 	[12] = { .vector = IRQ12_VECTOR, },
+ 	[13] = { .vector = IRQ13_VECTOR, },
+ 	[14] = { .vector = IRQ14_VECTOR, },
+ 	[15] = { .vector = IRQ15_VECTOR, },
  };
  
 -void __init arch_early_irq_init(void)
 +int __init arch_early_irq_init(void)
  {
  	struct irq_cfg *cfg;
  	struct irq_desc *desc;
@@@ -183,9 -183,11 +183,13 @@@
  	for (i = 0; i < count; i++) {
  		desc = irq_to_desc(i);
  		desc->chip_data = &cfg[i];
+ 		alloc_bootmem_cpumask_var(&cfg[i].domain);
+ 		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ 		if (i < NR_IRQS_LEGACY)
+ 			cpumask_setall(cfg[i].domain);
  	}
 +
 +	return 0;
  }
  
  #ifdef CONFIG_SPARSE_IRQ
@@@ -209,12 -211,26 +213,26 @@@ static struct irq_cfg *get_one_free_irq
  	node = cpu_to_node(cpu);
  
  	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ 	if (cfg) {
+ 		/* FIXME: needs alloc_cpumask_var_node() */
+ 		if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+ 			kfree(cfg);
+ 			cfg = NULL;
+ 		} else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+ 			free_cpumask_var(cfg->domain);
+ 			kfree(cfg);
+ 			cfg = NULL;
+ 		} else {
+ 			cpumask_clear(cfg->domain);
+ 			cpumask_clear(cfg->old_domain);
+ 		}
+ 	}
  	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
  
  	return cfg;
  }
  
 -void arch_init_chip_data(struct irq_desc *desc, int cpu)
 +int arch_init_chip_data(struct irq_desc *desc, int cpu)
  {
  	struct irq_cfg *cfg;
  
@@@ -226,8 -242,6 +244,8 @@@
  			BUG_ON(1);
  		}
  	}
 +
 +	return 0;
  }
  
  #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
@@@ -333,13 -347,14 +351,14 @@@ void arch_free_chip_data(struct irq_des
  	}
  }
  
- static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ static void
+ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
  	struct irq_cfg *cfg = desc->chip_data;
  
  	if (!cfg->move_in_progress) {
  		/* it means that domain is not changed */
- 		if (!cpus_intersects(desc->affinity, mask))
+ 		if (!cpumask_intersects(&desc->affinity, mask))
  			cfg->move_desc_pending = 1;
  	}
  }
@@@ -354,7 -369,8 +373,8 @@@ static struct irq_cfg *irq_cfg(unsigne
  #endif
  
  #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
- static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ static inline void
+ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
  }
  #endif
@@@ -485,6 -501,26 +505,26 @@@ static void ioapic_mask_entry(int apic
  }
  
  #ifdef CONFIG_SMP
+ static void send_cleanup_vector(struct irq_cfg *cfg)
+ {
+ 	cpumask_var_t cleanup_mask;
+ 
+ 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ 		unsigned int i;
+ 		cfg->move_cleanup_count = 0;
+ 		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ 			cfg->move_cleanup_count++;
+ 		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ 			send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ 	} else {
+ 		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ 		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ 		free_cpumask_var(cleanup_mask);
+ 	}
+ 	cfg->move_in_progress = 0;
+ }
+ 
  static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
  {
  	int apic, pin;
@@@ -520,41 -556,55 +560,55 @@@
  	}
  }
  
- static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
+ static int
+ assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
  
- static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ /*
+  * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+  * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+  */
+ static unsigned int
+ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
  {
  	struct irq_cfg *cfg;
- 	unsigned long flags;
- 	unsigned int dest;
- 	cpumask_t tmp;
  	unsigned int irq;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
- 		return;
+ 	if (!cpumask_intersects(mask, cpu_online_mask))
+ 		return BAD_APICID;
  
  	irq = desc->irq;
  	cfg = desc->chip_data;
  	if (assign_irq_vector(irq, cfg, mask))
- 		return;
+ 		return BAD_APICID;
  
+ 	cpumask_and(&desc->affinity, cfg->domain, mask);
  	set_extra_move_desc(desc, mask);
+ 	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+ }
  
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
- 	/*
- 	 * Only the high 8 bits are valid.
- 	 */
- 	dest = SET_APIC_LOGICAL_ID(dest);
+ static void
+ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ {
+ 	struct irq_cfg *cfg;
+ 	unsigned long flags;
+ 	unsigned int dest;
+ 	unsigned int irq;
+ 
+ 	irq = desc->irq;
+ 	cfg = desc->chip_data;
  
  	spin_lock_irqsave(&ioapic_lock, flags);
- 	__target_IO_APIC_irq(irq, dest, cfg);
- 	desc->affinity = mask;
+ 	dest = set_desc_affinity(desc, mask);
+ 	if (dest != BAD_APICID) {
+ 		/* Only the high 8 bits are valid. */
+ 		dest = SET_APIC_LOGICAL_ID(dest);
+ 		__target_IO_APIC_irq(irq, dest, cfg);
+ 	}
  	spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
- static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ static void
+ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc;
  
@@@ -1222,7 -1272,8 +1276,8 @@@ void unlock_vector_lock(void
  	spin_unlock(&vector_lock);
  }
  
- static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+ static int
+ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
  {
  	/*
  	 * NOTE! The local APIC isn't very good at handling
@@@ -1237,49 -1288,49 +1292,49 @@@
  	 */
  	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
  	unsigned int old_vector;
- 	int cpu;
+ 	int cpu, err;
+ 	cpumask_var_t tmp_mask;
  
  	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
  		return -EBUSY;
  
- 	/* Only try and allocate irqs on cpus that are present */
- 	cpus_and(mask, mask, cpu_online_map);
+ 	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ 		return -ENOMEM;
  
  	old_vector = cfg->vector;
  	if (old_vector) {
- 		cpumask_t tmp;
- 		cpus_and(tmp, cfg->domain, mask);
- 		if (!cpus_empty(tmp))
+ 		cpumask_and(tmp_mask, mask, cpu_online_mask);
+ 		cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ 		if (!cpumask_empty(tmp_mask)) {
+ 			free_cpumask_var(tmp_mask);
  			return 0;
+ 		}
  	}
  
- 	for_each_cpu_mask_nr(cpu, mask) {
- 		cpumask_t domain, new_mask;
+ 	/* Only try and allocate irqs on cpus that are present */
+ 	err = -ENOSPC;
+ 	for_each_cpu_and(cpu, mask, cpu_online_mask) {
  		int new_cpu;
  		int vector, offset;
  
- 		domain = vector_allocation_domain(cpu);
- 		cpus_and(new_mask, domain, cpu_online_map);
+ 		vector_allocation_domain(cpu, tmp_mask);
  
  		vector = current_vector;
  		offset = current_offset;
  next:
  		vector += 8;
  		if (vector >= first_system_vector) {
- 			/* If we run out of vectors on large boxen, must share them. */
+ 			/* If out of vectors on large boxen, must share them. */
  			offset = (offset + 1) % 8;
  			vector = FIRST_DEVICE_VECTOR + offset;
  		}
  		if (unlikely(current_vector == vector))
  			continue;
- #ifdef CONFIG_X86_64
- 		if (vector == IA32_SYSCALL_VECTOR)
- 			goto next;
- #else
- 		if (vector == SYSCALL_VECTOR)
+ 
+ 		if (test_bit(vector, used_vectors))
  			goto next;
- #endif
- 		for_each_cpu_mask_nr(new_cpu, new_mask)
+ 
+ 		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
  			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
  				goto next;
  		/* Found one! */
@@@ -1287,18 -1338,21 +1342,21 @@@
  		current_offset = offset;
  		if (old_vector) {
  			cfg->move_in_progress = 1;
- 			cfg->old_domain = cfg->domain;
+ 			cpumask_copy(cfg->old_domain, cfg->domain);
  		}
- 		for_each_cpu_mask_nr(new_cpu, new_mask)
+ 		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
  			per_cpu(vector_irq, new_cpu)[vector] = irq;
  		cfg->vector = vector;
- 		cfg->domain = domain;
- 		return 0;
+ 		cpumask_copy(cfg->domain, tmp_mask);
+ 		err = 0;
+ 		break;
  	}
- 	return -ENOSPC;
+ 	free_cpumask_var(tmp_mask);
+ 	return err;
  }
  
- static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+ static int
+ assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
  {
  	int err;
  	unsigned long flags;
@@@ -1311,23 -1365,20 +1369,20 @@@
  
  static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
  {
- 	cpumask_t mask;
  	int cpu, vector;
  
  	BUG_ON(!cfg->vector);
  
  	vector = cfg->vector;
- 	cpus_and(mask, cfg->domain, cpu_online_map);
- 	for_each_cpu_mask_nr(cpu, mask)
+ 	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
  		per_cpu(vector_irq, cpu)[vector] = -1;
  
  	cfg->vector = 0;
- 	cpus_clear(cfg->domain);
+ 	cpumask_clear(cfg->domain);
  
  	if (likely(!cfg->move_in_progress))
  		return;
- 	cpus_and(mask, cfg->old_domain, cpu_online_map);
- 	for_each_cpu_mask_nr(cpu, mask) {
+ 	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
  		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
  								vector++) {
  			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@@ -1349,8 -1400,10 +1404,8 @@@ void __setup_vector_irq(int cpu
  
  	/* Mark the inuse vectors */
  	for_each_irq_desc(irq, desc) {
 -		if (!desc)
 -			continue;
  		cfg = desc->chip_data;
- 		if (!cpu_isset(cpu, cfg->domain))
+ 		if (!cpumask_test_cpu(cpu, cfg->domain))
  			continue;
  		vector = cfg->vector;
  		per_cpu(vector_irq, cpu)[vector] = irq;
@@@ -1362,7 -1415,7 +1417,7 @@@
  			continue;
  
  		cfg = irq_cfg(irq);
- 		if (!cpu_isset(cpu, cfg->domain))
+ 		if (!cpumask_test_cpu(cpu, cfg->domain))
  			per_cpu(vector_irq, cpu)[vector] = -1;
  	}
  }
@@@ -1498,18 -1551,17 +1553,17 @@@ static void setup_IO_APIC_irq(int apic
  {
  	struct irq_cfg *cfg;
  	struct IO_APIC_route_entry entry;
- 	cpumask_t mask;
+ 	unsigned int dest;
  
  	if (!IO_APIC_IRQ(irq))
  		return;
  
  	cfg = desc->chip_data;
  
- 	mask = TARGET_CPUS;
- 	if (assign_irq_vector(irq, cfg, mask))
+ 	if (assign_irq_vector(irq, cfg, TARGET_CPUS))
  		return;
  
- 	cpus_and(mask, cfg->domain, mask);
+ 	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
  
  	apic_printk(APIC_VERBOSE,KERN_DEBUG
  		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@@ -1519,8 -1571,7 +1573,7 @@@
  
  
  	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- 			       cpu_mask_to_apicid(mask), trigger, polarity,
- 			       cfg->vector)) {
+ 			       dest, trigger, polarity, cfg->vector)) {
  		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
  		       mp_ioapics[apic].mp_apicid, pin);
  		__clear_irq_vector(irq, cfg);
@@@ -1732,6 -1783,8 +1785,6 @@@ __apicdebuginit(void) print_IO_APIC(voi
  	for_each_irq_desc(irq, desc) {
  		struct irq_pin_list *entry;
  
 -		if (!desc)
 -			continue;
  		cfg = desc->chip_data;
  		entry = cfg->irq_2_pin;
  		if (!entry)
@@@ -2240,7 -2293,7 +2293,7 @@@ static int ioapic_retrigger_irq(unsigne
  	unsigned long flags;
  
  	spin_lock_irqsave(&vector_lock, flags);
- 	send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+ 	send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
  	spin_unlock_irqrestore(&vector_lock, flags);
  
  	return 1;
@@@ -2289,18 -2342,17 +2342,17 @@@ static DECLARE_DELAYED_WORK(ir_migratio
   * as simple as edge triggered migration and we can do the irq migration
   * with a simple atomic update to IO-APIC RTE.
   */
- static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ static void
+ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
  {
  	struct irq_cfg *cfg;
- 	cpumask_t tmp, cleanup_mask;
  	struct irte irte;
  	int modify_ioapic_rte;
  	unsigned int dest;
  	unsigned long flags;
  	unsigned int irq;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
+ 	if (!cpumask_intersects(mask, cpu_online_mask))
  		return;
  
  	irq = desc->irq;
@@@ -2313,8 -2365,7 +2365,7 @@@
  
  	set_extra_move_desc(desc, mask);
  
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
+ 	dest = cpu_mask_to_apicid_and(cfg->domain, mask);
  
  	modify_ioapic_rte = desc->status & IRQ_LEVEL;
  	if (modify_ioapic_rte) {
@@@ -2331,14 -2382,10 +2382,10 @@@
  	 */
  	modify_irte(irq, &irte);
  
- 	if (cfg->move_in_progress) {
- 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- 		cfg->move_in_progress = 0;
- 	}
+ 	if (cfg->move_in_progress)
+ 		send_cleanup_vector(cfg);
  
- 	desc->affinity = mask;
+ 	cpumask_copy(&desc->affinity, mask);
  }
  
  static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@@ -2360,11 -2407,11 +2407,11 @@@
  	}
  
  	/* everthing is clear. we have right of way */
- 	migrate_ioapic_irq_desc(desc, desc->pending_mask);
+ 	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
  
  	ret = 0;
  	desc->status &= ~IRQ_MOVE_PENDING;
- 	cpus_clear(desc->pending_mask);
+ 	cpumask_clear(&desc->pending_mask);
  
  unmask:
  	unmask_IO_APIC_irq_desc(desc);
@@@ -2378,6 -2425,9 +2425,6 @@@ static void ir_irq_migration(struct wor
  	struct irq_desc *desc;
  
  	for_each_irq_desc(irq, desc) {
 -		if (!desc)
 -			continue;
 -
  		if (desc->status & IRQ_MOVE_PENDING) {
  			unsigned long flags;
  
@@@ -2389,7 -2439,7 +2436,7 @@@
  				continue;
  			}
  
- 			desc->chip->set_affinity(irq, desc->pending_mask);
+ 			desc->chip->set_affinity(irq, &desc->pending_mask);
  			spin_unlock_irqrestore(&desc->lock, flags);
  		}
  	}
@@@ -2398,18 -2448,20 +2445,20 @@@
  /*
   * Migrates the IRQ destination in the process context.
   */
- static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ 					    const struct cpumask *mask)
  {
  	if (desc->status & IRQ_LEVEL) {
  		desc->status |= IRQ_MOVE_PENDING;
- 		desc->pending_mask = mask;
+ 		cpumask_copy(&desc->pending_mask, mask);
  		migrate_irq_remapped_level_desc(desc);
  		return;
  	}
  
  	migrate_ioapic_irq_desc(desc, mask);
  }
- static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ static void set_ir_ioapic_affinity_irq(unsigned int irq,
+ 				       const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  
@@@ -2420,9 -2472,10 +2469,9 @@@
  asmlinkage void smp_irq_move_cleanup_interrupt(void)
  {
  	unsigned vector, me;
 +
  	ack_APIC_irq();
 -#ifdef CONFIG_X86_64
  	exit_idle();
 -#endif
  	irq_enter();
  
  	me = smp_processor_id();
@@@ -2444,7 -2497,7 +2493,7 @@@
  		if (!cfg->move_cleanup_count)
  			goto unlock;
  
- 		if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ 		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
  			goto unlock;
  
  		__get_cpu_var(vector_irq)[vector] = -1;
@@@ -2467,7 -2520,7 +2516,7 @@@ static void irq_complete_move(struct ir
  		if (likely(!cfg->move_desc_pending))
  			return;
  
 -		/* domain is not change, but affinity is changed */
 +		/* domain has not changed, but affinity did */
  		me = smp_processor_id();
  		if (cpu_isset(me, desc->affinity)) {
  			*descp = desc = move_irq_desc(desc, me);
@@@ -2481,20 -2534,14 +2530,14 @@@
  
  	vector = ~get_irq_regs()->orig_ax;
  	me = smp_processor_id();
- 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
- 		cpumask_t cleanup_mask;
- 
  #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
  		*descp = desc = move_irq_desc(desc, me);
  		/* get the new one */
  		cfg = desc->chip_data;
  #endif
  
- 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- 		cfg->move_in_progress = 0;
- 	}
+ 	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ 		send_cleanup_vector(cfg);
  }
  #else
  static inline void irq_complete_move(struct irq_desc **descp) {}
@@@ -2667,6 -2714,9 +2710,6 @@@ static inline void init_IO_APIC_traps(v
  	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
  	 */
  	for_each_irq_desc(irq, desc) {
 -		if (!desc)
 -			continue;
 -
  		cfg = desc->chip_data;
  		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
  			/*
@@@ -3216,16 -3266,13 +3259,13 @@@ static int msi_compose_msg(struct pci_d
  	struct irq_cfg *cfg;
  	int err;
  	unsigned dest;
- 	cpumask_t tmp;
  
  	cfg = irq_cfg(irq);
- 	tmp = TARGET_CPUS;
- 	err = assign_irq_vector(irq, cfg, tmp);
+ 	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
  	if (err)
  		return err;
  
- 	cpus_and(tmp, cfg->domain, tmp);
- 	dest = cpu_mask_to_apicid(tmp);
+ 	dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
  
  #ifdef CONFIG_INTR_REMAP
  	if (irq_remapped(irq)) {
@@@ -3279,26 -3326,18 +3319,18 @@@
  }
  
  #ifdef CONFIG_SMP
- static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  	struct irq_cfg *cfg;
  	struct msi_msg msg;
  	unsigned int dest;
- 	cpumask_t tmp;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
+ 	dest = set_desc_affinity(desc, mask);
+ 	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
- 	if (assign_irq_vector(irq, cfg, mask))
- 		return;
- 
- 	set_extra_move_desc(desc, mask);
- 
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
  
  	read_msi_msg_desc(desc, &msg);
  
@@@ -3308,37 -3347,27 +3340,27 @@@
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
  	write_msi_msg_desc(desc, &msg);
- 	desc->affinity = mask;
  }
  #ifdef CONFIG_INTR_REMAP
  /*
   * Migrate the MSI irq to another cpumask. This migration is
   * done in the process context using interrupt-remapping hardware.
   */
- static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+ static void
+ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
- 	struct irq_cfg *cfg;
+ 	struct irq_cfg *cfg = desc->chip_data;
  	unsigned int dest;
- 	cpumask_t tmp, cleanup_mask;
  	struct irte irte;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
- 		return;
- 
  	if (get_irte(irq, &irte))
  		return;
  
- 	cfg = desc->chip_data;
- 	if (assign_irq_vector(irq, cfg, mask))
+ 	dest = set_desc_affinity(desc, mask);
+ 	if (dest == BAD_APICID)
  		return;
  
- 	set_extra_move_desc(desc, mask);
- 
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
- 
  	irte.vector = cfg->vector;
  	irte.dest_id = IRTE_DEST(dest);
  
@@@ -3352,14 -3381,8 +3374,8 @@@
  	 * at the new destination. So, time to cleanup the previous
  	 * vector allocation.
  	 */
- 	if (cfg->move_in_progress) {
- 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- 		cfg->move_in_progress = 0;
- 	}
- 
- 	desc->affinity = mask;
+ 	if (cfg->move_in_progress)
+ 		send_cleanup_vector(cfg);
  }
  
  #endif
@@@ -3550,26 -3573,18 +3566,18 @@@ void arch_teardown_msi_irq(unsigned in
  
  #ifdef CONFIG_DMAR
  #ifdef CONFIG_SMP
- static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  	struct irq_cfg *cfg;
  	struct msi_msg msg;
  	unsigned int dest;
- 	cpumask_t tmp;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
+ 	dest = set_desc_affinity(desc, mask);
+ 	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
- 	if (assign_irq_vector(irq, cfg, mask))
- 		return;
- 
- 	set_extra_move_desc(desc, mask);
- 
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
  
  	dmar_msi_read(irq, &msg);
  
@@@ -3579,7 -3594,6 +3587,6 @@@
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
  	dmar_msi_write(irq, &msg);
- 	desc->affinity = mask;
  }
  
  #endif /* CONFIG_SMP */
@@@ -3613,26 -3627,18 +3620,18 @@@ int arch_setup_dmar_msi(unsigned int ir
  #ifdef CONFIG_HPET_TIMER
  
  #ifdef CONFIG_SMP
- static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  	struct irq_cfg *cfg;
  	struct msi_msg msg;
  	unsigned int dest;
- 	cpumask_t tmp;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
+ 	dest = set_desc_affinity(desc, mask);
+ 	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
- 	if (assign_irq_vector(irq, cfg, mask))
- 		return;
- 
- 	set_extra_move_desc(desc, mask);
- 
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
  
  	hpet_msi_read(irq, &msg);
  
@@@ -3642,7 -3648,6 +3641,6 @@@
  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
  	hpet_msi_write(irq, &msg);
- 	desc->affinity = mask;
  }
  
  #endif /* CONFIG_SMP */
@@@ -3697,28 -3702,19 +3695,19 @@@ static void target_ht_irq(unsigned int 
  	write_ht_irq_msg(irq, &msg);
  }
  
- static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  	struct irq_cfg *cfg;
  	unsigned int dest;
- 	cpumask_t tmp;
  
- 	cpus_and(tmp, mask, cpu_online_map);
- 	if (cpus_empty(tmp))
+ 	dest = set_desc_affinity(desc, mask);
+ 	if (dest == BAD_APICID)
  		return;
  
  	cfg = desc->chip_data;
- 	if (assign_irq_vector(irq, cfg, mask))
- 		return;
- 
- 	set_extra_move_desc(desc, mask);
- 
- 	cpus_and(tmp, cfg->domain, mask);
- 	dest = cpu_mask_to_apicid(tmp);
  
  	target_ht_irq(irq, dest, cfg->vector);
- 	desc->affinity = mask;
  }
  
  #endif
@@@ -3738,17 -3734,14 +3727,14 @@@ int arch_setup_ht_irq(unsigned int irq
  {
  	struct irq_cfg *cfg;
  	int err;
- 	cpumask_t tmp;
  
  	cfg = irq_cfg(irq);
- 	tmp = TARGET_CPUS;
- 	err = assign_irq_vector(irq, cfg, tmp);
+ 	err = assign_irq_vector(irq, cfg, TARGET_CPUS);
  	if (!err) {
  		struct ht_irq_msg msg;
  		unsigned dest;
  
- 		cpus_and(tmp, cfg->domain, tmp);
- 		dest = cpu_mask_to_apicid(tmp);
+ 		dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
  
  		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
  
@@@ -3784,7 -3777,7 +3770,7 @@@
  int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
  		       unsigned long mmr_offset)
  {
- 	const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
  	struct irq_cfg *cfg;
  	int mmr_pnode;
  	unsigned long mmr_value;
@@@ -3794,7 -3787,7 +3780,7 @@@
  
  	cfg = irq_cfg(irq);
  
- 	err = assign_irq_vector(irq, cfg, *eligible_cpu);
+ 	err = assign_irq_vector(irq, cfg, eligible_cpu);
  	if (err != 0)
  		return err;
  
@@@ -3813,7 -3806,7 +3799,7 @@@
  	entry->polarity = 0;
  	entry->trigger = 0;
  	entry->mask = 0;
- 	entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+ 	entry->dest = cpu_mask_to_apicid(eligible_cpu);
  
  	mmr_pnode = uv_blade_to_pnode(mmr_blade);
  	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@@ -4024,7 -4017,7 +4010,7 @@@ void __init setup_ioapic_dest(void
  	int pin, ioapic, irq, irq_entry;
  	struct irq_desc *desc;
  	struct irq_cfg *cfg;
- 	cpumask_t mask;
+ 	const struct cpumask *mask;
  
  	if (skip_ioapic_setup == 1)
  		return;
@@@ -4055,7 -4048,7 +4041,7 @@@
  			 */
  			if (desc->status &
  			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- 				mask = desc->affinity;
+ 				mask = &desc->affinity;
  			else
  				mask = TARGET_CPUS;
  
diff --combined arch/x86/kernel/irq_64.c
index a174a217eb1,fca2991443f..6383d50f82e
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@@ -13,12 -13,12 +13,12 @@@
  #include <linux/seq_file.h>
  #include <linux/module.h>
  #include <linux/delay.h>
 +#include <linux/ftrace.h>
  #include <asm/uaccess.h>
  #include <asm/io_apic.h>
  #include <asm/idle.h>
  #include <asm/smp.h>
  
 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
  /*
   * Probabilistic stack overflow check:
   *
@@@ -28,25 -28,26 +28,25 @@@
   */
  static inline void stack_overflow_check(struct pt_regs *regs)
  {
 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
  	u64 curbase = (u64)task_stack_page(current);
 -	static unsigned long warned = -60*HZ;
 -
 -	if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
 -	    regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
 -	    time_after(jiffies, warned + 60*HZ)) {
 -		printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
 -		       current->comm, curbase, regs->sp);
 -		show_stack(NULL,NULL);
 -		warned = jiffies;
 -	}
 -}
 +
 +	WARN_ONCE(regs->sp >= curbase &&
 +		  regs->sp <= curbase + THREAD_SIZE &&
 +		  regs->sp <  curbase + sizeof(struct thread_info) +
 +					sizeof(struct pt_regs) + 128,
 +
 +		  "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
 +			current->comm, curbase, regs->sp);
  #endif
 +}
  
  /*
   * do_IRQ handles all normal device IRQ's (the special
   * SMP cross-CPU interrupts have their own specific
   * handlers).
   */
 -asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
  {
  	struct pt_regs *old_regs = set_irq_regs(regs);
  	struct irq_desc *desc;
@@@ -59,7 -60,9 +59,7 @@@
  	irq_enter();
  	irq = __get_cpu_var(vector_irq)[vector];
  
 -#ifdef CONFIG_DEBUG_STACKOVERFLOW
  	stack_overflow_check(regs);
 -#endif
  
  	desc = irq_to_desc(irq);
  	if (likely(desc))
@@@ -80,16 -83,17 +80,17 @@@
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
- void fixup_irqs(cpumask_t map)
+ /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+ void fixup_irqs(void)
  {
  	unsigned int irq;
  	static int warned;
  	struct irq_desc *desc;
  
  	for_each_irq_desc(irq, desc) {
- 		cpumask_t mask;
  		int break_affinity = 0;
  		int set_affinity = 1;
+ 		const struct cpumask *affinity;
  
  		if (!desc)
  			continue;
@@@ -99,23 -103,23 +100,23 @@@
  		/* interrupt's are disabled at this point */
  		spin_lock(&desc->lock);
  
+ 		affinity = &desc->affinity;
  		if (!irq_has_action(irq) ||
- 		    cpus_equal(desc->affinity, map)) {
+ 		    cpumask_equal(affinity, cpu_online_mask)) {
  			spin_unlock(&desc->lock);
  			continue;
  		}
  
- 		cpus_and(mask, desc->affinity, map);
- 		if (cpus_empty(mask)) {
+ 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
  			break_affinity = 1;
- 			mask = map;
+ 			affinity = cpu_all_mask;
  		}
  
  		if (desc->chip->mask)
  			desc->chip->mask(irq);
  
  		if (desc->chip->set_affinity)
- 			desc->chip->set_affinity(irq, mask);
+ 			desc->chip->set_affinity(irq, affinity);
  		else if (!(warned++))
  			set_affinity = 0;
  
diff --combined arch/x86/kernel/irqinit_32.c
index 203384ed2b5,61aa2a1004b..84723295f88
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@@ -110,6 -110,18 +110,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
  	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
  };
  
+ int vector_used_by_percpu_irq(unsigned int vector)
+ {
+ 	int cpu;
+ 
+ 	for_each_online_cpu(cpu) {
+ 		if (per_cpu(vector_irq, cpu)[vector] != -1)
+ 			return 1;
+ 	}
+ 
+ 	return 0;
+ }
+ 
  /* Overridden in paravirt.c */
  void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
  
@@@ -128,7 -140,7 +140,7 @@@ void __init native_init_IRQ(void
  	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
  		/* SYSCALL_VECTOR was reserved in trap_init. */
  		if (i != SYSCALL_VECTOR)
 -			set_intr_gate(i, interrupt[i]);
 +			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
  	}
  
  
@@@ -146,10 -158,12 +158,12 @@@
  	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
  
  	/* IPI for single call function */
- 	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+ 	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ 				 call_function_single_interrupt);
  
  	/* Low priority IPI to cleanup after moving an irq */
  	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
  #endif
  
  #ifdef CONFIG_X86_LOCAL_APIC
diff --combined arch/x86/kernel/irqinit_64.c
index 6190e6ef546,1020919efe1..31ebfe38e96
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@@ -23,6 -23,41 +23,6 @@@
  #include <asm/apic.h>
  #include <asm/i8259.h>
  
 -/*
 - * Common place to define all x86 IRQ vectors
 - *
 - * This builds up the IRQ handler stubs using some ugly macros in irq.h
 - *
 - * These macros create the low-level assembly IRQ routines that save
 - * register context and call do_IRQ(). do_IRQ() then does all the
 - * operations that are needed to keep the AT (or SMP IOAPIC)
 - * interrupt-controller happy.
 - */
 -
 -#define IRQ_NAME2(nr) nr##_interrupt(void)
 -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
 -
 -/*
 - *	SMP has a few special interrupts for IPI messages
 - */
 -
 -#define BUILD_IRQ(nr)				\
 -	asmlinkage void IRQ_NAME(nr);		\
 -	asm("\n.text\n.p2align\n"		\
 -	    "IRQ" #nr "_interrupt:\n\t"		\
 -	    "push $~(" #nr ") ; "		\
 -	    "jmp common_interrupt\n"		\
 -	    ".previous");
 -
 -#define BI(x,y) \
 -	BUILD_IRQ(x##y)
 -
 -#define BUILD_16_IRQS(x) \
 -	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
 -	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
 -	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
 -	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
 -
  /*
   * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
   * (these are usually mapped to vectors 0x30-0x3f)
@@@ -38,6 -73,37 +38,6 @@@
   *
   * (these are usually mapped into the 0x30-0xff vector range)
   */
 -				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
 -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
 -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
 -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
 -
 -#undef BUILD_16_IRQS
 -#undef BI
 -
 -
 -#define IRQ(x,y) \
 -	IRQ##x##y##_interrupt
 -
 -#define IRQLIST_16(x) \
 -	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
 -	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
 -	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
 -	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 -
 -/* for the irq vectors */
 -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
 -					  IRQLIST_16(0x2), IRQLIST_16(0x3),
 -	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
 -	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
 -	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
 -};
 -
 -#undef IRQ
 -#undef IRQLIST_16
 -
 -
 -
  
  /*
   * IRQ2 is cascade interrupt to second interrupt controller
@@@ -69,6 -135,18 +69,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
  	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
  };
  
+ int vector_used_by_percpu_irq(unsigned int vector)
+ {
+ 	int cpu;
+ 
+ 	for_each_online_cpu(cpu) {
+ 		if (per_cpu(vector_irq, cpu)[vector] != -1)
+ 			return 1;
+ 	}
+ 
+ 	return 0;
+ }
+ 
  void __init init_ISA_irqs(void)
  {
  	int i;
@@@ -121,6 -199,7 +133,7 @@@ static void __init smp_intr_init(void
  
  	/* Low priority IPI to cleanup after moving an irq */
  	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
  #endif
  }
  
diff --combined arch/x86/kernel/reboot.c
index 72e0e4e712d,ba7b9a0e606..39643b1df06
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@@ -12,7 -12,6 +12,7 @@@
  #include <asm/proto.h>
  #include <asm/reboot_fixups.h>
  #include <asm/reboot.h>
 +#include <asm/virtext.h>
  
  #ifdef CONFIG_X86_32
  # include <linux/dmi.h>
@@@ -40,12 -39,6 +40,12 @@@ int reboot_force
  static int reboot_cpu = -1;
  #endif
  
 +/* This is set if we need to go through the 'emergency' path.
 + * When machine_emergency_restart() is called, we may be on
 + * an inconsistent state and won't be able to do a clean cleanup
 + */
 +static int reboot_emergency;
 +
  /* This is set by the PCI code if either type 1 or type 2 PCI is detected */
  bool port_cf9_safe = false;
  
@@@ -375,48 -368,6 +375,48 @@@ static inline void kb_wait(void
  	}
  }
  
 +static void vmxoff_nmi(int cpu, struct die_args *args)
 +{
 +	cpu_emergency_vmxoff();
 +}
 +
 +/* Use NMIs as IPIs to tell all CPUs to disable virtualization
 + */
 +static void emergency_vmx_disable_all(void)
 +{
 +	/* Just make sure we won't change CPUs while doing this */
 +	local_irq_disable();
 +
 +	/* We need to disable VMX on all CPUs before rebooting, otherwise
 +	 * we risk hanging up the machine, because the CPU ignore INIT
 +	 * signals when VMX is enabled.
 +	 *
 +	 * We can't take any locks and we may be on an inconsistent
 +	 * state, so we use NMIs as IPIs to tell the other CPUs to disable
 +	 * VMX and halt.
 +	 *
 +	 * For safety, we will avoid running the nmi_shootdown_cpus()
 +	 * stuff unnecessarily, but we don't have a way to check
 +	 * if other CPUs have VMX enabled. So we will call it only if the
 +	 * CPU we are running on has VMX enabled.
 +	 *
 +	 * We will miss cases where VMX is not enabled on all CPUs. This
 +	 * shouldn't do much harm because KVM always enable VMX on all
 +	 * CPUs anyway. But we can miss it on the small window where KVM
 +	 * is still enabling VMX.
 +	 */
 +	if (cpu_has_vmx() && cpu_vmx_enabled()) {
 +		/* Disable VMX on this CPU.
 +		 */
 +		cpu_vmxoff();
 +
 +		/* Halt and disable VMX on the other CPUs */
 +		nmi_shootdown_cpus(vmxoff_nmi);
 +
 +	}
 +}
 +
 +
  void __attribute__((weak)) mach_reboot_fixups(void)
  {
  }
@@@ -425,9 -376,6 +425,9 @@@ static void native_machine_emergency_re
  {
  	int i;
  
 +	if (reboot_emergency)
 +		emergency_vmx_disable_all();
 +
  	/* Tell the BIOS if we want cold or warm reboot */
  	*((unsigned short *)__va(0x472)) = reboot_mode;
  
@@@ -534,19 -482,13 +534,19 @@@ void native_machine_shutdown(void
  #endif
  }
  
 +static void __machine_emergency_restart(int emergency)
 +{
 +	reboot_emergency = emergency;
 +	machine_ops.emergency_restart();
 +}
 +
  static void native_machine_restart(char *__unused)
  {
  	printk("machine restart\n");
  
  	if (!reboot_force)
  		machine_shutdown();
 -	machine_emergency_restart();
 +	__machine_emergency_restart(0);
  }
  
  static void native_machine_halt(void)
@@@ -590,7 -532,7 +590,7 @@@ void machine_shutdown(void
  
  void machine_emergency_restart(void)
  {
 -	machine_ops.emergency_restart();
 +	__machine_emergency_restart(1);
  }
  
  void machine_restart(char *cmd)
@@@ -650,10 -592,7 +650,7 @@@ static int crash_nmi_callback(struct no
  
  static void smp_send_nmi_allbutself(void)
  {
- 	cpumask_t mask = cpu_online_map;
- 	cpu_clear(safe_smp_processor_id(), mask);
- 	if (!cpus_empty(mask))
- 		send_IPI_mask(mask, NMI_VECTOR);
+ 	send_IPI_allbutself(NMI_VECTOR);
  }
  
  static struct notifier_block crash_nmi_nb = {
diff --combined arch/x86/kernel/smp.c
index 7e558db362c,49ed667b06f..beea2649a24
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@@ -118,22 -118,22 +118,22 @@@ static void native_smp_send_reschedule(
  		WARN_ON(1);
  		return;
  	}
- 	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+ 	send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
  }
  
  void native_send_call_func_single_ipi(int cpu)
  {
- 	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ 	send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
  }
  
- void native_send_call_func_ipi(cpumask_t mask)
+ void native_send_call_func_ipi(const struct cpumask *mask)
  {
  	cpumask_t allbutself;
  
  	allbutself = cpu_online_map;
  	cpu_clear(smp_processor_id(), allbutself);
  
- 	if (cpus_equal(mask, allbutself) &&
+ 	if (cpus_equal(*mask, allbutself) &&
  	    cpus_equal(cpu_online_map, cpu_callout_map))
  		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
  	else
@@@ -165,7 -165,11 +165,7 @@@ static void native_smp_send_stop(void
  void smp_reschedule_interrupt(struct pt_regs *regs)
  {
  	ack_APIC_irq();
 -#ifdef CONFIG_X86_32
 -	__get_cpu_var(irq_stat).irq_resched_count++;
 -#else
 -	add_pda(irq_resched_count, 1);
 -#endif
 +	inc_irq_stat(irq_resched_count);
  }
  
  void smp_call_function_interrupt(struct pt_regs *regs)
@@@ -173,7 -177,11 +173,7 @@@
  	ack_APIC_irq();
  	irq_enter();
  	generic_smp_call_function_interrupt();
 -#ifdef CONFIG_X86_32
 -	__get_cpu_var(irq_stat).irq_call_count++;
 -#else
 -	add_pda(irq_call_count, 1);
 -#endif
 +	inc_irq_stat(irq_call_count);
  	irq_exit();
  }
  
@@@ -182,7 -190,11 +182,7 @@@ void smp_call_function_single_interrupt
  	ack_APIC_irq();
  	irq_enter();
  	generic_smp_call_function_single_interrupt();
 -#ifdef CONFIG_X86_32
 -	__get_cpu_var(irq_stat).irq_call_count++;
 -#else
 -	add_pda(irq_call_count, 1);
 -#endif
 +	inc_irq_stat(irq_call_count);
  	irq_exit();
  }
  
diff --combined arch/x86/kernel/smpboot.c
index f8500c96944,1a9941b1115..31869bf5fab
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -102,14 -102,8 +102,8 @@@ EXPORT_SYMBOL(smp_num_siblings)
  /* Last level cache ID of each logical CPU */
  DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
  
- /* bitmap of online cpus */
- cpumask_t cpu_online_map __read_mostly;
- EXPORT_SYMBOL(cpu_online_map);
- 
  cpumask_t cpu_callin_map;
  cpumask_t cpu_callout_map;
- cpumask_t cpu_possible_map;
- EXPORT_SYMBOL(cpu_possible_map);
  
  /* representing HT siblings of each logical CPU */
  DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@@ -288,7 -282,7 +282,7 @@@ static int __cpuinitdata unsafe_smp
  /*
   * Activate a secondary processor.
   */
 -static void __cpuinit start_secondary(void *unused)
 +notrace static void __cpuinit start_secondary(void *unused)
  {
  	/*
  	 * Don't put *anything* before cpu_init(), SMP booting is too
@@@ -1081,10 -1075,8 +1075,10 @@@ static int __init smp_sanity_check(unsi
  #endif
  
  	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 -		printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
 -				    "by the BIOS.\n", hard_smp_processor_id());
 +		printk(KERN_WARNING
 +			"weird, boot CPU (#%d) not listed by the BIOS.\n",
 +			hard_smp_processor_id());
 +
  		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
  	}
  
@@@ -1260,6 -1252,15 +1254,15 @@@ void __init native_smp_cpus_done(unsign
  	check_nmi_watchdog();
  }
  
+ static int __initdata setup_possible_cpus = -1;
+ static int __init _setup_possible_cpus(char *str)
+ {
+ 	get_option(&str, &setup_possible_cpus);
+ 	return 0;
+ }
+ early_param("possible_cpus", _setup_possible_cpus);
+ 
+ 
  /*
   * cpu_possible_map should be static, it cannot change as cpu's
   * are onlined, or offlined. The reason is per-cpu data-structures
@@@ -1272,7 -1273,7 +1275,7 @@@
   *
   * Three ways to find out the number of additional hotplug CPUs:
   * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
-  * - The user can overwrite it with additional_cpus=NUM
+  * - The user can overwrite it with possible_cpus=NUM
   * - Otherwise don't reserve additional CPUs.
   * We do this because additional CPUs waste a lot of memory.
   * -AK
@@@ -1285,9 -1286,17 +1288,17 @@@ __init void prefill_possible_map(void
  	if (!num_processors)
  		num_processors = 1;
  
- 	possible = num_processors + disabled_cpus;
- 	if (possible > NR_CPUS)
- 		possible = NR_CPUS;
+ 	if (setup_possible_cpus == -1)
+ 		possible = num_processors + disabled_cpus;
+ 	else
+ 		possible = setup_possible_cpus;
+ 
+ 	if (possible > CONFIG_NR_CPUS) {
+ 		printk(KERN_WARNING
+ 			"%d Processors exceeds NR_CPUS limit of %d\n",
+ 			possible, CONFIG_NR_CPUS);
+ 		possible = CONFIG_NR_CPUS;
+ 	}
  
  	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
  		possible, max_t(int, possible - num_processors, 0));
@@@ -1352,7 -1361,7 +1363,7 @@@ void cpu_disable_common(void
  	lock_vector_lock();
  	remove_cpu_from_maps(cpu);
  	unlock_vector_lock();
- 	fixup_irqs(cpu_online_map);
+ 	fixup_irqs();
  }
  
  int native_cpu_disable(void)
diff --combined arch/x86/kernel/tlb_32.c
index 8da059f949b,174ea90d1cb..ce505464224
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@@ -34,8 -34,9 +34,8 @@@ static DEFINE_SPINLOCK(tlbstate_lock)
   */
  void leave_mm(int cpu)
  {
 -	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
 -		BUG();
 -	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
 +	BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
 +	cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
  	load_cr3(swapper_pg_dir);
  }
  EXPORT_SYMBOL_GPL(leave_mm);
@@@ -103,8 -104,8 +103,8 @@@ void smp_invalidate_interrupt(struct pt
  		 * BUG();
  		 */
  
 -	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
 -		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
 +	if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
 +		if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
  			if (flush_va == TLB_FLUSH_ALL)
  				local_flush_tlb();
  			else
@@@ -118,7 -119,7 +118,7 @@@
  	smp_mb__after_clear_bit();
  out:
  	put_cpu_no_resched();
 -	__get_cpu_var(irq_stat).irq_tlb_count++;
 +	inc_irq_stat(irq_tlb_count);
  }
  
  void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@@ -163,7 -164,7 +163,7 @@@
  	 * We have to send the IPI only to
  	 * CPUs affected.
  	 */
- 	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+ 	send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
  
  	while (!cpus_empty(flush_cpumask))
  		/* nothing. lockup detection does not belong here */
@@@ -237,7 -238,7 +237,7 @@@ static void do_flush_tlb_all(void *info
  	unsigned long cpu = smp_processor_id();
  
  	__flush_tlb_all();
 -	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
 +	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
  		leave_mm(cpu);
  }
  
diff --combined arch/x86/kernel/tlb_64.c
index 29887d7081a,de6f1bda0c5..f8be6f1d2e4
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@@ -154,7 -154,7 +154,7 @@@ asmlinkage void smp_invalidate_interrup
  out:
  	ack_APIC_irq();
  	cpu_clear(cpu, f->flush_cpumask);
 -	add_pda(irq_tlb_count, 1);
 +	inc_irq_stat(irq_tlb_count);
  }
  
  void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@@ -191,7 -191,7 +191,7 @@@
  	 * We have to send the IPI only to
  	 * CPUs affected.
  	 */
- 	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+ 	send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
  
  	while (!cpus_empty(f->flush_cpumask))
  		cpu_relax();
diff --combined arch/x86/kernel/traps.c
index 141907ab6e2,4a6dff39a47..2d1f4c7e405
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -72,9 -72,6 +72,6 @@@
  
  #include "cpu/mcheck/mce.h"
  
- DECLARE_BITMAP(used_vectors, NR_VECTORS);
- EXPORT_SYMBOL_GPL(used_vectors);
- 
  asmlinkage int system_call(void);
  
  /* Do we ignore FPU interrupts ? */
@@@ -89,6 -86,9 +86,9 @@@ gate_desc idt_table[256
  	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
  #endif
  
+ DECLARE_BITMAP(used_vectors, NR_VECTORS);
+ EXPORT_SYMBOL_GPL(used_vectors);
+ 
  static int ignore_nmis;
  
  static inline void conditional_sti(struct pt_regs *regs)
@@@ -481,7 -481,11 +481,7 @@@ do_nmi(struct pt_regs *regs, long error
  {
  	nmi_enter();
  
 -#ifdef CONFIG_X86_32
 -	{ int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
 -#else
 -	add_pda(__nmi_count, 1);
 -#endif
 +	inc_irq_stat(__nmi_count);
  
  	if (!ignore_nmis)
  		default_do_nmi(regs);
@@@ -660,7 -664,7 +660,7 @@@ void math_error(void __user *ip
  {
  	struct task_struct *task;
  	siginfo_t info;
 -	unsigned short cwd, swd;
 +	unsigned short cwd, swd, err;
  
  	/*
  	 * Save the info for the exception handler and clear the error.
@@@ -671,6 -675,7 +671,6 @@@
  	task->thread.error_code = 0;
  	info.si_signo = SIGFPE;
  	info.si_errno = 0;
 -	info.si_code = __SI_FAULT;
  	info.si_addr = ip;
  	/*
  	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@@ -684,31 -689,34 +684,31 @@@
  	 */
  	cwd = get_fpu_cwd(task);
  	swd = get_fpu_swd(task);
 -	switch (swd & ~cwd & 0x3f) {
 -	case 0x000: /* No unmasked exception */
 +
 +	err = swd & ~cwd & 0x3f;
 +
  #ifdef CONFIG_X86_32
 +	if (!err)
  		return;
  #endif
 -	default: /* Multiple exceptions */
 -		break;
 -	case 0x001: /* Invalid Op */
 +
 +	if (err & 0x001) {	/* Invalid op */
  		/*
  		 * swd & 0x240 == 0x040: Stack Underflow
  		 * swd & 0x240 == 0x240: Stack Overflow
  		 * User must clear the SF bit (0x40) if set
  		 */
  		info.si_code = FPE_FLTINV;
 -		break;
 -	case 0x002: /* Denormalize */
 -	case 0x010: /* Underflow */
 -		info.si_code = FPE_FLTUND;
 -		break;
 -	case 0x004: /* Zero Divide */
 +	} else if (err & 0x004) { /* Divide by Zero */
  		info.si_code = FPE_FLTDIV;
 -		break;
 -	case 0x008: /* Overflow */
 +	} else if (err & 0x008) { /* Overflow */
  		info.si_code = FPE_FLTOVF;
 -		break;
 -	case 0x020: /* Precision */
 +	} else if (err & 0x012) { /* Denormal, Underflow */
 +		info.si_code = FPE_FLTUND;
 +	} else if (err & 0x020) { /* Precision */
  		info.si_code = FPE_FLTRES;
 -		break;
 +	} else {
 +		info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
  	}
  	force_sig_info(SIGFPE, &info, task);
  }
@@@ -941,9 -949,7 +941,7 @@@ dotraplinkage void do_iret_error(struc
  
  void __init trap_init(void)
  {
- #ifdef CONFIG_X86_32
  	int i;
- #endif
  
  #ifdef CONFIG_EISA
  	void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@@ -1000,11 -1006,15 +998,15 @@@
  	}
  
  	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+ #endif
  
  	/* Reserve all the builtin and the syscall vector: */
  	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
  		set_bit(i, used_vectors);
  
+ #ifdef CONFIG_X86_64
+ 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
+ #else
  	set_bit(SYSCALL_VECTOR, used_vectors);
  #endif
  	/*
diff --combined arch/x86/lguest/boot.c
index 50a779264bb,104c8220a38..a7ed208f81e
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@@ -590,8 -590,7 +590,8 @@@ static void __init lguest_init_IRQ(void
  		 * a straightforward 1 to 1 mapping, so force that here. */
  		__get_cpu_var(vector_irq)[vector] = i;
  		if (vector != SYSCALL_VECTOR) {
 -			set_intr_gate(vector, interrupt[vector]);
 +			set_intr_gate(vector,
 +				      interrupt[vector-FIRST_EXTERNAL_VECTOR]);
  			set_irq_chip_and_handler_name(i, &lguest_irq_controller,
  						      handle_level_irq,
  						      "level");
@@@ -738,7 -737,7 +738,7 @@@ static void lguest_time_init(void
  
  	/* We can't set cpumask in the initializer: damn C limitations!  Set it
  	 * here and register our timer device. */
- 	lguest_clockevent.cpumask = cpumask_of_cpu(0);
+ 	lguest_clockevent.cpumask = cpumask_of(0);
  	clockevents_register_device(&lguest_clockevent);
  
  	/* Finally, we unblock the timer interrupt. */
diff --combined arch/x86/xen/mmu.c
index 773d68d3e91,e59e53b11e2..503c240e26c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -154,13 -154,13 +154,13 @@@ void xen_setup_mfn_list_list(void
  {
  	unsigned pfn, idx;
  
 -	for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
 +	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
  		unsigned topidx = p2m_top_index(pfn);
  
  		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
  	}
  
 -	for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
 +	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
  		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
  		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
  	}
@@@ -179,7 -179,7 +179,7 @@@ void __init xen_build_dynamic_phys_to_m
  	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
  	unsigned pfn;
  
 -	for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 +	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
  		unsigned topidx = p2m_top_index(pfn);
  
  		p2m_top[topidx] = &mfn_list[pfn];
@@@ -207,7 -207,7 +207,7 @@@ static void alloc_p2m(unsigned long **p
  	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
  	BUG_ON(p == NULL);
  
 -	for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 +	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
  		p[i] = INVALID_P2M_ENTRY;
  
  	if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
@@@ -407,8 -407,7 +407,8 @@@ out
  		preempt_enable();
  }
  
 -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 +				 unsigned long addr, pte_t *ptep)
  {
  	/* Just return the pte as-is.  We preserve the bits on commit */
  	return *ptep;
@@@ -879,8 -878,7 +879,8 @@@ static void __xen_pgd_pin(struct mm_str
  
  		if (user_pgd) {
  			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 -			xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
 +			xen_do_pin(MMUEXT_PIN_L4_TABLE,
 +				   PFN_DOWN(__pa(user_pgd)));
  		}
  	}
  #else /* CONFIG_X86_32 */
@@@ -995,8 -993,7 +995,8 @@@ static void __xen_pgd_unpin(struct mm_s
  		pgd_t *user_pgd = xen_get_user_pgd(pgd);
  
  		if (user_pgd) {
 -			xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
 +			xen_do_pin(MMUEXT_UNPIN_TABLE,
 +				   PFN_DOWN(__pa(user_pgd)));
  			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
  		}
  	}
@@@ -1082,7 -1079,7 +1082,7 @@@ static void drop_other_mm_ref(void *inf
  
  static void xen_drop_mm_ref(struct mm_struct *mm)
  {
- 	cpumask_t mask;
+ 	cpumask_var_t mask;
  	unsigned cpu;
  
  	if (current->active_mm == mm) {
@@@ -1094,7 -1091,16 +1094,16 @@@
  	}
  
  	/* Get the "official" set of cpus referring to our pagetable. */
- 	mask = mm->cpu_vm_mask;
+ 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ 		for_each_online_cpu(cpu) {
+ 			if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+ 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ 				continue;
+ 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ 		}
+ 		return;
+ 	}
+ 	cpumask_copy(mask, &mm->cpu_vm_mask);
  
  	/* It's possible that a vcpu may have a stale reference to our
  	   cr3, because its in lazy mode, and it hasn't yet flushed
@@@ -1103,11 -1109,12 +1112,12 @@@
  	   if needed. */
  	for_each_online_cpu(cpu) {
  		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- 			cpu_set(cpu, mask);
+ 			cpumask_set_cpu(cpu, mask);
  	}
  
- 	if (!cpus_empty(mask))
- 		smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+ 	if (!cpumask_empty(mask))
+ 		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ 	free_cpumask_var(mask);
  }
  #else
  static void xen_drop_mm_ref(struct mm_struct *mm)
diff --combined drivers/xen/events.c
index e26733a9df2,6c8193046e0..eb0dfdeaa94
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@@ -142,6 -142,9 +142,6 @@@ static void init_evtchn_cpu_bindings(vo
  
  	/* By default all event channels notify CPU#0. */
  	for_each_irq_desc(i, desc) {
 -		if (!desc)
 -			continue;
 -
  		desc->affinity = cpumask_of_cpu(0);
  	}
  #endif
@@@ -230,7 -233,6 +230,7 @@@ static void unmask_evtchn(int port
  static int find_unbound_irq(void)
  {
  	int irq;
 +	struct irq_desc *desc;
  
  	/* Only allocate from dynirq range */
  	for (irq = 0; irq < nr_irqs; irq++)
@@@ -240,10 -242,6 +240,10 @@@
  	if (irq == nr_irqs)
  		panic("No available IRQ to bind to: increase nr_irqs!\n");
  
 +	desc = irq_to_desc_alloc_cpu(irq, 0);
 +	if (WARN_ON(desc == NULL))
 +		return -1;
 +
  	return irq;
  }
  
@@@ -585,7 -583,7 +585,7 @@@ void rebind_evtchn_irq(int evtchn, int 
  	spin_unlock(&irq_mapping_update_lock);
  
  	/* new event channels are always bound to cpu 0 */
- 	irq_set_affinity(irq, cpumask_of_cpu(0));
+ 	irq_set_affinity(irq, cpumask_of(0));
  
  	/* Unmask the event channel. */
  	enable_irq(irq);
@@@ -614,9 -612,9 +614,9 @@@ static void rebind_irq_to_cpu(unsigned 
  }
  
  
- static void set_affinity_irq(unsigned irq, cpumask_t dest)
+ static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
  {
- 	unsigned tcpu = first_cpu(dest);
+ 	unsigned tcpu = cpumask_first(dest);
  	rebind_irq_to_cpu(irq, tcpu);
  }
  
diff --combined include/linux/interrupt.h
index 8cc8ef47f5b,7e85a6e89e4..990355fbc54
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@@ -111,13 -111,13 +111,13 @@@ extern void enable_irq(unsigned int irq
  
  extern cpumask_t irq_default_affinity;
  
- extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
  extern int irq_can_set_affinity(unsigned int irq);
  extern int irq_select_affinity(unsigned int irq);
  
  #else /* CONFIG_SMP */
  
- static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
  {
  	return -EINVAL;
  }
@@@ -253,6 -253,9 +253,6 @@@ enu
  	BLOCK_SOFTIRQ,
  	TASKLET_SOFTIRQ,
  	SCHED_SOFTIRQ,
 -#ifdef CONFIG_HIGH_RES_TIMERS
 -	HRTIMER_SOFTIRQ,
 -#endif
  	RCU_SOFTIRQ, 	/* Preferable RCU should always be the last softirq */
  
  	NR_SOFTIRQS
@@@ -464,10 -467,4 +464,10 @@@ static inline void init_irq_proc(void
  
  int show_interrupts(struct seq_file *p, void *v);
  
 +struct irq_desc;
 +
 +extern int early_irq_init(void);
 +extern int arch_early_irq_init(void);
 +extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 +
  #endif
diff --combined include/linux/irq.h
index d64a6d49bde,fde5e613201..f899b502f18
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@@ -113,7 -113,8 +113,8 @@@ struct irq_chip 
  	void		(*eoi)(unsigned int irq);
  
  	void		(*end)(unsigned int irq);
- 	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+ 	void		(*set_affinity)(unsigned int irq,
+ 					const struct cpumask *dest);
  	int		(*retrigger)(unsigned int irq);
  	int		(*set_type)(unsigned int irq, unsigned int flow_type);
  	int		(*set_wake)(unsigned int irq, unsigned int on);
@@@ -134,9 -135,6 +135,9 @@@ struct irq_2_iommu
  /**
   * struct irq_desc - interrupt descriptor
   * @irq:		interrupt number for this descriptor
 + * @timer_rand_state:	pointer to timer rand state struct
 + * @kstat_irqs:		irq stats per cpu
 + * @irq_2_iommu:	iommu with this irq
   * @handle_irq:		highlevel irq-events handler [if NULL, __do_IRQ()]
   * @chip:		low level interrupt hardware access
   * @msi_desc:		MSI descriptor
@@@ -148,8 -146,8 +149,8 @@@
   * @depth:		disable-depth, for nested irq_disable() calls
   * @wake_depth:		enable depth, for multiple set_irq_wake() callers
   * @irq_count:		stats field to detect stalled irqs
 - * @irqs_unhandled:	stats field for spurious unhandled interrupts
   * @last_unhandled:	aging timer for unhandled count
 + * @irqs_unhandled:	stats field for spurious unhandled interrupts
   * @lock:		locking for SMP
   * @affinity:		IRQ affinity on SMP
   * @cpu:		cpu index useful for balancing
@@@ -177,8 -175,8 +178,8 @@@ struct irq_desc 
  	unsigned int		depth;		/* nested irq disables */
  	unsigned int		wake_depth;	/* nested wake enables */
  	unsigned int		irq_count;	/* For detecting broken IRQs */
 -	unsigned int		irqs_unhandled;
  	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 +	unsigned int		irqs_unhandled;
  	spinlock_t		lock;
  #ifdef CONFIG_SMP
  	cpumask_t		affinity;
@@@ -193,23 -191,42 +194,23 @@@
  	const char		*name;
  } ____cacheline_internodealigned_in_smp;
  
 -extern void early_irq_init(void);
 -extern void arch_early_irq_init(void);
 -extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
  extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
  					struct irq_desc *desc, int cpu);
  extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
  
  #ifndef CONFIG_SPARSE_IRQ
  extern struct irq_desc irq_desc[NR_IRQS];
 -
 -static inline struct irq_desc *irq_to_desc(unsigned int irq)
 -{
 -	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 -}
 -static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 -{
 -	return irq_to_desc(irq);
 -}
 -
 -#else
 -
 -extern struct irq_desc *irq_to_desc(unsigned int irq);
 -extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 +#else /* CONFIG_SPARSE_IRQ */
  extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
  
 -# define for_each_irq_desc(irq, desc)		\
 -	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
 -# define for_each_irq_desc_reverse(irq, desc)                          \
 -	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
 -
  #define kstat_irqs_this_cpu(DESC) \
  	((DESC)->kstat_irqs[smp_processor_id()])
  #define kstat_incr_irqs_this_cpu(irqno, DESC) \
  	((DESC)->kstat_irqs[smp_processor_id()]++)
  
 -#endif
 +#endif /* CONFIG_SPARSE_IRQ */
 +
 +extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
  
  static inline struct irq_desc *
  irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
diff --combined include/linux/sched.h
index 8395e715809,e5f928a079e..158d53d0776
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc
  extern int runqueue_is_locked(void);
  extern void task_rq_unlock_wait(struct task_struct *p);
  
- extern cpumask_t nohz_cpu_mask;
+ extern cpumask_var_t nohz_cpu_mask;
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
  extern int select_nohz_load_balancer(int cpu);
  #else
@@@ -571,6 -571,12 +571,6 @@@ struct signal_struct 
  	 */
  	struct rlimit rlim[RLIM_NLIMITS];
  
 -	/* keep the process-shared keyrings here so that they do the right
 -	 * thing in threads created with CLONE_THREAD */
 -#ifdef CONFIG_KEYS
 -	struct key *session_keyring;	/* keyring inherited over fork */
 -	struct key *process_keyring;	/* keyring private to this process */
 -#endif
  #ifdef CONFIG_BSD_PROCESS_ACCT
  	struct pacct_struct pacct;	/* per-process accounting information */
  #endif
@@@ -641,7 -647,6 +641,7 @@@ struct user_struct 
  	/* Hash table maintenance information */
  	struct hlist_node uidhash_node;
  	uid_t uid;
 +	struct user_namespace *user_ns;
  
  #ifdef CONFIG_USER_SCHED
  	struct task_group *tg;
@@@ -659,7 -664,6 +659,7 @@@ extern struct user_struct *find_user(ui
  extern struct user_struct root_user;
  #define INIT_USER (&root_user)
  
 +
  struct backing_dev_info;
  struct reclaim_state;
  
@@@ -667,7 -671,8 +667,7 @@@
  struct sched_info {
  	/* cumulative counters */
  	unsigned long pcount;	      /* # of times run on this cpu */
 -	unsigned long long cpu_time,  /* time spent on the cpu */
 -			   run_delay; /* time spent waiting on a runqueue */
 +	unsigned long long run_delay; /* time spent waiting on a runqueue */
  
  	/* timestamps */
  	unsigned long long last_arrival,/* when we last ran on a cpu */
@@@ -758,20 -763,51 +758,51 @@@ enum cpu_idle_type 
  #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
  #define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
  
- #define BALANCE_FOR_MC_POWER	\
- 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+ enum powersavings_balance_level {
+ 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+ 	POWERSAVINGS_BALANCE_BASIC,	/* Fill one thread/core/package
+ 					 * first for long running threads
+ 					 */
+ 	POWERSAVINGS_BALANCE_WAKEUP,	/* Also bias task wakeups to semi-idle
+ 					 * cpu package for power savings
+ 					 */
+ 	MAX_POWERSAVINGS_BALANCE_LEVELS
+ };
  
- #define BALANCE_FOR_PKG_POWER	\
- 	((sched_mc_power_savings || sched_smt_power_savings) ?	\
- 	 SD_POWERSAVINGS_BALANCE : 0)
+ extern int sched_mc_power_savings, sched_smt_power_savings;
  
- #define test_sd_parent(sd, flag)	((sd->parent &&		\
- 					 (sd->parent->flags & flag)) ? 1 : 0)
+ static inline int sd_balance_for_mc_power(void)
+ {
+ 	if (sched_smt_power_savings)
+ 		return SD_POWERSAVINGS_BALANCE;
  
+ 	return 0;
+ }
+ 
+ static inline int sd_balance_for_package_power(void)
+ {
+ 	if (sched_mc_power_savings | sched_smt_power_savings)
+ 		return SD_POWERSAVINGS_BALANCE;
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Optimise SD flags for power savings:
+  * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+  * Keep default SD flags if sched_{smt,mc}_power_saving=0
+  */
+ 
+ static inline int sd_power_saving_flags(void)
+ {
+ 	if (sched_mc_power_savings | sched_smt_power_savings)
+ 		return SD_BALANCE_NEWIDLE;
+ 
+ 	return 0;
+ }
  
  struct sched_group {
  	struct sched_group *next;	/* Must be a circular list */
- 	cpumask_t cpumask;
  
  	/*
  	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@@ -784,8 -820,15 +815,15 @@@
  	 * (see include/linux/reciprocal_div.h)
  	 */
  	u32 reciprocal_cpu_power;
+ 
+ 	unsigned long cpumask[];
  };
  
+ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+ {
+ 	return to_cpumask(sg->cpumask);
+ }
+ 
  enum sched_domain_level {
  	SD_LV_NONE = 0,
  	SD_LV_SIBLING,
@@@ -809,7 -852,6 +847,6 @@@ struct sched_domain 
  	struct sched_domain *parent;	/* top domain must be null terminated */
  	struct sched_domain *child;	/* bottom domain must be null terminated */
  	struct sched_group *groups;	/* the balancing groups of the domain */
- 	cpumask_t span;			/* span of all CPUs in this domain */
  	unsigned long min_interval;	/* Minimum balance interval ms */
  	unsigned long max_interval;	/* Maximum balance interval ms */
  	unsigned int busy_factor;	/* less balancing by factor if busy */
@@@ -864,25 -906,73 +901,42 @@@
  #ifdef CONFIG_SCHED_DEBUG
  	char *name;
  #endif
+ 
+ 	/* span of all CPUs in this domain */
+ 	unsigned long span[];
  };
  
- extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+ {
+ 	return to_cpumask(sd->span);
+ }
+ 
+ extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
  				    struct sched_domain_attr *dattr_new);
  extern int arch_reinit_sched_domains(void);
  
+ /* Test a flag in parent sched domain */
+ static inline int test_sd_parent(struct sched_domain *sd, int flag)
+ {
+ 	if (sd->parent && (sd->parent->flags & flag))
+ 		return 1;
+ 
+ 	return 0;
+ }
+ 
  #else /* CONFIG_SMP */
  
  struct sched_domain_attr;
  
  static inline void
- partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
  			struct sched_domain_attr *dattr_new)
  {
  }
  #endif	/* !CONFIG_SMP */
  
  struct io_context;			/* See blkdev.h */
 -#define NGROUPS_SMALL		32
 -#define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
 -struct group_info {
 -	int ngroups;
 -	atomic_t usage;
 -	gid_t small_block[NGROUPS_SMALL];
 -	int nblocks;
 -	gid_t *blocks[0];
 -};
  
 -/*
 - * get_group_info() must be called with the owning task locked (via task_lock())
 - * when task != current.  The reason being that the vast majority of callers are
 - * looking at current->group_info, which can not be changed except by the
 - * current task.  Changing current->group_info requires the task lock, too.
 - */
 -#define get_group_info(group_info) do { \
 -	atomic_inc(&(group_info)->usage); \
 -} while (0)
 -
 -#define put_group_info(group_info) do { \
 -	if (atomic_dec_and_test(&(group_info)->usage)) \
 -		groups_free(group_info); \
 -} while (0)
 -
 -extern struct group_info *groups_alloc(int gidsetsize);
 -extern void groups_free(struct group_info *group_info);
 -extern int set_current_groups(struct group_info *group_info);
 -extern int groups_search(struct group_info *group_info, gid_t grp);
 -/* access the groups "array" with this macro */
 -#define GROUP_AT(gi, i) \
 -    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
  
  #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
  extern void prefetch_stack(struct task_struct *t);
@@@ -926,7 -1016,7 +980,7 @@@ struct sched_class 
  	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
  	void (*set_cpus_allowed)(struct task_struct *p,
- 				 const cpumask_t *newmask);
+ 				 const struct cpumask *newmask);
  
  	void (*rq_online)(struct rq *rq);
  	void (*rq_offline)(struct rq *rq);
@@@ -1138,7 -1228,6 +1192,7 @@@ struct task_struct 
  	 * The buffer to hold the BTS data.
  	 */
  	void *bts_buffer;
 +	size_t bts_size;
  #endif /* CONFIG_X86_PTRACE_BTS */
  
  	/* PID/PID hash table linkage. */
@@@ -1162,12 -1251,17 +1216,12 @@@
  	struct list_head cpu_timers[3];
  
  /* process credentials */
 -	uid_t uid,euid,suid,fsuid;
 -	gid_t gid,egid,sgid,fsgid;
 -	struct group_info *group_info;
 -	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
 -	struct user_struct *user;
 -	unsigned securebits;
 -#ifdef CONFIG_KEYS
 -	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 -	struct key *request_key_auth;	/* assumed request_key authority */
 -	struct key *thread_keyring;	/* keyring private to this thread */
 -#endif
 +	const struct cred *real_cred;	/* objective and real subjective task
 +					 * credentials (COW) */
 +	const struct cred *cred;	/* effective (overridable) subjective task
 +					 * credentials (COW) */
 +	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
 +
  	char comm[TASK_COMM_LEN]; /* executable name excluding path
  				     - access with [gs]et_task_comm (which lock
  				       it with task_lock())
@@@ -1204,6 -1298,9 +1258,6 @@@
  	int (*notifier)(void *priv);
  	void *notifier_data;
  	sigset_t *notifier_mask;
 -#ifdef CONFIG_SECURITY
 -	void *security;
 -#endif
  	struct audit_context *audit_context;
  #ifdef CONFIG_AUDITSYSCALL
  	uid_t loginuid;
@@@ -1579,12 -1676,12 +1633,12 @@@ extern cputime_t task_gtime(struct task
  
  #ifdef CONFIG_SMP
  extern int set_cpus_allowed_ptr(struct task_struct *p,
- 				const cpumask_t *new_mask);
+ 				const struct cpumask *new_mask);
  #else
  static inline int set_cpus_allowed_ptr(struct task_struct *p,
- 				       const cpumask_t *new_mask)
+ 				       const struct cpumask *new_mask)
  {
- 	if (!cpu_isset(0, *new_mask))
+ 	if (!cpumask_test_cpu(0, new_mask))
  		return -EINVAL;
  	return 0;
  }
@@@ -1760,6 -1857,7 +1814,6 @@@ static inline struct user_struct *get_u
  	return u;
  }
  extern void free_uid(struct user_struct *);
 -extern void switch_uid(struct user_struct *);
  extern void release_uids(struct user_namespace *ns);
  
  #include <asm/current.h>
@@@ -1778,6 -1876,9 +1832,6 @@@ extern void wake_up_new_task(struct tas
  extern void sched_fork(struct task_struct *p, int clone_flags);
  extern void sched_dead(struct task_struct *p);
  
 -extern int in_group_p(gid_t);
 -extern int in_egroup_p(gid_t);
 -
  extern void proc_caches_init(void);
  extern void flush_signals(struct task_struct *);
  extern void ignore_signals(struct task_struct *);
@@@ -1909,8 -2010,6 +1963,8 @@@ static inline unsigned long wait_task_i
  #define for_each_process(p) \
  	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
  
 +extern bool is_single_threaded(struct task_struct *);
 +
  /*
   * Careful: do_each_thread/while_each_thread is a double loop so
   *          'break' will not work as expected - use goto instead.
@@@ -2195,10 -2294,8 +2249,8 @@@ __trace_special(void *__tr, void *__dat
  }
  #endif
  
- extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
- extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
- 
- extern int sched_mc_power_savings, sched_smt_power_savings;
+ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  
  extern void normalize_rt_tasks(void);
  
diff --combined init/Kconfig
index 13627191a60,b3782c6d5ed..f6281711166
--- a/init/Kconfig
+++ b/init/Kconfig
@@@ -588,13 -588,6 +588,13 @@@ config KALLSYMS_AL
  
  	   Say N.
  
 +config KALLSYMS_STRIP_GENERATED
 +	bool "Strip machine generated symbols from kallsyms"
 +	depends on KALLSYMS_ALL
 +	default y
 +	help
 +	  Say N if you want kallsyms to retain even machine generated symbols.
 +
  config KALLSYMS_EXTRA_PASS
  	bool "Do an extra kallsyms pass"
  	depends on KALLSYMS
@@@ -924,6 -917,15 +924,15 @@@ config KMO
  
  endif # MODULES
  
+ config INIT_ALL_POSSIBLE
+ 	bool
+ 	help
+ 	  Back when each arch used to define their own cpu_online_map and
+ 	  cpu_possible_map, some of them chose to initialize cpu_possible_map
+ 	  with all 1s, and others with all 0s.  When they were centralised,
+ 	  it was better to provide this option than to break all the archs
+ 	  and have several arch maintainers persuing me down dark alleys.
+ 
  config STOP_MACHINE
  	bool
  	default y
@@@ -936,90 -938,10 +945,90 @@@ source "block/Kconfig
  config PREEMPT_NOTIFIERS
  	bool
  
 +choice
 +	prompt "RCU Implementation"
 +	default CLASSIC_RCU
 +
  config CLASSIC_RCU
 -	def_bool !PREEMPT_RCU
 +	bool "Classic RCU"
  	help
  	  This option selects the classic RCU implementation that is
  	  designed for best read-side performance on non-realtime
 -	  systems.  Classic RCU is the default.  Note that the
 -	  PREEMPT_RCU symbol is used to select/deselect this option.
 +	  systems.
 +
 +	  Select this option if you are unsure.
 +
 +config TREE_RCU
 +	bool "Tree-based hierarchical RCU"
 +	help
 +	  This option selects the RCU implementation that is
 +	  designed for very large SMP system with hundreds or
 +	  thousands of CPUs.
 +
 +config PREEMPT_RCU
 +	bool "Preemptible RCU"
 +	depends on PREEMPT
 +	help
 +	  This option reduces the latency of the kernel by making certain
 +	  RCU sections preemptible. Normally RCU code is non-preemptible, if
 +	  this option is selected then read-only RCU sections become
 +	  preemptible. This helps latency, but may expose bugs due to
 +	  now-naive assumptions about each RCU read-side critical section
 +	  remaining on a given CPU through its execution.
 +
 +endchoice
 +
 +config RCU_TRACE
 +	bool "Enable tracing for RCU"
 +	depends on TREE_RCU || PREEMPT_RCU
 +	help
 +	  This option provides tracing in RCU which presents stats
 +	  in debugfs for debugging RCU implementation.
 +
 +	  Say Y here if you want to enable RCU tracing
 +	  Say N if you are unsure.
 +
 +config RCU_FANOUT
 +	int "Tree-based hierarchical RCU fanout value"
 +	range 2 64 if 64BIT
 +	range 2 32 if !64BIT
 +	depends on TREE_RCU
 +	default 64 if 64BIT
 +	default 32 if !64BIT
 +	help
 +	  This option controls the fanout of hierarchical implementations
 +	  of RCU, allowing RCU to work efficiently on machines with
 +	  large numbers of CPUs.  This value must be at least the cube
 +	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
 +	  systems and up to 262,144 for 64-bit systems.
 +
 +	  Select a specific number if testing RCU itself.
 +	  Take the default if unsure.
 +
 +config RCU_FANOUT_EXACT
 +	bool "Disable tree-based hierarchical RCU auto-balancing"
 +	depends on TREE_RCU
 +	default n
 +	help
 +	  This option forces use of the exact RCU_FANOUT value specified,
 +	  regardless of imbalances in the hierarchy.  This is useful for
 +	  testing RCU itself, and might one day be useful on systems with
 +	  strong NUMA behavior.
 +
 +	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
 +
 +	  Say N if unsure.
 +
 +config TREE_RCU_TRACE
 +	def_bool RCU_TRACE && TREE_RCU
 +	select DEBUG_FS
 +	help
 +	  This option provides tracing for the TREE_RCU implementation,
 +	  permitting Makefile to trivially select kernel/rcutree_trace.c.
 +
 +config PREEMPT_RCU_TRACE
 +	def_bool RCU_TRACE && PREEMPT_RCU
 +	select DEBUG_FS
 +	help
 +	  This option provides tracing for the PREEMPT_RCU implementation,
 +	  permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --combined kernel/irq/chip.c
index 6eb3c7952b6,b343deedae9..f63c706d25e
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@@ -46,7 -46,7 +46,7 @@@ void dynamic_irq_init(unsigned int irq
  	desc->irq_count = 0;
  	desc->irqs_unhandled = 0;
  #ifdef CONFIG_SMP
- 	cpus_setall(desc->affinity);
+ 	cpumask_setall(&desc->affinity);
  #endif
  	spin_unlock_irqrestore(&desc->lock, flags);
  }
@@@ -125,7 -125,6 +125,7 @@@ int set_irq_type(unsigned int irq, unsi
  		return -ENODEV;
  	}
  
 +	type &= IRQ_TYPE_SENSE_MASK;
  	if (type == IRQ_TYPE_NONE)
  		return 0;
  
diff --combined kernel/irq/manage.c
index 540f6c49f3f,10ad2f87ed9..61c4a9b6216
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@@ -79,7 -79,7 +79,7 @@@ int irq_can_set_affinity(unsigned int i
   *	@cpumask:	cpumask
   *
   */
- int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  {
  	struct irq_desc *desc = irq_to_desc(irq);
  	unsigned long flags;
@@@ -91,14 -91,14 +91,14 @@@
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
  	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- 		desc->affinity = cpumask;
+ 		cpumask_copy(&desc->affinity, cpumask);
  		desc->chip->set_affinity(irq, cpumask);
  	} else {
  		desc->status |= IRQ_MOVE_PENDING;
- 		desc->pending_mask = cpumask;
+ 		cpumask_copy(&desc->pending_mask, cpumask);
  	}
  #else
- 	desc->affinity = cpumask;
+ 	cpumask_copy(&desc->affinity, cpumask);
  	desc->chip->set_affinity(irq, cpumask);
  #endif
  	desc->status |= IRQ_AFFINITY_SET;
@@@ -112,26 -112,24 +112,24 @@@
   */
  int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
  {
- 	cpumask_t mask;
- 
  	if (!irq_can_set_affinity(irq))
  		return 0;
  
- 	cpus_and(mask, cpu_online_map, irq_default_affinity);
- 
  	/*
  	 * Preserve an userspace affinity setup, but make sure that
  	 * one of the targets is online.
  	 */
  	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- 		if (cpus_intersects(desc->affinity, cpu_online_map))
- 			mask = desc->affinity;
+ 		if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+ 		    < nr_cpu_ids)
+ 			goto set_affinity;
  		else
  			desc->status &= ~IRQ_AFFINITY_SET;
  	}
  
- 	desc->affinity = mask;
- 	desc->chip->set_affinity(irq, mask);
+ 	cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+ set_affinity:
+ 	desc->chip->set_affinity(irq, &desc->affinity);
  
  	return 0;
  }
@@@ -370,18 -368,16 +368,18 @@@ int __irq_set_trigger(struct irq_desc *
  		return 0;
  	}
  
 -	ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
 +	/* caller masked out all except trigger mode flags */
 +	ret = chip->set_type(irq, flags);
  
  	if (ret)
  		pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
 -				(int)(flags & IRQF_TRIGGER_MASK),
 -				irq, chip->set_type);
 +				(int)flags, irq, chip->set_type);
  	else {
 +		if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
 +			flags |= IRQ_LEVEL;
  		/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
 -		desc->status &= ~IRQ_TYPE_SENSE_MASK;
 -		desc->status |= flags & IRQ_TYPE_SENSE_MASK;
 +		desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
 +		desc->status |= flags;
  	}
  
  	return ret;
@@@ -461,8 -457,7 +459,8 @@@ __setup_irq(unsigned int irq, struct ir
  
  		/* Setup the type (level, edge polarity) if configured: */
  		if (new->flags & IRQF_TRIGGER_MASK) {
 -			ret = __irq_set_trigger(desc, irq, new->flags);
 +			ret = __irq_set_trigger(desc, irq,
 +					new->flags & IRQF_TRIGGER_MASK);
  
  			if (ret) {
  				spin_unlock_irqrestore(&desc->lock, flags);
@@@ -676,18 -671,6 +674,18 @@@ int request_irq(unsigned int irq, irq_h
  	struct irq_desc *desc;
  	int retval;
  
 +	/*
 +	 * handle_IRQ_event() always ignores IRQF_DISABLED except for
 +	 * the _first_ irqaction (sigh).  That can cause oopsing, but
 +	 * the behavior is classified as "will not fix" so we need to
 +	 * start nudging drivers away from using that idiom.
 +	 */
 +	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
 +			== (IRQF_SHARED|IRQF_DISABLED))
 +		pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
 +				"guaranteed on shared IRQs\n",
 +				irq, devname);
 +
  #ifdef CONFIG_LOCKDEP
  	/*
  	 * Lockdep wants atomic interrupt handlers:
diff --combined kernel/sched.c
index fff1c4a20b6,756d981d91a..27ba1d642f0
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@@ -209,6 -209,7 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi
  	hrtimer_init(&rt_b->rt_period_timer,
  			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	rt_b->rt_period_timer.function = sched_rt_period_timer;
 -	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
  }
  
  static inline int rt_bandwidth_enabled(void)
@@@ -360,9 -361,7 +360,9 @@@ static inline struct task_group *task_g
  	struct task_group *tg;
  
  #ifdef CONFIG_USER_SCHED
 -	tg = p->user->tg;
 +	rcu_read_lock();
 +	tg = __task_cred(p)->user->tg;
 +	rcu_read_unlock();
  #elif defined(CONFIG_CGROUP_SCHED)
  	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
  				struct task_group, css);
@@@ -498,18 -497,26 +498,26 @@@ struct rt_rq 
   */
  struct root_domain {
  	atomic_t refcount;
- 	cpumask_t span;
- 	cpumask_t online;
+ 	cpumask_var_t span;
+ 	cpumask_var_t online;
  
  	/*
  	 * The "RT overload" flag: it gets set if a CPU has more than
  	 * one runnable RT task.
  	 */
- 	cpumask_t rto_mask;
+ 	cpumask_var_t rto_mask;
  	atomic_t rto_count;
  #ifdef CONFIG_SMP
  	struct cpupri cpupri;
  #endif
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ 	/*
+ 	 * Preferred wake up cpu nominated by sched_mc balance that will be
+ 	 * used when most cpus are idle in the system indicating overall very
+ 	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+ 	 */
+ 	unsigned int sched_mc_preferred_wakeup_cpu;
+ #endif
  };
  
  /*
@@@ -603,8 -610,6 +611,8 @@@ struct rq 
  #ifdef CONFIG_SCHEDSTATS
  	/* latency stats */
  	struct sched_info rq_sched_info;
 +	unsigned long long rq_cpu_time;
 +	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
  	/* sys_sched_yield() stats */
  	unsigned int yld_exp_empty;
@@@ -1138,6 -1143,7 +1146,6 @@@ static void init_rq_hrtick(struct rq *r
  
  	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	rq->hrtick_timer.function = hrtick;
 -	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
  #else	/* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
@@@ -1514,7 -1520,7 +1522,7 @@@ static int tg_shares_up(struct task_gro
  	struct sched_domain *sd = data;
  	int i;
  
- 	for_each_cpu_mask(i, sd->span) {
+ 	for_each_cpu(i, sched_domain_span(sd)) {
  		/*
  		 * If there are currently no tasks on the cpu pretend there
  		 * is one of average load so that when a new task gets to
@@@ -1535,7 -1541,7 +1543,7 @@@
  	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
  		shares = tg->shares;
  
- 	for_each_cpu_mask(i, sd->span)
+ 	for_each_cpu(i, sched_domain_span(sd))
  		update_group_shares_cpu(tg, i, shares, rq_weight);
  
  	return 0;
@@@ -1865,8 -1871,6 +1873,8 @@@ void set_task_cpu(struct task_struct *p
  
  	clock_offset = old_rq->clock - new_rq->clock;
  
 +	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
 +
  #ifdef CONFIG_SCHEDSTATS
  	if (p->se.wait_start)
  		p->se.wait_start -= clock_offset;
@@@ -2101,15 -2105,17 +2109,17 @@@ find_idlest_group(struct sched_domain *
  		int i;
  
  		/* Skip over this group if it has no CPUs allowed */
- 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ 		if (!cpumask_intersects(sched_group_cpus(group),
+ 					&p->cpus_allowed))
  			continue;
  
- 		local_group = cpu_isset(this_cpu, group->cpumask);
+ 		local_group = cpumask_test_cpu(this_cpu,
+ 					       sched_group_cpus(group));
  
  		/* Tally up the load of all CPUs in the group */
  		avg_load = 0;
  
- 		for_each_cpu_mask_nr(i, group->cpumask) {
+ 		for_each_cpu(i, sched_group_cpus(group)) {
  			/* Bias balancing toward cpus of our domain */
  			if (local_group)
  				load = source_load(i, load_idx);
@@@ -2141,17 -2147,14 +2151,14 @@@
   * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
- 		cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
  	unsigned long load, min_load = ULONG_MAX;
  	int idlest = -1;
  	int i;
  
  	/* Traverse only the allowed CPUs */
- 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
- 
- 	for_each_cpu_mask_nr(i, *tmp) {
+ 	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
  		load = weighted_cpuload(i);
  
  		if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2193,7 -2196,6 +2200,6 @@@ static int sched_balance_self(int cpu, 
  		update_shares(sd);
  
  	while (sd) {
- 		cpumask_t span, tmpmask;
  		struct sched_group *group;
  		int new_cpu, weight;
  
@@@ -2202,14 -2204,13 +2208,13 @@@
  			continue;
  		}
  
- 		span = sd->span;
  		group = find_idlest_group(sd, t, cpu);
  		if (!group) {
  			sd = sd->child;
  			continue;
  		}
  
- 		new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+ 		new_cpu = find_idlest_cpu(group, t, cpu);
  		if (new_cpu == -1 || new_cpu == cpu) {
  			/* Now try balancing at a lower domain level of cpu */
  			sd = sd->child;
@@@ -2218,10 -2219,10 +2223,10 @@@
  
  		/* Now try balancing at a lower domain level of new_cpu */
  		cpu = new_cpu;
+ 		weight = cpumask_weight(sched_domain_span(sd));
  		sd = NULL;
- 		weight = cpus_weight(span);
  		for_each_domain(cpu, tmp) {
- 			if (weight <= cpus_weight(tmp->span))
+ 			if (weight <= cpumask_weight(sched_domain_span(tmp)))
  				break;
  			if (tmp->flags & flag)
  				sd = tmp;
@@@ -2266,7 -2267,7 +2271,7 @@@ static int try_to_wake_up(struct task_s
  		cpu = task_cpu(p);
  
  		for_each_domain(this_cpu, sd) {
- 			if (cpu_isset(cpu, sd->span)) {
+ 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  				update_shares(sd);
  				break;
  			}
@@@ -2276,7 -2277,6 +2281,7 @@@
  
  	smp_wmb();
  	rq = task_rq_lock(p, &flags);
 +	update_rq_clock(rq);
  	old_state = p->state;
  	if (!(old_state & state))
  		goto out;
@@@ -2315,7 -2315,7 +2320,7 @@@
  	else {
  		struct sched_domain *sd;
  		for_each_domain(this_cpu, sd) {
- 			if (cpu_isset(cpu, sd->span)) {
+ 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  				schedstat_inc(sd, ttwu_wake_remote);
  				break;
  			}
@@@ -2334,11 -2334,12 +2339,11 @@@ out_activate
  		schedstat_inc(p, se.nr_wakeups_local);
  	else
  		schedstat_inc(p, se.nr_wakeups_remote);
 -	update_rq_clock(rq);
  	activate_task(rq, p, 1);
  	success = 1;
  
  out_running:
 -	trace_sched_wakeup(rq, p);
 +	trace_sched_wakeup(rq, p, success);
  	check_preempt_curr(rq, p, sync);
  
  	p->state = TASK_RUNNING;
@@@ -2471,7 -2472,7 +2476,7 @@@ void wake_up_new_task(struct task_struc
  		p->sched_class->task_new(rq, p);
  		inc_nr_running(rq);
  	}
 -	trace_sched_wakeup_new(rq, p);
 +	trace_sched_wakeup_new(rq, p, 1);
  	check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
  	if (p->sched_class->task_wake_up)
@@@ -2846,10 -2847,11 +2851,10 @@@ static void sched_migrate_task(struct t
  	struct rq *rq;
  
  	rq = task_rq_lock(p, &flags);
- 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+ 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
  	    || unlikely(!cpu_active(dest_cpu)))
  		goto out;
  
 -	trace_sched_migrate_task(rq, p, dest_cpu);
  	/* force the process onto the specified CPU */
  	if (migrate_task(p, dest_cpu, &req)) {
  		/* Need to wait for migration thread (might exit: take ref). */
@@@ -2911,7 -2913,7 +2916,7 @@@ int can_migrate_task(struct task_struc
  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
  	 * 3) are cache-hot on their current CPU.
  	 */
- 	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
  		schedstat_inc(p, se.nr_failed_migrations_affine);
  		return 0;
  	}
@@@ -3086,7 -3088,7 +3091,7 @@@ static int move_one_task(struct rq *thi
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
  		   unsigned long *imbalance, enum cpu_idle_type idle,
- 		   int *sd_idle, const cpumask_t *cpus, int *balance)
+ 		   int *sd_idle, const struct cpumask *cpus, int *balance)
  {
  	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
  	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@@ -3122,10 -3124,11 +3127,11 @@@
  		unsigned long sum_avg_load_per_task;
  		unsigned long avg_load_per_task;
  
- 		local_group = cpu_isset(this_cpu, group->cpumask);
+ 		local_group = cpumask_test_cpu(this_cpu,
+ 					       sched_group_cpus(group));
  
  		if (local_group)
- 			balance_cpu = first_cpu(group->cpumask);
+ 			balance_cpu = cpumask_first(sched_group_cpus(group));
  
  		/* Tally up the load of all CPUs in the group */
  		sum_weighted_load = sum_nr_running = avg_load = 0;
@@@ -3134,13 -3137,8 +3140,8 @@@
  		max_cpu_load = 0;
  		min_cpu_load = ~0UL;
  
- 		for_each_cpu_mask_nr(i, group->cpumask) {
- 			struct rq *rq;
- 
- 			if (!cpu_isset(i, *cpus))
- 				continue;
- 
- 			rq = cpu_rq(i);
+ 		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ 			struct rq *rq = cpu_rq(i);
  
  			if (*sd_idle && rq->nr_running)
  				*sd_idle = 0;
@@@ -3251,8 -3249,8 +3252,8 @@@
  		 */
  		if ((sum_nr_running < min_nr_running) ||
  		    (sum_nr_running == min_nr_running &&
- 		     first_cpu(group->cpumask) <
- 		     first_cpu(group_min->cpumask))) {
+ 		     cpumask_first(sched_group_cpus(group)) >
+ 		     cpumask_first(sched_group_cpus(group_min)))) {
  			group_min = group;
  			min_nr_running = sum_nr_running;
  			min_load_per_task = sum_weighted_load /
@@@ -3267,8 -3265,8 +3268,8 @@@
  		if (sum_nr_running <= group_capacity - 1) {
  			if (sum_nr_running > leader_nr_running ||
  			    (sum_nr_running == leader_nr_running &&
- 			     first_cpu(group->cpumask) >
- 			      first_cpu(group_leader->cpumask))) {
+ 			     cpumask_first(sched_group_cpus(group)) <
+ 			     cpumask_first(sched_group_cpus(group_leader)))) {
  				group_leader = group;
  				leader_nr_running = sum_nr_running;
  			}
@@@ -3394,6 -3392,10 +3395,10 @@@ out_balanced
  
  	if (this == group_leader && group_leader != group_min) {
  		*imbalance = min_load_per_task;
+ 		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ 			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ 				cpumask_first(sched_group_cpus(group_leader));
+ 		}
  		return group_min;
  	}
  #endif
@@@ -3407,16 -3409,16 +3412,16 @@@ ret
   */
  static struct rq *
  find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- 		   unsigned long imbalance, const cpumask_t *cpus)
+ 		   unsigned long imbalance, const struct cpumask *cpus)
  {
  	struct rq *busiest = NULL, *rq;
  	unsigned long max_load = 0;
  	int i;
  
- 	for_each_cpu_mask_nr(i, group->cpumask) {
+ 	for_each_cpu(i, sched_group_cpus(group)) {
  		unsigned long wl;
  
- 		if (!cpu_isset(i, *cpus))
+ 		if (!cpumask_test_cpu(i, cpus))
  			continue;
  
  		rq = cpu_rq(i);
@@@ -3446,7 -3448,7 +3451,7 @@@
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
  			struct sched_domain *sd, enum cpu_idle_type idle,
- 			int *balance, cpumask_t *cpus)
+ 			int *balance, struct cpumask *cpus)
  {
  	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
  	struct sched_group *group;
@@@ -3454,7 -3456,7 +3459,7 @@@
  	struct rq *busiest;
  	unsigned long flags;
  
- 	cpus_setall(*cpus);
+ 	cpumask_setall(cpus);
  
  	/*
  	 * When power savings policy is enabled for the parent domain, idle
@@@ -3514,8 -3516,8 +3519,8 @@@ redo
  
  		/* All tasks on this runqueue were pinned by CPU affinity */
  		if (unlikely(all_pinned)) {
- 			cpu_clear(cpu_of(busiest), *cpus);
- 			if (!cpus_empty(*cpus))
+ 			cpumask_clear_cpu(cpu_of(busiest), cpus);
+ 			if (!cpumask_empty(cpus))
  				goto redo;
  			goto out_balanced;
  		}
@@@ -3532,7 -3534,8 +3537,8 @@@
  			/* don't kick the migration_thread, if the curr
  			 * task on busiest cpu can't be moved to this_cpu
  			 */
- 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ 			if (!cpumask_test_cpu(this_cpu,
+ 					      &busiest->curr->cpus_allowed)) {
  				spin_unlock_irqrestore(&busiest->lock, flags);
  				all_pinned = 1;
  				goto out_one_pinned;
@@@ -3607,7 -3610,7 +3613,7 @@@ out
   */
  static int
  load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- 			cpumask_t *cpus)
+ 			struct cpumask *cpus)
  {
  	struct sched_group *group;
  	struct rq *busiest = NULL;
@@@ -3616,7 -3619,7 +3622,7 @@@
  	int sd_idle = 0;
  	int all_pinned = 0;
  
- 	cpus_setall(*cpus);
+ 	cpumask_setall(cpus);
  
  	/*
  	 * When power savings policy is enabled for the parent domain, idle
@@@ -3660,17 -3663,71 +3666,71 @@@ redo
  		double_unlock_balance(this_rq, busiest);
  
  		if (unlikely(all_pinned)) {
- 			cpu_clear(cpu_of(busiest), *cpus);
- 			if (!cpus_empty(*cpus))
+ 			cpumask_clear_cpu(cpu_of(busiest), cpus);
+ 			if (!cpumask_empty(cpus))
  				goto redo;
  		}
  	}
  
  	if (!ld_moved) {
+ 		int active_balance = 0;
+ 
  		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
  		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
  		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
  			return -1;
+ 
+ 		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+ 			return -1;
+ 
+ 		if (sd->nr_balance_failed++ < 2)
+ 			return -1;
+ 
+ 		/*
+ 		 * The only task running in a non-idle cpu can be moved to this
+ 		 * cpu in an attempt to completely freeup the other CPU
+ 		 * package. The same method used to move task in load_balance()
+ 		 * have been extended for load_balance_newidle() to speedup
+ 		 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+ 		 *
+ 		 * The package power saving logic comes from
+ 		 * find_busiest_group().  If there are no imbalance, then
+ 		 * f_b_g() will return NULL.  However when sched_mc={1,2} then
+ 		 * f_b_g() will select a group from which a running task may be
+ 		 * pulled to this cpu in order to make the other package idle.
+ 		 * If there is no opportunity to make a package idle and if
+ 		 * there are no imbalance, then f_b_g() will return NULL and no
+ 		 * action will be taken in load_balance_newidle().
+ 		 *
+ 		 * Under normal task pull operation due to imbalance, there
+ 		 * will be more than one task in the source run queue and
+ 		 * move_tasks() will succeed.  ld_moved will be true and this
+ 		 * active balance code will not be triggered.
+ 		 */
+ 
+ 		/* Lock busiest in correct order while this_rq is held */
+ 		double_lock_balance(this_rq, busiest);
+ 
+ 		/*
+ 		 * don't kick the migration_thread, if the curr
+ 		 * task on busiest cpu can't be moved to this_cpu
+ 		 */
+ 		if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ 			double_unlock_balance(this_rq, busiest);
+ 			all_pinned = 1;
+ 			return ld_moved;
+ 		}
+ 
+ 		if (!busiest->active_balance) {
+ 			busiest->active_balance = 1;
+ 			busiest->push_cpu = this_cpu;
+ 			active_balance = 1;
+ 		}
+ 
+ 		double_unlock_balance(this_rq, busiest);
+ 		if (active_balance)
+ 			wake_up_process(busiest->migration_thread);
+ 
  	} else
  		sd->nr_balance_failed = 0;
  
@@@ -3696,7 -3753,10 +3756,10 @@@ static void idle_balance(int this_cpu, 
  	struct sched_domain *sd;
  	int pulled_task = 0;
  	unsigned long next_balance = jiffies + HZ;
- 	cpumask_t tmpmask;
+ 	cpumask_var_t tmpmask;
+ 
+ 	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+ 		return;
  
  	for_each_domain(this_cpu, sd) {
  		unsigned long interval;
@@@ -3707,7 -3767,7 +3770,7 @@@
  		if (sd->flags & SD_BALANCE_NEWIDLE)
  			/* If we've pulled tasks over stop searching: */
  			pulled_task = load_balance_newidle(this_cpu, this_rq,
- 							   sd, &tmpmask);
+ 							   sd, tmpmask);
  
  		interval = msecs_to_jiffies(sd->balance_interval);
  		if (time_after(next_balance, sd->last_balance + interval))
@@@ -3722,6 -3782,7 +3785,7 @@@
  		 */
  		this_rq->next_balance = next_balance;
  	}
+ 	free_cpumask_var(tmpmask);
  }
  
  /*
@@@ -3759,7 -3820,7 +3823,7 @@@ static void active_load_balance(struct 
  	/* Search for an sd spanning us and the target CPU. */
  	for_each_domain(target_cpu, sd) {
  		if ((sd->flags & SD_LOAD_BALANCE) &&
- 		    cpu_isset(busiest_cpu, sd->span))
+ 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
  				break;
  	}
  
@@@ -3778,10 -3839,9 +3842,9 @@@
  #ifdef CONFIG_NO_HZ
  static struct {
  	atomic_t load_balancer;
- 	cpumask_t cpu_mask;
+ 	cpumask_var_t cpu_mask;
  } nohz ____cacheline_aligned = {
  	.load_balancer = ATOMIC_INIT(-1),
- 	.cpu_mask = CPU_MASK_NONE,
  };
  
  /*
@@@ -3809,7 -3869,7 +3872,7 @@@ int select_nohz_load_balancer(int stop_
  	int cpu = smp_processor_id();
  
  	if (stop_tick) {
- 		cpu_set(cpu, nohz.cpu_mask);
+ 		cpumask_set_cpu(cpu, nohz.cpu_mask);
  		cpu_rq(cpu)->in_nohz_recently = 1;
  
  		/*
@@@ -3823,7 -3883,7 +3886,7 @@@
  		}
  
  		/* time for ilb owner also to sleep */
- 		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ 		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
  			if (atomic_read(&nohz.load_balancer) == cpu)
  				atomic_set(&nohz.load_balancer, -1);
  			return 0;
@@@ -3836,10 -3896,10 +3899,10 @@@
  		} else if (atomic_read(&nohz.load_balancer) == cpu)
  			return 1;
  	} else {
- 		if (!cpu_isset(cpu, nohz.cpu_mask))
+ 		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
  			return 0;
  
- 		cpu_clear(cpu, nohz.cpu_mask);
+ 		cpumask_clear_cpu(cpu, nohz.cpu_mask);
  
  		if (atomic_read(&nohz.load_balancer) == cpu)
  			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3867,7 -3927,11 +3930,11 @@@ static void rebalance_domains(int cpu, 
  	unsigned long next_balance = jiffies + 60*HZ;
  	int update_next_balance = 0;
  	int need_serialize;
- 	cpumask_t tmp;
+ 	cpumask_var_t tmp;
+ 
+ 	/* Fails alloc?  Rebalancing probably not a priority right now. */
+ 	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+ 		return;
  
  	for_each_domain(cpu, sd) {
  		if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -3892,7 -3956,7 +3959,7 @@@
  		}
  
  		if (time_after_eq(jiffies, sd->last_balance + interval)) {
- 			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+ 			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
  				/*
  				 * We've pulled tasks over so either we're no
  				 * longer idle, or one of our SMT siblings is
@@@ -3926,6 -3990,8 +3993,8 @@@ out
  	 */
  	if (likely(update_next_balance))
  		rq->next_balance = next_balance;
+ 
+ 	free_cpumask_var(tmp);
  }
  
  /*
@@@ -3950,12 -4016,13 +4019,13 @@@ static void run_rebalance_domains(struc
  	 */
  	if (this_rq->idle_at_tick &&
  	    atomic_read(&nohz.load_balancer) == this_cpu) {
- 		cpumask_t cpus = nohz.cpu_mask;
  		struct rq *rq;
  		int balance_cpu;
  
- 		cpu_clear(this_cpu, cpus);
- 		for_each_cpu_mask_nr(balance_cpu, cpus) {
+ 		for_each_cpu(balance_cpu, nohz.cpu_mask) {
+ 			if (balance_cpu == this_cpu)
+ 				continue;
+ 
  			/*
  			 * If this cpu gets work to do, stop the load balancing
  			 * work being done for other cpus. Next load
@@@ -3993,7 -4060,7 +4063,7 @@@ static inline void trigger_load_balance
  		rq->in_nohz_recently = 0;
  
  		if (atomic_read(&nohz.load_balancer) == cpu) {
- 			cpu_clear(cpu, nohz.cpu_mask);
+ 			cpumask_clear_cpu(cpu, nohz.cpu_mask);
  			atomic_set(&nohz.load_balancer, -1);
  		}
  
@@@ -4006,7 -4073,7 +4076,7 @@@
  			 * TBD: Traverse the sched domains and nominate
  			 * the nearest cpu in the nohz.cpu_mask.
  			 */
- 			int ilb = first_cpu(nohz.cpu_mask);
+ 			int ilb = cpumask_first(nohz.cpu_mask);
  
  			if (ilb < nr_cpu_ids)
  				resched_cpu(ilb);
@@@ -4018,7 -4085,7 +4088,7 @@@
  	 * cpus with ticks stopped, is it time for that to stop?
  	 */
  	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- 	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ 	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
  		resched_cpu(cpu);
  		return;
  	}
@@@ -4028,7 -4095,7 +4098,7 @@@
  	 * someone else, then no need raise the SCHED_SOFTIRQ
  	 */
  	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- 	    cpu_isset(cpu, nohz.cpu_mask))
+ 	    cpumask_test_cpu(cpu, nohz.cpu_mask))
  		return;
  #endif
  	if (time_after_eq(jiffies, rq->next_balance))
@@@ -5120,22 -5187,6 +5190,22 @@@ __setscheduler(struct rq *rq, struct ta
  	set_load_weight(p);
  }
  
 +/*
 + * check the target process has a UID that matches the current process's
 + */
 +static bool check_same_owner(struct task_struct *p)
 +{
 +	const struct cred *cred = current_cred(), *pcred;
 +	bool match;
 +
 +	rcu_read_lock();
 +	pcred = __task_cred(p);
 +	match = (cred->euid == pcred->euid ||
 +		 cred->euid == pcred->uid);
 +	rcu_read_unlock();
 +	return match;
 +}
 +
  static int __sched_setscheduler(struct task_struct *p, int policy,
  				struct sched_param *param, bool user)
  {
@@@ -5195,7 -5246,8 +5265,7 @@@ recheck
  			return -EPERM;
  
  		/* can't change other user's priorities */
 -		if ((current->euid != p->euid) &&
 -		    (current->euid != p->uid))
 +		if (!check_same_owner(p))
  			return -EPERM;
  	}
  
@@@ -5401,10 -5453,9 +5471,9 @@@ out_unlock
  	return retval;
  }
  
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
- 	cpumask_t cpus_allowed;
- 	cpumask_t new_mask = *in_mask;
+ 	cpumask_var_t cpus_allowed, new_mask;
  	struct task_struct *p;
  	int retval;
  
@@@ -5426,45 -5477,58 +5495,57 @@@
  	get_task_struct(p);
  	read_unlock(&tasklist_lock);
  
+ 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ 		retval = -ENOMEM;
+ 		goto out_put_task;
+ 	}
+ 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ 		retval = -ENOMEM;
+ 		goto out_free_cpus_allowed;
+ 	}
  	retval = -EPERM;
 -	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 -			!capable(CAP_SYS_NICE))
 +	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
  		goto out_unlock;
  
  	retval = security_task_setscheduler(p, 0, NULL);
  	if (retval)
  		goto out_unlock;
  
- 	cpuset_cpus_allowed(p, &cpus_allowed);
- 	cpus_and(new_mask, new_mask, cpus_allowed);
+ 	cpuset_cpus_allowed(p, cpus_allowed);
+ 	cpumask_and(new_mask, in_mask, cpus_allowed);
   again:
- 	retval = set_cpus_allowed_ptr(p, &new_mask);
+ 	retval = set_cpus_allowed_ptr(p, new_mask);
  
  	if (!retval) {
- 		cpuset_cpus_allowed(p, &cpus_allowed);
- 		if (!cpus_subset(new_mask, cpus_allowed)) {
+ 		cpuset_cpus_allowed(p, cpus_allowed);
+ 		if (!cpumask_subset(new_mask, cpus_allowed)) {
  			/*
  			 * We must have raced with a concurrent cpuset
  			 * update. Just reset the cpus_allowed to the
  			 * cpuset's cpus_allowed
  			 */
- 			new_mask = cpus_allowed;
+ 			cpumask_copy(new_mask, cpus_allowed);
  			goto again;
  		}
  	}
  out_unlock:
+ 	free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+ 	free_cpumask_var(cpus_allowed);
+ out_put_task:
  	put_task_struct(p);
  	put_online_cpus();
  	return retval;
  }
  
  static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- 			     cpumask_t *new_mask)
+ 			     struct cpumask *new_mask)
  {
- 	if (len < sizeof(cpumask_t)) {
- 		memset(new_mask, 0, sizeof(cpumask_t));
- 	} else if (len > sizeof(cpumask_t)) {
- 		len = sizeof(cpumask_t);
- 	}
+ 	if (len < cpumask_size())
+ 		cpumask_clear(new_mask);
+ 	else if (len > cpumask_size())
+ 		len = cpumask_size();
+ 
  	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
  }
  
@@@ -5477,17 -5541,20 +5558,20 @@@
  asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
  				      unsigned long __user *user_mask_ptr)
  {
- 	cpumask_t new_mask;
+ 	cpumask_var_t new_mask;
  	int retval;
  
- 	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
- 	if (retval)
- 		return retval;
+ 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ 		return -ENOMEM;
  
- 	return sched_setaffinity(pid, &new_mask);
+ 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ 	if (retval == 0)
+ 		retval = sched_setaffinity(pid, new_mask);
+ 	free_cpumask_var(new_mask);
+ 	return retval;
  }
  
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
  	struct task_struct *p;
  	int retval;
@@@ -5504,7 -5571,7 +5588,7 @@@
  	if (retval)
  		goto out_unlock;
  
- 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+ 	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
  
  out_unlock:
  	read_unlock(&tasklist_lock);
@@@ -5523,19 -5590,24 +5607,24 @@@ asmlinkage long sys_sched_getaffinity(p
  				      unsigned long __user *user_mask_ptr)
  {
  	int ret;
- 	cpumask_t mask;
+ 	cpumask_var_t mask;
  
- 	if (len < sizeof(cpumask_t))
+ 	if (len < cpumask_size())
  		return -EINVAL;
  
- 	ret = sched_getaffinity(pid, &mask);
- 	if (ret < 0)
- 		return ret;
+ 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ 		return -ENOMEM;
  
- 	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
- 		return -EFAULT;
+ 	ret = sched_getaffinity(pid, mask);
+ 	if (ret == 0) {
+ 		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+ 			ret = -EFAULT;
+ 		else
+ 			ret = cpumask_size();
+ 	}
+ 	free_cpumask_var(mask);
  
- 	return sizeof(cpumask_t);
+ 	return ret;
  }
  
  /**
@@@ -5877,7 -5949,7 +5966,7 @@@ void __cpuinit init_idle(struct task_st
  	idle->se.exec_start = sched_clock();
  
  	idle->prio = idle->normal_prio = MAX_PRIO;
- 	idle->cpus_allowed = cpumask_of_cpu(cpu);
+ 	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
  	__set_task_cpu(idle, cpu);
  
  	rq->curr = rq->idle = idle;
@@@ -5904,9 -5976,9 +5993,9 @@@
   * indicates which cpus entered this state. This is used
   * in the rcu update to wait only for active cpus. For system
   * which do not switch off the HZ timer nohz_cpu_mask should
-  * always be CPU_MASK_NONE.
+  * always be CPU_BITS_NONE.
   */
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
  
  /*
   * Increase the granularity value when there are more CPUs,
@@@ -5961,7 -6033,7 +6050,7 @@@ static inline void sched_init_granulari
   * task must not exit() & deallocate itself prematurely. The
   * call is not atomic; no spinlocks may be held.
   */
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
  	struct migration_req req;
  	unsigned long flags;
@@@ -5969,13 -6041,13 +6058,13 @@@
  	int ret = 0;
  
  	rq = task_rq_lock(p, &flags);
- 	if (!cpus_intersects(*new_mask, cpu_online_map)) {
+ 	if (!cpumask_intersects(new_mask, cpu_online_mask)) {
  		ret = -EINVAL;
  		goto out;
  	}
  
  	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- 		     !cpus_equal(p->cpus_allowed, *new_mask))) {
+ 		     !cpumask_equal(&p->cpus_allowed, new_mask))) {
  		ret = -EINVAL;
  		goto out;
  	}
@@@ -5983,15 -6055,15 +6072,15 @@@
  	if (p->sched_class->set_cpus_allowed)
  		p->sched_class->set_cpus_allowed(p, new_mask);
  	else {
- 		p->cpus_allowed = *new_mask;
- 		p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+ 		cpumask_copy(&p->cpus_allowed, new_mask);
+ 		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
  	}
  
  	/* Can the task run on the task's current CPU? If so, we're done */
- 	if (cpu_isset(task_cpu(p), *new_mask))
+ 	if (cpumask_test_cpu(task_cpu(p), new_mask))
  		goto out;
  
- 	if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+ 	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
  		/* Need help from migration thread: drop lock and wait. */
  		task_rq_unlock(rq, &flags);
  		wake_up_process(rq->migration_thread);
@@@ -6033,7 -6105,7 +6122,7 @@@ static int __migrate_task(struct task_s
  	if (task_cpu(p) != src_cpu)
  		goto done;
  	/* Affinity changed (again). */
- 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
+ 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
  		goto fail;
  
  	on_rq = p->se.on_rq;
@@@ -6130,50 -6202,43 +6219,43 @@@ static int __migrate_task_irq(struct ta
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
- 	unsigned long flags;
- 	cpumask_t mask;
- 	struct rq *rq;
  	int dest_cpu;
+ 	/* FIXME: Use cpumask_of_node here. */
+ 	cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+ 	const struct cpumask *nodemask = &_nodemask;
+ 
+ again:
+ 	/* Look for allowed, online CPU in same node. */
+ 	for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+ 		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ 			goto move;
+ 
+ 	/* Any allowed, online CPU? */
+ 	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+ 	if (dest_cpu < nr_cpu_ids)
+ 		goto move;
+ 
+ 	/* No more Mr. Nice Guy. */
+ 	if (dest_cpu >= nr_cpu_ids) {
+ 		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+ 		dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
  
- 	do {
- 		/* On same node? */
- 		mask = node_to_cpumask(cpu_to_node(dead_cpu));
- 		cpus_and(mask, mask, p->cpus_allowed);
- 		dest_cpu = any_online_cpu(mask);
- 
- 		/* On any allowed CPU? */
- 		if (dest_cpu >= nr_cpu_ids)
- 			dest_cpu = any_online_cpu(p->cpus_allowed);
- 
- 		/* No more Mr. Nice Guy. */
- 		if (dest_cpu >= nr_cpu_ids) {
- 			cpumask_t cpus_allowed;
- 
- 			cpuset_cpus_allowed_locked(p, &cpus_allowed);
- 			/*
- 			 * Try to stay on the same cpuset, where the
- 			 * current cpuset may be a subset of all cpus.
- 			 * The cpuset_cpus_allowed_locked() variant of
- 			 * cpuset_cpus_allowed() will not block. It must be
- 			 * called within calls to cpuset_lock/cpuset_unlock.
- 			 */
- 			rq = task_rq_lock(p, &flags);
- 			p->cpus_allowed = cpus_allowed;
- 			dest_cpu = any_online_cpu(p->cpus_allowed);
- 			task_rq_unlock(rq, &flags);
- 
- 			/*
- 			 * Don't tell them about moving exiting tasks or
- 			 * kernel threads (both mm NULL), since they never
- 			 * leave kernel.
- 			 */
- 			if (p->mm && printk_ratelimit()) {
- 				printk(KERN_INFO "process %d (%s) no "
- 				       "longer affine to cpu%d\n",
- 					task_pid_nr(p), p->comm, dead_cpu);
- 			}
+ 		/*
+ 		 * Don't tell them about moving exiting tasks or
+ 		 * kernel threads (both mm NULL), since they never
+ 		 * leave kernel.
+ 		 */
+ 		if (p->mm && printk_ratelimit()) {
+ 			printk(KERN_INFO "process %d (%s) no "
+ 			       "longer affine to cpu%d\n",
+ 			       task_pid_nr(p), p->comm, dead_cpu);
  		}
- 	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+ 	}
+ 
+ move:
+ 	/* It can have affinity changed while we were choosing. */
+ 	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+ 		goto again;
  }
  
  /*
@@@ -6185,7 -6250,7 +6267,7 @@@
   */
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
- 	struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+ 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
  	unsigned long flags;
  
  	local_irq_save(flags);
@@@ -6475,7 -6540,7 +6557,7 @@@ static void set_rq_online(struct rq *rq
  	if (!rq->online) {
  		const struct sched_class *class;
  
- 		cpu_set(rq->cpu, rq->rd->online);
+ 		cpumask_set_cpu(rq->cpu, rq->rd->online);
  		rq->online = 1;
  
  		for_each_class(class) {
@@@ -6495,7 -6560,7 +6577,7 @@@ static void set_rq_offline(struct rq *r
  				class->rq_offline(rq);
  		}
  
- 		cpu_clear(rq->cpu, rq->rd->online);
+ 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
  		rq->online = 0;
  	}
  }
@@@ -6536,7 -6601,7 +6618,7 @@@ migration_call(struct notifier_block *n
  		rq = cpu_rq(cpu);
  		spin_lock_irqsave(&rq->lock, flags);
  		if (rq->rd) {
- 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
  			set_rq_online(rq);
  		}
@@@ -6550,7 -6615,7 +6632,7 @@@
  			break;
  		/* Unbind it from offline cpu so it can run. Fall thru. */
  		kthread_bind(cpu_rq(cpu)->migration_thread,
- 			     any_online_cpu(cpu_online_map));
+ 			     cpumask_any(cpu_online_mask));
  		kthread_stop(cpu_rq(cpu)->migration_thread);
  		cpu_rq(cpu)->migration_thread = NULL;
  		break;
@@@ -6600,7 -6665,7 +6682,7 @@@
  		rq = cpu_rq(cpu);
  		spin_lock_irqsave(&rq->lock, flags);
  		if (rq->rd) {
- 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  			set_rq_offline(rq);
  		}
  		spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6639,13 -6704,13 +6721,13 @@@ early_initcall(migration_init)
  #ifdef CONFIG_SCHED_DEBUG
  
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- 				  cpumask_t *groupmask)
+ 				  struct cpumask *groupmask)
  {
  	struct sched_group *group = sd->groups;
  	char str[256];
  
- 	cpulist_scnprintf(str, sizeof(str), sd->span);
- 	cpus_clear(*groupmask);
+ 	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+ 	cpumask_clear(groupmask);
  
  	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
  
@@@ -6659,11 -6724,11 +6741,11 @@@
  
  	printk(KERN_CONT "span %s level %s\n", str, sd->name);
  
- 	if (!cpu_isset(cpu, sd->span)) {
+ 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  		printk(KERN_ERR "ERROR: domain->span does not contain "
  				"CPU%d\n", cpu);
  	}
- 	if (!cpu_isset(cpu, group->cpumask)) {
+ 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
  		printk(KERN_ERR "ERROR: domain->groups does not contain"
  				" CPU%d\n", cpu);
  	}
@@@ -6683,31 -6748,32 +6765,32 @@@
  			break;
  		}
  
- 		if (!cpus_weight(group->cpumask)) {
+ 		if (!cpumask_weight(sched_group_cpus(group))) {
  			printk(KERN_CONT "\n");
  			printk(KERN_ERR "ERROR: empty group\n");
  			break;
  		}
  
- 		if (cpus_intersects(*groupmask, group->cpumask)) {
+ 		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
  			printk(KERN_CONT "\n");
  			printk(KERN_ERR "ERROR: repeated CPUs\n");
  			break;
  		}
  
- 		cpus_or(*groupmask, *groupmask, group->cpumask);
+ 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
- 		cpulist_scnprintf(str, sizeof(str), group->cpumask);
+ 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
  		printk(KERN_CONT " %s", str);
  
  		group = group->next;
  	} while (group != sd->groups);
  	printk(KERN_CONT "\n");
  
- 	if (!cpus_equal(sd->span, *groupmask))
+ 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
  		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
  
- 	if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+ 	if (sd->parent &&
+ 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
  		printk(KERN_ERR "ERROR: parent span is not a superset "
  			"of domain->span\n");
  	return 0;
@@@ -6715,7 -6781,7 +6798,7 @@@
  
  static void sched_domain_debug(struct sched_domain *sd, int cpu)
  {
- 	cpumask_t *groupmask;
+ 	cpumask_var_t groupmask;
  	int level = 0;
  
  	if (!sd) {
@@@ -6725,8 -6791,7 +6808,7 @@@
  
  	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
  
- 	groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
- 	if (!groupmask) {
+ 	if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
  		printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
  		return;
  	}
@@@ -6739,7 -6804,7 +6821,7 @@@
  		if (!sd)
  			break;
  	}
- 	kfree(groupmask);
+ 	free_cpumask_var(groupmask);
  }
  #else /* !CONFIG_SCHED_DEBUG */
  # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6747,7 -6812,7 +6829,7 @@@
  
  static int sd_degenerate(struct sched_domain *sd)
  {
- 	if (cpus_weight(sd->span) == 1)
+ 	if (cpumask_weight(sched_domain_span(sd)) == 1)
  		return 1;
  
  	/* Following flags need at least 2 groups */
@@@ -6778,7 -6843,7 +6860,7 @@@ sd_parent_degenerate(struct sched_domai
  	if (sd_degenerate(parent))
  		return 1;
  
- 	if (!cpus_equal(sd->span, parent->span))
+ 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
  		return 0;
  
  	/* Does parent contain flags not in child? */
@@@ -6802,6 -6867,16 +6884,16 @@@
  	return 1;
  }
  
+ static void free_rootdomain(struct root_domain *rd)
+ {
+ 	cpupri_cleanup(&rd->cpupri);
+ 
+ 	free_cpumask_var(rd->rto_mask);
+ 	free_cpumask_var(rd->online);
+ 	free_cpumask_var(rd->span);
+ 	kfree(rd);
+ }
+ 
  static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
  	unsigned long flags;
@@@ -6811,38 -6886,63 +6903,63 @@@
  	if (rq->rd) {
  		struct root_domain *old_rd = rq->rd;
  
- 		if (cpu_isset(rq->cpu, old_rd->online))
+ 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
  			set_rq_offline(rq);
  
- 		cpu_clear(rq->cpu, old_rd->span);
+ 		cpumask_clear_cpu(rq->cpu, old_rd->span);
  
  		if (atomic_dec_and_test(&old_rd->refcount))
- 			kfree(old_rd);
+ 			free_rootdomain(old_rd);
  	}
  
  	atomic_inc(&rd->refcount);
  	rq->rd = rd;
  
- 	cpu_set(rq->cpu, rd->span);
- 	if (cpu_isset(rq->cpu, cpu_online_map))
+ 	cpumask_set_cpu(rq->cpu, rd->span);
+ 	if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
  		set_rq_online(rq);
  
  	spin_unlock_irqrestore(&rq->lock, flags);
  }
  
- static void init_rootdomain(struct root_domain *rd)
+ static int init_rootdomain(struct root_domain *rd, bool bootmem)
  {
  	memset(rd, 0, sizeof(*rd));
  
- 	cpus_clear(rd->span);
- 	cpus_clear(rd->online);
+ 	if (bootmem) {
+ 		alloc_bootmem_cpumask_var(&def_root_domain.span);
+ 		alloc_bootmem_cpumask_var(&def_root_domain.online);
+ 		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+ 		cpupri_init(&rd->cpupri, true);
+ 		return 0;
+ 	}
+ 
+ 	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ 		goto free_rd;
+ 	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ 		goto free_span;
+ 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ 		goto free_online;
+ 
+ 	if (cpupri_init(&rd->cpupri, false) != 0)
+ 		goto free_rto_mask;
+ 	return 0;
  
- 	cpupri_init(&rd->cpupri);
+ free_rto_mask:
+ 	free_cpumask_var(rd->rto_mask);
+ free_online:
+ 	free_cpumask_var(rd->online);
+ free_span:
+ 	free_cpumask_var(rd->span);
+ free_rd:
+ 	kfree(rd);
+ 	return -ENOMEM;
  }
  
  static void init_defrootdomain(void)
  {
- 	init_rootdomain(&def_root_domain);
+ 	init_rootdomain(&def_root_domain, true);
+ 
  	atomic_set(&def_root_domain.refcount, 1);
  }
  
@@@ -6854,7 -6954,10 +6971,10 @@@ static struct root_domain *alloc_rootdo
  	if (!rd)
  		return NULL;
  
- 	init_rootdomain(rd);
+ 	if (init_rootdomain(rd, false) != 0) {
+ 		kfree(rd);
+ 		return NULL;
+ 	}
  
  	return rd;
  }
@@@ -6896,19 -6999,12 +7016,12 @@@ cpu_attach_domain(struct sched_domain *
  }
  
  /* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
  
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
- 	static int __initdata ints[NR_CPUS];
- 	int i;
- 
- 	str = get_options(str, ARRAY_SIZE(ints), ints);
- 	cpus_clear(cpu_isolated_map);
- 	for (i = 1; i <= ints[0]; i++)
- 		if (ints[i] < NR_CPUS)
- 			cpu_set(ints[i], cpu_isolated_map);
+ 	cpulist_parse(str, cpu_isolated_map);
  	return 1;
  }
  
@@@ -6917,42 -7013,43 +7030,43 @@@ __setup("isolcpus=", isolated_cpu_setup
  /*
   * init_sched_build_groups takes the cpumask we wish to span, and a pointer
   * to a function which identifies what group(along with sched group) a CPU
-  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
-  * (due to the fact that we keep track of groups covered with a cpumask_t).
+  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+  * (due to the fact that we keep track of groups covered with a struct cpumask).
   *
   * init_sched_build_groups will build a circular linked list of the groups
   * covered by the given span, and will set each group's ->cpumask correctly,
   * and ->cpu_power to 0.
   */
  static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
- 			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+ 			const struct cpumask *cpu_map,
+ 			int (*group_fn)(int cpu, const struct cpumask *cpu_map,
  					struct sched_group **sg,
- 					cpumask_t *tmpmask),
- 			cpumask_t *covered, cpumask_t *tmpmask)
+ 					struct cpumask *tmpmask),
+ 			struct cpumask *covered, struct cpumask *tmpmask)
  {
  	struct sched_group *first = NULL, *last = NULL;
  	int i;
  
- 	cpus_clear(*covered);
+ 	cpumask_clear(covered);
  
- 	for_each_cpu_mask_nr(i, *span) {
+ 	for_each_cpu(i, span) {
  		struct sched_group *sg;
  		int group = group_fn(i, cpu_map, &sg, tmpmask);
  		int j;
  
- 		if (cpu_isset(i, *covered))
+ 		if (cpumask_test_cpu(i, covered))
  			continue;
  
- 		cpus_clear(sg->cpumask);
+ 		cpumask_clear(sched_group_cpus(sg));
  		sg->__cpu_power = 0;
  
- 		for_each_cpu_mask_nr(j, *span) {
+ 		for_each_cpu(j, span) {
  			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
  				continue;
  
- 			cpu_set(j, *covered);
- 			cpu_set(j, sg->cpumask);
+ 			cpumask_set_cpu(j, covered);
+ 			cpumask_set_cpu(j, sched_group_cpus(sg));
  		}
  		if (!first)
  			first = sg;
@@@ -7016,9 -7113,10 +7130,10 @@@ static int find_next_best_node(int node
   * should be one that prevents unnecessary balancing, but also spreads tasks
   * out optimally.
   */
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
  {
  	nodemask_t used_nodes;
+ 	/* FIXME: use cpumask_of_node() */
  	node_to_cpumask_ptr(nodemask, node);
  	int i;
  
@@@ -7039,19 -7137,34 +7154,34 @@@
  
  int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  
+ /*
+  * The cpus mask in sched_group and sched_domain hangs off the end.
+  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+  * for nr_cpu_ids < CONFIG_NR_CPUS.
+  */
+ struct static_sched_group {
+ 	struct sched_group sg;
+ 	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+ 
+ struct static_sched_domain {
+ 	struct sched_domain sd;
+ 	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+ 
  /*
   * SMT sched-domains:
   */
  #ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
  
  static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- 		 cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+ 		 struct sched_group **sg, struct cpumask *unused)
  {
  	if (sg)
- 		*sg = &per_cpu(sched_group_cpus, cpu);
+ 		*sg = &per_cpu(sched_group_cpus, cpu).sg;
  	return cpu;
  }
  #endif /* CONFIG_SCHED_SMT */
@@@ -7060,56 -7173,55 +7190,55 @@@
   * multi-core sched-domains:
   */
  #ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
  #endif /* CONFIG_SCHED_MC */
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- 		  cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+ 		  struct sched_group **sg, struct cpumask *mask)
  {
  	int group;
  
- 	*mask = per_cpu(cpu_sibling_map, cpu);
- 	cpus_and(*mask, *mask, *cpu_map);
- 	group = first_cpu(*mask);
+ 	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ 	group = cpumask_first(mask);
  	if (sg)
- 		*sg = &per_cpu(sched_group_core, group);
+ 		*sg = &per_cpu(sched_group_core, group).sg;
  	return group;
  }
  #elif defined(CONFIG_SCHED_MC)
  static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- 		  cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+ 		  struct sched_group **sg, struct cpumask *unused)
  {
  	if (sg)
- 		*sg = &per_cpu(sched_group_core, cpu);
+ 		*sg = &per_cpu(sched_group_core, cpu).sg;
  	return cpu;
  }
  #endif
  
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
  
  static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- 		  cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+ 		  struct sched_group **sg, struct cpumask *mask)
  {
  	int group;
  #ifdef CONFIG_SCHED_MC
+ 	/* FIXME: Use cpu_coregroup_mask. */
  	*mask = cpu_coregroup_map(cpu);
  	cpus_and(*mask, *mask, *cpu_map);
- 	group = first_cpu(*mask);
+ 	group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
- 	*mask = per_cpu(cpu_sibling_map, cpu);
- 	cpus_and(*mask, *mask, *cpu_map);
- 	group = first_cpu(*mask);
+ 	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ 	group = cpumask_first(mask);
  #else
  	group = cpu;
  #endif
  	if (sg)
- 		*sg = &per_cpu(sched_group_phys, group);
+ 		*sg = &per_cpu(sched_group_phys, group).sg;
  	return group;
  }
  
@@@ -7123,19 -7235,21 +7252,21 @@@ static DEFINE_PER_CPU(struct sched_doma
  static struct sched_group ***sched_group_nodes_bycpu;
  
  static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
  
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
- 				 struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+ 				 struct sched_group **sg,
+ 				 struct cpumask *nodemask)
  {
  	int group;
+ 	/* FIXME: use cpumask_of_node */
+ 	node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
  
- 	*nodemask = node_to_cpumask(cpu_to_node(cpu));
- 	cpus_and(*nodemask, *nodemask, *cpu_map);
- 	group = first_cpu(*nodemask);
+ 	cpumask_and(nodemask, pnodemask, cpu_map);
+ 	group = cpumask_first(nodemask);
  
  	if (sg)
- 		*sg = &per_cpu(sched_group_allnodes, group);
+ 		*sg = &per_cpu(sched_group_allnodes, group).sg;
  	return group;
  }
  
@@@ -7147,11 -7261,11 +7278,11 @@@ static void init_numa_sched_groups_powe
  	if (!sg)
  		return;
  	do {
- 		for_each_cpu_mask_nr(j, sg->cpumask) {
+ 		for_each_cpu(j, sched_group_cpus(sg)) {
  			struct sched_domain *sd;
  
- 			sd = &per_cpu(phys_domains, j);
- 			if (j != first_cpu(sd->groups->cpumask)) {
+ 			sd = &per_cpu(phys_domains, j).sd;
+ 			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
  				/*
  				 * Only add "power" once for each
  				 * physical package.
@@@ -7168,11 -7282,12 +7299,12 @@@
  
  #ifdef CONFIG_NUMA
  /* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+ 			      struct cpumask *nodemask)
  {
  	int cpu, i;
  
- 	for_each_cpu_mask_nr(cpu, *cpu_map) {
+ 	for_each_cpu(cpu, cpu_map) {
  		struct sched_group **sched_group_nodes
  			= sched_group_nodes_bycpu[cpu];
  
@@@ -7181,10 -7296,11 +7313,11 @@@
  
  		for (i = 0; i < nr_node_ids; i++) {
  			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ 			/* FIXME: Use cpumask_of_node */
+ 			node_to_cpumask_ptr(pnodemask, i);
  
- 			*nodemask = node_to_cpumask(i);
- 			cpus_and(*nodemask, *nodemask, *cpu_map);
- 			if (cpus_empty(*nodemask))
+ 			cpus_and(*nodemask, *pnodemask, *cpu_map);
+ 			if (cpumask_empty(nodemask))
  				continue;
  
  			if (sg == NULL)
@@@ -7202,7 -7318,8 +7335,8 @@@ next_sg
  	}
  }
  #else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+ 			      struct cpumask *nodemask)
  {
  }
  #endif /* CONFIG_NUMA */
@@@ -7228,7 -7345,7 +7362,7 @@@ static void init_sched_groups_power(in
  
  	WARN_ON(!sd || !sd->groups);
  
- 	if (cpu != first_cpu(sd->groups->cpumask))
+ 	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
  		return;
  
  	child = sd->child;
@@@ -7293,48 -7410,6 +7427,6 @@@ SD_INIT_FUNC(CPU
   SD_INIT_FUNC(MC)
  #endif
  
- /*
-  * To minimize stack usage kmalloc room for cpumasks and share the
-  * space as the usage in build_sched_domains() dictates.  Used only
-  * if the amount of space is significant.
-  */
- struct allmasks {
- 	cpumask_t tmpmask;			/* make this one first */
- 	union {
- 		cpumask_t nodemask;
- 		cpumask_t this_sibling_map;
- 		cpumask_t this_core_map;
- 	};
- 	cpumask_t send_covered;
- 
- #ifdef CONFIG_NUMA
- 	cpumask_t domainspan;
- 	cpumask_t covered;
- 	cpumask_t notcovered;
- #endif
- };
- 
- #if	NR_CPUS > 128
- #define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- {
- 	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
- }
- static inline void sched_cpumask_free(struct allmasks *masks)
- {
- 	kfree(masks);
- }
- #else
- #define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- { }
- static inline void sched_cpumask_free(struct allmasks *masks)
- { }
- #endif
- 
- #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
- 			((unsigned long)(a) + offsetof(struct allmasks, v))
- 
  static int default_relax_domain_level = -1;
  
  static int __init setup_relax_domain_level(char *str)
@@@ -7374,17 -7449,38 +7466,38 @@@ static void set_domain_attribute(struc
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
  				 struct sched_domain_attr *attr)
  {
- 	int i;
+ 	int i, err = -ENOMEM;
  	struct root_domain *rd;
- 	SCHED_CPUMASK_DECLARE(allmasks);
- 	cpumask_t *tmpmask;
+ 	cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+ 		tmpmask;
  #ifdef CONFIG_NUMA
+ 	cpumask_var_t domainspan, covered, notcovered;
  	struct sched_group **sched_group_nodes = NULL;
  	int sd_allnodes = 0;
  
+ 	if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+ 		goto out;
+ 	if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+ 		goto free_domainspan;
+ 	if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+ 		goto free_covered;
+ #endif
+ 
+ 	if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+ 		goto free_notcovered;
+ 	if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+ 		goto free_nodemask;
+ 	if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+ 		goto free_this_sibling_map;
+ 	if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+ 		goto free_this_core_map;
+ 	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ 		goto free_send_covered;
+ 
+ #ifdef CONFIG_NUMA
  	/*
  	 * Allocate the per-node list of sched groups
  	 */
@@@ -7392,54 -7488,37 +7505,37 @@@
  				    GFP_KERNEL);
  	if (!sched_group_nodes) {
  		printk(KERN_WARNING "Can not alloc sched group node list\n");
- 		return -ENOMEM;
+ 		goto free_tmpmask;
  	}
  #endif
  
  	rd = alloc_rootdomain();
  	if (!rd) {
  		printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
- 		kfree(sched_group_nodes);
- #endif
- 		return -ENOMEM;
+ 		goto free_sched_groups;
  	}
  
- 	/* get space for all scratch cpumask variables */
- 	sched_cpumask_alloc(&allmasks);
- 	if (!allmasks) {
- 		printk(KERN_WARNING "Cannot alloc cpumask array\n");
- 		kfree(rd);
  #ifdef CONFIG_NUMA
- 		kfree(sched_group_nodes);
- #endif
- 		return -ENOMEM;
- 	}
- 
- 	tmpmask = (cpumask_t *)allmasks;
- 
- 
- #ifdef CONFIG_NUMA
- 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+ 	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
  #endif
  
  	/*
  	 * Set up domains for cpus specified by the cpu_map.
  	 */
- 	for_each_cpu_mask_nr(i, *cpu_map) {
+ 	for_each_cpu(i, cpu_map) {
  		struct sched_domain *sd = NULL, *p;
- 		SCHED_CPUMASK_VAR(nodemask, allmasks);
  
+ 		/* FIXME: use cpumask_of_node */
  		*nodemask = node_to_cpumask(cpu_to_node(i));
  		cpus_and(*nodemask, *nodemask, *cpu_map);
  
  #ifdef CONFIG_NUMA
- 		if (cpus_weight(*cpu_map) >
- 				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+ 		if (cpumask_weight(cpu_map) >
+ 				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
  			sd = &per_cpu(allnodes_domains, i);
  			SD_INIT(sd, ALLNODES);
  			set_domain_attribute(sd, attr);
- 			sd->span = *cpu_map;
+ 			cpumask_copy(sched_domain_span(sd), cpu_map);
  			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
  			p = sd;
  			sd_allnodes = 1;
@@@ -7449,18 -7528,19 +7545,19 @@@
  		sd = &per_cpu(node_domains, i);
  		SD_INIT(sd, NODE);
  		set_domain_attribute(sd, attr);
- 		sched_domain_node_span(cpu_to_node(i), &sd->span);
+ 		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
  		sd->parent = p;
  		if (p)
  			p->child = sd;
- 		cpus_and(sd->span, sd->span, *cpu_map);
+ 		cpumask_and(sched_domain_span(sd),
+ 			    sched_domain_span(sd), cpu_map);
  #endif
  
  		p = sd;
- 		sd = &per_cpu(phys_domains, i);
+ 		sd = &per_cpu(phys_domains, i).sd;
  		SD_INIT(sd, CPU);
  		set_domain_attribute(sd, attr);
- 		sd->span = *nodemask;
+ 		cpumask_copy(sched_domain_span(sd), nodemask);
  		sd->parent = p;
  		if (p)
  			p->child = sd;
@@@ -7468,11 -7548,12 +7565,12 @@@
  
  #ifdef CONFIG_SCHED_MC
  		p = sd;
- 		sd = &per_cpu(core_domains, i);
+ 		sd = &per_cpu(core_domains, i).sd;
  		SD_INIT(sd, MC);
  		set_domain_attribute(sd, attr);
- 		sd->span = cpu_coregroup_map(i);
- 		cpus_and(sd->span, sd->span, *cpu_map);
+ 		*sched_domain_span(sd) = cpu_coregroup_map(i);
+ 		cpumask_and(sched_domain_span(sd),
+ 			    sched_domain_span(sd), cpu_map);
  		sd->parent = p;
  		p->child = sd;
  		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7480,11 -7561,11 +7578,11 @@@
  
  #ifdef CONFIG_SCHED_SMT
  		p = sd;
- 		sd = &per_cpu(cpu_domains, i);
+ 		sd = &per_cpu(cpu_domains, i).sd;
  		SD_INIT(sd, SIBLING);
  		set_domain_attribute(sd, attr);
- 		sd->span = per_cpu(cpu_sibling_map, i);
- 		cpus_and(sd->span, sd->span, *cpu_map);
+ 		cpumask_and(sched_domain_span(sd),
+ 			    &per_cpu(cpu_sibling_map, i), cpu_map);
  		sd->parent = p;
  		p->child = sd;
  		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7493,13 -7574,10 +7591,10 @@@
  
  #ifdef CONFIG_SCHED_SMT
  	/* Set up CPU (sibling) groups */
- 	for_each_cpu_mask_nr(i, *cpu_map) {
- 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
- 		SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
- 		*this_sibling_map = per_cpu(cpu_sibling_map, i);
- 		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
- 		if (i != first_cpu(*this_sibling_map))
+ 	for_each_cpu(i, cpu_map) {
+ 		cpumask_and(this_sibling_map,
+ 			    &per_cpu(cpu_sibling_map, i), cpu_map);
+ 		if (i != cpumask_first(this_sibling_map))
  			continue;
  
  		init_sched_build_groups(this_sibling_map, cpu_map,
@@@ -7510,13 -7588,11 +7605,11 @@@
  
  #ifdef CONFIG_SCHED_MC
  	/* Set up multi-core groups */
- 	for_each_cpu_mask_nr(i, *cpu_map) {
- 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
- 		SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
+ 	for_each_cpu(i, cpu_map) {
+ 		/* FIXME: Use cpu_coregroup_mask */
  		*this_core_map = cpu_coregroup_map(i);
  		cpus_and(*this_core_map, *this_core_map, *cpu_map);
- 		if (i != first_cpu(*this_core_map))
+ 		if (i != cpumask_first(this_core_map))
  			continue;
  
  		init_sched_build_groups(this_core_map, cpu_map,
@@@ -7527,12 -7603,10 +7620,10 @@@
  
  	/* Set up physical groups */
  	for (i = 0; i < nr_node_ids; i++) {
- 		SCHED_CPUMASK_VAR(nodemask, allmasks);
- 		SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
+ 		/* FIXME: Use cpumask_of_node */
  		*nodemask = node_to_cpumask(i);
  		cpus_and(*nodemask, *nodemask, *cpu_map);
- 		if (cpus_empty(*nodemask))
+ 		if (cpumask_empty(nodemask))
  			continue;
  
  		init_sched_build_groups(nodemask, cpu_map,
@@@ -7543,8 -7617,6 +7634,6 @@@
  #ifdef CONFIG_NUMA
  	/* Set up node groups */
  	if (sd_allnodes) {
- 		SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
  		init_sched_build_groups(cpu_map, cpu_map,
  					&cpu_to_allnodes_group,
  					send_covered, tmpmask);
@@@ -7553,58 -7625,58 +7642,58 @@@
  	for (i = 0; i < nr_node_ids; i++) {
  		/* Set up node groups */
  		struct sched_group *sg, *prev;
- 		SCHED_CPUMASK_VAR(nodemask, allmasks);
- 		SCHED_CPUMASK_VAR(domainspan, allmasks);
- 		SCHED_CPUMASK_VAR(covered, allmasks);
  		int j;
  
+ 		/* FIXME: Use cpumask_of_node */
  		*nodemask = node_to_cpumask(i);
- 		cpus_clear(*covered);
+ 		cpumask_clear(covered);
  
  		cpus_and(*nodemask, *nodemask, *cpu_map);
- 		if (cpus_empty(*nodemask)) {
+ 		if (cpumask_empty(nodemask)) {
  			sched_group_nodes[i] = NULL;
  			continue;
  		}
  
  		sched_domain_node_span(i, domainspan);
- 		cpus_and(*domainspan, *domainspan, *cpu_map);
+ 		cpumask_and(domainspan, domainspan, cpu_map);
  
- 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ 		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ 				  GFP_KERNEL, i);
  		if (!sg) {
  			printk(KERN_WARNING "Can not alloc domain group for "
  				"node %d\n", i);
  			goto error;
  		}
  		sched_group_nodes[i] = sg;
- 		for_each_cpu_mask_nr(j, *nodemask) {
+ 		for_each_cpu(j, nodemask) {
  			struct sched_domain *sd;
  
  			sd = &per_cpu(node_domains, j);
  			sd->groups = sg;
  		}
  		sg->__cpu_power = 0;
- 		sg->cpumask = *nodemask;
+ 		cpumask_copy(sched_group_cpus(sg), nodemask);
  		sg->next = sg;
- 		cpus_or(*covered, *covered, *nodemask);
+ 		cpumask_or(covered, covered, nodemask);
  		prev = sg;
  
  		for (j = 0; j < nr_node_ids; j++) {
- 			SCHED_CPUMASK_VAR(notcovered, allmasks);
  			int n = (i + j) % nr_node_ids;
+ 			/* FIXME: Use cpumask_of_node */
  			node_to_cpumask_ptr(pnodemask, n);
  
- 			cpus_complement(*notcovered, *covered);
- 			cpus_and(*tmpmask, *notcovered, *cpu_map);
- 			cpus_and(*tmpmask, *tmpmask, *domainspan);
- 			if (cpus_empty(*tmpmask))
+ 			cpumask_complement(notcovered, covered);
+ 			cpumask_and(tmpmask, notcovered, cpu_map);
+ 			cpumask_and(tmpmask, tmpmask, domainspan);
+ 			if (cpumask_empty(tmpmask))
  				break;
  
- 			cpus_and(*tmpmask, *tmpmask, *pnodemask);
- 			if (cpus_empty(*tmpmask))
+ 			cpumask_and(tmpmask, tmpmask, pnodemask);
+ 			if (cpumask_empty(tmpmask))
  				continue;
  
- 			sg = kmalloc_node(sizeof(struct sched_group),
+ 			sg = kmalloc_node(sizeof(struct sched_group) +
+ 					  cpumask_size(),
  					  GFP_KERNEL, i);
  			if (!sg) {
  				printk(KERN_WARNING
@@@ -7612,9 -7684,9 +7701,9 @@@
  				goto error;
  			}
  			sg->__cpu_power = 0;
- 			sg->cpumask = *tmpmask;
+ 			cpumask_copy(sched_group_cpus(sg), tmpmask);
  			sg->next = prev->next;
- 			cpus_or(*covered, *covered, *tmpmask);
+ 			cpumask_or(covered, covered, tmpmask);
  			prev->next = sg;
  			prev = sg;
  		}
@@@ -7623,22 -7695,22 +7712,22 @@@
  
  	/* Calculate CPU power for physical packages and nodes */
  #ifdef CONFIG_SCHED_SMT
- 	for_each_cpu_mask_nr(i, *cpu_map) {
- 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
+ 	for_each_cpu(i, cpu_map) {
+ 		struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
  
  		init_sched_groups_power(i, sd);
  	}
  #endif
  #ifdef CONFIG_SCHED_MC
- 	for_each_cpu_mask_nr(i, *cpu_map) {
- 		struct sched_domain *sd = &per_cpu(core_domains, i);
+ 	for_each_cpu(i, cpu_map) {
+ 		struct sched_domain *sd = &per_cpu(core_domains, i).sd;
  
  		init_sched_groups_power(i, sd);
  	}
  #endif
  
- 	for_each_cpu_mask_nr(i, *cpu_map) {
- 		struct sched_domain *sd = &per_cpu(phys_domains, i);
+ 	for_each_cpu(i, cpu_map) {
+ 		struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
  
  		init_sched_groups_power(i, sd);
  	}
@@@ -7650,53 -7722,78 +7739,78 @@@
  	if (sd_allnodes) {
  		struct sched_group *sg;
  
- 		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+ 		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
  								tmpmask);
  		init_numa_sched_groups_power(sg);
  	}
  #endif
  
  	/* Attach the domains */
- 	for_each_cpu_mask_nr(i, *cpu_map) {
+ 	for_each_cpu(i, cpu_map) {
  		struct sched_domain *sd;
  #ifdef CONFIG_SCHED_SMT
- 		sd = &per_cpu(cpu_domains, i);
+ 		sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
- 		sd = &per_cpu(core_domains, i);
+ 		sd = &per_cpu(core_domains, i).sd;
  #else
- 		sd = &per_cpu(phys_domains, i);
+ 		sd = &per_cpu(phys_domains, i).sd;
  #endif
  		cpu_attach_domain(sd, rd, i);
  	}
  
- 	sched_cpumask_free(allmasks);
- 	return 0;
+ 	err = 0;
+ 
+ free_tmpmask:
+ 	free_cpumask_var(tmpmask);
+ free_send_covered:
+ 	free_cpumask_var(send_covered);
+ free_this_core_map:
+ 	free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+ 	free_cpumask_var(this_sibling_map);
+ free_nodemask:
+ 	free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+ 	free_cpumask_var(notcovered);
+ free_covered:
+ 	free_cpumask_var(covered);
+ free_domainspan:
+ 	free_cpumask_var(domainspan);
+ out:
+ #endif
+ 	return err;
+ 
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+ 	kfree(sched_group_nodes);
+ #endif
+ 	goto free_tmpmask;
  
  #ifdef CONFIG_NUMA
  error:
  	free_sched_groups(cpu_map, tmpmask);
- 	sched_cpumask_free(allmasks);
- 	kfree(rd);
- 	return -ENOMEM;
+ 	free_rootdomain(rd);
+ 	goto free_tmpmask;
  #endif
  }
  
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
  {
  	return __build_sched_domains(cpu_map, NULL);
  }
  
- static cpumask_t *doms_cur;	/* current sched domains */
+ static struct cpumask *doms_cur;	/* current sched domains */
  static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
  static struct sched_domain_attr *dattr_cur;
  				/* attribues of custom domains in 'doms_cur' */
  
  /*
   * Special case: If a kmalloc of a doms_cur partition (array of
-  * cpumask_t) fails, then fallback to a single sched domain,
-  * as determined by the single cpumask_t fallback_doms.
+  * cpumask) fails, then fallback to a single sched domain,
+  * as determined by the single cpumask fallback_doms.
   */
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
  
  /*
   * arch_update_cpu_topology lets virtualized architectures update the
@@@ -7713,16 -7810,16 +7827,16 @@@ int __attribute__((weak)) arch_update_c
   * For now this just excludes isolated cpus, but could be used to
   * exclude other special cases in the future.
   */
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
  {
  	int err;
  
  	arch_update_cpu_topology();
  	ndoms_cur = 1;
- 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ 	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
  	if (!doms_cur)
- 		doms_cur = &fallback_doms;
- 	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+ 		doms_cur = fallback_doms;
+ 	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
  	dattr_cur = NULL;
  	err = build_sched_domains(doms_cur);
  	register_sched_domain_sysctl();
@@@ -7730,8 -7827,8 +7844,8 @@@
  	return err;
  }
  
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
- 				       cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+ 				       struct cpumask *tmpmask)
  {
  	free_sched_groups(cpu_map, tmpmask);
  }
@@@ -7740,15 -7837,16 +7854,16 @@@
   * Detach sched domains from a group of cpus specified in cpu_map
   * These cpus will now be attached to the NULL domain
   */
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
  {
- 	cpumask_t tmpmask;
+ 	/* Save because hotplug lock held. */
+ 	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
  	int i;
  
- 	for_each_cpu_mask_nr(i, *cpu_map)
+ 	for_each_cpu(i, cpu_map)
  		cpu_attach_domain(NULL, &def_root_domain, i);
  	synchronize_sched();
- 	arch_destroy_sched_domains(cpu_map, &tmpmask);
+ 	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
  }
  
  /* handle null as "default" */
@@@ -7773,7 -7871,7 +7888,7 @@@ static int dattrs_equal(struct sched_do
   * doms_new[] to the current sched domain partitioning, doms_cur[].
   * It destroys each deleted domain and builds each new domain.
   *
-  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
   * The masks don't intersect (don't overlap.) We should setup one
   * sched domain for each mask. CPUs not in any of the cpumasks will
   * not be load balanced. If the same cpumask appears both in the
@@@ -7787,13 -7885,14 +7902,14 @@@
   * the single partition 'fallback_doms', it also forces the domains
   * to be rebuilt.
   *
-  * If doms_new == NULL it will be replaced with cpu_online_map.
+  * If doms_new == NULL it will be replaced with cpu_online_mask.
   * ndoms_new == 0 is a special case for destroying existing domains,
   * and it will not create the default domain.
   *
   * Call with hotplug lock held
   */
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
  			     struct sched_domain_attr *dattr_new)
  {
  	int i, j, n;
@@@ -7812,7 -7911,7 +7928,7 @@@
  	/* Destroy deleted domains */
  	for (i = 0; i < ndoms_cur; i++) {
  		for (j = 0; j < n && !new_topology; j++) {
- 			if (cpus_equal(doms_cur[i], doms_new[j])
+ 			if (cpumask_equal(&doms_cur[i], &doms_new[j])
  			    && dattrs_equal(dattr_cur, i, dattr_new, j))
  				goto match1;
  		}
@@@ -7824,15 -7923,15 +7940,15 @@@ match1
  
  	if (doms_new == NULL) {
  		ndoms_cur = 0;
- 		doms_new = &fallback_doms;
- 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+ 		doms_new = fallback_doms;
+ 		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
  		WARN_ON_ONCE(dattr_new);
  	}
  
  	/* Build new domains */
  	for (i = 0; i < ndoms_new; i++) {
  		for (j = 0; j < ndoms_cur && !new_topology; j++) {
- 			if (cpus_equal(doms_new[i], doms_cur[j])
+ 			if (cpumask_equal(&doms_new[i], &doms_cur[j])
  			    && dattrs_equal(dattr_new, i, dattr_cur, j))
  				goto match2;
  		}
@@@ -7844,7 -7943,7 +7960,7 @@@ match2
  	}
  
  	/* Remember the new sched domains */
- 	if (doms_cur != &fallback_doms)
+ 	if (doms_cur != fallback_doms)
  		kfree(doms_cur);
  	kfree(dattr_cur);	/* kfree(NULL) is safe */
  	doms_cur = doms_new;
@@@ -7873,14 -7972,25 +7989,25 @@@ int arch_reinit_sched_domains(void
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  {
  	int ret;
+ 	unsigned int level = 0;
  
- 	if (buf[0] != '0' && buf[0] != '1')
+ 	if (sscanf(buf, "%u", &level) != 1)
+ 		return -EINVAL;
+ 
+ 	/*
+ 	 * level is always be positive so don't check for
+ 	 * level < POWERSAVINGS_BALANCE_NONE which is 0
+ 	 * What happens on 0 or 1 byte write,
+ 	 * need to check for count as well?
+ 	 */
+ 
+ 	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
  		return -EINVAL;
  
  	if (smt)
- 		sched_smt_power_savings = (buf[0] == '1');
+ 		sched_smt_power_savings = level;
  	else
- 		sched_mc_power_savings = (buf[0] == '1');
+ 		sched_mc_power_savings = level;
  
  	ret = arch_reinit_sched_domains();
  
@@@ -7984,7 -8094,9 +8111,9 @@@ static int update_runtime(struct notifi
  
  void __init sched_init_smp(void)
  {
- 	cpumask_t non_isolated_cpus;
+ 	cpumask_var_t non_isolated_cpus;
+ 
+ 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
  
  #if defined(CONFIG_NUMA)
  	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@@ -7993,10 -8105,10 +8122,10 @@@
  #endif
  	get_online_cpus();
  	mutex_lock(&sched_domains_mutex);
- 	arch_init_sched_domains(&cpu_online_map);
- 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
- 	if (cpus_empty(non_isolated_cpus))
- 		cpu_set(smp_processor_id(), non_isolated_cpus);
+ 	arch_init_sched_domains(cpu_online_mask);
+ 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+ 	if (cpumask_empty(non_isolated_cpus))
+ 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
  	mutex_unlock(&sched_domains_mutex);
  	put_online_cpus();
  
@@@ -8011,9 -8123,13 +8140,13 @@@
  	init_hrtick();
  
  	/* Move init over to a non-isolated CPU */
- 	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+ 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
  		BUG();
  	sched_init_granularity();
+ 	free_cpumask_var(non_isolated_cpus);
+ 
+ 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ 	init_sched_rt_class();
  }
  #else
  void __init sched_init_smp(void)
@@@ -8328,6 -8444,15 +8461,15 @@@ void __init sched_init(void
  	 */
  	current->sched_class = &fair_sched_class;
  
+ 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+ 	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+ 	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+ 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
+ 
  	scheduler_running = 1;
  }
  
@@@ -9298,41 -9423,6 +9440,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
  	kfree(ca);
  }
  
 +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 +{
 +	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +	u64 data;
 +
 +#ifndef CONFIG_64BIT
 +	/*
 +	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 +	 */
 +	spin_lock_irq(&cpu_rq(cpu)->lock);
 +	data = *cpuusage;
 +	spin_unlock_irq(&cpu_rq(cpu)->lock);
 +#else
 +	data = *cpuusage;
 +#endif
 +
 +	return data;
 +}
 +
 +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 +{
 +	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +
 +#ifndef CONFIG_64BIT
 +	/*
 +	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 +	 */
 +	spin_lock_irq(&cpu_rq(cpu)->lock);
 +	*cpuusage = val;
 +	spin_unlock_irq(&cpu_rq(cpu)->lock);
 +#else
 +	*cpuusage = val;
 +#endif
 +}
 +
  /* return total cpu usage (in nanoseconds) of a group */
  static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
  {
@@@ -9340,8 -9430,17 +9482,8 @@@
  	u64 totalcpuusage = 0;
  	int i;
  
 -	for_each_possible_cpu(i) {
 -		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 -
 -		/*
 -		 * Take rq->lock to make 64-bit addition safe on 32-bit
 -		 * platforms.
 -		 */
 -		spin_lock_irq(&cpu_rq(i)->lock);
 -		totalcpuusage += *cpuusage;
 -		spin_unlock_irq(&cpu_rq(i)->lock);
 -	}
 +	for_each_present_cpu(i)
 +		totalcpuusage += cpuacct_cpuusage_read(ca, i);
  
  	return totalcpuusage;
  }
@@@ -9358,39 -9457,23 +9500,39 @@@ static int cpuusage_write(struct cgrou
  		goto out;
  	}
  
 -	for_each_possible_cpu(i) {
 -		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 +	for_each_present_cpu(i)
 +		cpuacct_cpuusage_write(ca, i, 0);
  
 -		spin_lock_irq(&cpu_rq(i)->lock);
 -		*cpuusage = 0;
 -		spin_unlock_irq(&cpu_rq(i)->lock);
 -	}
  out:
  	return err;
  }
  
 +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 +				   struct seq_file *m)
 +{
 +	struct cpuacct *ca = cgroup_ca(cgroup);
 +	u64 percpu;
 +	int i;
 +
 +	for_each_present_cpu(i) {
 +		percpu = cpuacct_cpuusage_read(ca, i);
 +		seq_printf(m, "%llu ", (unsigned long long) percpu);
 +	}
 +	seq_printf(m, "\n");
 +	return 0;
 +}
 +
  static struct cftype files[] = {
  	{
  		.name = "usage",
  		.read_u64 = cpuusage_read,
  		.write_u64 = cpuusage_write,
  	},
 +	{
 +		.name = "usage_percpu",
 +		.read_seq_string = cpuacct_percpu_seq_read,
 +	},
 +
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
diff --combined kernel/sched_fair.c
index 5ad4440f0fc,36b5e34fa99..56c0efe902a
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -492,8 -492,6 +492,8 @@@ static void update_curr(struct cfs_rq *
  	 * overflow on 32 bits):
  	 */
  	delta_exec = (unsigned long)(now - curr->exec_start);
 +	if (!delta_exec)
 +		return;
  
  	__update_curr(cfs_rq, curr, delta_exec);
  	curr->exec_start = now;
@@@ -1019,16 -1017,33 +1019,33 @@@ static void yield_task_fair(struct rq *
   * search starts with cpus closest then further out as needed,
   * so we always favor a closer, idle cpu.
   * Domains may include CPUs that are not usable for migration,
-  * hence we need to mask them out (cpu_active_map)
+  * hence we need to mask them out (cpu_active_mask)
   *
   * Returns the CPU we should wake onto.
   */
  #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
  static int wake_idle(int cpu, struct task_struct *p)
  {
- 	cpumask_t tmp;
  	struct sched_domain *sd;
  	int i;
+ 	unsigned int chosen_wakeup_cpu;
+ 	int this_cpu;
+ 
+ 	/*
+ 	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
+ 	 * are idle and this is not a kernel thread and this task's affinity
+ 	 * allows it to be moved to preferred cpu, then just move!
+ 	 */
+ 
+ 	this_cpu = smp_processor_id();
+ 	chosen_wakeup_cpu =
+ 		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
+ 
+ 	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
+ 		idle_cpu(cpu) && idle_cpu(this_cpu) &&
+ 		p->mm && !(p->flags & PF_KTHREAD) &&
+ 		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
+ 		return chosen_wakeup_cpu;
  
  	/*
  	 * If it is idle, then it is the best cpu to run this task.
@@@ -1046,10 -1061,9 +1063,9 @@@
  		if ((sd->flags & SD_WAKE_IDLE)
  		    || ((sd->flags & SD_WAKE_IDLE_FAR)
  			&& !task_hot(p, task_rq(p)->clock, sd))) {
- 			cpus_and(tmp, sd->span, p->cpus_allowed);
- 			cpus_and(tmp, tmp, cpu_active_map);
- 			for_each_cpu_mask_nr(i, tmp) {
- 				if (idle_cpu(i)) {
+ 			for_each_cpu_and(i, sched_domain_span(sd),
+ 					 &p->cpus_allowed) {
+ 				if (cpu_active(i) && idle_cpu(i)) {
  					if (i != task_cpu(p)) {
  						schedstat_inc(p,
  						       se.nr_wakeups_idle);
@@@ -1242,13 -1256,13 +1258,13 @@@ static int select_task_rq_fair(struct t
  	 * this_cpu and prev_cpu are present in:
  	 */
  	for_each_domain(this_cpu, sd) {
- 		if (cpu_isset(prev_cpu, sd->span)) {
+ 		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
  			this_sd = sd;
  			break;
  		}
  	}
  
- 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ 	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
  		goto out;
  
  	/*
@@@ -1347,11 -1361,12 +1363,11 @@@ static void check_preempt_wakeup(struc
  {
  	struct task_struct *curr = rq->curr;
  	struct sched_entity *se = &curr->se, *pse = &p->se;
 +	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  
 -	if (unlikely(rt_prio(p->prio))) {
 -		struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 +	update_curr(cfs_rq);
  
 -		update_rq_clock(rq);
 -		update_curr(cfs_rq);
 +	if (unlikely(rt_prio(p->prio))) {
  		resched_task(curr);
  		return;
  	}
diff --combined kernel/sched_rt.c
index 51d2af3e619,1bbd9901401..833b6d44483
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -15,7 -15,7 +15,7 @@@ static inline void rt_set_overload(stru
  	if (!rq->online)
  		return;
  
- 	cpu_set(rq->cpu, rq->rd->rto_mask);
+ 	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  	/*
  	 * Make sure the mask is visible before we set
  	 * the overload count. That is checked to determine
@@@ -34,7 -34,7 +34,7 @@@ static inline void rt_clear_overload(st
  
  	/* the order here really doesn't matter */
  	atomic_dec(&rq->rd->rto_count);
- 	cpu_clear(rq->cpu, rq->rd->rto_mask);
+ 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  }
  
  static void update_rt_migration(struct rq *rq)
@@@ -77,7 -77,7 +77,7 @@@ static inline u64 sched_rt_period(struc
  }
  
  #define for_each_leaf_rt_rq(rt_rq, rq) \
 -	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 +	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  
  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  {
@@@ -139,14 -139,14 +139,14 @@@ static int rt_se_boosted(struct sched_r
  }
  
  #ifdef CONFIG_SMP
- static inline cpumask_t sched_rt_period_mask(void)
+ static inline const struct cpumask *sched_rt_period_mask(void)
  {
  	return cpu_rq(smp_processor_id())->rd->span;
  }
  #else
- static inline cpumask_t sched_rt_period_mask(void)
+ static inline const struct cpumask *sched_rt_period_mask(void)
  {
- 	return cpu_online_map;
+ 	return cpu_online_mask;
  }
  #endif
  
@@@ -212,9 -212,9 +212,9 @@@ static inline int rt_rq_throttled(struc
  	return rt_rq->rt_throttled;
  }
  
- static inline cpumask_t sched_rt_period_mask(void)
+ static inline const struct cpumask *sched_rt_period_mask(void)
  {
- 	return cpu_online_map;
+ 	return cpu_online_mask;
  }
  
  static inline
@@@ -241,11 -241,11 +241,11 @@@ static int do_balance_runtime(struct rt
  	int i, weight, more = 0;
  	u64 rt_period;
  
- 	weight = cpus_weight(rd->span);
+ 	weight = cpumask_weight(rd->span);
  
  	spin_lock(&rt_b->rt_runtime_lock);
  	rt_period = ktime_to_ns(rt_b->rt_period);
- 	for_each_cpu_mask_nr(i, rd->span) {
+ 	for_each_cpu(i, rd->span) {
  		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  		s64 diff;
  
@@@ -324,7 -324,7 +324,7 @@@ static void __disable_runtime(struct r
  		/*
  		 * Greedy reclaim, take back as much as we can.
  		 */
- 		for_each_cpu_mask(i, rd->span) {
+ 		for_each_cpu(i, rd->span) {
  			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  			s64 diff;
  
@@@ -429,13 -429,13 +429,13 @@@ static inline int balance_runtime(struc
  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
  {
  	int i, idle = 1;
- 	cpumask_t span;
+ 	const struct cpumask *span;
  
  	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  		return 1;
  
  	span = sched_rt_period_mask();
- 	for_each_cpu_mask(i, span) {
+ 	for_each_cpu(i, span) {
  		int enqueue = 0;
  		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
  		struct rq *rq = rq_of_rt_rq(rt_rq);
@@@ -805,17 -805,20 +805,20 @@@ static int select_task_rq_rt(struct tas
  
  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  {
- 	cpumask_t mask;
+ 	cpumask_var_t mask;
  
  	if (rq->curr->rt.nr_cpus_allowed == 1)
  		return;
  
- 	if (p->rt.nr_cpus_allowed != 1
- 	    && cpupri_find(&rq->rd->cpupri, p, &mask))
+ 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
  		return;
  
- 	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
- 		return;
+ 	if (p->rt.nr_cpus_allowed != 1
+ 	    && cpupri_find(&rq->rd->cpupri, p, mask))
+ 		goto free;
+ 
+ 	if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+ 		goto free;
  
  	/*
  	 * There appears to be other cpus that can accept
@@@ -824,6 -827,8 +827,8 @@@
  	 */
  	requeue_task_rt(rq, p, 1);
  	resched_task(rq->curr);
+ free:
+ 	free_cpumask_var(mask);
  }
  
  #endif /* CONFIG_SMP */
@@@ -914,7 -919,7 +919,7 @@@ static void deactivate_task(struct rq *
  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  {
  	if (!task_running(rq, p) &&
- 	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+ 	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
  	    (p->rt.nr_cpus_allowed > 1))
  		return 1;
  	return 0;
@@@ -953,7 -958,7 +958,7 @@@ static struct task_struct *pick_next_hi
  	return next;
  }
  
- static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
  
  static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
  {
@@@ -973,7 -978,7 +978,7 @@@
  static int find_lowest_rq(struct task_struct *task)
  {
  	struct sched_domain *sd;
- 	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+ 	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
  	int this_cpu = smp_processor_id();
  	int cpu      = task_cpu(task);
  
@@@ -988,7 -993,7 +993,7 @@@
  	 * I guess we might want to change cpupri_find() to ignore those
  	 * in the first place.
  	 */
- 	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+ 	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
  
  	/*
  	 * At this point we have built a mask of cpus representing the
@@@ -998,7 -1003,7 +1003,7 @@@
  	 * We prioritize the last cpu that the task executed on since
  	 * it is most likely cache-hot in that location.
  	 */
- 	if (cpu_isset(cpu, *lowest_mask))
+ 	if (cpumask_test_cpu(cpu, lowest_mask))
  		return cpu;
  
  	/*
@@@ -1013,7 -1018,8 +1018,8 @@@
  			cpumask_t domain_mask;
  			int       best_cpu;
  
- 			cpus_and(domain_mask, sd->span, *lowest_mask);
+ 			cpumask_and(&domain_mask, sched_domain_span(sd),
+ 				    lowest_mask);
  
  			best_cpu = pick_optimal_cpu(this_cpu,
  						    &domain_mask);
@@@ -1054,8 -1060,8 +1060,8 @@@ static struct rq *find_lock_lowest_rq(s
  			 * Also make sure that it wasn't scheduled on its rq.
  			 */
  			if (unlikely(task_rq(task) != rq ||
- 				     !cpu_isset(lowest_rq->cpu,
- 						task->cpus_allowed) ||
+ 				     !cpumask_test_cpu(lowest_rq->cpu,
+ 						       &task->cpus_allowed) ||
  				     task_running(rq, task) ||
  				     !task->se.on_rq)) {
  
@@@ -1176,7 -1182,7 +1182,7 @@@ static int pull_rt_task(struct rq *this
  
  	next = pick_next_task_rt(this_rq);
  
- 	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+ 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
  		if (this_cpu == cpu)
  			continue;
  
@@@ -1305,9 -1311,9 +1311,9 @@@ move_one_task_rt(struct rq *this_rq, in
  }
  
  static void set_cpus_allowed_rt(struct task_struct *p,
- 				const cpumask_t *new_mask)
+ 				const struct cpumask *new_mask)
  {
- 	int weight = cpus_weight(*new_mask);
+ 	int weight = cpumask_weight(new_mask);
  
  	BUG_ON(!rt_task(p));
  
@@@ -1328,7 -1334,7 +1334,7 @@@
  		update_rt_migration(rq);
  	}
  
- 	p->cpus_allowed    = *new_mask;
+ 	cpumask_copy(&p->cpus_allowed, new_mask);
  	p->rt.nr_cpus_allowed = weight;
  }
  
@@@ -1371,6 -1377,14 +1377,14 @@@ static void switched_from_rt(struct rq 
  	if (!rq->rt.rt_nr_running)
  		pull_rt_task(rq);
  }
+ 
+ static inline void init_sched_rt_class(void)
+ {
+ 	unsigned int i;
+ 
+ 	for_each_possible_cpu(i)
+ 		alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+ }
  #endif /* CONFIG_SMP */
  
  /*
@@@ -1541,3 -1555,4 +1555,4 @@@ static void print_rt_stats(struct seq_f
  	rcu_read_unlock();
  }
  #endif /* CONFIG_SCHED_DEBUG */
+ 
diff --combined kernel/sched_stats.h
index 3b01098164c,5fcf0e18458..f2773b5d122
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@@ -31,7 -31,7 +31,7 @@@ static int show_schedstat(struct seq_fi
  		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
  		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
  		    rq->ttwu_count, rq->ttwu_local,
 -		    rq->rq_sched_info.cpu_time,
 +		    rq->rq_cpu_time,
  		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
  
  		seq_printf(seq, "\n");
@@@ -42,7 -42,8 +42,8 @@@
  		for_each_domain(cpu, sd) {
  			enum cpu_idle_type itype;
  
- 			cpumask_scnprintf(mask_str, mask_len, sd->span);
+ 			cpumask_scnprintf(mask_str, mask_len,
+ 					  sched_domain_span(sd));
  			seq_printf(seq, "domain%d %s", dcount++, mask_str);
  			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
  					itype++) {
@@@ -123,7 -124,7 +124,7 @@@ static inline voi
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {
  	if (rq)
 -		rq->rq_sched_info.cpu_time += delta;
 +		rq->rq_cpu_time += delta;
  }
  
  static inline void
@@@ -236,6 -237,7 +237,6 @@@ static inline void sched_info_depart(st
  	unsigned long long delta = task_rq(t)->clock -
  					t->sched_info.last_arrival;
  
 -	t->sched_info.cpu_time += delta;
  	rq_sched_info_depart(task_rq(t), delta);
  
  	if (t->state == TASK_RUNNING)
diff --combined kernel/time/tick-sched.c
index 8f3fc2582d3,70f872c71f4..76a574bbef9
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@@ -144,7 -144,7 +144,7 @@@ void tick_nohz_update_jiffies(void
  	if (!ts->tick_stopped)
  		return;
  
- 	cpu_clear(cpu, nohz_cpu_mask);
+ 	cpumask_clear_cpu(cpu, nohz_cpu_mask);
  	now = ktime_get();
  	ts->idle_waketime = now;
  
@@@ -247,7 -247,7 +247,7 @@@ void tick_nohz_stop_sched_tick(int inid
  	if (need_resched())
  		goto end;
  
 -	if (unlikely(local_softirq_pending())) {
 +	if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
  		static int ratelimit;
  
  		if (ratelimit < 10) {
@@@ -282,31 -282,8 +282,31 @@@
  	/* Schedule the tick, if we are at least one jiffie off */
  	if ((long)delta_jiffies >= 1) {
  
 +		/*
 +		* calculate the expiry time for the next timer wheel
 +		* timer
 +		*/
 +		expires = ktime_add_ns(last_update, tick_period.tv64 *
 +				   delta_jiffies);
 +
 +		/*
 +		 * If this cpu is the one which updates jiffies, then
 +		 * give up the assignment and let it be taken by the
 +		 * cpu which runs the tick timer next, which might be
 +		 * this cpu as well. If we don't drop this here the
 +		 * jiffies might be stale and do_timer() never
 +		 * invoked.
 +		 */
 +		if (cpu == tick_do_timer_cpu)
 +			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 +
  		if (delta_jiffies > 1)
- 			cpu_set(cpu, nohz_cpu_mask);
+ 			cpumask_set_cpu(cpu, nohz_cpu_mask);
 +
 +		/* Skip reprogram of event if its not changed */
 +		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 +			goto out;
 +
  		/*
  		 * nohz_stop_sched_tick can be called several times before
  		 * the nohz_restart_sched_tick is called. This happens when
@@@ -319,7 -296,7 +319,7 @@@
  				/*
  				 * sched tick not stopped!
  				 */
- 				cpu_clear(cpu, nohz_cpu_mask);
+ 				cpumask_clear_cpu(cpu, nohz_cpu_mask);
  				goto out;
  			}
  
@@@ -329,6 -306,17 +329,6 @@@
  			rcu_enter_nohz();
  		}
  
 -		/*
 -		 * If this cpu is the one which updates jiffies, then
 -		 * give up the assignment and let it be taken by the
 -		 * cpu which runs the tick timer next, which might be
 -		 * this cpu as well. If we don't drop this here the
 -		 * jiffies might be stale and do_timer() never
 -		 * invoked.
 -		 */
 -		if (cpu == tick_do_timer_cpu)
 -			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 -
  		ts->idle_sleeps++;
  
  		/*
@@@ -344,7 -332,12 +344,7 @@@
  			goto out;
  		}
  
 -		/*
 -		 * calculate the expiry time for the next timer wheel
 -		 * timer
 -		 */
 -		expires = ktime_add_ns(last_update, tick_period.tv64 *
 -				       delta_jiffies);
 +		/* Mark expiries */
  		ts->idle_expires = expires;
  
  		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@@ -361,7 -354,7 +361,7 @@@
  		 * softirq.
  		 */
  		tick_do_update_jiffies64(ktime_get());
- 		cpu_clear(cpu, nohz_cpu_mask);
+ 		cpumask_clear_cpu(cpu, nohz_cpu_mask);
  	}
  	raise_softirq_irqoff(TIMER_SOFTIRQ);
  out:
@@@ -439,7 -432,7 +439,7 @@@ void tick_nohz_restart_sched_tick(void
  	select_nohz_load_balancer(0);
  	now = ktime_get();
  	tick_do_update_jiffies64(now);
- 	cpu_clear(cpu, nohz_cpu_mask);
+ 	cpumask_clear_cpu(cpu, nohz_cpu_mask);
  
  	/*
  	 * We stopped the tick in idle. Update process times would miss the
@@@ -688,6 -681,7 +688,6 @@@ void tick_setup_sched_timer(void
  	 */
  	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  	ts->sched_timer.function = tick_sched_timer;
 -	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  
  	/* Get the next period (per cpu) */
  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --combined kernel/trace/trace.c
index 4185d522163,6adf660fc81..0e91f43b6ba
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -30,6 -30,7 +30,6 @@@
  #include <linux/gfp.h>
  #include <linux/fs.h>
  #include <linux/kprobes.h>
 -#include <linux/seq_file.h>
  #include <linux/writeback.h>
  
  #include <linux/stacktrace.h>
@@@ -286,7 -287,6 +286,7 @@@ static const char *trace_options[] = 
  	"annotate",
  	"userstacktrace",
  	"sym-userobj",
 +	"printk-msg-only",
  	NULL
  };
  
@@@ -320,7 -320,7 +320,7 @@@ __update_max_tr(struct trace_array *tr
  
  	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
  	data->pid = tsk->pid;
 -	data->uid = tsk->uid;
 +	data->uid = task_uid(tsk);
  	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
  	data->policy = tsk->policy;
  	data->rt_priority = tsk->rt_priority;
@@@ -678,16 -678,6 +678,16 @@@ void tracing_reset(struct trace_array *
  	ftrace_enable_cpu();
  }
  
 +void tracing_reset_online_cpus(struct trace_array *tr)
 +{
 +	int cpu;
 +
 +	tr->time_start = ftrace_now(tr->cpu);
 +
 +	for_each_online_cpu(cpu)
 +		tracing_reset(tr, cpu);
 +}
 +
  #define SAVED_CMDLINES 128
  static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
  static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@@ -1309,7 -1299,7 +1309,7 @@@ enum trace_file_type 
  	TRACE_FILE_ANNOTATE	= 2,
  };
  
 -static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
 +static void trace_iterator_increment(struct trace_iterator *iter)
  {
  	/* Don't allow ftrace to trace into the ring buffers */
  	ftrace_disable_cpu();
@@@ -1388,7 -1378,7 +1388,7 @@@ static void *find_next_entry_inc(struc
  	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
  
  	if (iter->ent)
 -		trace_iterator_increment(iter, iter->cpu);
 +		trace_iterator_increment(iter);
  
  	return iter->ent ? iter : NULL;
  }
@@@ -1757,13 -1747,6 +1757,13 @@@ lat_print_timestamp(struct trace_seq *s
  
  static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
  
 +static int task_state_char(unsigned long state)
 +{
 +	int bit = state ? __ffs(state) + 1 : 0;
 +
 +	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
 +}
 +
  /*
   * The message is supposed to contain an ending newline.
   * If the printing stops prematurely, try to add a newline of our own.
@@@ -1832,6 -1815,7 +1832,6 @@@ print_lat_fmt(struct trace_iterator *it
  	char *comm;
  	int S, T;
  	int i;
 -	unsigned state;
  
  	if (entry->type == TRACE_CONT)
  		return TRACE_TYPE_HANDLED;
@@@ -1877,8 -1861,12 +1877,8 @@@
  
  		trace_assign_type(field, entry);
  
 -		T = field->next_state < sizeof(state_to_char) ?
 -			state_to_char[field->next_state] : 'X';
 -
 -		state = field->prev_state ?
 -			__ffs(field->prev_state) + 1 : 0;
 -		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
 +		T = task_state_char(field->next_state);
 +		S = task_state_char(field->prev_state);
  		comm = trace_find_cmdline(field->next_pid);
  		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
  				 field->prev_pid,
@@@ -2019,8 -2007,10 +2019,8 @@@ static enum print_line_t print_trace_fm
  
  		trace_assign_type(field, entry);
  
 -		S = field->prev_state < sizeof(state_to_char) ?
 -			state_to_char[field->prev_state] : 'X';
 -		T = field->next_state < sizeof(state_to_char) ?
 -			state_to_char[field->next_state] : 'X';
 +		T = task_state_char(field->next_state);
 +		S = task_state_char(field->prev_state);
  		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
  				       field->prev_pid,
  				       field->prev_prio,
@@@ -2150,9 -2140,12 +2150,9 @@@ static enum print_line_t print_raw_fmt(
  
  		trace_assign_type(field, entry);
  
 -		S = field->prev_state < sizeof(state_to_char) ?
 -			state_to_char[field->prev_state] : 'X';
 -		T = field->next_state < sizeof(state_to_char) ?
 -			state_to_char[field->next_state] : 'X';
 -		if (entry->type == TRACE_WAKE)
 -			S = '+';
 +		T = task_state_char(field->next_state);
 +		S = entry->type == TRACE_WAKE ? '+' :
 +			task_state_char(field->prev_state);
  		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
  				       field->prev_pid,
  				       field->prev_prio,
@@@ -2239,9 -2232,12 +2239,9 @@@ static enum print_line_t print_hex_fmt(
  
  		trace_assign_type(field, entry);
  
 -		S = field->prev_state < sizeof(state_to_char) ?
 -			state_to_char[field->prev_state] : 'X';
 -		T = field->next_state < sizeof(state_to_char) ?
 -			state_to_char[field->next_state] : 'X';
 -		if (entry->type == TRACE_WAKE)
 -			S = '+';
 +		T = task_state_char(field->next_state);
 +		S = entry->type == TRACE_WAKE ? '+' :
 +			task_state_char(field->prev_state);
  		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
  		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
  		SEQ_PUT_HEX_FIELD_RET(s, S);
@@@ -2269,25 -2265,6 +2269,25 @@@
  	return TRACE_TYPE_HANDLED;
  }
  
 +static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 +{
 +	struct trace_seq *s = &iter->seq;
 +	struct trace_entry *entry = iter->ent;
 +	struct print_entry *field;
 +	int ret;
 +
 +	trace_assign_type(field, entry);
 +
 +	ret = trace_seq_printf(s, field->buf);
 +	if (!ret)
 +		return TRACE_TYPE_PARTIAL_LINE;
 +
 +	if (entry->flags & TRACE_FLAG_CONT)
 +		trace_seq_print_cont(s, iter);
 +
 +	return TRACE_TYPE_HANDLED;
 +}
 +
  static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
  {
  	struct trace_seq *s = &iter->seq;
@@@ -2368,11 -2345,6 +2368,11 @@@ static enum print_line_t print_trace_li
  			return ret;
  	}
  
 +	if (iter->ent->type == TRACE_PRINT &&
 +			trace_flags & TRACE_ITER_PRINTK &&
 +			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
 +		return print_printk_msg_only(iter);
 +
  	if (trace_flags & TRACE_ITER_BIN)
  		return print_bin_fmt(iter);
  
@@@ -2453,7 -2425,7 +2453,7 @@@ __tracing_open(struct inode *inode, str
  
  	/* Notify the tracer early; before we stop tracing. */
  	if (iter->trace && iter->trace->open)
 -			iter->trace->open(iter);
 +		iter->trace->open(iter);
  
  	/* Annotate start of buffers if we had overruns */
  	if (ring_buffer_overruns(iter->tr->buffer))
@@@ -2674,7 -2646,7 +2674,7 @@@ tracing_cpumask_read(struct file *filp
  
  	mutex_lock(&tracing_cpumask_update_lock);
  
- 	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+ 	len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
  	if (count - len < 2) {
  		count = -EINVAL;
  		goto out_err;
@@@ -2695,7 -2667,7 +2695,7 @@@ tracing_cpumask_write(struct file *filp
  	int err, cpu;
  
  	mutex_lock(&tracing_cpumask_update_lock);
- 	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ 	err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
  	if (err)
  		goto err_unlock;
  
diff --combined lib/Kconfig
index fd4118e097f,7823f8342ab..2ba43c4a5b0
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@@ -64,8 -64,6 +64,8 @@@ config CRC
  
  config LIBCRC32C
  	tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
 +	select CRYPTO
 +	select CRYPTO_CRC32C
  	help
  	  This option is provided for the case where no in-kernel-tree
  	  modules require CRC32c functions, but a module built outside the
@@@ -159,4 -157,11 +159,11 @@@ config CHECK_SIGNATUR
  config HAVE_LMB
  	boolean
  
+ config CPUMASK_OFFSTACK
+ 	bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+ 	help
+ 	  Use dynamic allocation for cpumask_var_t, instead of putting
+ 	  them on the stack.  This is a bit more expensive, but avoids
+ 	  stack overflow.
+ 
  endmenu
diff --combined mm/slub.c
index 6cb7ad10785,8e516e29f98..0d861c3154b
--- a/mm/slub.c
+++ b/mm/slub.c
@@@ -24,7 -24,6 +24,7 @@@
  #include <linux/kallsyms.h>
  #include <linux/memory.h>
  #include <linux/math64.h>
 +#include <linux/fault-inject.h>
  
  /*
   * Lock order:
@@@ -154,10 -153,6 +154,10 @@@
  #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
  #endif
  
 +#define OO_SHIFT	16
 +#define OO_MASK		((1 << OO_SHIFT) - 1)
 +#define MAX_OBJS_PER_PAGE	65535 /* since page.objects is u16 */
 +
  /* Internal SLUB flags */
  #define __OBJECT_POISON		0x80000000 /* Poison object */
  #define __SYSFS_ADD_DEFERRED	0x40000000 /* Not yet visible via sysfs */
@@@ -183,7 -178,7 +183,7 @@@ static LIST_HEAD(slab_caches)
   * Tracking user of a slab.
   */
  struct track {
 -	void *addr;		/* Called from address */
 +	unsigned long addr;	/* Called from address */
  	int cpu;		/* Was running on cpu */
  	int pid;		/* Pid context */
  	unsigned long when;	/* When did the operation occur */
@@@ -295,7 -290,7 +295,7 @@@ static inline struct kmem_cache_order_o
  						unsigned long size)
  {
  	struct kmem_cache_order_objects x = {
 -		(order << 16) + (PAGE_SIZE << order) / size
 +		(order << OO_SHIFT) + (PAGE_SIZE << order) / size
  	};
  
  	return x;
@@@ -303,12 -298,12 +303,12 @@@
  
  static inline int oo_order(struct kmem_cache_order_objects x)
  {
 -	return x.x >> 16;
 +	return x.x >> OO_SHIFT;
  }
  
  static inline int oo_objects(struct kmem_cache_order_objects x)
  {
 -	return x.x & ((1 << 16) - 1);
 +	return x.x & OO_MASK;
  }
  
  #ifdef CONFIG_SLUB_DEBUG
@@@ -372,7 -367,7 +372,7 @@@ static struct track *get_track(struct k
  }
  
  static void set_track(struct kmem_cache *s, void *object,
 -				enum track_item alloc, void *addr)
 +			enum track_item alloc, unsigned long addr)
  {
  	struct track *p;
  
@@@ -396,8 -391,8 +396,8 @@@ static void init_tracking(struct kmem_c
  	if (!(s->flags & SLAB_STORE_USER))
  		return;
  
 -	set_track(s, object, TRACK_FREE, NULL);
 -	set_track(s, object, TRACK_ALLOC, NULL);
 +	set_track(s, object, TRACK_FREE, 0UL);
 +	set_track(s, object, TRACK_ALLOC, 0UL);
  }
  
  static void print_track(const char *s, struct track *t)
@@@ -406,7 -401,7 +406,7 @@@
  		return;
  
  	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
 -		s, t->addr, jiffies - t->when, t->cpu, t->pid);
 +		s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
  }
  
  static void print_tracking(struct kmem_cache *s, void *object)
@@@ -697,7 -692,7 +697,7 @@@ static int check_object(struct kmem_cac
  	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
  		object_err(s, page, p, "Freepointer corrupt");
  		/*
 -		 * No choice but to zap it and thus loose the remainder
 +		 * No choice but to zap it and thus lose the remainder
  		 * of the free objects in this slab. May cause
  		 * another error because the object count is now wrong.
  		 */
@@@ -769,8 -764,8 +769,8 @@@ static int on_freelist(struct kmem_cach
  	}
  
  	max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
 -	if (max_objects > 65535)
 -		max_objects = 65535;
 +	if (max_objects > MAX_OBJS_PER_PAGE)
 +		max_objects = MAX_OBJS_PER_PAGE;
  
  	if (page->objects != max_objects) {
  		slab_err(s, page, "Wrong number of objects. Found %d but "
@@@ -871,7 -866,7 +871,7 @@@ static void setup_object_debug(struct k
  }
  
  static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
 -						void *object, void *addr)
 +					void *object, unsigned long addr)
  {
  	if (!check_slab(s, page))
  		goto bad;
@@@ -911,7 -906,7 +911,7 @@@ bad
  }
  
  static int free_debug_processing(struct kmem_cache *s, struct page *page,
 -						void *object, void *addr)
 +					void *object, unsigned long addr)
  {
  	if (!check_slab(s, page))
  		goto fail;
@@@ -1034,10 -1029,10 +1034,10 @@@ static inline void setup_object_debug(s
  			struct page *page, void *object) {}
  
  static inline int alloc_debug_processing(struct kmem_cache *s,
 -	struct page *page, void *object, void *addr) { return 0; }
 +	struct page *page, void *object, unsigned long addr) { return 0; }
  
  static inline int free_debug_processing(struct kmem_cache *s,
 -	struct page *page, void *object, void *addr) { return 0; }
 +	struct page *page, void *object, unsigned long addr) { return 0; }
  
  static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
  			{ return 1; }
@@@ -1504,8 -1499,8 +1504,8 @@@ static inline int node_match(struct kme
   * we need to allocate a new slab. This is the slowest path since it involves
   * a call to the page allocator and the setup of a new slab.
   */
 -static void *__slab_alloc(struct kmem_cache *s,
 -		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
 +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 +			  unsigned long addr, struct kmem_cache_cpu *c)
  {
  	void **object;
  	struct page *new;
@@@ -1589,18 -1584,13 +1589,18 @@@ debug
   * Otherwise we can simply pick the next object from the lockless free list.
   */
  static __always_inline void *slab_alloc(struct kmem_cache *s,
 -		gfp_t gfpflags, int node, void *addr)
 +		gfp_t gfpflags, int node, unsigned long addr)
  {
  	void **object;
  	struct kmem_cache_cpu *c;
  	unsigned long flags;
  	unsigned int objsize;
  
 +	might_sleep_if(gfpflags & __GFP_WAIT);
 +
 +	if (should_failslab(s->objsize, gfpflags))
 +		return NULL;
 +
  	local_irq_save(flags);
  	c = get_cpu_slab(s, smp_processor_id());
  	objsize = c->objsize;
@@@ -1623,14 -1613,14 +1623,14 @@@
  
  void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
  {
 -	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
 +	return slab_alloc(s, gfpflags, -1, _RET_IP_);
  }
  EXPORT_SYMBOL(kmem_cache_alloc);
  
  #ifdef CONFIG_NUMA
  void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
  {
 -	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
 +	return slab_alloc(s, gfpflags, node, _RET_IP_);
  }
  EXPORT_SYMBOL(kmem_cache_alloc_node);
  #endif
@@@ -1644,7 -1634,7 +1644,7 @@@
   * handling required then we can return immediately.
   */
  static void __slab_free(struct kmem_cache *s, struct page *page,
 -				void *x, void *addr, unsigned int offset)
 +			void *x, unsigned long addr, unsigned int offset)
  {
  	void *prior;
  	void **object = (void *)x;
@@@ -1714,7 -1704,7 +1714,7 @@@ debug
   * with all sorts of special processing.
   */
  static __always_inline void slab_free(struct kmem_cache *s,
 -			struct page *page, void *x, void *addr)
 +			struct page *page, void *x, unsigned long addr)
  {
  	void **object = (void *)x;
  	struct kmem_cache_cpu *c;
@@@ -1741,11 -1731,11 +1741,11 @@@ void kmem_cache_free(struct kmem_cache 
  
  	page = virt_to_head_page(x);
  
 -	slab_free(s, page, x, __builtin_return_address(0));
 +	slab_free(s, page, x, _RET_IP_);
  }
  EXPORT_SYMBOL(kmem_cache_free);
  
 -/* Figure out on which slab object the object resides */
 +/* Figure out on which slab page the object resides */
  static struct page *get_object_page(const void *x)
  {
  	struct page *page = virt_to_head_page(x);
@@@ -1817,8 -1807,8 +1817,8 @@@ static inline int slab_order(int size, 
  	int rem;
  	int min_order = slub_min_order;
  
 -	if ((PAGE_SIZE << min_order) / size > 65535)
 -		return get_order(size * 65535) - 1;
 +	if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
 +		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
  
  	for (order = max(min_order,
  				fls(min_objects * size - 1) - PAGE_SHIFT);
@@@ -2083,7 -2073,8 +2083,7 @@@ static inline int alloc_kmem_cache_cpus
   * when allocating for the kmalloc_node_cache. This is used for bootstrapping
   * memory on a fresh node that has no slab structures yet.
   */
 -static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
 -							   int node)
 +static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
  {
  	struct page *page;
  	struct kmem_cache_node *n;
@@@ -2121,6 -2112,7 +2121,6 @@@
  	local_irq_save(flags);
  	add_partial(n, page, 0);
  	local_irq_restore(flags);
 -	return n;
  }
  
  static void free_kmem_cache_nodes(struct kmem_cache *s)
@@@ -2152,7 -2144,8 +2152,7 @@@ static int init_kmem_cache_nodes(struc
  			n = &s->local_node;
  		else {
  			if (slab_state == DOWN) {
 -				n = early_kmem_cache_node_alloc(gfpflags,
 -								node);
 +				early_kmem_cache_node_alloc(gfpflags, node);
  				continue;
  			}
  			n = kmem_cache_alloc_node(kmalloc_caches,
@@@ -2666,7 -2659,7 +2666,7 @@@ void *__kmalloc(size_t size, gfp_t flag
  	if (unlikely(ZERO_OR_NULL_PTR(s)))
  		return s;
  
 -	return slab_alloc(s, flags, -1, __builtin_return_address(0));
 +	return slab_alloc(s, flags, -1, _RET_IP_);
  }
  EXPORT_SYMBOL(__kmalloc);
  
@@@ -2694,7 -2687,7 +2694,7 @@@ void *__kmalloc_node(size_t size, gfp_
  	if (unlikely(ZERO_OR_NULL_PTR(s)))
  		return s;
  
 -	return slab_alloc(s, flags, node, __builtin_return_address(0));
 +	return slab_alloc(s, flags, node, _RET_IP_);
  }
  EXPORT_SYMBOL(__kmalloc_node);
  #endif
@@@ -2751,7 -2744,7 +2751,7 @@@ void kfree(const void *x
  		put_page(page);
  		return;
  	}
 -	slab_free(page->slab, page, object, __builtin_return_address(0));
 +	slab_free(page->slab, page, object, _RET_IP_);
  }
  EXPORT_SYMBOL(kfree);
  
@@@ -3130,12 -3123,8 +3130,12 @@@ struct kmem_cache *kmem_cache_create(co
  		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
  		up_write(&slub_lock);
  
 -		if (sysfs_slab_alias(s, name))
 +		if (sysfs_slab_alias(s, name)) {
 +			down_write(&slub_lock);
 +			s->refcount--;
 +			up_write(&slub_lock);
  			goto err;
 +		}
  		return s;
  	}
  
@@@ -3145,13 -3134,8 +3145,13 @@@
  				size, align, flags, ctor)) {
  			list_add(&s->list, &slab_caches);
  			up_write(&slub_lock);
 -			if (sysfs_slab_add(s))
 +			if (sysfs_slab_add(s)) {
 +				down_write(&slub_lock);
 +				list_del(&s->list);
 +				up_write(&slub_lock);
 +				kfree(s);
  				goto err;
 +			}
  			return s;
  		}
  		kfree(s);
@@@ -3218,7 -3202,7 +3218,7 @@@ static struct notifier_block __cpuinitd
  
  #endif
  
 -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
  {
  	struct kmem_cache *s;
  
@@@ -3234,7 -3218,7 +3234,7 @@@
  }
  
  void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 -					int node, void *caller)
 +					int node, unsigned long caller)
  {
  	struct kmem_cache *s;
  
@@@ -3445,7 -3429,7 +3445,7 @@@ static void resiliency_test(void) {}
  
  struct location {
  	unsigned long count;
 -	void *addr;
 +	unsigned long addr;
  	long long sum_time;
  	long min_time;
  	long max_time;
@@@ -3493,7 -3477,7 +3493,7 @@@ static int add_location(struct loc_trac
  {
  	long start, end, pos;
  	struct location *l;
 -	void *caddr;
 +	unsigned long caddr;
  	unsigned long age = jiffies - track->when;
  
  	start = -1;
@@@ -3642,7 -3626,7 +3642,7 @@@ static int list_locations(struct kmem_c
  				len < PAGE_SIZE - 60) {
  			len += sprintf(buf + len, " cpus=");
  			len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
- 					l->cpus);
+ 					&l->cpus);
  		}
  
  		if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
@@@ -4361,7 -4345,7 +4361,7 @@@ static void sysfs_slab_remove(struct km
  
  /*
   * Need to buffer aliases during bootup until sysfs becomes
 - * available lest we loose that information.
 + * available lest we lose that information.
   */
  struct saved_alias {
  	struct kmem_cache *s;