From: Linus Torvalds Date: Fri, 2 Jan 2009 19:44:09 +0000 (-0800) Subject: Merge branch 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel... X-Git-Tag: v2.6.29-rc1~538 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=b840d79631c882786925303c2b0f4fefc31845ed;hp=-c;p=linux-2.6-omap-h63xx.git Merge branch 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip * 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (66 commits) x86: export vector_used_by_percpu_irq x86: use logical apicid in x2apic_cluster's x2apic_cpu_mask_to_apicid_and() sched: nominate preferred wakeup cpu, fix x86: fix lguest used_vectors breakage, -v2 x86: fix warning in arch/x86/kernel/io_apic.c sched: fix warning in kernel/sched.c sched: move test_sd_parent() to an SMP section of sched.h sched: add SD_BALANCE_NEWIDLE at MC and CPU level for sched_mc>0 sched: activate active load balancing in new idle cpus sched: bias task wakeups to preferred semi-idle packages sched: nominate preferred wakeup cpu sched: favour lower logical cpu number for sched_mc balance sched: framework for sched_mc/smt_power_savings=N sched: convert BALANCE_FOR_xx_POWER to inline functions x86: use possible_cpus=NUM to extend the possible cpus allowed x86: fix cpu_mask_to_apicid_and to include cpu_online_mask x86: update io_apic.c to the new cpumask code x86: Introduce topology_core_cpumask()/topology_thread_cpumask() x86: xen: use smp_call_function_many() x86: use work_on_cpu in x86/kernel/cpu/mcheck/mce_amd_64.c ... Fixed up trivial conflict in kernel/time/tick-sched.c manually --- b840d79631c882786925303c2b0f4fefc31845ed diff --combined arch/arm/kernel/smp.c index 019237d2162,bd905c0a736..55fa7ff96a3 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@@ -33,16 -33,6 +33,6 @@@ #include #include - /* - * bitmask of present and online CPUs. - * The present bitmask indicates that the CPU is physically present. - * The online bitmask indicates that the CPU is up and running. - */ - cpumask_t cpu_possible_map; - EXPORT_SYMBOL(cpu_possible_map); - cpumask_t cpu_online_map; - EXPORT_SYMBOL(cpu_online_map); - /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@@ -181,7 -171,7 +171,7 @@@ int __cpuexit __cpu_disable(void /* * Stop the local timer for this CPU. */ - local_timer_stop(cpu); + local_timer_stop(); /* * Flush user cache and TLB mappings, and then remove this CPU @@@ -284,7 -274,7 +274,7 @@@ asmlinkage void __cpuinit secondary_sta /* * Setup local timer for this CPU. */ - local_timer_setup(cpu); + local_timer_setup(); calibrate_delay(); diff --combined arch/arm/mach-at91/at91rm9200_time.c index d140eae53de,72f51d39202..1ff1bda0a89 --- a/arch/arm/mach-at91/at91rm9200_time.c +++ b/arch/arm/mach-at91/at91rm9200_time.c @@@ -141,15 -141,6 +141,15 @@@ clkevt32k_next_event(unsigned long delt /* Use "raw" primitives so we behave correctly on RT kernels. */ raw_local_irq_save(flags); + /* + * According to Thomas Gleixner irqs are already disabled here. Simply + * removing raw_local_irq_save above (and the matching + * raw_local_irq_restore) was not accepted. See + * http://thread.gmane.org/gmane.linux.ports.arm.kernel/41174 + * So for now (2008-11-20) just warn once if irqs were not disabled ... + */ + WARN_ON_ONCE(!raw_irqs_disabled_flags(flags)); + /* The alarm IRQ uses absolute time (now+delta), not the relative * time (delta) in our calling convention. Like all clockevents * using such "match" hardware, we have a race to defend against. @@@ -178,7 -169,6 +178,6 @@@ static struct clock_event_device clkev .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .shift = 32, .rating = 150, - .cpumask = CPU_MASK_CPU0, .set_next_event = clkevt32k_next_event, .set_mode = clkevt32k_mode, }; @@@ -206,7 -196,7 +205,7 @@@ void __init at91rm9200_timer_init(void clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift); clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt); clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1; - clkevt.cpumask = cpumask_of_cpu(0); + clkevt.cpumask = cpumask_of(0); clockevents_register_device(&clkevt); /* register clocksource */ diff --combined arch/arm/mach-pxa/time.c index 00162415851,bf3c9a4aad5..95656a72268 --- a/arch/arm/mach-pxa/time.c +++ b/arch/arm/mach-pxa/time.c @@@ -22,8 -22,8 +22,8 @@@ #include #include #include +#include #include -#include /* * This is PXA's sched_clock implementation. This has a resolution @@@ -122,7 -122,6 +122,6 @@@ static struct clock_event_device ckevt_ .features = CLOCK_EVT_FEAT_ONESHOT, .shift = 32, .rating = 200, - .cpumask = CPU_MASK_CPU0, .set_next_event = pxa_osmr0_set_next_event, .set_mode = pxa_osmr0_set_mode, }; @@@ -150,11 -149,18 +149,11 @@@ static struct irqaction pxa_ost0_irq = static void __init pxa_timer_init(void) { - unsigned long clock_tick_rate; + unsigned long clock_tick_rate = get_clock_tick_rate(); OIER = 0; OSSR = OSSR_M0 | OSSR_M1 | OSSR_M2 | OSSR_M3; - if (cpu_is_pxa25x()) - clock_tick_rate = 3686400; - else if (machine_is_mainstone()) - clock_tick_rate = 3249600; - else - clock_tick_rate = 3250000; - set_oscr2ns_scale(clock_tick_rate); ckevt_pxa_osmr0.mult = @@@ -163,6 -169,7 +162,7 @@@ clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0); ckevt_pxa_osmr0.min_delta_ns = clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1; + ckevt_pxa_osmr0.cpumask = cpumask_of(0); cksrc_pxa_oscr0.mult = clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift); diff --combined arch/arm/mach-realview/core.c index 5f1d55963ce,b07cb9b7adb..bd2aa4f1614 --- a/arch/arm/mach-realview/core.c +++ b/arch/arm/mach-realview/core.c @@@ -28,14 -28,11 +28,14 @@@ #include #include #include +#include +#include #include #include #include #include +#include #include #include @@@ -52,7 -49,7 +52,7 @@@ #define REALVIEW_REFCOUNTER (__io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_24MHz_OFFSET) -/* used by entry-macro.S */ +/* used by entry-macro.S and platsmp.c */ void __iomem *gic_cpu_base_addr; /* @@@ -127,29 -124,6 +127,29 @@@ int realview_flash_register(struct reso return platform_device_register(&realview_flash_device); } +static struct smc911x_platdata realview_smc911x_platdata = { + .flags = SMC911X_USE_32BIT, + .irq_flags = IRQF_SHARED, + .irq_polarity = 1, +}; + +static struct platform_device realview_eth_device = { + .name = "smc911x", + .id = 0, + .num_resources = 2, +}; + +int realview_eth_register(const char *name, struct resource *res) +{ + if (name) + realview_eth_device.name = name; + realview_eth_device.resource = res; + if (strcmp(realview_eth_device.name, "smc911x") == 0) + realview_eth_device.dev.platform_data = &realview_smc911x_platdata; + + return platform_device_register(&realview_eth_device); +} + static struct resource realview_i2c_resource = { .start = REALVIEW_I2C_BASE, .end = REALVIEW_I2C_BASE + SZ_4K - 1, @@@ -203,14 -177,9 +203,14 @@@ static const struct icst307_params real static void realview_oscvco_set(struct clk *clk, struct icst307_vco vco) { void __iomem *sys_lock = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_LOCK_OFFSET; - void __iomem *sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC4_OFFSET; + void __iomem *sys_osc; u32 val; + if (machine_is_realview_pb1176()) + sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC0_OFFSET; + else + sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC4_OFFSET; + val = readl(sys_osc) & ~0x7ffff; val |= vco.v | (vco.r << 9) | (vco.s << 16); @@@ -219,59 -188,12 +219,59 @@@ writel(0, sys_lock); } -struct clk realview_clcd_clk = { - .name = "CLCDCLK", +static struct clk oscvco_clk = { .params = &realview_oscvco_params, .setvco = realview_oscvco_set, }; +/* + * These are fixed clocks. + */ +static struct clk ref24_clk = { + .rate = 24000000, +}; + +static struct clk_lookup lookups[] = { + { /* UART0 */ + .dev_id = "dev:f1", + .clk = &ref24_clk, + }, { /* UART1 */ + .dev_id = "dev:f2", + .clk = &ref24_clk, + }, { /* UART2 */ + .dev_id = "dev:f3", + .clk = &ref24_clk, + }, { /* UART3 */ + .dev_id = "fpga:09", + .clk = &ref24_clk, + }, { /* KMI0 */ + .dev_id = "fpga:06", + .clk = &ref24_clk, + }, { /* KMI1 */ + .dev_id = "fpga:07", + .clk = &ref24_clk, + }, { /* MMC0 */ + .dev_id = "fpga:05", + .clk = &ref24_clk, + }, { /* EB:CLCD */ + .dev_id = "dev:20", + .clk = &oscvco_clk, + }, { /* PB:CLCD */ + .dev_id = "issp:20", + .clk = &oscvco_clk, + } +}; + +static int __init clk_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lookups); i++) + clkdev_add(&lookups[i]); + return 0; +} +arch_initcall(clk_init); + /* * CLCD support. */ @@@ -304,30 -226,7 +304,30 @@@ static struct clcd_panel vga = .width = -1, .height = -1, .tim2 = TIM2_BCD | TIM2_IPC, - .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1), + .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1), + .bpp = 16, +}; + +static struct clcd_panel xvga = { + .mode = { + .name = "XVGA", + .refresh = 60, + .xres = 1024, + .yres = 768, + .pixclock = 15748, + .left_margin = 152, + .right_margin = 48, + .upper_margin = 23, + .lower_margin = 3, + .hsync_len = 104, + .vsync_len = 4, + .sync = 0, + .vmode = FB_VMODE_NONINTERLACED, + }, + .width = -1, + .height = -1, + .tim2 = TIM2_BCD | TIM2_IPC, + .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1), .bpp = 16, }; @@@ -350,7 -249,7 +350,7 @@@ static struct clcd_panel sanyo_3_8_in .width = -1, .height = -1, .tim2 = TIM2_BCD, - .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1), + .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1), .bpp = 16, }; @@@ -373,7 -272,7 +373,7 @@@ static struct clcd_panel sanyo_2_5_in .width = -1, .height = -1, .tim2 = TIM2_IVS | TIM2_IHS | TIM2_IPC, - .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1), + .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1), .bpp = 16, }; @@@ -396,7 -295,7 +396,7 @@@ static struct clcd_panel epson_2_2_in .width = -1, .height = -1, .tim2 = TIM2_BCD | TIM2_IPC, - .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1), + .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1), .bpp = 16, }; @@@ -409,15 -308,9 +409,15 @@@ static struct clcd_panel *realview_clcd_panel(void) { void __iomem *sys_clcd = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_CLCD_OFFSET; - struct clcd_panel *panel = &vga; + struct clcd_panel *vga_panel; + struct clcd_panel *panel; u32 val; + if (machine_is_realview_eb()) + vga_panel = &vga; + else + vga_panel = &xvga; + val = readl(sys_clcd) & SYS_CLCD_ID_MASK; if (val == SYS_CLCD_ID_SANYO_3_8) panel = &sanyo_3_8_in; @@@ -426,11 -319,11 +426,11 @@@ else if (val == SYS_CLCD_ID_EPSON_2_2) panel = &epson_2_2_in; else if (val == SYS_CLCD_ID_VGA) - panel = &vga; + panel = vga_panel; else { printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n", val); - panel = &vga; + panel = vga_panel; } return panel; @@@ -465,18 -358,12 +465,18 @@@ static void realview_clcd_enable(struc writel(val, sys_clcd); } -static unsigned long framesize = SZ_1M; - static int realview_clcd_setup(struct clcd_fb *fb) { + unsigned long framesize; dma_addr_t dma; + if (machine_is_realview_eb()) + /* VGA, 16bpp */ + framesize = 640 * 480 * 2; + else + /* XVGA, 16bpp */ + framesize = 1024 * 768 * 2; + fb->panel = realview_clcd_panel(); fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize, @@@ -624,7 -511,7 +624,7 @@@ static struct clock_event_device timer0 .set_mode = timer_set_mode, .set_next_event = timer_set_next_event, .rating = 300, - .cpumask = CPU_MASK_ALL, + .cpumask = cpu_all_mask, }; static void __init realview_clockevents_init(unsigned int timer_irq) @@@ -701,7 -588,7 +701,7 @@@ void __init realview_timer_init(unsigne * The dummy clock device has to be registered before the main device * so that the latter will broadcast the clock events */ - local_timer_setup(smp_processor_id()); + local_timer_setup(); #endif /* diff --combined arch/arm/mach-realview/localtimer.c index 9019ef2e561,504961ef343..67d6d9cc68b --- a/arch/arm/mach-realview/localtimer.c +++ b/arch/arm/mach-realview/localtimer.c @@@ -38,14 -38,18 +38,14 @@@ void local_timer_interrupt(void #ifdef CONFIG_LOCAL_TIMERS -#define TWD_BASE(cpu) (twd_base_addr + (cpu) * twd_size) - /* set up by the platform code */ -void __iomem *twd_base_addr; -unsigned int twd_size; +void __iomem *twd_base; static unsigned long mpcore_timer_rate; static void local_timer_set_mode(enum clock_event_mode mode, struct clock_event_device *clk) { - void __iomem *base = TWD_BASE(smp_processor_id()); unsigned long ctrl; switch(mode) { @@@ -64,16 -68,17 +64,16 @@@ ctrl = 0; } - __raw_writel(ctrl, base + TWD_TIMER_CONTROL); + __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); } static int local_timer_set_next_event(unsigned long evt, struct clock_event_device *unused) { - void __iomem *base = TWD_BASE(smp_processor_id()); - unsigned long ctrl = __raw_readl(base + TWD_TIMER_CONTROL); + unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL); - __raw_writel(evt, base + TWD_TIMER_COUNTER); - __raw_writel(ctrl | TWD_TIMER_CONTROL_ENABLE, base + TWD_TIMER_CONTROL); + __raw_writel(evt, twd_base + TWD_TIMER_COUNTER); + __raw_writel(ctrl | TWD_TIMER_CONTROL_ENABLE, twd_base + TWD_TIMER_CONTROL); return 0; } @@@ -86,16 -91,19 +86,16 @@@ */ int local_timer_ack(void) { - void __iomem *base = TWD_BASE(smp_processor_id()); - - if (__raw_readl(base + TWD_TIMER_INTSTAT)) { - __raw_writel(1, base + TWD_TIMER_INTSTAT); + if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) { + __raw_writel(1, twd_base + TWD_TIMER_INTSTAT); return 1; } return 0; } -static void __cpuinit twd_calibrate_rate(unsigned int cpu) +static void __cpuinit twd_calibrate_rate(void) { - void __iomem *base = TWD_BASE(cpu); unsigned long load, count; u64 waitjiffies; @@@ -116,15 -124,15 +116,15 @@@ waitjiffies += 5; /* enable, no interrupt or reload */ - __raw_writel(0x1, base + TWD_TIMER_CONTROL); + __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL); /* maximum value */ - __raw_writel(0xFFFFFFFFU, base + TWD_TIMER_COUNTER); + __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); while (get_jiffies_64() < waitjiffies) udelay(10); - count = __raw_readl(base + TWD_TIMER_COUNTER); + count = __raw_readl(twd_base + TWD_TIMER_COUNTER); mpcore_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5); @@@ -134,19 -142,18 +134,19 @@@ load = mpcore_timer_rate / HZ; - __raw_writel(load, base + TWD_TIMER_LOAD); + __raw_writel(load, twd_base + TWD_TIMER_LOAD); } /* * Setup the local clock events for a CPU. */ -void __cpuinit local_timer_setup(unsigned int cpu) +void __cpuinit local_timer_setup(void) { + unsigned int cpu = smp_processor_id(); struct clock_event_device *clk = &per_cpu(local_clockevent, cpu); unsigned long flags; - twd_calibrate_rate(cpu); + twd_calibrate_rate(); clk->name = "local_timer"; clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; @@@ -154,7 -161,7 +154,7 @@@ clk->set_mode = local_timer_set_mode; clk->set_next_event = local_timer_set_next_event; clk->irq = IRQ_LOCALTIMER; - clk->cpumask = cpumask_of_cpu(cpu); + clk->cpumask = cpumask_of(cpu); clk->shift = 20; clk->mult = div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift); clk->max_delta_ns = clockevent_delta2ns(0xffffffff, clk); @@@ -171,9 -178,9 +171,9 @@@ /* * take a local timer down */ -void __cpuexit local_timer_stop(unsigned int cpu) +void __cpuexit local_timer_stop(void) { - __raw_writel(0, TWD_BASE(cpu) + TWD_TIMER_CONTROL); + __raw_writel(0, twd_base + TWD_TIMER_CONTROL); } #else /* CONFIG_LOCAL_TIMERS */ @@@ -183,9 -190,8 +183,9 @@@ static void dummy_timer_set_mode(enum c { } -void __cpuinit local_timer_setup(unsigned int cpu) +void __cpuinit local_timer_setup(void) { + unsigned int cpu = smp_processor_id(); struct clock_event_device *clk = &per_cpu(local_clockevent, cpu); clk->name = "dummy_timer"; @@@ -193,7 -199,7 +193,7 @@@ clk->rating = 200; clk->set_mode = dummy_timer_set_mode; clk->broadcast = smp_timer_broadcast; - clk->cpumask = cpumask_of_cpu(cpu); + clk->cpumask = cpumask_of(cpu); clockevents_register_device(clk); } diff --combined arch/arm/mach-sa1100/time.c index 8c5e727f3b7,1cac4ac0b4b..711c0295c66 --- a/arch/arm/mach-sa1100/time.c +++ b/arch/arm/mach-sa1100/time.c @@@ -2,8 -2,8 +2,8 @@@ * linux/arch/arm/mach-sa1100/time.c * * Copyright (C) 1998 Deborah Wallach. - * Twiddles (C) 1999 Hugo Fiennes - * + * Twiddles (C) 1999 Hugo Fiennes + * * 2000/03/29 (C) Nicolas Pitre * Rewritten: big cleanup, much simpler, better HZ accuracy. * @@@ -73,7 -73,6 +73,6 @@@ static struct clock_event_device ckevt_ .features = CLOCK_EVT_FEAT_ONESHOT, .shift = 32, .rating = 200, - .cpumask = CPU_MASK_CPU0, .set_next_event = sa1100_osmr0_set_next_event, .set_mode = sa1100_osmr0_set_mode, }; @@@ -110,6 -109,7 +109,7 @@@ static void __init sa1100_timer_init(vo clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0); ckevt_sa1100_osmr0.min_delta_ns = clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1; + ckevt_sa1100_osmr0.cpumask = cpumask_of(0); cksrc_sa1100_oscr.mult = clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift); diff --combined arch/arm/mach-versatile/core.c index df25aa13850,a3f1933434e..1c43494f5c4 --- a/arch/arm/mach-versatile/core.c +++ b/arch/arm/mach-versatile/core.c @@@ -31,7 -31,6 +31,7 @@@ #include #include +#include #include #include #include @@@ -374,60 -373,22 +374,60 @@@ static const struct icst307_params vers static void versatile_oscvco_set(struct clk *clk, struct icst307_vco vco) { - void __iomem *sys_lock = __io_address(VERSATILE_SYS_BASE) + VERSATILE_SYS_LOCK_OFFSET; - void __iomem *sys_osc = __io_address(VERSATILE_SYS_BASE) + VERSATILE_SYS_OSCCLCD_OFFSET; + void __iomem *sys = __io_address(VERSATILE_SYS_BASE); + void __iomem *sys_lock = sys + VERSATILE_SYS_LOCK_OFFSET; u32 val; - val = readl(sys_osc) & ~0x7ffff; + val = readl(sys + clk->oscoff) & ~0x7ffff; val |= vco.v | (vco.r << 9) | (vco.s << 16); writel(0xa05f, sys_lock); - writel(val, sys_osc); + writel(val, sys + clk->oscoff); writel(0, sys_lock); } -static struct clk versatile_clcd_clk = { - .name = "CLCDCLK", +static struct clk osc4_clk = { .params = &versatile_oscvco_params, - .setvco = versatile_oscvco_set, + .oscoff = VERSATILE_SYS_OSCCLCD_OFFSET, + .setvco = versatile_oscvco_set, +}; + +/* + * These are fixed clocks. + */ +static struct clk ref24_clk = { + .rate = 24000000, +}; + +static struct clk_lookup lookups[] __initdata = { + { /* UART0 */ + .dev_id = "dev:f1", + .clk = &ref24_clk, + }, { /* UART1 */ + .dev_id = "dev:f2", + .clk = &ref24_clk, + }, { /* UART2 */ + .dev_id = "dev:f3", + .clk = &ref24_clk, + }, { /* UART3 */ + .dev_id = "fpga:09", + .clk = &ref24_clk, + }, { /* KMI0 */ + .dev_id = "fpga:06", + .clk = &ref24_clk, + }, { /* KMI1 */ + .dev_id = "fpga:07", + .clk = &ref24_clk, + }, { /* MMC0 */ + .dev_id = "fpga:05", + .clk = &ref24_clk, + }, { /* MMC1 */ + .dev_id = "fpga:0b", + .clk = &ref24_clk, + }, { /* CLCD */ + .dev_id = "dev:20", + .clk = &osc4_clk, + } }; /* @@@ -825,8 -786,7 +825,8 @@@ void __init versatile_init(void { int i; - clk_register(&versatile_clcd_clk); + for (i = 0; i < ARRAY_SIZE(lookups); i++) + clkdev_add(&lookups[i]); platform_device_register(&versatile_flash_device); platform_device_register(&versatile_i2c_device); @@@ -1005,7 -965,7 +1005,7 @@@ static void __init versatile_timer_init timer0_clockevent.min_delta_ns = clockevent_delta2ns(0xf, &timer0_clockevent); - timer0_clockevent.cpumask = cpumask_of_cpu(0); + timer0_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&timer0_clockevent); } diff --combined arch/powerpc/kernel/smp.c index 8ac3f721d23,d1165566f06..65484b2200b --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@@ -57,15 -57,12 +57,11 @@@ #define DBG(fmt...) #endif -int smp_hw_index[NR_CPUS]; struct thread_info *secondary_ti; - cpumask_t cpu_possible_map = CPU_MASK_NONE; - cpumask_t cpu_online_map = CPU_MASK_NONE; DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE; DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE; - EXPORT_SYMBOL(cpu_online_map); - EXPORT_SYMBOL(cpu_possible_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); @@@ -122,65 -119,6 +118,65 @@@ void smp_message_recv(int msg } } +static irqreturn_t call_function_action(int irq, void *data) +{ + generic_smp_call_function_interrupt(); + return IRQ_HANDLED; +} + +static irqreturn_t reschedule_action(int irq, void *data) +{ + /* we just need the return path side effect of checking need_resched */ + return IRQ_HANDLED; +} + +static irqreturn_t call_function_single_action(int irq, void *data) +{ + generic_smp_call_function_single_interrupt(); + return IRQ_HANDLED; +} + +static irqreturn_t debug_ipi_action(int irq, void *data) +{ + smp_message_recv(PPC_MSG_DEBUGGER_BREAK); + return IRQ_HANDLED; +} + +static irq_handler_t smp_ipi_action[] = { + [PPC_MSG_CALL_FUNCTION] = call_function_action, + [PPC_MSG_RESCHEDULE] = reschedule_action, + [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action, + [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action, +}; + +const char *smp_ipi_name[] = { + [PPC_MSG_CALL_FUNCTION] = "ipi call function", + [PPC_MSG_RESCHEDULE] = "ipi reschedule", + [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single", + [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger", +}; + +/* optional function to request ipi, for controllers with >= 4 ipis */ +int smp_request_message_ipi(int virq, int msg) +{ + int err; + + if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) { + return -EINVAL; + } +#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC) + if (msg == PPC_MSG_DEBUGGER_BREAK) { + return 1; + } +#endif + err = request_irq(virq, smp_ipi_action[msg], IRQF_DISABLED|IRQF_PERCPU, + smp_ipi_name[msg], 0); + WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n", + virq, smp_ipi_name[msg], err); + + return err; +} + void smp_send_reschedule(int cpu) { if (likely(smp_ops)) @@@ -466,7 -404,8 +462,7 @@@ out static struct device_node *cpu_to_l2cache(int cpu) { struct device_node *np; - const phandle *php; - phandle ph; + struct device_node *cache; if (!cpu_present(cpu)) return NULL; @@@ -475,11 -414,13 +471,11 @@@ if (np == NULL) return NULL; - php = of_get_property(np, "l2-cache", NULL); - if (php == NULL) - return NULL; - ph = *php; + cache = of_find_next_cache_node(np); + of_node_put(np); - return of_find_node_by_phandle(ph); + return cache; } /* Activate a secondary processor. */ diff --combined arch/powerpc/kernel/time.c index e1f3a514042,6f39d35d6f5..99f1ddd6858 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@@ -164,6 -164,8 +164,6 @@@ static u64 tb_to_ns_scale __read_mostly static unsigned tb_to_ns_shift __read_mostly; static unsigned long boot_tb __read_mostly; -static struct gettimeofday_struct do_gtod; - extern struct timezone sys_tz; static long timezone_offset; @@@ -413,9 -415,31 +413,9 @@@ void udelay(unsigned long usecs } EXPORT_SYMBOL(udelay); - -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, u64 new_tb_to_xs) { - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; - - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; - - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; - /* * tb_update_count is used to allow the userspace gettimeofday code * to assure itself that it sees a consistent view of the tb_to_xs and @@@ -432,7 -456,6 +432,7 @@@ vdso_data->tb_to_xs = new_tb_to_xs; vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; + vdso_data->stamp_xtime = xtime; smp_wmb(); ++(vdso_data->tb_update_count); } @@@ -491,7 -514,9 +491,7 @@@ static int __init iSeries_tb_recal(void tb_ticks_per_sec = new_tb_ticks_per_sec; calc_cputime_factors(); div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; vdso_data->tb_to_xs = tb_to_xs; } @@@ -844,7 -869,7 +844,7 @@@ static void register_decrementer_clocke struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; *dec = decrementer_clockevent; - dec->cpumask = cpumask_of_cpu(cpu); + dec->cpumask = cpumask_of(cpu); printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); @@@ -963,6 -988,15 +963,6 @@@ void __init time_init(void sys_tz.tz_dsttime = 0; } - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_jiffy; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; - vdso_data->tb_orig_stamp = tb_last_jiffy; vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; diff --combined arch/powerpc/platforms/pseries/xics.c index f7a69021b7b,424b335a71c..84e058f1e1c --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@@ -332,7 -332,7 +332,7 @@@ static void xics_eoi_lpar(unsigned int lpar_xirr_info_set((0xff << 24) | irq); } - static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) + static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) { unsigned int irq; int status; @@@ -579,7 -579,7 +579,7 @@@ static void xics_update_irq_servers(voi int i, j; struct device_node *np; u32 ilen; - const u32 *ireg, *isize; + const u32 *ireg; u32 hcpuid; /* Find the server numbers for the boot cpu. */ @@@ -607,6 -607,11 +607,6 @@@ } } - /* get the bit size of server numbers */ - isize = of_get_property(np, "ibm,interrupt-server#-size", NULL); - if (isize) - interrupt_server_size = *isize; - of_node_put(np); } @@@ -677,7 -682,6 +677,7 @@@ void __init xics_init_IRQ(void struct device_node *np; u32 indx = 0; int found = 0; + const u32 *isize; ppc64_boot_msg(0x20, "XICS Init"); @@@ -697,26 -701,6 +697,26 @@@ if (found == 0) return; + /* get the bit size of server numbers */ + found = 0; + + for_each_compatible_node(np, NULL, "ibm,ppc-xics") { + isize = of_get_property(np, "ibm,interrupt-server#-size", NULL); + + if (!isize) + continue; + + if (!found) { + interrupt_server_size = *isize; + found = 1; + } else if (*isize != interrupt_server_size) { + printk(KERN_WARNING "XICS: " + "mismatched ibm,interrupt-server#-size\n"); + interrupt_server_size = max(*isize, + interrupt_server_size); + } + } + xics_update_irq_servers(); xics_init_host(); @@@ -744,18 -728,9 +744,18 @@@ static void xics_set_cpu_priority(unsig /* Have the calling processor join or leave the specified global queue */ static void xics_set_cpu_giq(unsigned int gserver, unsigned int join) { - int status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, - (1UL << interrupt_server_size) - 1 - gserver, join); - WARN_ON(status < 0); + int index; + int status; + + if (!rtas_indicator_present(GLOBAL_INTERRUPT_QUEUE, NULL)) + return; + + index = (1UL << interrupt_server_size) - 1 - gserver; + + status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, index, join); + + WARN(status < 0, "set-indicator(%d, %d, %u) returned %d\n", + GLOBAL_INTERRUPT_QUEUE, index, join, status); } void xics_setup_cpu(void) @@@ -870,7 -845,7 +870,7 @@@ void xics_migrate_irqs_away(void /* Reset affinity to all cpus */ irq_desc[virq].affinity = CPU_MASK_ALL; - desc->chip->set_affinity(virq, CPU_MASK_ALL); + desc->chip->set_affinity(virq, cpu_all_mask); unlock: spin_unlock_irqrestore(&desc->lock, flags); } diff --combined arch/powerpc/sysdev/mpic.c index c82babb7007,5d7f9f0c93c..3e0d89dcdba --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@@ -661,6 -661,17 +661,6 @@@ static inline void mpic_eoi(struct mpi (void)mpic_cpu_read(MPIC_INFO(CPU_WHOAMI)); } -#ifdef CONFIG_SMP -static irqreturn_t mpic_ipi_action(int irq, void *data) -{ - long ipi = (long)data; - - smp_message_recv(ipi); - - return IRQ_HANDLED; -} -#endif /* CONFIG_SMP */ - /* * Linux descriptor level callbacks */ @@@ -806,7 -817,7 +806,7 @@@ static void mpic_end_ipi(unsigned int i #endif /* CONFIG_SMP */ - void mpic_set_affinity(unsigned int irq, cpumask_t cpumask) + void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct mpic *mpic = mpic_from_irq(irq); unsigned int src = mpic_irq_to_hw(irq); @@@ -818,7 -829,7 +818,7 @@@ } else { cpumask_t tmp; - cpus_and(tmp, cpumask, cpu_online_map); + cpumask_and(&tmp, cpumask, cpu_online_mask); mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), mpic_physmask(cpus_addr(tmp)[0])); @@@ -1537,7 -1548,13 +1537,7 @@@ unsigned int mpic_get_mcirq(void void mpic_request_ipis(void) { struct mpic *mpic = mpic_primary; - long i, err; - static char *ipi_names[] = { - "IPI0 (call function)", - "IPI1 (reschedule)", - "IPI2 (call function single)", - "IPI3 (debugger break)", - }; + int i; BUG_ON(mpic == NULL); printk(KERN_INFO "mpic: requesting IPIs ... \n"); @@@ -1546,10 -1563,17 +1546,10 @@@ unsigned int vipi = irq_create_mapping(mpic->irqhost, mpic->ipi_vecs[0] + i); if (vipi == NO_IRQ) { - printk(KERN_ERR "Failed to map IPI %ld\n", i); - break; - } - err = request_irq(vipi, mpic_ipi_action, - IRQF_DISABLED|IRQF_PERCPU, - ipi_names[i], (void *)i); - if (err) { - printk(KERN_ERR "Request of irq %d for IPI %ld failed\n", - vipi, i); - break; + printk(KERN_ERR "Failed to map %s\n", smp_ipi_name[i]); + continue; } + smp_request_message_ipi(vipi, i); } } diff --combined arch/s390/Kconfig index 8152fefc97b,b4aa5869c7f..19577aeffd7 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@@ -43,9 -43,6 +43,9 @@@ config GENERIC_HWEIGH config GENERIC_TIME def_bool y +config GENERIC_TIME_VSYSCALL + def_bool y + config GENERIC_CLOCKEVENTS def_bool y @@@ -69,20 -66,16 +69,21 @@@ config PGST bool default y if KVM +config VIRT_CPU_ACCOUNTING + def_bool y + mainmenu "Linux Kernel Configuration" config S390 def_bool y + select USE_GENERIC_SMP_HELPERS if SMP + select HAVE_FUNCTION_TRACER select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK + select INIT_ALL_POSSIBLE source "init/Kconfig" @@@ -233,14 -226,6 +234,14 @@@ config MARCH_Z9_10 Class (z9 BC). The kernel will be slightly faster but will not work on older machines such as the z990, z890, z900, and z800. +config MARCH_Z10 + bool "IBM System z10" + help + Select this to enable optimizations for IBM System z10. The + kernel will be slightly faster but will not work on older + machines such as the z990, z890, z900, z800, z9-109, z9-ec + and z9-bc. + endchoice config PACK_STACK @@@ -359,6 -344,16 +360,6 @@@ config QDI If unsure, say Y. -config QDIO_DEBUG - bool "Extended debugging information" - depends on QDIO - help - Say Y here to get extended debugging output in - /sys/kernel/debug/s390dbf/qdio... - Warning: this option reduces the performance of the QDIO module. - - If unsure, say N. - config CHSC_SCH tristate "Support for CHSC subchannels" help @@@ -472,9 -467,22 +473,9 @@@ config PAGE_STATE hypervisor. The ESSA instruction is used to do the states changes between a page that has content and the unused state. -config VIRT_TIMER - bool "Virtual CPU timer support" - help - This provides a kernel interface for virtual CPU timers. - Default is disabled. - -config VIRT_CPU_ACCOUNTING - bool "Base user process accounting on virtual cpu timer" - depends on VIRT_TIMER - help - Select this option to use CPU timer deltas to do user - process accounting. - config APPLDATA_BASE bool "Linux - VM Monitor Stream, base infrastructure" - depends on PROC_FS && VIRT_TIMER=y + depends on PROC_FS help This provides a kernel interface for creating and updating z/VM APPLDATA monitor records. The monitor records are updated at certain time diff --combined arch/s390/kernel/smp.c index 6fc78541dc5,f03914b8ed2..3ed5c7a83c6 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@@ -20,9 -20,6 +20,9 @@@ * cpu_number_map in other architectures. */ +#define KMSG_COMPONENT "cpu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include #include #include @@@ -55,12 -52,6 +55,6 @@@ struct _lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); - cpumask_t cpu_online_map = CPU_MASK_NONE; - EXPORT_SYMBOL(cpu_online_map); - - cpumask_t cpu_possible_map = CPU_MASK_ALL; - EXPORT_SYMBOL(cpu_possible_map); - static struct task_struct *current_set[NR_CPUS]; static u8 smp_cpu_type; @@@ -80,6 -71,159 +74,6 @@@ static DEFINE_PER_CPU(struct cpu, cpu_d static void smp_ext_bitcall(int, ec_bit_sig); -/* - * Structure and data for __smp_call_function_map(). This is designed to - * minimise static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - cpumask_t started; - cpumask_t finished; - int wait; -}; - -static struct call_data_struct *call_data; - -/* - * 'Call function' interrupt callback - */ -static void do_call_function(void) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - cpu_set(smp_processor_id(), call_data->started); - (*func)(info); - if (wait) - cpu_set(smp_processor_id(), call_data->finished);; -} - -static void __smp_call_function_map(void (*func) (void *info), void *info, - int wait, cpumask_t map) -{ - struct call_data_struct data; - int cpu, local = 0; - - /* - * Can deadlock when interrupts are disabled or if in wrong context. - */ - WARN_ON(irqs_disabled() || in_irq()); - - /* - * Check for local function call. We have to have the same call order - * as in on_each_cpu() because of machine_restart_smp(). - */ - if (cpu_isset(smp_processor_id(), map)) { - local = 1; - cpu_clear(smp_processor_id(), map); - } - - cpus_and(map, map, cpu_online_map); - if (cpus_empty(map)) - goto out; - - data.func = func; - data.info = info; - data.started = CPU_MASK_NONE; - data.wait = wait; - if (wait) - data.finished = CPU_MASK_NONE; - - call_data = &data; - - for_each_cpu_mask(cpu, map) - smp_ext_bitcall(cpu, ec_call_function); - - /* Wait for response */ - while (!cpus_equal(map, data.started)) - cpu_relax(); - if (wait) - while (!cpus_equal(map, data.finished)) - cpu_relax(); -out: - if (local) { - local_irq_disable(); - func(info); - local_irq_enable(); - } -} - -/* - * smp_call_function: - * @func: the function to run; this must be fast and non-blocking - * @info: an arbitrary pointer to pass to the function - * @wait: if true, wait (atomically) until function has completed on other CPUs - * - * Run a function on all other CPUs. - * - * You must not call this function with disabled interrupts, from a - * hardware interrupt handler or from a bottom half. - */ -int smp_call_function(void (*func) (void *info), void *info, int wait) -{ - cpumask_t map; - - spin_lock(&call_lock); - map = cpu_online_map; - cpu_clear(smp_processor_id(), map); - __smp_call_function_map(func, info, wait, map); - spin_unlock(&call_lock); - return 0; -} -EXPORT_SYMBOL(smp_call_function); - -/* - * smp_call_function_single: - * @cpu: the CPU where func should run - * @func: the function to run; this must be fast and non-blocking - * @info: an arbitrary pointer to pass to the function - * @wait: if true, wait (atomically) until function has completed on other CPUs - * - * Run a function on one processor. - * - * You must not call this function with disabled interrupts, from a - * hardware interrupt handler or from a bottom half. - */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int wait) -{ - spin_lock(&call_lock); - __smp_call_function_map(func, info, wait, cpumask_of_cpu(cpu)); - spin_unlock(&call_lock); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - -/** - * smp_call_function_mask(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on. Must not include the current cpu. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. - * - * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, - int wait) -{ - spin_lock(&call_lock); - cpu_clear(smp_processor_id(), mask); - __smp_call_function_map(func, info, wait, mask); - spin_unlock(&call_lock); - return 0; -} -EXPORT_SYMBOL(smp_call_function_mask); - void smp_send_stop(void) { int cpu, rc; @@@ -121,10 -265,7 +115,10 @@@ static void do_ext_call_interrupt(__u1 bits = xchg(&S390_lowcore.ext_call_fast, 0); if (test_bit(ec_call_function, &bits)) - do_call_function(); + generic_smp_call_function_interrupt(); + + if (test_bit(ec_call_function_single, &bits)) + generic_smp_call_function_single_interrupt(); } /* @@@ -141,19 -282,6 +135,19 @@@ static void smp_ext_bitcall(int cpu, ec udelay(10); } +void arch_send_call_function_ipi(cpumask_t mask) +{ + int cpu; + + for_each_cpu_mask(cpu, mask) + smp_ext_bitcall(cpu, ec_call_function); +} + +void arch_send_call_function_single_ipi(int cpu) +{ + smp_ext_bitcall(cpu, ec_call_function_single); +} + #ifndef CONFIG_64BIT /* * this function sends a 'purge tlb' signal to another CPU. @@@ -254,8 -382,8 +248,8 @@@ static void __init smp_get_save_area(un if (ipl_info.type != IPL_TYPE_FCP_DUMP) return; if (cpu >= NR_CPUS) { - printk(KERN_WARNING "Registers for cpu %i not saved since dump " - "kernel was compiled with NR_CPUS=%i\n", cpu, NR_CPUS); + pr_warning("CPU %i exceeds the maximum %i and is excluded from " + "the dump\n", cpu, NR_CPUS - 1); return; } zfcpdump_save_areas[cpu] = kmalloc(sizeof(union save_area), GFP_KERNEL); @@@ -428,7 -556,7 +422,7 @@@ static void __init smp_detect_cpus(void } out: kfree(info); - printk(KERN_INFO "CPUs: %d configured, %d standby\n", c_cpus, s_cpus); + pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); get_online_cpus(); __smp_rescan_cpus(); put_online_cpus(); @@@ -444,17 -572,19 +438,17 @@@ int __cpuinit start_secondary(void *cpu preempt_disable(); /* Enable TOD clock interrupts on the secondary cpu. */ init_cpu_timer(); -#ifdef CONFIG_VIRT_TIMER /* Enable cpu timer interrupts on the secondary cpu. */ init_cpu_vtimer(); -#endif /* Enable pfault pseudo page faults on this cpu. */ pfault_init(); /* call cpu notifiers */ notify_cpu_starting(smp_processor_id()); /* Mark this cpu as online */ - spin_lock(&call_lock); + ipi_call_lock(); cpu_set(smp_processor_id(), cpu_online_map); - spin_unlock(&call_lock); + ipi_call_unlock(); /* Switch on interrupts */ local_irq_enable(); /* Print info about this processor */ @@@ -503,15 -633,18 +497,15 @@@ static int __cpuinit smp_alloc_lowcore( save_area = get_zeroed_page(GFP_KERNEL); if (!save_area) - goto out_save_area; + goto out; lowcore->extended_save_area_addr = (u32) save_area; } #endif lowcore_ptr[cpu] = lowcore; return 0; -#ifndef CONFIG_64BIT -out_save_area: - free_page(panic_stack); -#endif out: + free_page(panic_stack); free_pages(async_stack, ASYNC_ORDER); free_pages((unsigned long) lowcore, lc_order); return -ENOMEM; @@@ -551,8 -684,12 +545,8 @@@ int __cpuinit __cpu_up(unsigned int cpu ccode = signal_processor_p((__u32)(unsigned long)(lowcore_ptr[cpu]), cpu, sigp_set_prefix); - if (ccode) { - printk("sigp_set_prefix failed for cpu %d " - "with condition code %d\n", - (int) cpu, (int) ccode); + if (ccode) return -EIO; - } idle = current_set[cpu]; cpu_lowcore = lowcore_ptr[cpu]; @@@ -635,7 -772,7 +629,7 @@@ void __cpu_die(unsigned int cpu while (!smp_cpu_not_running(cpu)) cpu_relax(); smp_free_lowcore(cpu); - printk(KERN_INFO "Processor %d spun down\n", cpu); + pr_info("Processor %d stopped\n", cpu); } void cpu_die(void) diff --combined arch/s390/kernel/time.c index 5be981a36c3,f5bd141c844..d649600df5b --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@@ -12,9 -12,6 +12,9 @@@ * Copyright (C) 1991, 1992, 1995 Linus Torvalds */ +#define KMSG_COMPONENT "time" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include #include #include @@@ -23,8 -20,6 +23,8 @@@ #include #include #include +#include +#include #include #include #include @@@ -41,7 -36,6 +41,7 @@@ #include #include #include +#include #include #include #include @@@ -160,7 -154,7 +160,7 @@@ void init_cpu_timer(void cd->min_delta_ns = 1; cd->max_delta_ns = LONG_MAX; cd->rating = 400; - cd->cpumask = cpumask_of_cpu(cpu); + cd->cpumask = cpumask_of(cpu); cd->set_next_event = s390_next_event; cd->set_mode = s390_set_mode; @@@ -229,36 -223,6 +229,36 @@@ static struct clocksource clocksource_t }; +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +{ + if (clock != &clocksource_tod) + return; + + /* Make userspace gettimeofday spin until we're done. */ + ++vdso_data->tb_update_count; + smp_wmb(); + vdso_data->xtime_tod_stamp = clock->cycle_last; + vdso_data->xtime_clock_sec = xtime.tv_sec; + vdso_data->xtime_clock_nsec = xtime.tv_nsec; + vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; + smp_wmb(); + ++vdso_data->tb_update_count; +} + +extern struct timezone sys_tz; + +void update_vsyscall_tz(void) +{ + /* Make userspace gettimeofday spin until we're done. */ + ++vdso_data->tb_update_count; + smp_wmb(); + vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; + vdso_data->tz_dsttime = sys_tz.tz_dsttime; + smp_wmb(); + ++vdso_data->tb_update_count; +} + /* * Initialize the TOD clock and the CPU timer of * the boot cpu. @@@ -289,8 -253,10 +289,8 @@@ void __init time_init(void /* Enable TOD clock interrupts on the boot cpu. */ init_cpu_timer(); - -#ifdef CONFIG_VIRT_TIMER + /* Enable cpu timer interrupts on the boot cpu. */ vtime_init(); -#endif } /* @@@ -322,8 -288,8 +322,8 @@@ static unsigned long long adjust_time(u } sched_clock_base_cc += delta; if (adjust.offset != 0) { - printk(KERN_NOTICE "etr: time adjusted by %li micro-seconds\n", - adjust.offset); + pr_notice("The ETR interface has adjusted the clock " + "by %li microseconds\n", adjust.offset); adjust.modes = ADJ_OFFSET_SINGLESHOT; do_adjtimex(&adjust); } @@@ -394,15 -360,6 +394,15 @@@ static void enable_sync_clock(void atomic_set_mask(0x80000000, sw_ptr); } +/* Single threaded workqueue used for etr and stp sync events */ +static struct workqueue_struct *time_sync_wq; + +static void __init time_init_wq(void) +{ + if (!time_sync_wq) + time_sync_wq = create_singlethread_workqueue("timesync"); +} + /* * External Time Reference (ETR) code. */ @@@ -468,7 -425,6 +468,7 @@@ static struct timer_list etr_timer static void etr_timeout(unsigned long dummy); static void etr_work_fn(struct work_struct *work); +static DEFINE_MUTEX(etr_work_mutex); static DECLARE_WORK(etr_work, etr_work_fn); /* @@@ -484,8 -440,8 +484,8 @@@ static void etr_reset(void etr_tolec = get_clock(); set_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags); } else if (etr_port0_online || etr_port1_online) { - printk(KERN_WARNING "Running on non ETR capable " - "machine, only local mode available.\n"); + pr_warning("The real or virtual hardware system does " + "not provide an ETR interface\n"); etr_port0_online = etr_port1_online = 0; } } @@@ -496,18 -452,17 +496,18 @@@ static int __init etr_init(void if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags)) return 0; + time_init_wq(); /* Check if this machine has the steai instruction. */ if (etr_steai(&aib, ETR_STEAI_STEPPING_PORT) == 0) etr_steai_available = 1; setup_timer(&etr_timer, etr_timeout, 0UL); if (etr_port0_online) { set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } if (etr_port1_online) { set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } return 0; } @@@ -534,7 -489,7 +534,7 @@@ void etr_switch_to_local(void if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags)) disable_sync_clock(NULL); set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } /* @@@ -550,7 -505,7 +550,7 @@@ void etr_sync_check(void if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags)) disable_sync_clock(NULL); set_bit(ETR_EVENT_SYNC_CHECK, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } /* @@@ -574,13 -529,13 +574,13 @@@ static void etr_timing_alert(struct etr * Both ports are not up-to-date now. */ set_bit(ETR_EVENT_PORT_ALERT, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } static void etr_timeout(unsigned long dummy) { set_bit(ETR_EVENT_UPDATE, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } /* @@@ -687,16 -642,14 +687,16 @@@ static int etr_aib_follows(struct etr_a } struct clock_sync_data { + atomic_t cpus; int in_sync; unsigned long long fixup_cc; + int etr_port; + struct etr_aib *etr_aib; }; -static void clock_sync_cpu_start(void *dummy) +static void clock_sync_cpu(struct clock_sync_data *sync) { - struct clock_sync_data *sync = dummy; - + atomic_dec(&sync->cpus); enable_sync_clock(); /* * This looks like a busy wait loop but it isn't. etr_sync_cpus @@@ -722,35 -675,39 +722,35 @@@ fixup_clock_comparator(sync->fixup_cc); } -static void clock_sync_cpu_end(void *dummy) -{ -} - /* * Sync the TOD clock using the port refered to by aibp. This port * has to be enabled and the other port has to be disabled. The * last eacr update has to be more than 1.6 seconds in the past. */ -static int etr_sync_clock(struct etr_aib *aib, int port) +static int etr_sync_clock(void *data) { - struct etr_aib *sync_port; - struct clock_sync_data etr_sync; + static int first; unsigned long long clock, old_clock, delay, delta; - int follows; + struct clock_sync_data *etr_sync; + struct etr_aib *sync_port, *aib; + int port; int rc; - /* Check if the current aib is adjacent to the sync port aib. */ - sync_port = (port == 0) ? &etr_port0 : &etr_port1; - follows = etr_aib_follows(sync_port, aib, port); - memcpy(sync_port, aib, sizeof(*aib)); - if (!follows) - return -EAGAIN; + etr_sync = data; - /* - * Catch all other cpus and make them wait until we have - * successfully synced the clock. smp_call_function will - * return after all other cpus are in etr_sync_cpu_start. - */ - memset(&etr_sync, 0, sizeof(etr_sync)); - preempt_disable(); - smp_call_function(clock_sync_cpu_start, &etr_sync, 0); - local_irq_disable(); + if (xchg(&first, 1) == 1) { + /* Slave */ + clock_sync_cpu(etr_sync); + return 0; + } + + /* Wait until all other cpus entered the sync function. */ + while (atomic_read(&etr_sync->cpus) != 0) + cpu_relax(); + + port = etr_sync->etr_port; + aib = etr_sync->etr_aib; + sync_port = (port == 0) ? &etr_port0 : &etr_port1; enable_sync_clock(); /* Set clock to next OTE. */ @@@ -767,16 -724,16 +767,16 @@@ delay = (unsigned long long) (aib->edf2.etv - sync_port->edf2.etv) << 32; delta = adjust_time(old_clock, clock, delay); - etr_sync.fixup_cc = delta; + etr_sync->fixup_cc = delta; fixup_clock_comparator(delta); /* Verify that the clock is properly set. */ if (!etr_aib_follows(sync_port, aib, port)) { /* Didn't work. */ disable_sync_clock(NULL); - etr_sync.in_sync = -EAGAIN; + etr_sync->in_sync = -EAGAIN; rc = -EAGAIN; } else { - etr_sync.in_sync = 1; + etr_sync->in_sync = 1; rc = 0; } } else { @@@ -784,33 -741,12 +784,33 @@@ __ctl_clear_bit(0, 29); __ctl_clear_bit(14, 21); disable_sync_clock(NULL); - etr_sync.in_sync = -EAGAIN; + etr_sync->in_sync = -EAGAIN; rc = -EAGAIN; } - local_irq_enable(); - smp_call_function(clock_sync_cpu_end, NULL, 0); - preempt_enable(); + xchg(&first, 0); + return rc; +} + +static int etr_sync_clock_stop(struct etr_aib *aib, int port) +{ + struct clock_sync_data etr_sync; + struct etr_aib *sync_port; + int follows; + int rc; + + /* Check if the current aib is adjacent to the sync port aib. */ + sync_port = (port == 0) ? &etr_port0 : &etr_port1; + follows = etr_aib_follows(sync_port, aib, port); + memcpy(sync_port, aib, sizeof(*aib)); + if (!follows) + return -EAGAIN; + memset(&etr_sync, 0, sizeof(etr_sync)); + etr_sync.etr_aib = aib; + etr_sync.etr_port = port; + get_online_cpus(); + atomic_set(&etr_sync.cpus, num_online_cpus() - 1); + rc = stop_machine(etr_sync_clock, &etr_sync, &cpu_online_map); + put_online_cpus(); return rc; } @@@ -967,7 -903,7 +967,7 @@@ static void etr_update_eacr(struct etr_ } /* - * ETR tasklet. In this function you'll find the main logic. In + * ETR work. In this function you'll find the main logic. In * particular this is the only function that calls etr_update_eacr(), * it "controls" the etr control register. */ @@@ -978,9 -914,6 +978,9 @@@ static void etr_work_fn(struct work_str struct etr_aib aib; int sync_port; + /* prevent multiple execution. */ + mutex_lock(&etr_work_mutex); + /* Create working copy of etr_eacr. */ eacr = etr_eacr; @@@ -996,7 -929,7 +996,7 @@@ del_timer_sync(&etr_timer); etr_update_eacr(eacr); clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags); - return; + goto out_unlock; } /* Store aib to get the current ETR status word. */ @@@ -1083,7 -1016,7 +1083,7 @@@ eacr.es || sync_port < 0) { etr_update_eacr(eacr); etr_set_tolec_timeout(now); - return; + goto out_unlock; } /* @@@ -1103,7 -1036,7 +1103,7 @@@ etr_update_eacr(eacr); set_bit(CLOCK_SYNC_ETR, &clock_sync_flags); if (now < etr_tolec + (1600000 << 12) || - etr_sync_clock(&aib, sync_port) != 0) { + etr_sync_clock_stop(&aib, sync_port) != 0) { /* Sync failed. Try again in 1/2 second. */ eacr.es = 0; etr_update_eacr(eacr); @@@ -1111,8 -1044,6 +1111,8 @@@ etr_set_sync_timeout(); } else etr_set_tolec_timeout(now); +out_unlock: + mutex_unlock(&etr_work_mutex); } /* @@@ -1194,13 -1125,13 +1194,13 @@@ static ssize_t etr_online_store(struct return count; /* Nothing to do. */ etr_port0_online = value; set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } else { if (etr_port1_online == value) return count; /* Nothing to do. */ etr_port1_online = value; set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events); - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } return count; } @@@ -1401,7 -1332,6 +1401,7 @@@ static struct stp_sstpi stp_info static void *stp_page; static void stp_work_fn(struct work_struct *work); +static DEFINE_MUTEX(stp_work_mutex); static DECLARE_WORK(stp_work, stp_work_fn); static int __init early_parse_stp(char *p) @@@ -1426,8 -1356,7 +1426,8 @@@ static void __init stp_reset(void if (rc == 0) set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); else if (stp_online) { - printk(KERN_WARNING "Running on non STP capable machine.\n"); + pr_warning("The real or virtual hardware system does " + "not provide an STP interface\n"); free_bootmem((unsigned long) stp_page, PAGE_SIZE); stp_page = NULL; stp_online = 0; @@@ -1436,12 -1365,8 +1436,12 @@@ static int __init stp_init(void) { - if (test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online) - schedule_work(&stp_work); + if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) + return 0; + time_init_wq(); + if (!stp_online) + return 0; + queue_work(time_sync_wq, &stp_work); return 0; } @@@ -1458,7 -1383,7 +1458,7 @@@ arch_initcall(stp_init) static void stp_timing_alert(struct stp_irq_parm *intparm) { if (intparm->tsc || intparm->lac || intparm->tcpc) - schedule_work(&stp_work); + queue_work(time_sync_wq, &stp_work); } /* @@@ -1472,7 -1397,7 +1472,7 @@@ void stp_sync_check(void if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags)) return; disable_sync_clock(NULL); - schedule_work(&stp_work); + queue_work(time_sync_wq, &stp_work); } /* @@@ -1486,34 -1411,46 +1486,34 @@@ void stp_island_check(void if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags)) return; disable_sync_clock(NULL); - schedule_work(&stp_work); + queue_work(time_sync_wq, &stp_work); } -/* - * STP tasklet. Check for the STP state and take over the clock - * synchronization if the STP clock source is usable. - */ -static void stp_work_fn(struct work_struct *work) + +static int stp_sync_clock(void *data) { - struct clock_sync_data stp_sync; + static int first; unsigned long long old_clock, delta; + struct clock_sync_data *stp_sync; int rc; - if (!stp_online) { - chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000); - return; - } + stp_sync = data; - rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0); - if (rc) - return; + if (xchg(&first, 1) == 1) { + /* Slave */ + clock_sync_cpu(stp_sync); + return 0; + } - rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); - if (rc || stp_info.c == 0) - return; + /* Wait until all other cpus entered the sync function. */ + while (atomic_read(&stp_sync->cpus) != 0) + cpu_relax(); - /* - * Catch all other cpus and make them wait until we have - * successfully synced the clock. smp_call_function will - * return after all other cpus are in clock_sync_cpu_start. - */ - memset(&stp_sync, 0, sizeof(stp_sync)); - preempt_disable(); - smp_call_function(clock_sync_cpu_start, &stp_sync, 0); - local_irq_disable(); enable_sync_clock(); set_bit(CLOCK_SYNC_STP, &clock_sync_flags); if (test_and_clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags)) - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); rc = 0; if (stp_info.todoff[0] || stp_info.todoff[1] || @@@ -1532,49 -1469,16 +1532,49 @@@ } if (rc) { disable_sync_clock(NULL); - stp_sync.in_sync = -EAGAIN; + stp_sync->in_sync = -EAGAIN; clear_bit(CLOCK_SYNC_STP, &clock_sync_flags); if (etr_port0_online || etr_port1_online) - schedule_work(&etr_work); + queue_work(time_sync_wq, &etr_work); } else - stp_sync.in_sync = 1; + stp_sync->in_sync = 1; + xchg(&first, 0); + return 0; +} + +/* + * STP work. Check for the STP state and take over the clock + * synchronization if the STP clock source is usable. + */ +static void stp_work_fn(struct work_struct *work) +{ + struct clock_sync_data stp_sync; + int rc; + + /* prevent multiple execution. */ + mutex_lock(&stp_work_mutex); + + if (!stp_online) { + chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000); + goto out_unlock; + } + + rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0); + if (rc) + goto out_unlock; + + rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); + if (rc || stp_info.c == 0) + goto out_unlock; + + memset(&stp_sync, 0, sizeof(stp_sync)); + get_online_cpus(); + atomic_set(&stp_sync.cpus, num_online_cpus() - 1); + stop_machine(stp_sync_clock, &stp_sync, &cpu_online_map); + put_online_cpus(); - local_irq_enable(); - smp_call_function(clock_sync_cpu_end, NULL, 0); - preempt_enable(); +out_unlock: + mutex_unlock(&stp_work_mutex); } /* @@@ -1683,7 -1587,7 +1683,7 @@@ static ssize_t stp_online_store(struct if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags)) return -EOPNOTSUPP; stp_online = value; - schedule_work(&stp_work); + queue_work(time_sync_wq, &stp_work); return count; } diff --combined arch/sparc/kernel/irq_64.c index a3ea2bcb95d,4aaf18e83c8..cab8e028687 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@@ -312,7 -312,8 +312,8 @@@ static void sun4u_irq_enable(unsigned i } } - static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask) + static void sun4u_set_affinity(unsigned int virt_irq, + const struct cpumask *mask) { sun4u_irq_enable(virt_irq); } @@@ -362,7 -363,8 +363,8 @@@ static void sun4v_irq_enable(unsigned i ino, err); } - static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask) + static void sun4v_set_affinity(unsigned int virt_irq, + const struct cpumask *mask) { unsigned int ino = virt_irq_table[virt_irq].dev_ino; unsigned long cpuid = irq_choose_cpu(virt_irq); @@@ -429,7 -431,8 +431,8 @@@ static void sun4v_virq_enable(unsigned dev_handle, dev_ino, err); } - static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask) + static void sun4v_virt_set_affinity(unsigned int virt_irq, + const struct cpumask *mask) { unsigned long cpuid, dev_handle, dev_ino; int err; @@@ -775,69 -778,6 +778,69 @@@ void do_softirq(void local_irq_restore(flags); } +static void unhandled_perf_irq(struct pt_regs *regs) +{ + unsigned long pcr, pic; + + read_pcr(pcr); + read_pic(pic); + + write_pcr(0); + + printk(KERN_EMERG "CPU %d: Got unexpected perf counter IRQ.\n", + smp_processor_id()); + printk(KERN_EMERG "CPU %d: PCR[%016lx] PIC[%016lx]\n", + smp_processor_id(), pcr, pic); +} + +/* Almost a direct copy of the powerpc PMC code. */ +static DEFINE_SPINLOCK(perf_irq_lock); +static void *perf_irq_owner_caller; /* mostly for debugging */ +static void (*perf_irq)(struct pt_regs *regs) = unhandled_perf_irq; + +/* Invoked from level 15 PIL handler in trap table. */ +void perfctr_irq(int irq, struct pt_regs *regs) +{ + clear_softint(1 << irq); + perf_irq(regs); +} + +int register_perfctr_intr(void (*handler)(struct pt_regs *)) +{ + int ret; + + if (!handler) + return -EINVAL; + + spin_lock(&perf_irq_lock); + if (perf_irq != unhandled_perf_irq) { + printk(KERN_WARNING "register_perfctr_intr: " + "perf IRQ busy (reserved by caller %p)\n", + perf_irq_owner_caller); + ret = -EBUSY; + goto out; + } + + perf_irq_owner_caller = __builtin_return_address(0); + perf_irq = handler; + + ret = 0; +out: + spin_unlock(&perf_irq_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_perfctr_intr); + +void release_perfctr_intr(void (*handler)(struct pt_regs *)) +{ + spin_lock(&perf_irq_lock); + perf_irq_owner_caller = NULL; + perf_irq = unhandled_perf_irq; + spin_unlock(&perf_irq_lock); +} +EXPORT_SYMBOL_GPL(release_perfctr_intr); + #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) { @@@ -851,7 -791,7 +854,7 @@@ !(irq_desc[irq].status & IRQ_PER_CPU)) { if (irq_desc[irq].chip->set_affinity) irq_desc[irq].chip->set_affinity(irq, - irq_desc[irq].affinity); + &irq_desc[irq].affinity); } spin_unlock_irqrestore(&irq_desc[irq].lock, flags); } diff --combined arch/sparc/kernel/of_device_64.c index 46e231f7c5c,df2efb7fc14..322046cdf85 --- a/arch/sparc/kernel/of_device_64.c +++ b/arch/sparc/kernel/of_device_64.c @@@ -780,7 -780,7 +780,7 @@@ out if (nid != -1) { cpumask_t numa_mask = node_to_cpumask(nid); - irq_set_affinity(irq, numa_mask); + irq_set_affinity(irq, &numa_mask); } return irq; @@@ -811,20 -811,20 +811,20 @@@ static struct of_device * __init scan_o irq = of_get_property(dp, "interrupts", &len); if (irq) { - memcpy(op->irqs, irq, len); op->num_irqs = len / 4; + + /* Prevent overrunning the op->irqs[] array. */ + if (op->num_irqs > PROMINTR_MAX) { + printk(KERN_WARNING "%s: Too many irqs (%d), " + "limiting to %d.\n", + dp->full_name, op->num_irqs, PROMINTR_MAX); + op->num_irqs = PROMINTR_MAX; + } + memcpy(op->irqs, irq, op->num_irqs * 4); } else { op->num_irqs = 0; } - /* Prevent overrunning the op->irqs[] array. */ - if (op->num_irqs > PROMINTR_MAX) { - printk(KERN_WARNING "%s: Too many irqs (%d), " - "limiting to %d.\n", - dp->full_name, op->num_irqs, PROMINTR_MAX); - op->num_irqs = PROMINTR_MAX; - } - build_device_resources(op, parent); for (i = 0; i < op->num_irqs; i++) op->irqs[i] = build_one_device_irq(op, parent, op->irqs[i]); diff --combined arch/sparc/kernel/pci_msi.c index 2e680f34f72,0d0cd815e83..0d0cd815e83 --- a/arch/sparc/kernel/pci_msi.c +++ b/arch/sparc/kernel/pci_msi.c @@@ -288,7 -288,7 +288,7 @@@ static int bringup_one_msi_queue(struc if (nid != -1) { cpumask_t numa_mask = node_to_cpumask(nid); - irq_set_affinity(irq, numa_mask); + irq_set_affinity(irq, &numa_mask); } err = request_irq(irq, sparc64_msiq_interrupt, 0, "MSIQ", diff --combined arch/sparc/kernel/smp_32.c index e396c1f17a9,1e5ac4e282e..1e5ac4e282e --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@@ -39,8 -39,6 +39,6 @@@ volatile unsigned long cpu_callin_map[N unsigned char boot_cpu_id = 0; unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */ - cpumask_t cpu_online_map = CPU_MASK_NONE; - cpumask_t phys_cpu_present_map = CPU_MASK_NONE; cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* The only guaranteed locking primitive available on all Sparc @@@ -334,7 -332,7 +332,7 @@@ void __init smp_setup_cpu_possible_map( instance = 0; while (!cpu_find_by_instance(instance, NULL, &mid)) { if (mid < NR_CPUS) { - cpu_set(mid, phys_cpu_present_map); + cpu_set(mid, cpu_possible_map); cpu_set(mid, cpu_present_map); } instance++; @@@ -354,7 -352,7 +352,7 @@@ void __init smp_prepare_boot_cpu(void current_thread_info()->cpu = cpuid; cpu_set(cpuid, cpu_online_map); - cpu_set(cpuid, phys_cpu_present_map); + cpu_set(cpuid, cpu_possible_map); } int __cpuinit __cpu_up(unsigned int cpu) diff --combined arch/sparc/kernel/smp_64.c index bfe99d82d45,a97b8822c22..46329799f34 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@@ -49,14 -49,10 +49,10 @@@ int sparc64_multi_core __read_mostly; - cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE; - cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE; DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE; cpumask_t cpu_core_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; - EXPORT_SYMBOL(cpu_possible_map); - EXPORT_SYMBOL(cpu_online_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_SYMBOL(cpu_core_map); @@@ -163,7 -159,7 +159,7 @@@ static inline long get_delta (long *rt for (i = 0; i < NUM_ITERS; i++) { t0 = tick_ops->get_tick(); go[MASTER] = 1; - membar_storeload(); + membar_safe("#StoreLoad"); while (!(tm = go[SLAVE])) rmb(); go[SLAVE] = 0; @@@ -257,7 -253,7 +253,7 @@@ static void smp_synchronize_one_tick(in /* now let the client proceed into his loop */ go[MASTER] = 0; - membar_storeload(); + membar_safe("#StoreLoad"); spin_lock_irqsave(&itc_sync_lock, flags); { @@@ -267,7 -263,7 +263,7 @@@ go[MASTER] = 0; wmb(); go[SLAVE] = tick_ops->get_tick(); - membar_storeload(); + membar_safe("#StoreLoad"); } } spin_unlock_irqrestore(&itc_sync_lock, flags); @@@ -773,7 -769,7 +769,7 @@@ static void xcall_deliver(u64 data0, u6 /* Setup the initial cpu list. */ cnt = 0; - for_each_cpu_mask_nr(i, *mask) { + for_each_cpu(i, mask) { if (i == this_cpu || !cpu_online(i)) continue; cpu_list[cnt++] = i; @@@ -1122,6 -1118,7 +1118,6 @@@ void smp_capture(void smp_processor_id()); #endif penguins_are_doing_time = 1; - membar_storestore_loadstore(); atomic_inc(&smp_capture_registry); smp_cross_call(&xcall_capture, 0, 0, 0); while (atomic_read(&smp_capture_registry) != ncpus) @@@ -1141,13 -1138,13 +1137,13 @@@ void smp_release(void smp_processor_id()); #endif penguins_are_doing_time = 0; - membar_storeload_storestore(); + membar_safe("#StoreLoad"); atomic_dec(&smp_capture_registry); } } -/* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they - * can service tlb flush xcalls... +/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE + * set, so they can service tlb flush xcalls... */ extern void prom_world(int); @@@ -1160,7 -1157,7 +1156,7 @@@ void smp_penguin_jailcell(int irq, stru __asm__ __volatile__("flushw"); prom_world(1); atomic_inc(&smp_capture_registry); - membar_storeload_storestore(); + membar_safe("#StoreLoad"); while (penguins_are_doing_time) rmb(); atomic_dec(&smp_capture_registry); diff --combined arch/sparc/kernel/sparc_ksyms_32.c index a4d45fc29b2,32d11a5fe3a..e1e97639231 --- a/arch/sparc/kernel/sparc_ksyms_32.c +++ b/arch/sparc/kernel/sparc_ksyms_32.c @@@ -61,6 -61,7 +61,6 @@@ extern void (*bzero_1page)(void *) extern void *__bzero(void *, size_t); extern void *__memscan_zero(void *, size_t); extern void *__memscan_generic(void *, int, size_t); -extern int __memcmp(const void *, const void *, __kernel_size_t); extern int __strncmp(const char *, const char *, __kernel_size_t); extern int __ashrdi3(int, int); @@@ -112,17 -113,15 +112,13 @@@ EXPORT_PER_CPU_SYMBOL(__cpu_data) #ifdef CONFIG_SMP /* IRQ implementation. */ EXPORT_SYMBOL(synchronize_irq); - - /* CPU online map and active count. */ - EXPORT_SYMBOL(cpu_online_map); - EXPORT_SYMBOL(phys_cpu_present_map); #endif EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(rtc_lock); -#ifdef CONFIG_SUN_AUXIO EXPORT_SYMBOL(set_auxio); EXPORT_SYMBOL(get_auxio); -#endif EXPORT_SYMBOL(io_remap_pfn_range); #ifndef CONFIG_SMP @@@ -210,6 -209,7 +206,6 @@@ EXPORT_SYMBOL(bzero_1page) EXPORT_SYMBOL(__bzero); EXPORT_SYMBOL(__memscan_zero); EXPORT_SYMBOL(__memscan_generic); -EXPORT_SYMBOL(__memcmp); EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(__memmove); diff --combined arch/sparc/kernel/time_64.c index 141da375909,9df8f095a8b..9df8f095a8b --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@@ -763,7 -763,7 +763,7 @@@ void __devinit setup_sparc64_timer(void sevt = &__get_cpu_var(sparc64_events); memcpy(sevt, &sparc64_clockevent, sizeof(*sevt)); - sevt->cpumask = cpumask_of_cpu(smp_processor_id()); + sevt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(sevt); } diff --combined arch/x86/Kconfig index 0f44add3e0b,0ca2eb7573c..249d1e0824b --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -19,8 -19,6 +19,8 @@@ config X86_6 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE @@@ -92,10 -90,6 +92,10 @@@ config GENERIC_IOMA config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y @@@ -250,19 -244,16 +250,19 @@@ config X86_HAS_BOOT_CPU_I config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ - default y help - This enables support for sparse irq, esp for msi/msi-x. You may need - if you have lots of cards supports msi-x installed. + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. - If you don't know what to do here, say Y. + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" - depends on SPARSE_IRQ && SMP + depends on SPARSE_IRQ && NUMA default n help This enables moving irq_desc to cpu/node that irq will use handled. @@@ -273,13 -264,21 +273,13 @@@ config X86_FIND_SMP_CONFI def_bool y depends on X86_MPPARSE || X86_VOYAGER -if ACPI config X86_MPPARSE - def_bool y - bool "Enable MPS table" + bool "Enable MPS table" if ACPI + default y depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -endif - -if !ACPI -config X86_MPPARSE - def_bool y - depends on X86_LOCAL_APIC -endif choice prompt "Subarchitecture Type" @@@ -501,7 -500,7 +501,7 @@@ config HPET_TIME The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - . + . You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. @@@ -588,7 -587,7 +588,7 @@@ config AMD_IOMM # need this always selected by IOMMU for the VIA workaround config SWIOTLB - bool + def_bool y if X86_64 help Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation @@@ -601,19 -600,20 +601,20 @@@ config IOMMU_HELPE config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && BROKEN + depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + select CPUMASK_OFFSTACK default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. config NR_CPUS - int "Maximum number of CPUs (2-512)" if !MAXSMP - range 2 512 - depends on SMP + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 512 if SMP && !MAXSMP + default "1" if !SMP default "4096" if MAXSMP - default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 - default "8" + default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "8" if SMP help This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 512 and the @@@ -679,30 -679,6 +680,30 @@@ config X86_VISWS_API def_bool y depends on X86_32 && X86_VISWS +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER @@@ -999,37 -975,24 +1000,37 @@@ config X86_PA config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA - bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. + The kernel will try to allocate memory used by a CPU on the local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 32-bit this is currently highly experimental and should be only - used for kernel development. It might also cause boot failures. - For 64-bit this is recommended on all multiprocessor Opteron systems. - If the system is EM64T, you should say N unless your system is - EM64T NUMA. + For 64-bit this is recommended if the system is Intel Core i7 + (or later), AMD Opteron, or EM64T NUMA. + + For 32-bit this is only needed on (rare) 32-bit-only platforms + that support NUMA topologies, such as NUMAQ / Summit, or if you + boot a 32-bit kernel on a 64-bit NUMA platform. + + Otherwise, you should say N. comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) @@@ -1549,10 -1512,6 +1550,10 @@@ config ARCH_ENABLE_MEMORY_HOTPLU def_bool y depends on X86_64 || (X86_32 && HIGHMEM) +config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + depends on MEMORY_HOTPLUG + config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA diff --combined arch/x86/include/asm/irq.h index 28e409fc73f,4bb732e45a8..592688ed04d --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@@ -31,9 -31,13 +31,9 @@@ static inline int irq_canonicalize(int # endif #endif -#ifdef CONFIG_IRQBALANCE -extern int irqbalance_disable(char *str); -#endif - #ifdef CONFIG_HOTPLUG_CPU #include - extern void fixup_irqs(cpumask_t map); + extern void fixup_irqs(void); #endif extern unsigned int do_IRQ(struct pt_regs *regs); @@@ -42,5 -46,6 +42,6 @@@ extern void native_init_IRQ(void) /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); + extern int vector_used_by_percpu_irq(unsigned int vector); #endif /* _ASM_X86_IRQ_H */ diff --combined arch/x86/kernel/apic.c index b5229affb95,b9019271af6..6b7f824db16 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@@ -30,7 -30,6 +30,7 @@@ #include #include #include +#include #include #include @@@ -119,8 -118,6 +119,6 @@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o int first_system_vector = 0xfe; - char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - /* * Debug level, exported for io_apic.c */ @@@ -142,7 -139,7 +140,7 @@@ static int lapic_next_event(unsigned lo struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); - static void lapic_timer_broadcast(cpumask_t mask); + static void lapic_timer_broadcast(const cpumask_t *mask); static void apic_pm_activate(void); /* @@@ -455,7 -452,7 +453,7 @@@ static void lapic_timer_setup(enum cloc /* * Local APIC timer broadcast function */ - static void lapic_timer_broadcast(cpumask_t mask) + static void lapic_timer_broadcast(const cpumask_t *mask) { #ifdef CONFIG_SMP send_IPI_mask(mask, LOCAL_TIMER_VECTOR); @@@ -471,7 -468,7 +469,7 @@@ static void __cpuinit setup_APIC_timer( struct clock_event_device *levt = &__get_cpu_var(lapic_events); memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); + levt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(levt); } @@@ -778,7 -775,11 +776,7 @@@ static void local_apic_timer_interrupt( /* * the NMI deadlock-detector uses this. */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif + inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); } @@@ -791,7 -792,7 +789,7 @@@ * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs *regs) +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@@ -805,7 -806,9 +803,7 @@@ * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@@ -1663,7 -1666,9 +1661,7 @@@ void smp_spurious_interrupt(struct pt_r { u32 v; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@@ -1674,11 -1679,14 +1672,11 @@@ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else + inc_irq_stat(irq_spurious_count); + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif irq_exit(); } @@@ -1689,7 -1697,9 +1687,7 @@@ void smp_error_interrupt(struct pt_reg { u32 v, v1; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@@ -1807,28 -1817,32 +1805,32 @@@ void disconnect_bsp_APIC(int virt_wire_ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - cpumask_t tmp_map; /* * Validate version */ if (version == 0x0) { pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); + "fixing up to 0x10. (tell your hw vendor)\n", + version); version = 0x10; } apic_version[apicid] = version; - if (num_processors >= NR_CPUS) { - pr_warning("WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); + if (num_processors >= nr_cpu_ids) { + int max = nr_cpu_ids; + int thiscpu = max + disabled_cpus; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i reached." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; return; } num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); + cpu = cpumask_next_zero(-1, cpu_present_mask); physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { @@@ -1878,8 -1892,8 +1880,8 @@@ } #endif - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); } #ifdef CONFIG_X86_64 @@@ -2081,7 -2095,7 +2083,7 @@@ __cpuinit int apic_is_clustered_box(voi bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { /* are we being called early in kernel startup? */ if (bios_cpu_apicid) { id = bios_cpu_apicid[i]; diff --combined arch/x86/kernel/cpu/intel_cacheinfo.c index 68b5d8681cb,fb7f946cb65..c6ecda64f5f --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@@ -534,31 -534,16 +534,16 @@@ static void __cpuinit free_cache_attrib per_cpu(cpuid4_info, cpu) = NULL; } - static int __cpuinit detect_cache_attributes(unsigned int cpu) + static void get_cpu_leaves(void *_retval) { - struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; - cpumask_t oldmask; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) - return -ENOMEM; - - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (retval) - goto out; + int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { + struct _cpuid4_info *this_leaf; this_leaf = CPUID4_INFO_IDX(cpu, j); - retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) { + *retval = cpuid4_cache_lookup(j, this_leaf); + if (unlikely(*retval < 0)) { int i; for (i = 0; i < j; i++) @@@ -567,9 -552,21 +552,21 @@@ } cache_shared_cpu_map_setup(cpu, j); } - set_cpus_allowed_ptr(current, &oldmask); + } + + static int __cpuinit detect_cache_attributes(unsigned int cpu) + { + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; + + per_cpu(cpuid4_info, cpu) = kzalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (per_cpu(cpuid4_info, cpu) == NULL) + return -ENOMEM; - out: + smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { kfree(per_cpu(cpuid4_info, cpu)); per_cpu(cpuid4_info, cpu) = NULL; @@@ -626,8 -623,8 +623,8 @@@ static ssize_t show_shared_cpu_map_func cpumask_t *mask = &this_leaf->shared_cpu_map; n = type? - cpulist_scnprintf(buf, len-2, *mask): - cpumask_scnprintf(buf, len-2, *mask); + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; buf[n] = '\0'; } @@@ -644,17 -641,20 +641,17 @@@ static inline ssize_t show_shared_cpu_l return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { - switch(this_leaf->eax.split.type) { - case CACHE_TYPE_DATA: +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +{ + switch (this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); - break; - case CACHE_TYPE_INST: + case CACHE_TYPE_INST: return sprintf(buf, "Instruction\n"); - break; - case CACHE_TYPE_UNIFIED: + case CACHE_TYPE_UNIFIED: return sprintf(buf, "Unified\n"); - break; - default: + default: return sprintf(buf, "Unknown\n"); - break; } } diff --combined arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 748c8f9e7a0,a1de80f368f..a5a5e053037 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@@ -83,34 -83,41 +83,41 @@@ static DEFINE_PER_CPU(unsigned char, ba * CPU Initialization */ + struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; + }; + /* must be called with correct cpu affinity */ - static void threshold_restart_bank(struct threshold_block *b, - int reset, u16 old_limit) + static long threshold_restart_bank(void *_tr) { + struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; - rdmsr(b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - reset = 1; /* limit cannot be lower than err count */ + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ - if (reset) { /* reset err count and overflow bit */ + if (tr->reset) { /* reset err count and overflow bit */ mci_misc_hi = (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - b->threshold_limit); - } else if (old_limit) { /* change limit w/o reset */ + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (old_limit - b->threshold_limit); + (tr->old_limit - tr->b->threshold_limit); mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - b->interrupt_enable ? + tr->b->interrupt_enable ? (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(b->address, mci_misc_lo, mci_misc_hi); + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@@ -120,6 -127,7 +127,7 @@@ void __cpuinit mce_amd_feature_init(str unsigned int cpu = smp_processor_id(); u8 lvt_off; u32 low = 0, high = 0, address = 0; + struct thresh_restart tr; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@@ -162,7 -170,10 +170,10 @@@ wrmsr(address, low, high); threshold_defaults.address = address; - threshold_restart_bank(&threshold_defaults, 0, 0); + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); } } } @@@ -237,7 -248,7 +248,7 @@@ asmlinkage void mce_threshold_interrupt } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } @@@ -251,20 -262,6 +262,6 @@@ struct threshold_attr ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; - static void affinity_set(unsigned int cpu, cpumask_t *oldmask, - cpumask_t *newmask) - { - *oldmask = current->cpus_allowed; - cpus_clear(*newmask); - cpu_set(cpu, *newmask); - set_cpus_allowed_ptr(current, newmask); - } - - static void affinity_restore(const cpumask_t *oldmask) - { - set_cpus_allowed_ptr(current, oldmask); - } - #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ { \ @@@ -277,15 -274,16 +274,16 @@@ static ssize_t store_interrupt_enable(s const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; b->interrupt_enable = !!new; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, 0); - affinity_restore(&oldmask); + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } @@@ -294,8 -292,7 +292,7 @@@ static ssize_t store_threshold_limit(st const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; - u16 old; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; @@@ -303,34 -300,36 +300,36 @@@ new = THRESHOLD_MAX; if (new < 1) new = 1; - old = b->threshold_limit; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; + tr.b = b; + tr.reset = 0; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, old); - affinity_restore(&oldmask); + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } - static ssize_t show_error_count(struct threshold_block *b, char *buf) + static long local_error_count(void *_b) { - u32 high, low; - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); + struct threshold_block *b = _b; + u32 low, high; + rdmsr(b->address, low, high); - affinity_restore(&oldmask); - return sprintf(buf, "%x\n", - (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); + return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); + } + + static ssize_t show_error_count(struct threshold_block *b, char *buf) + { + return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); } static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 1, 0); - affinity_restore(&oldmask); + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return 1; } @@@ -463,12 -462,19 +462,19 @@@ out_free return err; } + static long local_allocate_threshold_blocks(void *_bank) + { + unsigned int *bank = _bank; + + return allocate_threshold_blocks(smp_processor_id(), *bank, 0, + MSR_IA32_MC0_MISC + *bank * 4); + } + /* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - cpumask_t oldmask, newmask; char name[32]; sprintf(name, "threshold_bank%i", bank); @@@ -519,11 -525,7 +525,7 @@@ per_cpu(threshold_banks, cpu)[bank] = b; - affinity_set(cpu, &oldmask, &newmask); - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(&oldmask); - + err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); if (err) goto out_free; diff --combined arch/x86/kernel/genx2apic_uv_x.c index dece1728973,0e88be11227..b193e082f6c --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@@ -10,7 -10,6 +10,7 @@@ #include #include +#include #include #include #include @@@ -18,9 -17,6 +18,9 @@@ #include #include #include +#include +#include +#include #include #include #include @@@ -79,16 -75,15 +79,15 @@@ EXPORT_SYMBOL(sn_rtc_cycles_per_second) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - static cpumask_t uv_target_cpus(void) + static const struct cpumask *uv_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } - static cpumask_t uv_vector_allocation_domain(int cpu) + static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) @@@ -127,28 -122,37 +126,37 @@@ static void uv_send_IPI_one(int cpu, in uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } - static void uv_send_IPI_mask(cpumask_t mask, int vector) + static void uv_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned int cpu; - for_each_possible_cpu(cpu) - if (cpu_isset(cpu, mask)) + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); + } + + static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) + { + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); + + for_each_cpu(cpu, mask) + if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - uv_send_IPI_mask(mask, vector); + for_each_online_cpu(cpu) + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_all(int vector) { - uv_send_IPI_mask(cpu_online_map, vector); + uv_send_IPI_mask(cpu_online_mask, vector); } static int uv_apic_id_registered(void) @@@ -160,7 -164,7 +168,7 @@@ static void uv_init_apic_ldr(void { } - static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) + static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@@ -168,13 -172,30 +176,30 @@@ * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } + static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) + { + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; + } + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@@ -222,8 -243,10 +247,10 @@@ struct genapic apic_x2apic_uv_x = .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@@ -359,103 -382,6 +386,103 @@@ static __init void uv_rtc_init(void sn_rtc_cycles_per_second = ticks_per_sec; } +/* + * percpu heartbeat timer + */ +static void uv_heartbeat(unsigned long ignored) +{ + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +} + +static void __cpuinit uv_heartbeat_enable(int cpu) +{ + if (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + } + + /* check boot cpu */ + if (!uv_cpu_hub_info(0)->scir.enabled) + uv_heartbeat_enable(0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __cpuinit uv_heartbeat_disable(int cpu) +{ + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); +} + +/* + * cpu hotplug notifier + */ +static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static __init void uv_scir_register_cpu_notifier(void) +{ + hotcpu_notifier(uv_scir_cpu_notify, 0); +} + +#else /* !CONFIG_HOTPLUG_CPU */ + +static __init void uv_scir_register_cpu_notifier(void) +{ +} + +static __init int uv_init_heartbeat(void) +{ + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; +} + +late_initcall(uv_init_heartbeat); + +#endif /* !CONFIG_HOTPLUG_CPU */ + /* * Called on each cpu to initialize the per_cpu UV data area. * ZZZ hotplug not supported yet @@@ -529,7 -455,7 +556,7 @@@ void __init uv_system_init(void uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@@ -540,7 -466,8 +567,7 @@@ uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; @@@ -550,8 -477,7 +577,8 @@@ uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@@ -568,6 -494,4 +595,6 @@@ map_mmioh_high(max_pnode); uv_cpu_init(); + uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); } diff --combined arch/x86/kernel/hpet.c index 845ea097383,e76d7e27297..cd759ad9069 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@@ -33,9 -33,7 +33,9 @@@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; -unsigned long hpet_num_timers; +#ifdef CONFIG_PCI_MSI +static unsigned long hpet_num_timers; +#endif static void __iomem *hpet_virt_address; struct hpet_dev { @@@ -248,7 -246,7 +248,7 @@@ static void hpet_legacy_clockevent_regi * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); @@@ -303,7 -301,7 +303,7 @@@ static void hpet_set_mode(enum clock_ev struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); hpet_setup_msi_irq(hdev->irq); disable_irq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } break; @@@ -451,7 -449,7 +451,7 @@@ static int hpet_setup_irq(struct hpet_d return -1; disable_irq(dev->irq); - irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", @@@ -502,7 -500,7 +502,7 @@@ static void init_one_hpet_msi_clockeven /* 5 usec minimum reprogramming delta. */ evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of_cpu(hdev->cpu); + evt->cpumask = cpumask_of(hdev->cpu); clockevents_register_device(evt); } @@@ -813,7 -811,7 +813,7 @@@ int __init hpet_enable(void out_nohpet: hpet_clear_mapping(); - boot_hpet_disable = 1; + hpet_address = 0; return 0; } @@@ -836,11 -834,10 +836,11 @@@ static __init int hpet_late_init(void hpet_address = force_hpet_address; hpet_enable(); - if (!hpet_virt_address) - return -ENODEV; } + if (!hpet_virt_address) + return -ENODEV; + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); for_each_online_cpu(cpu) { diff --combined arch/x86/kernel/io_apic.c index 74917658b00,1cbf7c8d46e..62ecfc991e1 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@@ -136,8 -136,8 +136,8 @@@ static struct irq_pin_list *get_one_fre struct irq_cfg { struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; + cpumask_var_t domain; + cpumask_var_t old_domain; unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; @@@ -152,25 -152,25 +152,25 @@@ static struct irq_cfg irq_cfgx[] = #else static struct irq_cfg irq_cfgx[NR_IRQS] = { #endif - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, + [0] = { .vector = IRQ0_VECTOR, }, + [1] = { .vector = IRQ1_VECTOR, }, + [2] = { .vector = IRQ2_VECTOR, }, + [3] = { .vector = IRQ3_VECTOR, }, + [4] = { .vector = IRQ4_VECTOR, }, + [5] = { .vector = IRQ5_VECTOR, }, + [6] = { .vector = IRQ6_VECTOR, }, + [7] = { .vector = IRQ7_VECTOR, }, + [8] = { .vector = IRQ8_VECTOR, }, + [9] = { .vector = IRQ9_VECTOR, }, + [10] = { .vector = IRQ10_VECTOR, }, + [11] = { .vector = IRQ11_VECTOR, }, + [12] = { .vector = IRQ12_VECTOR, }, + [13] = { .vector = IRQ13_VECTOR, }, + [14] = { .vector = IRQ14_VECTOR, }, + [15] = { .vector = IRQ15_VECTOR, }, }; -void __init arch_early_irq_init(void) +int __init arch_early_irq_init(void) { struct irq_cfg *cfg; struct irq_desc *desc; @@@ -183,9 -183,11 +183,13 @@@ for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; + alloc_bootmem_cpumask_var(&cfg[i].domain); + alloc_bootmem_cpumask_var(&cfg[i].old_domain); + if (i < NR_IRQS_LEGACY) + cpumask_setall(cfg[i].domain); } + + return 0; } #ifdef CONFIG_SPARSE_IRQ @@@ -209,12 -211,26 +213,26 @@@ static struct irq_cfg *get_one_free_irq node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + if (cfg) { + /* FIXME: needs alloc_cpumask_var_node() */ + if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) { + kfree(cfg); + cfg = NULL; + } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) { + free_cpumask_var(cfg->domain); + kfree(cfg); + cfg = NULL; + } else { + cpumask_clear(cfg->domain); + cpumask_clear(cfg->old_domain); + } + } printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node); return cfg; } -void arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int cpu) { struct irq_cfg *cfg; @@@ -226,8 -242,6 +244,8 @@@ BUG_ON(1); } } + + return 0; } #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC @@@ -333,13 -347,14 +351,14 @@@ void arch_free_chip_data(struct irq_des } } - static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) + static void + set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg = desc->chip_data; if (!cfg->move_in_progress) { /* it means that domain is not changed */ - if (!cpus_intersects(desc->affinity, mask)) + if (!cpumask_intersects(&desc->affinity, mask)) cfg->move_desc_pending = 1; } } @@@ -354,7 -369,8 +373,8 @@@ static struct irq_cfg *irq_cfg(unsigne #endif #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC - static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) + static inline void + set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) { } #endif @@@ -485,6 -501,26 +505,26 @@@ static void ioapic_mask_entry(int apic } #ifdef CONFIG_SMP + static void send_cleanup_vector(struct irq_cfg *cfg) + { + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; + } + static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; @@@ -520,41 -556,55 +560,55 @@@ } } - static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask); + static int + assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask) + /* + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid + * of that, or returns BAD_APICID and leaves desc->affinity untouched. + */ + static unsigned int + set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; + return BAD_APICID; + cpumask_and(&desc->affinity, cfg->domain, mask); set_extra_move_desc(desc, mask); + return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); + } - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); + static void + set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) + { + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg); - desc->affinity = mask; + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } spin_unlock_irqrestore(&ioapic_lock, flags); } - static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) + static void + set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc; @@@ -1222,7 -1272,8 +1276,8 @@@ void unlock_vector_lock(void spin_unlock(&vector_lock); } - static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask) + static int + __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@@ -1237,49 -1288,49 +1292,49 @@@ */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; - int cpu; + int cpu, err; + cpumask_var_t tmp_mask; if ((cfg->move_in_progress) || cfg->move_cleanup_count) return -EBUSY; - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; old_vector = cfg->vector; if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); return 0; + } } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + vector_allocation_domain(cpu, tmp_mask); vector = current_vector; offset = current_offset; next: vector += 8; if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ + /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; - #ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; - #else - if (vector == SYSCALL_VECTOR) + + if (test_bit(vector, used_vectors)) goto next; - #endif - for_each_cpu_mask_nr(new_cpu, new_mask) + + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@@ -1287,18 -1338,21 +1342,21 @@@ current_offset = offset; if (old_vector) { cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; + cpumask_copy(cfg->old_domain, cfg->domain); } - for_each_cpu_mask_nr(new_cpu, new_mask) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; - cfg->domain = domain; - return 0; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; } - return -ENOSPC; + free_cpumask_var(tmp_mask); + return err; } - static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask) + static int + assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@@ -1311,23 -1365,20 +1369,20 @@@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) { - cpumask_t mask; int cpu, vector; BUG_ON(!cfg->vector); vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cpus_clear(cfg->domain); + cpumask_clear(cfg->domain); if (likely(!cfg->move_in_progress)) return; - cpus_and(mask, cfg->old_domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@@ -1349,8 -1400,10 +1404,8 @@@ void __setup_vector_irq(int cpu /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { - if (!desc) - continue; cfg = desc->chip_data; - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; @@@ -1362,7 -1415,7 +1417,7 @@@ continue; cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@@ -1498,18 -1551,17 +1553,17 @@@ static void setup_IO_APIC_irq(int apic { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - cpumask_t mask; + unsigned int dest; if (!IO_APIC_IRQ(irq)) return; cfg = desc->chip_data; - mask = TARGET_CPUS; - if (assign_irq_vector(irq, cfg, mask)) + if (assign_irq_vector(irq, cfg, TARGET_CPUS)) return; - cpus_and(mask, cfg->domain, mask); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@@ -1519,8 -1571,7 +1573,7 @@@ if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { + dest, trigger, polarity, cfg->vector)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic].mp_apicid, pin); __clear_irq_vector(irq, cfg); @@@ -1732,6 -1783,8 +1785,6 @@@ __apicdebuginit(void) print_IO_APIC(voi for_each_irq_desc(irq, desc) { struct irq_pin_list *entry; - if (!desc) - continue; cfg = desc->chip_data; entry = cfg->irq_2_pin; if (!entry) @@@ -2240,7 -2293,7 +2293,7 @@@ static int ioapic_retrigger_irq(unsigne unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@@ -2289,18 -2342,17 +2342,17 @@@ static DECLARE_DELAYED_WORK(ir_migratio * as simple as edge triggered migration and we can do the irq migration * with a simple atomic update to IO-APIC RTE. */ - static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask) + static void + migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - cpumask_t tmp, cleanup_mask; struct irte irte; int modify_ioapic_rte; unsigned int dest; unsigned long flags; unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpumask_intersects(mask, cpu_online_mask)) return; irq = desc->irq; @@@ -2313,8 -2365,7 +2365,7 @@@ set_extra_move_desc(desc, mask); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, mask); modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { @@@ -2331,14 -2382,10 +2382,10 @@@ */ modify_irte(irq, &irte); - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (cfg->move_in_progress) + send_cleanup_vector(cfg); - desc->affinity = mask; + cpumask_copy(&desc->affinity, mask); } static int migrate_irq_remapped_level_desc(struct irq_desc *desc) @@@ -2360,11 -2407,11 +2407,11 @@@ } /* everthing is clear. we have right of way */ - migrate_ioapic_irq_desc(desc, desc->pending_mask); + migrate_ioapic_irq_desc(desc, &desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); unmask: unmask_IO_APIC_irq_desc(desc); @@@ -2378,6 -2425,9 +2425,6 @@@ static void ir_irq_migration(struct wor struct irq_desc *desc; for_each_irq_desc(irq, desc) { - if (!desc) - continue; - if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@@ -2389,7 -2439,7 +2436,7 @@@ continue; } - desc->chip->set_affinity(irq, desc->pending_mask); + desc->chip->set_affinity(irq, &desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@@ -2398,18 -2448,20 +2445,20 @@@ /* * Migrates the IRQ destination in the process context. */ - static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask) + static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) { if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; + cpumask_copy(&desc->pending_mask, mask); migrate_irq_remapped_level_desc(desc); return; } migrate_ioapic_irq_desc(desc, mask); } - static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) + static void set_ir_ioapic_affinity_irq(unsigned int irq, + const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@@ -2420,9 -2472,10 +2469,9 @@@ asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); @@@ -2444,7 -2497,7 +2493,7 @@@ if (!cfg->move_cleanup_count) goto unlock; - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; __get_cpu_var(vector_irq)[vector] = -1; @@@ -2467,7 -2520,7 +2516,7 @@@ static void irq_complete_move(struct ir if (likely(!cfg->move_desc_pending)) return; - /* domain is not change, but affinity is changed */ + /* domain has not changed, but affinity did */ me = smp_processor_id(); if (cpu_isset(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); @@@ -2481,20 -2534,14 +2530,14 @@@ vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC *descp = desc = move_irq_desc(desc, me); /* get the new one */ cfg = desc->chip_data; #endif - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); } #else static inline void irq_complete_move(struct irq_desc **descp) {} @@@ -2667,6 -2714,9 +2710,6 @@@ static inline void init_IO_APIC_traps(v * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_irq_desc(irq, desc) { - if (!desc) - continue; - cfg = desc->chip_data; if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* @@@ -3216,16 -3266,13 +3259,13 @@@ static int msi_compose_msg(struct pci_d struct irq_cfg *cfg; int err; unsigned dest; - cpumask_t tmp; cfg = irq_cfg(irq); - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, cfg, tmp); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (err) return err; - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { @@@ -3279,26 -3326,18 +3319,18 @@@ } #ifdef CONFIG_SMP - static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) + static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); read_msi_msg_desc(desc, &msg); @@@ -3308,37 -3347,27 +3340,27 @@@ msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); - desc->affinity = mask; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ - static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) + static void + ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = desc->chip_data; unsigned int dest; - cpumask_t tmp, cleanup_mask; struct irte irte; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - if (get_irte(irq, &irte)) return; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - set_extra_move_desc(desc, mask); - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@@ -3352,14 -3381,8 +3374,8 @@@ * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc->affinity = mask; + if (cfg->move_in_progress) + send_cleanup_vector(cfg); } #endif @@@ -3550,26 -3573,18 +3566,18 @@@ void arch_teardown_msi_irq(unsigned in #ifdef CONFIG_DMAR #ifdef CONFIG_SMP - static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) + static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); dmar_msi_read(irq, &msg); @@@ -3579,7 -3594,6 +3587,6 @@@ msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - desc->affinity = mask; } #endif /* CONFIG_SMP */ @@@ -3613,26 -3627,18 +3620,18 @@@ int arch_setup_dmar_msi(unsigned int ir #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP - static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) + static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); hpet_msi_read(irq, &msg); @@@ -3642,7 -3648,6 +3641,6 @@@ msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); - desc->affinity = mask; } #endif /* CONFIG_SMP */ @@@ -3697,28 -3702,19 +3695,19 @@@ static void target_ht_irq(unsigned int write_ht_irq_msg(irq, &msg); } - static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) + static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - desc->affinity = mask; } #endif @@@ -3738,17 -3734,14 +3727,14 @@@ int arch_setup_ht_irq(unsigned int irq { struct irq_cfg *cfg; int err; - cpumask_t tmp; cfg = irq_cfg(irq); - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, cfg, tmp); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (!err) { struct ht_irq_msg msg; unsigned dest; - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@@ -3784,7 -3777,7 +3770,7 @@@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset) { - const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@@ -3794,7 -3787,7 +3780,7 @@@ cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, *eligible_cpu); + err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; @@@ -3813,7 -3806,7 +3799,7 @@@ entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(*eligible_cpu); + entry->dest = cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@@ -4024,7 -4017,7 +4010,7 @@@ void __init setup_ioapic_dest(void int pin, ioapic, irq, irq_entry; struct irq_desc *desc; struct irq_cfg *cfg; - cpumask_t mask; + const struct cpumask *mask; if (skip_ioapic_setup == 1) return; @@@ -4055,7 -4048,7 +4041,7 @@@ */ if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = &desc->affinity; else mask = TARGET_CPUS; diff --combined arch/x86/kernel/irq_64.c index a174a217eb1,fca2991443f..6383d50f82e --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@@ -13,12 -13,12 +13,12 @@@ #include #include #include +#include #include #include #include #include -#ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: * @@@ -28,25 -28,26 +28,25 @@@ */ static inline void stack_overflow_check(struct pt_regs *regs) { +#ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; - - if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); - show_stack(NULL,NULL); - warned = jiffies; - } -} + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); #endif +} /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; @@@ -59,7 -60,9 +59,7 @@@ irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; -#ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); -#endif desc = irq_to_desc(irq); if (likely(desc)) @@@ -80,16 -83,17 +80,17 @@@ } #ifdef CONFIG_HOTPLUG_CPU - void fixup_irqs(cpumask_t map) + /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ + void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + const struct cpumask *affinity; if (!desc) continue; @@@ -99,23 -103,23 +100,23 @@@ /* interrupt's are disabled at this point */ spin_lock(&desc->lock); + affinity = &desc->affinity; if (!irq_has_action(irq) || - cpus_equal(desc->affinity, map)) { + cpumask_equal(affinity, cpu_online_mask)) { spin_unlock(&desc->lock); continue; } - cpus_and(mask, desc->affinity, map); - if (cpus_empty(mask)) { + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - mask = map; + affinity = cpu_all_mask; } if (desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (!(warned++)) set_affinity = 0; diff --combined arch/x86/kernel/irqinit_32.c index 203384ed2b5,61aa2a1004b..84723295f88 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@@ -110,6 -110,18 +110,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; + int vector_used_by_percpu_irq(unsigned int vector) + { + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; + } + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@@ -128,7 -140,7 +140,7 @@@ void __init native_init_IRQ(void for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i]); + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } @@@ -146,10 -158,12 +158,12 @@@ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --combined arch/x86/kernel/irqinit_64.c index 6190e6ef546,1020919efe1..31ebfe38e96 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@@ -23,6 -23,41 +23,6 @@@ #include #include -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.text\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt\n" \ - ".previous"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x30-0x3f) @@@ -38,6 -73,37 +38,6 @@@ * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - /* * IRQ2 is cascade interrupt to second interrupt controller @@@ -69,6 -135,18 +69,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; + int vector_used_by_percpu_irq(unsigned int vector) + { + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; + } + void __init init_ISA_irqs(void) { int i; @@@ -121,6 -199,7 +133,7 @@@ static void __init smp_intr_init(void /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif } diff --combined arch/x86/kernel/reboot.c index 72e0e4e712d,ba7b9a0e606..39643b1df06 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@@ -12,7 -12,6 +12,7 @@@ #include #include #include +#include #ifdef CONFIG_X86_32 # include @@@ -40,12 -39,6 +40,12 @@@ int reboot_force static int reboot_cpu = -1; #endif +/* This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; @@@ -375,48 -368,6 +375,48 @@@ static inline void kb_wait(void } } +static void vmxoff_nmi(int cpu, struct die_args *args) +{ + cpu_emergency_vmxoff(); +} + +/* Use NMIs as IPIs to tell all CPUs to disable virtualization + */ +static void emergency_vmx_disable_all(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* We need to disable VMX on all CPUs before rebooting, otherwise + * we risk hanging up the machine, because the CPU ignore INIT + * signals when VMX is enabled. + * + * We can't take any locks and we may be on an inconsistent + * state, so we use NMIs as IPIs to tell the other CPUs to disable + * VMX and halt. + * + * For safety, we will avoid running the nmi_shootdown_cpus() + * stuff unnecessarily, but we don't have a way to check + * if other CPUs have VMX enabled. So we will call it only if the + * CPU we are running on has VMX enabled. + * + * We will miss cases where VMX is not enabled on all CPUs. This + * shouldn't do much harm because KVM always enable VMX on all + * CPUs anyway. But we can miss it on the small window where KVM + * is still enabling VMX. + */ + if (cpu_has_vmx() && cpu_vmx_enabled()) { + /* Disable VMX on this CPU. + */ + cpu_vmxoff(); + + /* Halt and disable VMX on the other CPUs */ + nmi_shootdown_cpus(vmxoff_nmi); + + } +} + + void __attribute__((weak)) mach_reboot_fixups(void) { } @@@ -425,9 -376,6 +425,9 @@@ static void native_machine_emergency_re { int i; + if (reboot_emergency) + emergency_vmx_disable_all(); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@@ -534,19 -482,13 +534,19 @@@ void native_machine_shutdown(void #endif } +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + static void native_machine_restart(char *__unused) { printk("machine restart\n"); if (!reboot_force) machine_shutdown(); - machine_emergency_restart(); + __machine_emergency_restart(0); } static void native_machine_halt(void) @@@ -590,7 -532,7 +590,7 @@@ void machine_shutdown(void void machine_emergency_restart(void) { - machine_ops.emergency_restart(); + __machine_emergency_restart(1); } void machine_restart(char *cmd) @@@ -650,10 -592,7 +650,7 @@@ static int crash_nmi_callback(struct no static void smp_send_nmi_allbutself(void) { - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); + send_IPI_allbutself(NMI_VECTOR); } static struct notifier_block crash_nmi_nb = { diff --combined arch/x86/kernel/smp.c index 7e558db362c,49ed667b06f..beea2649a24 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@@ -118,22 -118,22 +118,22 @@@ static void native_smp_send_reschedule( WARN_ON(1); return; } - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); } - void native_send_call_func_ipi(cpumask_t mask) + void native_send_call_func_ipi(const struct cpumask *mask) { cpumask_t allbutself; allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - if (cpus_equal(mask, allbutself) && + if (cpus_equal(*mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else @@@ -165,7 -165,11 +165,7 @@@ static void native_smp_send_stop(void void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_resched_count++; -#else - add_pda(irq_resched_count, 1); -#endif + inc_irq_stat(irq_resched_count); } void smp_call_function_interrupt(struct pt_regs *regs) @@@ -173,7 -177,11 +173,7 @@@ ack_APIC_irq(); irq_enter(); generic_smp_call_function_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } @@@ -182,7 -190,11 +182,7 @@@ void smp_call_function_single_interrupt ack_APIC_irq(); irq_enter(); generic_smp_call_function_single_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } diff --combined arch/x86/kernel/smpboot.c index f8500c96944,1a9941b1115..31869bf5fab --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@@ -102,14 -102,8 +102,8 @@@ EXPORT_SYMBOL(smp_num_siblings) /* Last level cache ID of each logical CPU */ DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; - /* bitmap of online cpus */ - cpumask_t cpu_online_map __read_mostly; - EXPORT_SYMBOL(cpu_online_map); - cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; - cpumask_t cpu_possible_map; - EXPORT_SYMBOL(cpu_possible_map); /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); @@@ -288,7 -282,7 +282,7 @@@ static int __cpuinitdata unsafe_smp /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@@ -1081,10 -1075,8 +1075,10 @@@ static int __init smp_sanity_check(unsi #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING "weird, boot CPU (#%d) not listed" - "by the BIOS.\n", hard_smp_processor_id()); + printk(KERN_WARNING + "weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); } @@@ -1260,6 -1252,15 +1254,15 @@@ void __init native_smp_cpus_done(unsign check_nmi_watchdog(); } + static int __initdata setup_possible_cpus = -1; + static int __init _setup_possible_cpus(char *str) + { + get_option(&str, &setup_possible_cpus); + return 0; + } + early_param("possible_cpus", _setup_possible_cpus); + + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@@ -1272,7 -1273,7 +1275,7 @@@ * * Three ways to find out the number of additional hotplug CPUs: * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with additional_cpus=NUM + * - The user can overwrite it with possible_cpus=NUM * - Otherwise don't reserve additional CPUs. * We do this because additional CPUs waste a lot of memory. * -AK @@@ -1285,9 -1286,17 +1288,17 @@@ __init void prefill_possible_map(void if (!num_processors) num_processors = 1; - possible = num_processors + disabled_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (setup_possible_cpus == -1) + possible = num_processors + disabled_cpus; + else + possible = setup_possible_cpus; + + if (possible > CONFIG_NR_CPUS) { + printk(KERN_WARNING + "%d Processors exceeds NR_CPUS limit of %d\n", + possible, CONFIG_NR_CPUS); + possible = CONFIG_NR_CPUS; + } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); @@@ -1352,7 -1361,7 +1363,7 @@@ void cpu_disable_common(void lock_vector_lock(); remove_cpu_from_maps(cpu); unlock_vector_lock(); - fixup_irqs(cpu_online_map); + fixup_irqs(); } int native_cpu_disable(void) diff --combined arch/x86/kernel/tlb_32.c index 8da059f949b,174ea90d1cb..ce505464224 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@@ -34,8 -34,9 +34,8 @@@ static DEFINE_SPINLOCK(tlbstate_lock) */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@@ -103,8 -104,8 +103,8 @@@ void smp_invalidate_interrupt(struct pt * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@@ -118,7 -119,7 +118,7 @@@ smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@@ -163,7 -164,7 +163,7 @@@ * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ @@@ -237,7 -238,7 +237,7 @@@ static void do_flush_tlb_all(void *info unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } diff --combined arch/x86/kernel/tlb_64.c index 29887d7081a,de6f1bda0c5..f8be6f1d2e4 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@@ -154,7 -154,7 +154,7 @@@ asmlinkage void smp_invalidate_interrup out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@@ -191,7 -191,7 +191,7 @@@ * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); while (!cpus_empty(f->flush_cpumask)) cpu_relax(); diff --combined arch/x86/kernel/traps.c index 141907ab6e2,4a6dff39a47..2d1f4c7e405 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@@ -72,9 -72,6 +72,6 @@@ #include "cpu/mcheck/mce.h" - DECLARE_BITMAP(used_vectors, NR_VECTORS); - EXPORT_SYMBOL_GPL(used_vectors); - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@@ -89,6 -86,9 +86,9 @@@ gate_desc idt_table[256 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif + DECLARE_BITMAP(used_vectors, NR_VECTORS); + EXPORT_SYMBOL_GPL(used_vectors); + static int ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) @@@ -481,7 -481,11 +481,7 @@@ do_nmi(struct pt_regs *regs, long error { nmi_enter(); -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif + inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); @@@ -660,7 -664,7 +660,7 @@@ void math_error(void __user *ip { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@@ -671,6 -675,7 +671,6 @@@ task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@@ -684,31 -689,34 +684,31 @@@ */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ + + err = swd & ~cwd & 0x3f; + #ifdef CONFIG_X86_32 + if (!err) return; #endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ } force_sig_info(SIGFPE, &info, task); } @@@ -941,9 -949,7 +941,7 @@@ dotraplinkage void do_iret_error(struc void __init trap_init(void) { - #ifdef CONFIG_X86_32 int i; - #endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@@ -1000,11 -1006,15 +998,15 @@@ } set_system_trap_gate(SYSCALL_VECTOR, &system_call); + #endif /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); + #ifdef CONFIG_X86_64 + set_bit(IA32_SYSCALL_VECTOR, used_vectors); + #else set_bit(SYSCALL_VECTOR, used_vectors); #endif /* diff --combined arch/x86/lguest/boot.c index 50a779264bb,104c8220a38..a7ed208f81e --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@@ -590,8 -590,7 +590,8 @@@ static void __init lguest_init_IRQ(void * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[vector]); + set_intr_gate(vector, + interrupt[vector-FIRST_EXTERNAL_VECTOR]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); @@@ -738,7 -737,7 +738,7 @@@ static void lguest_time_init(void /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of_cpu(0); + lguest_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ diff --combined arch/x86/xen/mmu.c index 773d68d3e91,e59e53b11e2..503c240e26c --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@@ -154,13 -154,13 +154,13 @@@ void xen_setup_mfn_list_list(void { unsigned pfn, idx; - for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@@ -179,7 -179,7 +179,7 @@@ void __init xen_build_dynamic_phys_to_m unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@@ -207,7 -207,7 +207,7 @@@ static void alloc_p2m(unsigned long **p p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); BUG_ON(p == NULL); - for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; if (cmpxchg(pp, p2m_missing, p) != p2m_missing) @@@ -407,8 -407,7 +407,8 @@@ out preempt_enable(); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ return *ptep; @@@ -879,8 -878,7 +879,8 @@@ static void __xen_pgd_pin(struct mm_str if (user_pgd) { xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ @@@ -995,8 -993,7 +995,8 @@@ static void __xen_pgd_unpin(struct mm_s pgd_t *user_pgd = xen_get_user_pgd(pgd); if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } @@@ -1082,7 -1079,7 +1082,7 @@@ static void drop_other_mm_ref(void *inf static void xen_drop_mm_ref(struct mm_struct *mm) { - cpumask_t mask; + cpumask_var_t mask; unsigned cpu; if (current->active_mm == mm) { @@@ -1094,7 -1091,16 +1094,16 @@@ } /* Get the "official" set of cpus referring to our pagetable. */ - mask = mm->cpu_vm_mask; + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, &mm->cpu_vm_mask); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@@ -1103,11 -1109,12 +1112,12 @@@ if needed. */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, mask); } - if (!cpus_empty(mask)) - smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) diff --combined drivers/xen/events.c index e26733a9df2,6c8193046e0..eb0dfdeaa94 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@@ -142,6 -142,9 +142,6 @@@ static void init_evtchn_cpu_bindings(vo /* By default all event channels notify CPU#0. */ for_each_irq_desc(i, desc) { - if (!desc) - continue; - desc->affinity = cpumask_of_cpu(0); } #endif @@@ -230,7 -233,6 +230,7 @@@ static void unmask_evtchn(int port static int find_unbound_irq(void) { int irq; + struct irq_desc *desc; /* Only allocate from dynirq range */ for (irq = 0; irq < nr_irqs; irq++) @@@ -240,10 -242,6 +240,10 @@@ if (irq == nr_irqs) panic("No available IRQ to bind to: increase nr_irqs!\n"); + desc = irq_to_desc_alloc_cpu(irq, 0); + if (WARN_ON(desc == NULL)) + return -1; + return irq; } @@@ -585,7 -583,7 +585,7 @@@ void rebind_evtchn_irq(int evtchn, int spin_unlock(&irq_mapping_update_lock); /* new event channels are always bound to cpu 0 */ - irq_set_affinity(irq, cpumask_of_cpu(0)); + irq_set_affinity(irq, cpumask_of(0)); /* Unmask the event channel. */ enable_irq(irq); @@@ -614,9 -612,9 +614,9 @@@ static void rebind_irq_to_cpu(unsigned } - static void set_affinity_irq(unsigned irq, cpumask_t dest) + static void set_affinity_irq(unsigned irq, const struct cpumask *dest) { - unsigned tcpu = first_cpu(dest); + unsigned tcpu = cpumask_first(dest); rebind_irq_to_cpu(irq, tcpu); } diff --combined include/linux/interrupt.h index 8cc8ef47f5b,7e85a6e89e4..990355fbc54 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@@ -111,13 -111,13 +111,13 @@@ extern void enable_irq(unsigned int irq extern cpumask_t irq_default_affinity; - extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask); + extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); #else /* CONFIG_SMP */ - static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask) + static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } @@@ -253,6 -253,9 +253,6 @@@ enu BLOCK_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, -#ifdef CONFIG_HIGH_RES_TIMERS - HRTIMER_SOFTIRQ, -#endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS @@@ -464,10 -467,4 +464,10 @@@ static inline void init_irq_proc(void int show_interrupts(struct seq_file *p, void *v); +struct irq_desc; + +extern int early_irq_init(void); +extern int arch_early_irq_init(void); +extern int arch_init_chip_data(struct irq_desc *desc, int cpu); + #endif diff --combined include/linux/irq.h index d64a6d49bde,fde5e613201..f899b502f18 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@@ -113,7 -113,8 +113,8 @@@ struct irq_chip void (*eoi)(unsigned int irq); void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, cpumask_t dest); + void (*set_affinity)(unsigned int irq, + const struct cpumask *dest); int (*retrigger)(unsigned int irq); int (*set_type)(unsigned int irq, unsigned int flow_type); int (*set_wake)(unsigned int irq, unsigned int on); @@@ -134,9 -135,6 +135,9 @@@ struct irq_2_iommu /** * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor + * @timer_rand_state: pointer to timer rand state struct + * @kstat_irqs: irq stats per cpu + * @irq_2_iommu: iommu with this irq * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @chip: low level interrupt hardware access * @msi_desc: MSI descriptor @@@ -148,8 -146,8 +149,8 @@@ * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple set_irq_wake() callers * @irq_count: stats field to detect stalled irqs - * @irqs_unhandled: stats field for spurious unhandled interrupts * @last_unhandled: aging timer for unhandled count + * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@@ -177,8 -175,8 +178,8 @@@ struct irq_desc unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ - unsigned int irqs_unhandled; unsigned long last_unhandled; /* Aging timer for unhandled count */ + unsigned int irqs_unhandled; spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; @@@ -193,23 -191,42 +194,23 @@@ const char *name; } ____cacheline_internodealigned_in_smp; -extern void early_irq_init(void); -extern void arch_early_irq_init(void); -extern void arch_init_chip_data(struct irq_desc *desc, int cpu); extern void arch_init_copy_chip_data(struct irq_desc *old_desc, struct irq_desc *desc, int cpu); extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); #ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; - -static inline struct irq_desc *irq_to_desc(unsigned int irq) -{ - return (irq < NR_IRQS) ? irq_desc + irq : NULL; -} -static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) -{ - return irq_to_desc(irq); -} - -#else - -extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +#else /* CONFIG_SPARSE_IRQ */ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); -# define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq)) -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq)) - #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) #define kstat_incr_irqs_this_cpu(irqno, DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]++) -#endif +#endif /* CONFIG_SPARSE_IRQ */ + +extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); static inline struct irq_desc * irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) diff --combined include/linux/sched.h index 8395e715809,e5f928a079e..158d53d0776 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc extern int runqueue_is_locked(void); extern void task_rq_unlock_wait(struct task_struct *p); - extern cpumask_t nohz_cpu_mask; + extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); #else @@@ -571,6 -571,12 +571,6 @@@ struct signal_struct */ struct rlimit rlim[RLIM_NLIMITS]; - /* keep the process-shared keyrings here so that they do the right - * thing in threads created with CLONE_THREAD */ -#ifdef CONFIG_KEYS - struct key *session_keyring; /* keyring inherited over fork */ - struct key *process_keyring; /* keyring private to this process */ -#endif #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif @@@ -641,7 -647,6 +641,7 @@@ struct user_struct /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + struct user_namespace *user_ns; #ifdef CONFIG_USER_SCHED struct task_group *tg; @@@ -659,7 -664,6 +659,7 @@@ extern struct user_struct *find_user(ui extern struct user_struct root_user; #define INIT_USER (&root_user) + struct backing_dev_info; struct reclaim_state; @@@ -667,7 -671,8 +667,7 @@@ struct sched_info { /* cumulative counters */ unsigned long pcount; /* # of times run on this cpu */ - unsigned long long cpu_time, /* time spent on the cpu */ - run_delay; /* time spent waiting on a runqueue */ + unsigned long long run_delay; /* time spent waiting on a runqueue */ /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ @@@ -758,20 -763,51 +758,51 @@@ enum cpu_idle_type #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ #define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ - #define BALANCE_FOR_MC_POWER \ - (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) + enum powersavings_balance_level { + POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ + POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package + * first for long running threads + */ + POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle + * cpu package for power savings + */ + MAX_POWERSAVINGS_BALANCE_LEVELS + }; - #define BALANCE_FOR_PKG_POWER \ - ((sched_mc_power_savings || sched_smt_power_savings) ? \ - SD_POWERSAVINGS_BALANCE : 0) + extern int sched_mc_power_savings, sched_smt_power_savings; - #define test_sd_parent(sd, flag) ((sd->parent && \ - (sd->parent->flags & flag)) ? 1 : 0) + static inline int sd_balance_for_mc_power(void) + { + if (sched_smt_power_savings) + return SD_POWERSAVINGS_BALANCE; + return 0; + } + + static inline int sd_balance_for_package_power(void) + { + if (sched_mc_power_savings | sched_smt_power_savings) + return SD_POWERSAVINGS_BALANCE; + + return 0; + } + + /* + * Optimise SD flags for power savings: + * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. + * Keep default SD flags if sched_{smt,mc}_power_saving=0 + */ + + static inline int sd_power_saving_flags(void) + { + if (sched_mc_power_savings | sched_smt_power_savings) + return SD_BALANCE_NEWIDLE; + + return 0; + } struct sched_group { struct sched_group *next; /* Must be a circular list */ - cpumask_t cpumask; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@@ -784,8 -820,15 +815,15 @@@ * (see include/linux/reciprocal_div.h) */ u32 reciprocal_cpu_power; + + unsigned long cpumask[]; }; + static inline struct cpumask *sched_group_cpus(struct sched_group *sg) + { + return to_cpumask(sg->cpumask); + } + enum sched_domain_level { SD_LV_NONE = 0, SD_LV_SIBLING, @@@ -809,7 -852,6 +847,6 @@@ struct sched_domain struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ - cpumask_t span; /* span of all CPUs in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ @@@ -864,25 -906,73 +901,42 @@@ #ifdef CONFIG_SCHED_DEBUG char *name; #endif + + /* span of all CPUs in this domain */ + unsigned long span[]; }; - extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + static inline struct cpumask *sched_domain_span(struct sched_domain *sd) + { + return to_cpumask(sd->span); + } + + extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new); extern int arch_reinit_sched_domains(void); + /* Test a flag in parent sched domain */ + static inline int test_sd_parent(struct sched_domain *sd, int flag) + { + if (sd->parent && (sd->parent->flags & flag)) + return 1; + + return 0; + } + #else /* CONFIG_SMP */ struct sched_domain_attr; static inline void - partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { } #endif /* !CONFIG_SMP */ struct io_context; /* See blkdev.h */ -#define NGROUPS_SMALL 32 -#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) -struct group_info { - int ngroups; - atomic_t usage; - gid_t small_block[NGROUPS_SMALL]; - int nblocks; - gid_t *blocks[0]; -}; -/* - * get_group_info() must be called with the owning task locked (via task_lock()) - * when task != current. The reason being that the vast majority of callers are - * looking at current->group_info, which can not be changed except by the - * current task. Changing current->group_info requires the task lock, too. - */ -#define get_group_info(group_info) do { \ - atomic_inc(&(group_info)->usage); \ -} while (0) - -#define put_group_info(group_info) do { \ - if (atomic_dec_and_test(&(group_info)->usage)) \ - groups_free(group_info); \ -} while (0) - -extern struct group_info *groups_alloc(int gidsetsize); -extern void groups_free(struct group_info *group_info); -extern int set_current_groups(struct group_info *group_info); -extern int groups_search(struct group_info *group_info, gid_t grp); -/* access the groups "array" with this macro */ -#define GROUP_AT(gi, i) \ - ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK extern void prefetch_stack(struct task_struct *t); @@@ -926,7 -1016,7 +980,7 @@@ struct sched_class void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const cpumask_t *newmask); + const struct cpumask *newmask); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); @@@ -1138,7 -1228,6 +1192,7 @@@ struct task_struct * The buffer to hold the BTS data. */ void *bts_buffer; + size_t bts_size; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ @@@ -1162,12 -1251,17 +1216,12 @@@ struct list_head cpu_timers[3]; /* process credentials */ - uid_t uid,euid,suid,fsuid; - gid_t gid,egid,sgid,fsgid; - struct group_info *group_info; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - struct user_struct *user; - unsigned securebits; -#ifdef CONFIG_KEYS - unsigned char jit_keyring; /* default keyring to attach requested keys to */ - struct key *request_key_auth; /* assumed request_key authority */ - struct key *thread_keyring; /* keyring private to this thread */ -#endif + const struct cred *real_cred; /* objective and real subjective task + * credentials (COW) */ + const struct cred *cred; /* effective (overridable) subjective task + * credentials (COW) */ + struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ + char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) @@@ -1204,6 -1298,9 +1258,6 @@@ int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; -#ifdef CONFIG_SECURITY - void *security; -#endif struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; @@@ -1579,12 -1676,12 +1633,12 @@@ extern cputime_t task_gtime(struct task #ifdef CONFIG_SMP extern int set_cpus_allowed_ptr(struct task_struct *p, - const cpumask_t *new_mask); + const struct cpumask *new_mask); #else static inline int set_cpus_allowed_ptr(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - if (!cpu_isset(0, *new_mask)) + if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } @@@ -1760,6 -1857,7 +1814,6 @@@ static inline struct user_struct *get_u return u; } extern void free_uid(struct user_struct *); -extern void switch_uid(struct user_struct *); extern void release_uids(struct user_namespace *ns); #include @@@ -1778,6 -1876,9 +1832,6 @@@ extern void wake_up_new_task(struct tas extern void sched_fork(struct task_struct *p, int clone_flags); extern void sched_dead(struct task_struct *p); -extern int in_group_p(gid_t); -extern int in_egroup_p(gid_t); - extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); @@@ -1909,8 -2010,6 +1963,8 @@@ static inline unsigned long wait_task_i #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) +extern bool is_single_threaded(struct task_struct *); + /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. @@@ -2195,10 -2294,8 +2249,8 @@@ __trace_special(void *__tr, void *__dat } #endif - extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); - extern long sched_getaffinity(pid_t pid, cpumask_t *mask); - - extern int sched_mc_power_savings, sched_smt_power_savings; + extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); + extern long sched_getaffinity(pid_t pid, struct cpumask *mask); extern void normalize_rt_tasks(void); diff --combined init/Kconfig index 13627191a60,b3782c6d5ed..f6281711166 --- a/init/Kconfig +++ b/init/Kconfig @@@ -588,13 -588,6 +588,13 @@@ config KALLSYMS_AL Say N. +config KALLSYMS_STRIP_GENERATED + bool "Strip machine generated symbols from kallsyms" + depends on KALLSYMS_ALL + default y + help + Say N if you want kallsyms to retain even machine generated symbols. + config KALLSYMS_EXTRA_PASS bool "Do an extra kallsyms pass" depends on KALLSYMS @@@ -924,6 -917,15 +924,15 @@@ config KMO endif # MODULES + config INIT_ALL_POSSIBLE + bool + help + Back when each arch used to define their own cpu_online_map and + cpu_possible_map, some of them chose to initialize cpu_possible_map + with all 1s, and others with all 0s. When they were centralised, + it was better to provide this option than to break all the archs + and have several arch maintainers persuing me down dark alleys. + config STOP_MACHINE bool default y @@@ -936,90 -938,10 +945,90 @@@ source "block/Kconfig config PREEMPT_NOTIFIERS bool +choice + prompt "RCU Implementation" + default CLASSIC_RCU + config CLASSIC_RCU - def_bool !PREEMPT_RCU + bool "Classic RCU" help This option selects the classic RCU implementation that is designed for best read-side performance on non-realtime - systems. Classic RCU is the default. Note that the - PREEMPT_RCU symbol is used to select/deselect this option. + systems. + + Select this option if you are unsure. + +config TREE_RCU + bool "Tree-based hierarchical RCU" + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. + +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + +endchoice + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on TREE_RCU || PREEMPT_RCU + help + This option provides tracing in RCU which presents stats + in debugfs for debugging RCU implementation. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on TREE_RCU + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the cube + root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit + systems and up to 262,144 for 64-bit systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_EXACT + bool "Disable tree-based hierarchical RCU auto-balancing" + depends on TREE_RCU + default n + help + This option forces use of the exact RCU_FANOUT value specified, + regardless of imbalances in the hierarchy. This is useful for + testing RCU itself, and might one day be useful on systems with + strong NUMA behavior. + + Without RCU_FANOUT_EXACT, the code will balance the hierarchy. + + Say N if unsure. + +config TREE_RCU_TRACE + def_bool RCU_TRACE && TREE_RCU + select DEBUG_FS + help + This option provides tracing for the TREE_RCU implementation, + permitting Makefile to trivially select kernel/rcutree_trace.c. + +config PREEMPT_RCU_TRACE + def_bool RCU_TRACE && PREEMPT_RCU + select DEBUG_FS + help + This option provides tracing for the PREEMPT_RCU implementation, + permitting Makefile to trivially select kernel/rcupreempt_trace.c. diff --combined kernel/irq/chip.c index 6eb3c7952b6,b343deedae9..f63c706d25e --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@@ -46,7 -46,7 +46,7 @@@ void dynamic_irq_init(unsigned int irq desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpus_setall(desc->affinity); + cpumask_setall(&desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } @@@ -125,7 -125,6 +125,7 @@@ int set_irq_type(unsigned int irq, unsi return -ENODEV; } + type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) return 0; diff --combined kernel/irq/manage.c index 540f6c49f3f,10ad2f87ed9..61c4a9b6216 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@@ -79,7 -79,7 +79,7 @@@ int irq_can_set_affinity(unsigned int i * @cpumask: cpumask * */ - int irq_set_affinity(unsigned int irq, cpumask_t cpumask) + int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@@ -91,14 -91,14 +91,14 @@@ #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = cpumask; + cpumask_copy(&desc->pending_mask, cpumask); } #else - desc->affinity = cpumask; + cpumask_copy(&desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@@ -112,26 -112,24 +112,24 @@@ */ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { - cpumask_t mask; - if (!irq_can_set_affinity(irq)) return 0; - cpus_and(mask, cpu_online_map, irq_default_affinity); - /* * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpus_intersects(desc->affinity, cpu_online_map)) - mask = desc->affinity; + if (cpumask_any_and(&desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - desc->affinity = mask; - desc->chip->set_affinity(irq, mask); + cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity); + set_affinity: + desc->chip->set_affinity(irq, &desc->affinity); return 0; } @@@ -370,18 -368,16 +368,18 @@@ int __irq_set_trigger(struct irq_desc * return 0; } - ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); + /* caller masked out all except trigger mode flags */ + ret = chip->set_type(irq, flags); if (ret) pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)(flags & IRQF_TRIGGER_MASK), - irq, chip->set_type); + (int)flags, irq, chip->set_type); else { + if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) + flags |= IRQ_LEVEL; /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~IRQ_TYPE_SENSE_MASK; - desc->status |= flags & IRQ_TYPE_SENSE_MASK; + desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); + desc->status |= flags; } return ret; @@@ -461,8 -457,7 +459,8 @@@ __setup_irq(unsigned int irq, struct ir /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc, irq, new->flags); + ret = __irq_set_trigger(desc, irq, + new->flags & IRQF_TRIGGER_MASK); if (ret) { spin_unlock_irqrestore(&desc->lock, flags); @@@ -676,18 -671,6 +674,18 @@@ int request_irq(unsigned int irq, irq_h struct irq_desc *desc; int retval; + /* + * handle_IRQ_event() always ignores IRQF_DISABLED except for + * the _first_ irqaction (sigh). That can cause oopsing, but + * the behavior is classified as "will not fix" so we need to + * start nudging drivers away from using that idiom. + */ + if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) + == (IRQF_SHARED|IRQF_DISABLED)) + pr_warning("IRQ %d/%s: IRQF_DISABLED is not " + "guaranteed on shared IRQs\n", + irq, devname); + #ifdef CONFIG_LOCKDEP /* * Lockdep wants atomic interrupt handlers: diff --combined kernel/sched.c index fff1c4a20b6,756d981d91a..27ba1d642f0 --- a/kernel/sched.c +++ b/kernel/sched.c @@@ -209,6 -209,7 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static inline int rt_bandwidth_enabled(void) @@@ -360,9 -361,7 +360,9 @@@ static inline struct task_group *task_g struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->user->tg; + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@@ -498,18 -497,26 +498,26 @@@ struct rt_rq */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; #endif + #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; + #endif }; /* @@@ -603,8 -610,6 +611,8 @@@ struct rq #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; @@@ -1138,6 -1143,7 +1146,6 @@@ static void init_rq_hrtick(struct rq *r hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@@ -1514,7 -1520,7 +1522,7 @@@ static int tg_shares_up(struct task_gro struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@@ -1535,7 -1541,7 +1543,7 @@@ if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@@ -1865,8 -1871,6 +1873,8 @@@ void set_task_cpu(struct task_struct *p clock_offset = old_rq->clock - new_rq->clock; + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; @@@ -2101,15 -2105,17 +2109,17 @@@ find_idlest_group(struct sched_domain * int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@@ -2141,17 -2147,14 +2151,14 @@@ * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int - find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) + find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@@ -2193,7 -2196,6 +2200,6 @@@ static int sched_balance_self(int cpu, update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@@ -2202,14 -2204,13 +2208,13 @@@ continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@@ -2218,10 -2219,10 +2223,10 @@@ /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@@ -2266,7 -2267,7 +2271,7 @@@ static int try_to_wake_up(struct task_s cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@@ -2276,7 -2277,6 +2281,7 @@@ smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@@ -2315,7 -2315,7 +2320,7 @@@ else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@@ -2334,11 -2334,12 +2339,11 @@@ out_activate schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; out_running: - trace_sched_wakeup(rq, p); + trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@@ -2471,7 -2472,7 +2476,7 @@@ void wake_up_new_task(struct task_struc p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_sched_wakeup_new(rq, p); + trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@@ -2846,10 -2847,11 +2851,10 @@@ static void sched_migrate_task(struct t struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; - trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@@ -2911,7 -2913,7 +2916,7 @@@ int can_migrate_task(struct task_struc * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@@ -3086,7 -3088,7 +3091,7 @@@ static int move_one_task(struct rq *thi static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@@ -3122,10 -3124,11 +3127,11 @@@ unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@@ -3134,13 -3137,8 +3140,8 @@@ max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@@ -3251,8 -3249,8 +3252,8 @@@ */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@@ -3267,8 -3265,8 +3268,8 @@@ if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@@ -3394,6 -3392,10 +3395,10 @@@ out_balanced if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + cpumask_first(sched_group_cpus(group_leader)); + } return group_min; } #endif @@@ -3407,16 -3409,16 +3412,16 @@@ ret */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@@ -3446,7 -3448,7 +3451,7 @@@ */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@@ -3454,7 -3456,7 +3459,7 @@@ struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@@ -3514,8 -3516,8 +3519,8 @@@ redo /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@@ -3532,7 -3534,8 +3537,8 @@@ /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@@ -3607,7 -3610,7 +3613,7 @@@ out */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@@ -3616,7 -3619,7 +3622,7 @@@ int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@@ -3660,17 -3663,71 +3666,71 @@@ redo double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } if (!ld_moved) { + int active_balance = 0; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + if (active_balance) + wake_up_process(busiest->migration_thread); + } else sd->nr_balance_failed = 0; @@@ -3696,7 -3753,10 +3756,10 @@@ static void idle_balance(int this_cpu, struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@@ -3707,7 -3767,7 +3770,7 @@@ if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@@ -3722,6 -3782,7 +3785,7 @@@ */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* @@@ -3759,7 -3820,7 +3823,7 @@@ static void active_load_balance(struct /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@@ -3778,10 -3839,9 +3842,9 @@@ #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@@ -3809,7 -3869,7 +3872,7 @@@ int select_nohz_load_balancer(int stop_ int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@@ -3823,7 -3883,7 +3886,7 @@@ } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@@ -3836,10 -3896,10 +3899,10 @@@ } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@@ -3867,7 -3927,11 +3930,11 @@@ static void rebalance_domains(int cpu, unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@@ -3892,7 -3956,7 +3959,7 @@@ } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@@ -3926,6 -3990,8 +3993,8 @@@ out */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* @@@ -3950,12 -4016,13 +4019,13 @@@ static void run_rebalance_domains(struc */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@@ -3993,7 -4060,7 +4063,7 @@@ static inline void trigger_load_balance rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@@ -4006,7 -4073,7 +4076,7 @@@ * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@@ -4018,7 -4085,7 +4088,7 @@@ * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@@ -4028,7 -4095,7 +4098,7 @@@ * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@@ -5120,22 -5187,6 +5190,22 @@@ __setscheduler(struct rq *rq, struct ta set_load_weight(p); } +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; +} + static int __sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param, bool user) { @@@ -5195,7 -5246,8 +5265,7 @@@ recheck return -EPERM; /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) + if (!check_same_owner(p)) return -EPERM; } @@@ -5401,10 -5453,9 +5471,9 @@@ out_unlock return retval; } - long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@@ -5426,45 -5477,58 +5495,57 @@@ get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); + out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + out_put_task: put_task_struct(p); put_online_cpus(); return retval; } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@@ -5477,17 -5541,20 +5558,20 @@@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } - long sched_getaffinity(pid_t pid, cpumask_t *mask) + long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@@ -5504,7 -5571,7 +5588,7 @@@ if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@@ -5523,19 -5590,24 +5607,24 @@@ asmlinkage long sys_sched_getaffinity(p unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** @@@ -5877,7 -5949,7 +5966,7 @@@ void __cpuinit init_idle(struct task_st idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@@ -5904,9 -5976,9 +5993,9 @@@ * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ - cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@@ -5961,7 -6033,7 +6050,7 @@@ static inline void sched_init_granulari * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ - int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@@ -5969,13 -6041,13 +6058,13 @@@ int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@@ -5983,15 -6055,15 +6072,15 @@@ if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@@ -6033,7 -6105,7 +6122,7 @@@ static int __migrate_task(struct task_s if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@@ -6130,50 -6202,43 +6219,43 @@@ static int __migrate_task_irq(struct ta */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - cpumask_t mask; - struct rq *rq; int dest_cpu; + /* FIXME: Use cpumask_of_node here. */ + cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); + const struct cpumask *nodemask = &_nodemask; + + again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - do { - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + + move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* @@@ -6185,7 -6250,7 +6267,7 @@@ */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@@ -6475,7 -6540,7 +6557,7 @@@ static void set_rq_online(struct rq *rq if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@@ -6495,7 -6560,7 +6577,7 @@@ static void set_rq_offline(struct rq *r class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@@ -6536,7 -6601,7 +6618,7 @@@ migration_call(struct notifier_block *n rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@@ -6550,7 -6615,7 +6632,7 @@@ break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@@ -6600,7 -6665,7 +6682,7 @@@ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@@ -6639,13 -6704,13 +6721,13 @@@ early_initcall(migration_init) #ifdef CONFIG_SCHED_DEBUG static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), sd->span); - cpus_clear(*groupmask); + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@@ -6659,11 -6724,11 +6741,11 @@@ printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@@ -6683,31 -6748,32 +6765,32 @@@ break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), group->cpumask); + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@@ -6715,7 -6781,7 +6798,7 @@@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@@ -6725,8 -6791,7 +6808,7 @@@ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@@ -6739,7 -6804,7 +6821,7 @@@ if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@@ -6747,7 -6812,7 +6829,7 @@@ static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@@ -6778,7 -6843,7 +6860,7 @@@ sd_parent_degenerate(struct sched_domai if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@@ -6802,6 -6867,16 +6884,16 @@@ return 1; } + static void free_rootdomain(struct root_domain *rd) + { + cpupri_cleanup(&rd->cpupri); + + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); + } + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@@ -6811,38 -6886,63 +6903,63 @@@ if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } - static void init_rootdomain(struct root_domain *rd) + static int init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri, true); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto free_rd; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; + return 0; - cpupri_init(&rd->cpupri); + free_rto_mask: + free_cpumask_var(rd->rto_mask); + free_online: + free_cpumask_var(rd->online); + free_span: + free_cpumask_var(rd->span); + free_rd: + kfree(rd); + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@@ -6854,7 -6954,10 +6971,10 @@@ static struct root_domain *alloc_rootdo if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@@ -6896,19 -6999,12 +7016,12 @@@ cpu_attach_domain(struct sched_domain * } /* cpus with isolated domains */ - static cpumask_t cpu_isolated_map = CPU_MASK_NONE; + static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, cpu_isolated_map); return 1; } @@@ -6917,42 -7013,43 +7030,43 @@@ __setup("isolcpus=", isolated_cpu_setup /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void - init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, + init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@@ -7016,9 -7113,10 +7130,10 @@@ static int find_next_best_node(int node * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ - static void sched_domain_node_span(int node, cpumask_t *span) + static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; + /* FIXME: use cpumask_of_node() */ node_to_cpumask_ptr(nodemask, node); int i; @@@ -7039,19 -7137,34 +7154,34 @@@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + /* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ + struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); + }; + + struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); + }; + /* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT - static DEFINE_PER_CPU(struct sched_domain, cpu_domains); - static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); + static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); + static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int - cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) + cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@@ -7060,56 -7173,55 +7190,55 @@@ * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC - static DEFINE_PER_CPU(struct sched_domain, core_domains); - static DEFINE_PER_CPU(struct sched_group, sched_group_core); + static DEFINE_PER_CPU(struct static_sched_domain, core_domains); + static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int - cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) + cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int - cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) + cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif - static DEFINE_PER_CPU(struct sched_domain, phys_domains); - static DEFINE_PER_CPU(struct sched_group, sched_group_phys); + static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); + static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int - cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) + cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC + /* FIXME: Use cpu_coregroup_mask. */ *mask = cpu_coregroup_map(cpu); cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@@ -7123,19 -7235,21 +7252,21 @@@ static DEFINE_PER_CPU(struct sched_doma static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); - static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); + static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); - static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) + static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; + /* FIXME: use cpumask_of_node */ + node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, pnodemask, cpu_map); + group = cpumask_first(nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@@ -7147,11 -7261,11 +7278,11 @@@ static void init_numa_sched_groups_powe if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + sd = &per_cpu(phys_domains, j).sd; + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@@ -7168,11 -7282,12 +7299,12 @@@ #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ - static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) + static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@@ -7181,10 -7296,11 +7313,11 @@@ for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + /* FIXME: Use cpumask_of_node */ + node_to_cpumask_ptr(pnodemask, i); - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + cpus_and(*nodemask, *pnodemask, *cpu_map); + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@@ -7202,7 -7318,8 +7335,8 @@@ next_sg } } #else /* !CONFIG_NUMA */ - static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) + static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@@ -7228,7 -7345,7 +7362,7 @@@ static void init_sched_groups_power(in WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@@ -7293,48 -7410,6 +7427,6 @@@ SD_INIT_FUNC(CPU SD_INIT_FUNC(MC) #endif - /* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ - struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - - #ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; - #endif - }; - - #if NR_CPUS > 128 - #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v - static inline void sched_cpumask_alloc(struct allmasks **masks) - { - *masks = kmalloc(sizeof(**masks), GFP_KERNEL); - } - static inline void sched_cpumask_free(struct allmasks *masks) - { - kfree(masks); - } - #else - #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v - static inline void sched_cpumask_alloc(struct allmasks **masks) - { } - static inline void sched_cpumask_free(struct allmasks *masks) - { } - #endif - - #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@@ -7374,17 -7449,38 +7466,38 @@@ static void set_domain_attribute(struc * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ - static int __build_sched_domains(const cpumask_t *cpu_map, + static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; + #endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + + #ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@@ -7392,54 -7488,37 +7505,37 @@@ GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); - #ifdef CONFIG_NUMA - kfree(sched_group_nodes); - #endif - return -ENOMEM; + goto free_sched_groups; } - /* get space for all scratch cpumask variables */ - sched_cpumask_alloc(&allmasks); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); #ifdef CONFIG_NUMA - kfree(sched_group_nodes); - #endif - return -ENOMEM; - } - - tmpmask = (cpumask_t *)allmasks; - - - #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); + /* FIXME: use cpumask_of_node */ *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@@ -7449,18 -7528,19 +7545,19 @@@ sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@@ -7468,11 -7548,12 +7565,12 @@@ #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = cpu_coregroup_map(i); - cpus_and(sd->span, sd->span, *cpu_map); + *sched_domain_span(sd) = cpu_coregroup_map(i); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@@ -7480,11 -7561,11 +7578,11 @@@ #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@@ -7493,13 -7574,10 +7591,10 @@@ #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@@ -7510,13 -7588,11 +7605,11 @@@ #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - + for_each_cpu(i, cpu_map) { + /* FIXME: Use cpu_coregroup_mask */ *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@@ -7527,12 -7603,10 +7620,10 @@@ /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@@ -7543,8 -7617,6 +7634,6 @@@ #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@@ -7553,58 -7625,58 +7642,58 @@@ for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); - cpus_clear(*covered); + cpumask_clear(covered); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, pnodemask); + if (cpumask_empty(tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@@ -7612,9 -7684,9 +7701,9 @@@ goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@@ -7623,22 -7695,22 +7712,22 @@@ /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@@ -7650,53 -7722,78 +7739,78 @@@ if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } - sched_cpumask_free(allmasks); - return 0; + err = 0; + + free_tmpmask: + free_cpumask_var(tmpmask); + free_send_covered: + free_cpumask_var(send_covered); + free_this_core_map: + free_cpumask_var(this_core_map); + free_this_sibling_map: + free_cpumask_var(this_sibling_map); + free_nodemask: + free_cpumask_var(nodemask); + free_notcovered: + #ifdef CONFIG_NUMA + free_cpumask_var(notcovered); + free_covered: + free_cpumask_var(covered); + free_domainspan: + free_cpumask_var(domainspan); + out: + #endif + return err; + + free_sched_groups: + #ifdef CONFIG_NUMA + kfree(sched_group_nodes); + #endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - sched_cpumask_free(allmasks); - kfree(rd); - return -ENOMEM; + free_rootdomain(rd); + goto free_tmpmask; #endif } - static int build_sched_domains(const cpumask_t *cpu_map) + static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } - static cpumask_t *doms_cur; /* current sched domains */ + static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ - static cpumask_t fallback_doms; + static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@@ -7713,16 -7810,16 +7827,16 @@@ int __attribute__((weak)) arch_update_c * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ - static int arch_init_sched_domains(const cpumask_t *cpu_map) + static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + doms_cur = fallback_doms; + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@@ -7730,8 -7827,8 +7844,8 @@@ return err; } - static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) + static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@@ -7740,15 -7837,16 +7854,16 @@@ * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ - static void detach_destroy_domains(const cpumask_t *cpu_map) + static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@@ -7773,7 -7871,7 +7888,7 @@@ static int dattrs_equal(struct sched_do * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@@ -7787,13 -7885,14 +7902,14 @@@ * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ - void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + /* FIXME: Change to struct cpumask *doms_new[] */ + void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; @@@ -7812,7 -7911,7 +7928,7 @@@ /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@@ -7824,15 -7923,15 +7940,15 @@@ match1 if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + doms_new = fallback_doms; + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } @@@ -7844,7 -7943,7 +7960,7 @@@ match2 } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@@ -7873,14 -7972,25 +7989,25 @@@ int arch_reinit_sched_domains(void static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { int ret; + unsigned int level = 0; - if (buf[0] != '0' && buf[0] != '1') + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) return -EINVAL; if (smt) - sched_smt_power_savings = (buf[0] == '1'); + sched_smt_power_savings = level; else - sched_mc_power_savings = (buf[0] == '1'); + sched_mc_power_savings = level; ret = arch_reinit_sched_domains(); @@@ -7984,7 -8094,9 +8111,9 @@@ static int update_runtime(struct notifi void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@@ -7993,10 -8105,10 +8122,10 @@@ #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@@ -8011,9 -8123,13 +8140,13 @@@ init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) @@@ -8328,6 -8444,15 +8461,15 @@@ void __init sched_init(void */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); + #ifdef CONFIG_SMP + #ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); + #endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); + #endif /* SMP */ + scheduler_running = 1; } @@@ -9298,41 -9423,6 +9440,41 @@@ cpuacct_destroy(struct cgroup_subsys *s kfree(ca); } +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@@ -9340,8 -9430,17 +9482,8 @@@ u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@@ -9358,39 -9457,23 +9500,39 @@@ static int cpuusage_write(struct cgrou goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) diff --combined kernel/sched_fair.c index 5ad4440f0fc,36b5e34fa99..56c0efe902a --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@@ -492,8 -492,6 +492,8 @@@ static void update_curr(struct cfs_rq * * overflow on 32 bits): */ delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; @@@ -1019,16 -1017,33 +1019,33 @@@ static void yield_task_fair(struct rq * * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; + unsigned int chosen_wakeup_cpu; + int this_cpu; + + /* + * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu + * are idle and this is not a kernel thread and this task's affinity + * allows it to be moved to preferred cpu, then just move! + */ + + this_cpu = smp_processor_id(); + chosen_wakeup_cpu = + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && + idle_cpu(cpu) && idle_cpu(this_cpu) && + p->mm && !(p->flags & PF_KTHREAD) && + cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) + return chosen_wakeup_cpu; /* * If it is idle, then it is the best cpu to run this task. @@@ -1046,10 -1061,9 +1063,9 @@@ if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@@ -1242,13 -1256,13 +1258,13 @@@ static int select_task_rq_fair(struct t * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* @@@ -1347,11 -1361,12 +1363,11 @@@ static void check_preempt_wakeup(struc { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); - if (unlikely(rt_prio(p->prio))) { - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + update_curr(cfs_rq); - update_rq_clock(rq); - update_curr(cfs_rq); + if (unlikely(rt_prio(p->prio))) { resched_task(curr); return; } diff --combined kernel/sched_rt.c index 51d2af3e619,1bbd9901401..833b6d44483 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@@ -15,7 -15,7 +15,7 @@@ static inline void rt_set_overload(stru if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@@ -34,7 -34,7 +34,7 @@@ static inline void rt_clear_overload(st /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@@ -77,7 -77,7 +77,7 @@@ static inline u64 sched_rt_period(struc } #define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { @@@ -139,14 -139,14 +139,14 @@@ static int rt_se_boosted(struct sched_r } #ifdef CONFIG_SMP - static inline cpumask_t sched_rt_period_mask(void) + static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else - static inline cpumask_t sched_rt_period_mask(void) + static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@@ -212,9 -212,9 +212,9 @@@ static inline int rt_rq_throttled(struc return rt_rq->rt_throttled; } - static inline cpumask_t sched_rt_period_mask(void) + static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@@ -241,11 -241,11 +241,11 @@@ static int do_balance_runtime(struct rt int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@@ -324,7 -324,7 +324,7 @@@ static void __disable_runtime(struct r /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@@ -429,13 -429,13 +429,13 @@@ static inline int balance_runtime(struc static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@@ -805,17 -805,20 +805,20 @@@ static int select_task_rq_rt(struct tas static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@@ -824,6 -827,8 +827,8 @@@ */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); + free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ @@@ -914,7 -919,7 +919,7 @@@ static void deactivate_task(struct rq * static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@@ -953,7 -958,7 +958,7 @@@ static struct task_struct *pick_next_hi return next; } - static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { @@@ -973,7 -978,7 +978,7 @@@ static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@@ -988,7 -993,7 +993,7 @@@ * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@@ -998,7 -1003,7 +1003,7 @@@ * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@@ -1013,7 -1018,8 +1018,8 @@@ cpumask_t domain_mask; int best_cpu; - cpus_and(domain_mask, sd->span, *lowest_mask); + cpumask_and(&domain_mask, sched_domain_span(sd), + lowest_mask); best_cpu = pick_optimal_cpu(this_cpu, &domain_mask); @@@ -1054,8 -1060,8 +1060,8 @@@ static struct rq *find_lock_lowest_rq(s * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@@ -1176,7 -1182,7 +1182,7 @@@ static int pull_rt_task(struct rq *this next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@@ -1305,9 -1311,9 +1311,9 @@@ move_one_task_rt(struct rq *this_rq, in } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@@ -1328,7 -1334,7 +1334,7 @@@ update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } @@@ -1371,6 -1377,14 +1377,14 @@@ static void switched_from_rt(struct rq if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + + static inline void init_sched_rt_class(void) + { + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); + } #endif /* CONFIG_SMP */ /* @@@ -1541,3 -1555,4 +1555,4 @@@ static void print_rt_stats(struct seq_f rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + diff --combined kernel/sched_stats.h index 3b01098164c,5fcf0e18458..f2773b5d122 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@@ -31,7 -31,7 +31,7 @@@ static int show_schedstat(struct seq_fi rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, rq->sched_switch, rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, - rq->rq_sched_info.cpu_time, + rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); seq_printf(seq, "\n"); @@@ -42,7 -42,8 +42,8 @@@ for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, sd->span); + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { @@@ -123,7 -124,7 +124,7 @@@ static inline voi rq_sched_info_depart(struct rq *rq, unsigned long long delta) { if (rq) - rq->rq_sched_info.cpu_time += delta; + rq->rq_cpu_time += delta; } static inline void @@@ -236,6 -237,7 +237,6 @@@ static inline void sched_info_depart(st unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; - t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); if (t->state == TASK_RUNNING) diff --combined kernel/time/tick-sched.c index 8f3fc2582d3,70f872c71f4..76a574bbef9 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@@ -144,7 -144,7 +144,7 @@@ void tick_nohz_update_jiffies(void if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@@ -247,7 -247,7 +247,7 @@@ void tick_nohz_stop_sched_tick(int inid if (need_resched()) goto end; - if (unlikely(local_softirq_pending())) { + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { @@@ -282,31 -282,8 +282,31 @@@ /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@@ -319,7 -296,7 +319,7 @@@ /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@@ -329,6 -306,17 +329,6 @@@ rcu_enter_nohz(); } - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. - */ - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->idle_sleeps++; /* @@@ -344,7 -332,12 +344,7 @@@ goto out; } - /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); + /* Mark expiries */ ts->idle_expires = expires; if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { @@@ -361,7 -354,7 +361,7 @@@ * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@@ -439,7 -432,7 +439,7 @@@ void tick_nohz_restart_sched_tick(void select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); /* * We stopped the tick in idle. Update process times would miss the @@@ -688,6 -681,7 +688,6 @@@ void tick_setup_sched_timer(void */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --combined kernel/trace/trace.c index 4185d522163,6adf660fc81..0e91f43b6ba --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@@ -30,6 -30,7 +30,6 @@@ #include #include #include -#include #include #include @@@ -286,7 -287,6 +286,7 @@@ static const char *trace_options[] = "annotate", "userstacktrace", "sym-userobj", + "printk-msg-only", NULL }; @@@ -320,7 -320,7 +320,7 @@@ __update_max_tr(struct trace_array *tr memcpy(data->comm, tsk->comm, TASK_COMM_LEN); data->pid = tsk->pid; - data->uid = tsk->uid; + data->uid = task_uid(tsk); data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; data->policy = tsk->policy; data->rt_priority = tsk->rt_priority; @@@ -678,16 -678,6 +678,16 @@@ void tracing_reset(struct trace_array * ftrace_enable_cpu(); } +void tracing_reset_online_cpus(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); +} + #define SAVED_CMDLINES 128 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; @@@ -1309,7 -1299,7 +1309,7 @@@ enum trace_file_type TRACE_FILE_ANNOTATE = 2, }; -static void trace_iterator_increment(struct trace_iterator *iter, int cpu) +static void trace_iterator_increment(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ ftrace_disable_cpu(); @@@ -1388,7 -1378,7 +1388,7 @@@ static void *find_next_entry_inc(struc iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); if (iter->ent) - trace_iterator_increment(iter, iter->cpu); + trace_iterator_increment(iter); return iter->ent ? iter : NULL; } @@@ -1757,13 -1747,6 +1757,13 @@@ lat_print_timestamp(struct trace_seq *s static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + /* * The message is supposed to contain an ending newline. * If the printing stops prematurely, try to add a newline of our own. @@@ -1832,6 -1815,7 +1832,6 @@@ print_lat_fmt(struct trace_iterator *it char *comm; int S, T; int i; - unsigned state; if (entry->type == TRACE_CONT) return TRACE_TYPE_HANDLED; @@@ -1877,8 -1861,12 +1877,8 @@@ trace_assign_type(field, entry); - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - - state = field->prev_state ? - __ffs(field->prev_state) + 1 : 0; - S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, @@@ -2019,8 -2007,10 +2019,8 @@@ static enum print_line_t print_trace_fm trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", field->prev_pid, field->prev_prio, @@@ -2150,9 -2140,12 +2150,9 @@@ static enum print_line_t print_raw_fmt( trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@@ -2239,9 -2232,12 +2239,9 @@@ static enum print_line_t print_hex_fmt( trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); @@@ -2269,25 -2265,6 +2269,25 @@@ return TRACE_TYPE_HANDLED; } +static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return TRACE_TYPE_HANDLED; +} + static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@@ -2368,11 -2345,6 +2368,11 @@@ static enum print_line_t print_trace_li return ret; } + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return print_printk_msg_only(iter); + if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@@ -2453,7 -2425,7 +2453,7 @@@ __tracing_open(struct inode *inode, str /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) - iter->trace->open(iter); + iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ if (ring_buffer_overruns(iter->tr->buffer)) @@@ -2674,7 -2646,7 +2674,7 @@@ tracing_cpumask_read(struct file *filp mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, &tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@@ -2695,7 -2667,7 +2695,7 @@@ tracing_cpumask_write(struct file *filp int err, cpu; mutex_lock(&tracing_cpumask_update_lock); - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new); if (err) goto err_unlock; diff --combined lib/Kconfig index fd4118e097f,7823f8342ab..2ba43c4a5b0 --- a/lib/Kconfig +++ b/lib/Kconfig @@@ -64,8 -64,6 +64,8 @@@ config CRC config LIBCRC32C tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check" + select CRYPTO + select CRYPTO_CRC32C help This option is provided for the case where no in-kernel-tree modules require CRC32c functions, but a module built outside the @@@ -159,4 -157,11 +159,11 @@@ config CHECK_SIGNATUR config HAVE_LMB boolean + config CPUMASK_OFFSTACK + bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS + help + Use dynamic allocation for cpumask_var_t, instead of putting + them on the stack. This is a bit more expensive, but avoids + stack overflow. + endmenu diff --combined mm/slub.c index 6cb7ad10785,8e516e29f98..0d861c3154b --- a/mm/slub.c +++ b/mm/slub.c @@@ -24,7 -24,6 +24,7 @@@ #include #include #include +#include /* * Lock order: @@@ -154,10 -153,6 +154,10 @@@ #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ @@@ -183,7 -178,7 +183,7 @@@ static LIST_HEAD(slab_caches) * Tracking user of a slab. */ struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@@ -295,7 -290,7 +295,7 @@@ static inline struct kmem_cache_order_o unsigned long size) { struct kmem_cache_order_objects x = { - (order << 16) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + (PAGE_SIZE << order) / size }; return x; @@@ -303,12 -298,12 +303,12 @@@ static inline int oo_order(struct kmem_cache_order_objects x) { - return x.x >> 16; + return x.x >> OO_SHIFT; } static inline int oo_objects(struct kmem_cache_order_objects x) { - return x.x & ((1 << 16) - 1); + return x.x & OO_MASK; } #ifdef CONFIG_SLUB_DEBUG @@@ -372,7 -367,7 +372,7 @@@ static struct track *get_track(struct k } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { struct track *p; @@@ -396,8 -391,8 +396,8 @@@ static void init_tracking(struct kmem_c if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@@ -406,7 -401,7 +406,7 @@@ return; printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) @@@ -697,7 -692,7 +697,7 @@@ static int check_object(struct kmem_cac if (!check_valid_pointer(s, page, get_freepointer(s, p))) { object_err(s, page, p, "Freepointer corrupt"); /* - * No choice but to zap it and thus loose the remainder + * No choice but to zap it and thus lose the remainder * of the free objects in this slab. May cause * another error because the object count is now wrong. */ @@@ -769,8 -764,8 +769,8 @@@ static int on_freelist(struct kmem_cach } max_objects = (PAGE_SIZE << compound_order(page)) / s->size; - if (max_objects > 65535) - max_objects = 65535; + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { slab_err(s, page, "Wrong number of objects. Found %d but " @@@ -871,7 -866,7 +871,7 @@@ static void setup_object_debug(struct k } static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; @@@ -911,7 -906,7 +911,7 @@@ bad } static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@@ -1034,10 -1029,10 +1034,10 @@@ static inline void setup_object_debug(s struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@@ -1504,8 -1499,8 +1504,8 @@@ static inline int node_match(struct kme * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { void **object; struct page *new; @@@ -1589,18 -1584,13 +1589,18 @@@ debug * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; unsigned long flags; unsigned int objsize; + might_sleep_if(gfpflags & __GFP_WAIT); + + if (should_failslab(s->objsize, gfpflags)) + return NULL; + local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); objsize = c->objsize; @@@ -1623,14 -1613,14 +1623,14 @@@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + return slab_alloc(s, gfpflags, -1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + return slab_alloc(s, gfpflags, node, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif @@@ -1644,7 -1634,7 +1644,7 @@@ * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr, unsigned int offset) { void *prior; void **object = (void *)x; @@@ -1714,7 -1704,7 +1714,7 @@@ debug * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; struct kmem_cache_cpu *c; @@@ -1741,11 -1731,11 +1741,11 @@@ void kmem_cache_free(struct kmem_cache page = virt_to_head_page(x); - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, page, x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab object the object resides */ +/* Figure out on which slab page the object resides */ static struct page *get_object_page(const void *x) { struct page *page = virt_to_head_page(x); @@@ -1817,8 -1807,8 +1817,8 @@@ static inline int slab_order(int size, int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > 65535) - return get_order(size * 65535) - 1; + if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); @@@ -2083,7 -2073,8 +2083,7 @@@ static inline int alloc_kmem_cache_cpus * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) { struct page *page; struct kmem_cache_node *n; @@@ -2121,6 -2112,7 +2121,6 @@@ local_irq_save(flags); add_partial(n, page, 0); local_irq_restore(flags); - return n; } static void free_kmem_cache_nodes(struct kmem_cache *s) @@@ -2152,7 -2144,8 +2152,7 @@@ static int init_kmem_cache_nodes(struc n = &s->local_node; else { if (slab_state == DOWN) { - n = early_kmem_cache_node_alloc(gfpflags, - node); + early_kmem_cache_node_alloc(gfpflags, node); continue; } n = kmem_cache_alloc_node(kmalloc_caches, @@@ -2666,7 -2659,7 +2666,7 @@@ void *__kmalloc(size_t size, gfp_t flag if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + return slab_alloc(s, flags, -1, _RET_IP_); } EXPORT_SYMBOL(__kmalloc); @@@ -2694,7 -2687,7 +2694,7 @@@ void *__kmalloc_node(size_t size, gfp_ if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + return slab_alloc(s, flags, node, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_node); #endif @@@ -2751,7 -2744,7 +2751,7 @@@ void kfree(const void *x put_page(page); return; } - slab_free(page->slab, page, object, __builtin_return_address(0)); + slab_free(page->slab, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@@ -3130,12 -3123,8 +3130,12 @@@ struct kmem_cache *kmem_cache_create(co s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); - if (sysfs_slab_alias(s, name)) + if (sysfs_slab_alias(s, name)) { + down_write(&slub_lock); + s->refcount--; + up_write(&slub_lock); goto err; + } return s; } @@@ -3145,13 -3134,8 +3145,13 @@@ size, align, flags, ctor)) { list_add(&s->list, &slab_caches); up_write(&slub_lock); - if (sysfs_slab_add(s)) + if (sysfs_slab_add(s)) { + down_write(&slub_lock); + list_del(&s->list); + up_write(&slub_lock); + kfree(s); goto err; + } return s; } kfree(s); @@@ -3218,7 -3202,7 +3218,7 @@@ static struct notifier_block __cpuinitd #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; @@@ -3234,7 -3218,7 +3234,7 @@@ } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; @@@ -3445,7 -3429,7 +3445,7 @@@ static void resiliency_test(void) {} struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; @@@ -3493,7 -3477,7 +3493,7 @@@ static int add_location(struct loc_trac { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; @@@ -3642,7 -3626,7 +3642,7 @@@ static int list_locations(struct kmem_c len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->cpus); + &l->cpus); } if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && @@@ -4361,7 -4345,7 +4361,7 @@@ static void sysfs_slab_remove(struct km /* * Need to buffer aliases during bootup until sysfs becomes - * available lest we loose that information. + * available lest we lose that information. */ struct saved_alias { struct kmem_cache *s;