From: Mike Travis Date: Thu, 1 Jan 2009 01:34:16 +0000 (-0800) Subject: Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux... X-Git-Tag: v2.6.29-rc1~521^2~11 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=7eb19553369c46cc1fa64caf120cbcab1b597f7c;hp=-c;p=linux-2.6-omap-h63xx.git Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-cpumask into merge-rr-cpumask Conflicts: arch/x86/kernel/io_apic.c kernel/rcuclassic.c kernel/sched.c kernel/time/tick-sched.c Signed-off-by: Mike Travis [ mingo@elte.hu: backmerged typo fix for io_apic.c ] Signed-off-by: Ingo Molnar --- 7eb19553369c46cc1fa64caf120cbcab1b597f7c diff --combined arch/ia64/include/asm/topology.h index a3cc9f65f95,97ae7f50910..76a33a91ca6 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@@ -34,6 -34,7 +34,7 @@@ * Returns a bitmask of CPUs on Node 'node'. */ #define node_to_cpumask(node) (node_to_cpu_mask[node]) + #define cpumask_of_node(node) (&node_to_cpu_mask[node]) /* * Returns the number of the node containing Node 'nid'. @@@ -45,7 -46,7 +46,7 @@@ /* * Returns the number of the first CPU on Node 'node'. */ - #define node_to_first_cpu(node) (first_cpu(node_to_cpumask(node))) + #define node_to_first_cpu(node) (cpumask_first(cpumask_of_node(node))) /* * Determines the node for a given pci bus @@@ -55,6 -56,7 +56,6 @@@ void build_cpu_to_node_map(void); #define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ @@@ -79,6 -81,7 +80,6 @@@ /* sched_domains SD_NODE_INIT for IA64 NUMA machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ @@@ -109,6 -112,8 +110,8 @@@ #define topology_core_id(cpu) (cpu_data(cpu)->core_id) #define topology_core_siblings(cpu) (cpu_core_map[cpu]) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) + #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) + #define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) #define smt_capable() (smp_num_siblings > 1) #endif @@@ -119,6 -124,10 +122,10 @@@ extern void arch_fix_phys_package_id(in node_to_cpumask(pcibus_to_node(bus)) \ ) + #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ + cpu_all_mask : \ + cpumask_from_node(pcibus_to_node(bus))) + #include #endif /* _ASM_IA64_TOPOLOGY_H */ diff --combined arch/mips/include/asm/mach-ip27/topology.h index 1fb959f9898,c1c3f5b2f18..55d481569a1 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@@ -25,11 -25,13 +25,13 @@@ extern struct cpuinfo_ip27 sn_cpu_info[ #define cpu_to_node(cpu) (sn_cpu_info[(cpu)].p_nodeid) #define parent_node(node) (node) #define node_to_cpumask(node) (hub_data(node)->h_cpus) - #define node_to_first_cpu(node) (first_cpu(node_to_cpumask(node))) + #define cpumask_of_node(node) (&hub_data(node)->h_cpus) + #define node_to_first_cpu(node) (cpumask_first(cpumask_of_node(node))) struct pci_bus; extern int pcibus_to_node(struct pci_bus *); #define pcibus_to_cpumask(bus) (cpu_online_map) + #define cpumask_of_pcibus(bus) (cpu_online_mask) extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; @@@ -37,6 -39,7 +39,6 @@@ /* sched_domains SD_NODE_INIT for SGI IP27 machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ diff --combined arch/powerpc/include/asm/topology.h index 373fca394a5,236dae1cd29..375258559ae --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@@ -22,11 -22,11 +22,11 @@@ static inline cpumask_t node_to_cpumask return numa_cpumask_lookup_table[node]; } + #define cpumask_of_node(node) (&numa_cpumask_lookup_table[node]) + static inline int node_to_first_cpu(int node) { - cpumask_t tmp; - tmp = node_to_cpumask(node); - return first_cpu(tmp); + return cpumask_first(cpumask_of_node(node)); } int of_node_to_nid(struct device_node *device); @@@ -46,8 -46,13 +46,12 @@@ static inline int pcibus_to_node(struc node_to_cpumask(pcibus_to_node(bus)) \ ) + #define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \ + cpu_all_mask : \ + cpumask_of_node(pcibus_to_node(bus))) + /* sched_domains SD_NODE_INIT for PPC64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ @@@ -108,6 -113,8 +112,8 @@@ static inline void sysfs_remove_device_ #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) + #define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) + #define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) #endif #endif diff --combined arch/sh/include/asm/topology.h index 279d9cc4a00,9aa160d0efe..066f0fba590 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@@ -5,6 -5,7 +5,6 @@@ /* sched_domains SD_NODE_INIT for sh machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ .groups = NULL, \ @@@ -32,6 -33,7 +32,7 @@@ #define parent_node(node) ((void)(node),0) #define node_to_cpumask(node) ((void)node, cpu_online_map) + #define cpumask_of_node(node) ((void)node, cpu_online_mask) #define node_to_first_cpu(node) ((void)(node),0) #define pcibus_to_node(bus) ((void)(bus), -1) diff --combined arch/x86/Kconfig index 0ca2eb7573c,0f44add3e0b..249d1e0824b --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@@ -19,6 -19,8 +19,8 @@@ config X86_6 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE @@@ -90,6 -92,10 +92,10 @@@ config GENERIC_IOMA config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + + config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y @@@ -244,16 -250,19 +250,19 @@@ config X86_HAS_BOOT_CPU_I config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ - default y help - This enables support for sparse irq, esp for msi/msi-x. You may need - if you have lots of cards supports msi-x installed. + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. - If you don't know what to do here, say Y. + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" - depends on SPARSE_IRQ && SMP + depends on SPARSE_IRQ && NUMA default n help This enables moving irq_desc to cpu/node that irq will use handled. @@@ -264,21 -273,13 +273,13 @@@ config X86_FIND_SMP_CONFI def_bool y depends on X86_MPPARSE || X86_VOYAGER - if ACPI config X86_MPPARSE - def_bool y - bool "Enable MPS table" + bool "Enable MPS table" if ACPI + default y depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it - endif - - if !ACPI - config X86_MPPARSE - def_bool y - depends on X86_LOCAL_APIC - endif choice prompt "Subarchitecture Type" @@@ -500,7 -501,7 +501,7 @@@ config HPET_TIME The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - . + . You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. @@@ -587,7 -588,7 +588,7 @@@ config AMD_IOMM # need this always selected by IOMMU for the VIA workaround config SWIOTLB - bool + def_bool y if X86_64 help Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation @@@ -600,20 -601,19 +601,20 @@@ config IOMMU_HELPE config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && BROKEN + depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + select CPUMASK_OFFSTACK default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. config NR_CPUS - int "Maximum number of CPUs (2-512)" if !MAXSMP - range 2 512 - depends on SMP + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 512 if SMP && !MAXSMP + default "1" if !SMP default "4096" if MAXSMP - default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 - default "8" + default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "8" if SMP help This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 512 and the @@@ -679,6 -679,30 +680,30 @@@ config X86_VISWS_API def_bool y depends on X86_32 && X86_VISWS + config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER @@@ -975,24 -999,37 +1000,37 @@@ config X86_PA config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE + config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA - bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. + The kernel will try to allocate memory used by a CPU on the local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 32-bit this is currently highly experimental and should be only - used for kernel development. It might also cause boot failures. - For 64-bit this is recommended on all multiprocessor Opteron systems. - If the system is EM64T, you should say N unless your system is - EM64T NUMA. + For 64-bit this is recommended if the system is Intel Core i7 + (or later), AMD Opteron, or EM64T NUMA. + + For 32-bit this is only needed on (rare) 32-bit-only platforms + that support NUMA topologies, such as NUMAQ / Summit, or if you + boot a 32-bit kernel on a 64-bit NUMA platform. + + Otherwise, you should say N. comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) @@@ -1512,6 -1549,10 +1550,10 @@@ config ARCH_ENABLE_MEMORY_HOTPLU def_bool y depends on X86_64 || (X86_32 && HIGHMEM) + config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + depends on MEMORY_HOTPLUG + config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA diff --combined arch/x86/include/asm/irq.h index 4bb732e45a8,28e409fc73f..592688ed04d --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@@ -31,13 -31,9 +31,9 @@@ static inline int irq_canonicalize(int # endif #endif - #ifdef CONFIG_IRQBALANCE - extern int irqbalance_disable(char *str); - #endif - #ifdef CONFIG_HOTPLUG_CPU #include -extern void fixup_irqs(cpumask_t map); +extern void fixup_irqs(void); #endif extern unsigned int do_IRQ(struct pt_regs *regs); @@@ -46,6 -42,5 +42,6 @@@ extern void native_init_IRQ(void) /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); +extern int vector_used_by_percpu_irq(unsigned int vector); #endif /* _ASM_X86_IRQ_H */ diff --combined arch/x86/include/asm/topology.h index 79e31e9dcdd,168203c0c31..4e2f2e0aab2 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@@ -61,13 -61,19 +61,19 @@@ static inline int cpu_to_node(int cpu * * Side note: this function creates the returned cpumask on the stack * so with a high NR_CPUS count, excessive stack space is used. The - * node_to_cpumask_ptr function should be used whenever possible. + * cpumask_of_node function should be used whenever possible. */ static inline cpumask_t node_to_cpumask(int node) { return node_to_cpumask_map[node]; } + /* Returns a bitmask of CPUs on Node 'node'. */ + static inline const struct cpumask *cpumask_of_node(int node) + { + return &node_to_cpumask_map[node]; + } + #else /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ @@@ -82,7 -88,7 +88,7 @@@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_n #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern int cpu_to_node(int cpu); extern int early_cpu_to_node(int cpu); - extern const cpumask_t *_node_to_cpumask_ptr(int node); + extern const cpumask_t *cpumask_of_node(int node); extern cpumask_t node_to_cpumask(int node); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ @@@ -103,7 -109,7 +109,7 @@@ static inline int early_cpu_to_node(in } /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ - static inline const cpumask_t *_node_to_cpumask_ptr(int node) + static inline const cpumask_t *cpumask_of_node(int node) { return &node_to_cpumask_map[node]; } @@@ -116,12 -122,15 +122,15 @@@ static inline cpumask_t node_to_cpumask #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ - /* Replace default node_to_cpumask_ptr with optimized version */ + /* + * Replace default node_to_cpumask_ptr with optimized version + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = _node_to_cpumask_ptr(node) + const cpumask_t *v = cpumask_of_node(node) #define node_to_cpumask_ptr_next(v, node) \ - v = _node_to_cpumask_ptr(node) + v = cpumask_of_node(node) #endif /* CONFIG_X86_64 */ @@@ -187,7 -196,7 +196,7 @@@ extern int __node_distance(int, int) #define cpu_to_node(cpu) 0 #define early_cpu_to_node(cpu) 0 - static inline const cpumask_t *_node_to_cpumask_ptr(int node) + static inline const cpumask_t *cpumask_of_node(int node) { return &cpu_online_map; } @@@ -200,12 -209,15 +209,15 @@@ static inline int node_to_first_cpu(in return first_cpu(cpu_online_map); } - /* Replace default node_to_cpumask_ptr with optimized version */ + /* + * Replace default node_to_cpumask_ptr with optimized version + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = _node_to_cpumask_ptr(node) + const cpumask_t *v = cpumask_of_node(node) #define node_to_cpumask_ptr_next(v, node) \ - v = _node_to_cpumask_ptr(node) + v = cpumask_of_node(node) #endif #include @@@ -214,20 -226,18 +226,20 @@@ /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - node_to_cpumask_ptr(mask, node); - return first_cpu(*mask); + return cpumask_first(cpumask_of_node(node)); } #endif extern cpumask_t cpu_coregroup_map(int cpu); + extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) +#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) /* indicates that pointers to the topology cpumask_t maps are valid */ #define arch_provides_topology_pointers yes diff --combined arch/x86/kernel/apic.c index b9019271af6,6107b41da9a..6b7f824db16 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@@ -30,6 -30,7 +30,7 @@@ #include #include #include + #include #include #include @@@ -118,6 -119,8 +119,6 @@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o int first_system_vector = 0xfe; -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - /* * Debug level, exported for io_apic.c */ @@@ -139,7 -142,7 +140,7 @@@ static int lapic_next_event(unsigned lo struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); -static void lapic_timer_broadcast(const struct cpumask *mask); +static void lapic_timer_broadcast(const cpumask_t *mask); static void apic_pm_activate(void); /* @@@ -452,10 -455,10 +453,10 @@@ static void lapic_timer_setup(enum cloc /* * Local APIC timer broadcast function */ -static void lapic_timer_broadcast(const struct cpumask *mask) +static void lapic_timer_broadcast(const cpumask_t *mask) { #ifdef CONFIG_SMP - send_IPI_mask(*mask, LOCAL_TIMER_VECTOR); + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif } @@@ -775,11 -778,7 +776,7 @@@ static void local_apic_timer_interrupt( /* * the NMI deadlock-detector uses this. */ - #ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); - #else - per_cpu(irq_stat, cpu).apic_timer_irqs++; - #endif + inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); } @@@ -792,7 -791,7 +789,7 @@@ * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ - void smp_apic_timer_interrupt(struct pt_regs *regs) + void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@@ -806,9 -805,7 +803,7 @@@ * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - #ifdef CONFIG_X86_64 exit_idle(); - #endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@@ -1666,9 -1663,7 +1661,7 @@@ void smp_spurious_interrupt(struct pt_r { u32 v; - #ifdef CONFIG_X86_64 exit_idle(); - #endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@@ -1679,14 -1674,11 +1672,11 @@@ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); - #ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); - #else + inc_irq_stat(irq_spurious_count); + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; - #endif irq_exit(); } @@@ -1697,9 -1689,7 +1687,7 @@@ void smp_error_interrupt(struct pt_reg { u32 v, v1; - #ifdef CONFIG_X86_64 exit_idle(); - #endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@@ -1817,32 -1807,28 +1805,32 @@@ void disconnect_bsp_APIC(int virt_wire_ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - cpumask_t tmp_map; /* * Validate version */ if (version == 0x0) { pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); + "fixing up to 0x10. (tell your hw vendor)\n", + version); version = 0x10; } apic_version[apicid] = version; - if (num_processors >= NR_CPUS) { - pr_warning("WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); + if (num_processors >= nr_cpu_ids) { + int max = nr_cpu_ids; + int thiscpu = max + disabled_cpus; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i reached." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; return; } num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); + cpu = cpumask_next_zero(-1, cpu_present_mask); physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { @@@ -1892,8 -1878,8 +1880,8 @@@ } #endif - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); } #ifdef CONFIG_X86_64 @@@ -2095,7 -2081,7 +2083,7 @@@ __cpuinit int apic_is_clustered_box(voi bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { /* are we being called early in kernel startup? */ if (bios_cpu_apicid) { id = bios_cpu_apicid[i]; diff --combined arch/x86/kernel/cpu/intel_cacheinfo.c index 7bd00a56567,15cf14e9bf2..48533d77be7 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@@ -534,16 -534,31 +534,16 @@@ static void __cpuinit free_cache_attrib per_cpu(cpuid4_info, cpu) = NULL; } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static void __cpuinit get_cpu_leaves(void *_retval) { - struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; - cpumask_t oldmask; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) - return -ENOMEM; - - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (retval) - goto out; + int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { + struct _cpuid4_info *this_leaf; this_leaf = CPUID4_INFO_IDX(cpu, j); - retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) { + *retval = cpuid4_cache_lookup(j, this_leaf); + if (unlikely(*retval < 0)) { int i; for (i = 0; i < j; i++) @@@ -552,21 -567,9 +552,21 @@@ } cache_shared_cpu_map_setup(cpu, j); } - set_cpus_allowed_ptr(current, &oldmask); +} + +static int __cpuinit detect_cache_attributes(unsigned int cpu) +{ + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; + + per_cpu(cpuid4_info, cpu) = kzalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (per_cpu(cpuid4_info, cpu) == NULL) + return -ENOMEM; -out: + smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { kfree(per_cpu(cpuid4_info, cpu)); per_cpu(cpuid4_info, cpu) = NULL; @@@ -641,20 -644,17 +641,17 @@@ static inline ssize_t show_shared_cpu_l return show_shared_cpu_map_func(leaf, 1, buf); } - static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { - switch(this_leaf->eax.split.type) { - case CACHE_TYPE_DATA: + static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) + { + switch (this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); - break; - case CACHE_TYPE_INST: + case CACHE_TYPE_INST: return sprintf(buf, "Instruction\n"); - break; - case CACHE_TYPE_UNIFIED: + case CACHE_TYPE_UNIFIED: return sprintf(buf, "Unified\n"); - break; - default: + default: return sprintf(buf, "Unknown\n"); - break; } } diff --combined arch/x86/kernel/cpu/mcheck/mce_amd_64.c index a1de80f368f,748c8f9e7a0..a5a5e053037 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@@ -83,41 -83,34 +83,41 @@@ static DEFINE_PER_CPU(unsigned char, ba * CPU Initialization */ +struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; +}; + /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_block *b, - int reset, u16 old_limit) +static long threshold_restart_bank(void *_tr) { + struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; - rdmsr(b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - reset = 1; /* limit cannot be lower than err count */ + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ - if (reset) { /* reset err count and overflow bit */ + if (tr->reset) { /* reset err count and overflow bit */ mci_misc_hi = (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - b->threshold_limit); - } else if (old_limit) { /* change limit w/o reset */ + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (old_limit - b->threshold_limit); + (tr->old_limit - tr->b->threshold_limit); mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - b->interrupt_enable ? + tr->b->interrupt_enable ? (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(b->address, mci_misc_lo, mci_misc_hi); + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@@ -127,7 -120,6 +127,7 @@@ void __cpuinit mce_amd_feature_init(str unsigned int cpu = smp_processor_id(); u8 lvt_off; u32 low = 0, high = 0, address = 0; + struct thresh_restart tr; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@@ -170,10 -162,7 +170,10 @@@ wrmsr(address, low, high); threshold_defaults.address = address; - threshold_restart_bank(&threshold_defaults, 0, 0); + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); } } } @@@ -248,7 -237,7 +248,7 @@@ asmlinkage void mce_threshold_interrupt } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } @@@ -262,6 -251,20 +262,6 @@@ struct threshold_attr ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static void affinity_set(unsigned int cpu, cpumask_t *oldmask, - cpumask_t *newmask) -{ - *oldmask = current->cpus_allowed; - cpus_clear(*newmask); - cpu_set(cpu, *newmask); - set_cpus_allowed_ptr(current, newmask); -} - -static void affinity_restore(const cpumask_t *oldmask) -{ - set_cpus_allowed_ptr(current, oldmask); -} - #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ { \ @@@ -274,16 -277,15 +274,16 @@@ static ssize_t store_interrupt_enable(s const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; b->interrupt_enable = !!new; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, 0); - affinity_restore(&oldmask); + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } @@@ -292,7 -294,8 +292,7 @@@ static ssize_t store_threshold_limit(st const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; - u16 old; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; @@@ -300,36 -303,34 +300,36 @@@ new = THRESHOLD_MAX; if (new < 1) new = 1; - old = b->threshold_limit; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; + tr.b = b; + tr.reset = 0; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, old); - affinity_restore(&oldmask); + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } -static ssize_t show_error_count(struct threshold_block *b, char *buf) +static long local_error_count(void *_b) { - u32 high, low; - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); + struct threshold_block *b = _b; + u32 low, high; + rdmsr(b->address, low, high); - affinity_restore(&oldmask); - return sprintf(buf, "%x\n", - (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); + return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); } static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 1, 0); - affinity_restore(&oldmask); + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return 1; } @@@ -462,19 -463,12 +462,19 @@@ out_free return err; } +static long local_allocate_threshold_blocks(void *_bank) +{ + unsigned int *bank = _bank; + + return allocate_threshold_blocks(smp_processor_id(), *bank, 0, + MSR_IA32_MC0_MISC + *bank * 4); +} + /* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - cpumask_t oldmask, newmask; char name[32]; sprintf(name, "threshold_bank%i", bank); @@@ -525,7 -519,11 +525,7 @@@ per_cpu(threshold_banks, cpu)[bank] = b; - affinity_set(cpu, &oldmask, &newmask); - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(&oldmask); - + err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); if (err) goto out_free; diff --combined arch/x86/kernel/genx2apic_uv_x.c index 0e88be11227,dece1728973..b193e082f6c --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@@ -10,6 -10,7 +10,7 @@@ #include #include + #include #include #include #include @@@ -17,6 -18,9 +18,9 @@@ #include #include #include + #include + #include + #include #include #include #include @@@ -75,15 -79,16 +79,15 @@@ EXPORT_SYMBOL(sn_rtc_cycles_per_second) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t uv_target_cpus(void) +static const struct cpumask *uv_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t uv_vector_allocation_domain(int cpu) +static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) @@@ -122,37 -127,28 +126,37 @@@ static void uv_send_IPI_one(int cpu, in uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } -static void uv_send_IPI_mask(cpumask_t mask, int vector) +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned int cpu; - for_each_possible_cpu(cpu) - if (cpu_isset(cpu, mask)) + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); + + for_each_cpu(cpu, mask) + if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - uv_send_IPI_mask(mask, vector); + for_each_online_cpu(cpu) + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_all(int vector) { - uv_send_IPI_mask(cpu_online_map, vector); + uv_send_IPI_mask(cpu_online_mask, vector); } static int uv_apic_id_registered(void) @@@ -164,7 -160,7 +168,7 @@@ static void uv_init_apic_ldr(void { } -static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@@ -172,30 -168,13 +176,30 @@@ * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@@ -243,10 -222,8 +247,10 @@@ struct genapic apic_x2apic_uv_x = .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@@ -382,6 -359,103 +386,103 @@@ static __init void uv_rtc_init(void sn_rtc_cycles_per_second = ticks_per_sec; } + /* + * percpu heartbeat timer + */ + static void uv_heartbeat(unsigned long ignored) + { + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); + } + + static void __cpuinit uv_heartbeat_enable(int cpu) + { + if (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + } + + /* check boot cpu */ + if (!uv_cpu_hub_info(0)->scir.enabled) + uv_heartbeat_enable(0); + } + + #ifdef CONFIG_HOTPLUG_CPU + static void __cpuinit uv_heartbeat_disable(int cpu) + { + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); + } + + /* + * cpu hotplug notifier + */ + static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) + { + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; + } + + static __init void uv_scir_register_cpu_notifier(void) + { + hotcpu_notifier(uv_scir_cpu_notify, 0); + } + + #else /* !CONFIG_HOTPLUG_CPU */ + + static __init void uv_scir_register_cpu_notifier(void) + { + } + + static __init int uv_init_heartbeat(void) + { + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; + } + + late_initcall(uv_init_heartbeat); + + #endif /* !CONFIG_HOTPLUG_CPU */ + /* * Called on each cpu to initialize the per_cpu UV data area. * ZZZ hotplug not supported yet @@@ -455,7 -529,7 +556,7 @@@ void __init uv_system_init(void uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@@ -466,8 -540,7 +567,7 @@@ uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; @@@ -477,7 -550,8 +577,8 @@@ uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@@ -494,4 -568,6 +595,6 @@@ map_mmioh_high(max_pnode); uv_cpu_init(); + uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); } diff --combined arch/x86/kernel/io_apic.c index 1cbf7c8d46e,e7745961ed3..3e070bb961d --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@@ -136,8 -136,8 +136,8 @@@ static struct irq_pin_list *get_one_fre struct irq_cfg { struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; + cpumask_var_t domain; + cpumask_var_t old_domain; unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; @@@ -152,22 -152,22 +152,22 @@@ static struct irq_cfg irq_cfgx[] = #else static struct irq_cfg irq_cfgx[NR_IRQS] = { #endif - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, + [0] = { .vector = IRQ0_VECTOR, }, + [1] = { .vector = IRQ1_VECTOR, }, + [2] = { .vector = IRQ2_VECTOR, }, + [3] = { .vector = IRQ3_VECTOR, }, + [4] = { .vector = IRQ4_VECTOR, }, + [5] = { .vector = IRQ5_VECTOR, }, + [6] = { .vector = IRQ6_VECTOR, }, + [7] = { .vector = IRQ7_VECTOR, }, + [8] = { .vector = IRQ8_VECTOR, }, + [9] = { .vector = IRQ9_VECTOR, }, + [10] = { .vector = IRQ10_VECTOR, }, + [11] = { .vector = IRQ11_VECTOR, }, + [12] = { .vector = IRQ12_VECTOR, }, + [13] = { .vector = IRQ13_VECTOR, }, + [14] = { .vector = IRQ14_VECTOR, }, + [15] = { .vector = IRQ15_VECTOR, }, }; void __init arch_early_irq_init(void) @@@ -183,10 -183,6 +183,10 @@@ for (i = 0; i < count; i++) { desc = irq_to_desc(i); desc->chip_data = &cfg[i]; + alloc_bootmem_cpumask_var(&cfg[i].domain); + alloc_bootmem_cpumask_var(&cfg[i].old_domain); + if (i < NR_IRQS_LEGACY) + cpumask_setall(cfg[i].domain); } } @@@ -211,20 -207,6 +211,20 @@@ static struct irq_cfg *get_one_free_irq node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + if (cfg) { + /* FIXME: needs alloc_cpumask_var_node() */ + if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) { + kfree(cfg); + cfg = NULL; + } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) { + free_cpumask_var(cfg->domain); + kfree(cfg); + cfg = NULL; + } else { + cpumask_clear(cfg->domain); + cpumask_clear(cfg->old_domain); + } + } printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node); return cfg; @@@ -347,14 -329,13 +347,14 @@@ void arch_free_chip_data(struct irq_des } } -static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) +static void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg = desc->chip_data; if (!cfg->move_in_progress) { /* it means that domain is not changed */ - if (!cpus_intersects(desc->affinity, mask)) + if (!cpumask_intersects(&desc->affinity, mask)) cfg->move_desc_pending = 1; } } @@@ -369,8 -350,7 +369,8 @@@ static struct irq_cfg *irq_cfg(unsigne #endif #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) +static inline void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) { } #endif @@@ -501,26 -481,6 +501,26 @@@ static void ioapic_mask_entry(int apic } #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; @@@ -556,61 -516,48 +556,61 @@@ } } -static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask); +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); -static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask) +/* + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid + * of that, or returns BAD_APICID and leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; + return BAD_APICID; + cpumask_and(&desc->affinity, cfg->domain, mask); set_extra_move_desc(desc, mask); + return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); +} - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg); - desc->affinity = mask; + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } spin_unlock_irqrestore(&ioapic_lock, flags); } -static void set_ioapic_affinity_irq(unsigned int irq, - const struct cpumask *mask) +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc; desc = irq_to_desc(irq); - set_ioapic_affinity_irq_desc(desc, *mask); + set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@@ -1272,8 -1219,7 +1272,8 @@@ void unlock_vector_lock(void spin_unlock(&vector_lock); } -static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask) +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@@ -1288,49 -1234,49 +1288,49 @@@ */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; - int cpu; + int cpu, err; + cpumask_var_t tmp_mask; if ((cfg->move_in_progress) || cfg->move_cleanup_count) return -EBUSY; - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; old_vector = cfg->vector; if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); return 0; + } } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + vector_allocation_domain(cpu, tmp_mask); vector = current_vector; offset = current_offset; next: vector += 8; if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ + /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) + + if (test_bit(vector, used_vectors)) goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) + + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@@ -1338,21 -1284,18 +1338,21 @@@ current_offset = offset; if (old_vector) { cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; + cpumask_copy(cfg->old_domain, cfg->domain); } - for_each_cpu_mask_nr(new_cpu, new_mask) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; - cfg->domain = domain; - return 0; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; } - return -ENOSPC; + free_cpumask_var(tmp_mask); + return err; } -static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask) +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@@ -1365,20 -1308,23 +1365,20 @@@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) { - cpumask_t mask; int cpu, vector; BUG_ON(!cfg->vector); vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cpus_clear(cfg->domain); + cpumask_clear(cfg->domain); if (likely(!cfg->move_in_progress)) return; - cpus_and(mask, cfg->old_domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@@ -1403,7 -1349,7 +1403,7 @@@ void __setup_vector_irq(int cpu if (!desc) continue; cfg = desc->chip_data; - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; @@@ -1415,7 -1361,7 +1415,7 @@@ continue; cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@@ -1551,17 -1497,18 +1551,17 @@@ static void setup_IO_APIC_irq(int apic { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - cpumask_t mask; + unsigned int dest; if (!IO_APIC_IRQ(irq)) return; cfg = desc->chip_data; - mask = TARGET_CPUS; - if (assign_irq_vector(irq, cfg, mask)) + if (assign_irq_vector(irq, cfg, TARGET_CPUS)) return; - cpus_and(mask, cfg->domain, mask); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@@ -1571,7 -1518,8 +1571,7 @@@ if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { + dest, trigger, polarity, cfg->vector)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic].mp_apicid, pin); __clear_irq_vector(irq, cfg); @@@ -2293,7 -2241,7 +2293,7 @@@ static int ioapic_retrigger_irq(unsigne unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@@ -2342,17 -2290,18 +2342,17 @@@ static DECLARE_DELAYED_WORK(ir_migratio * as simple as edge triggered migration and we can do the irq migration * with a simple atomic update to IO-APIC RTE. */ -static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask) +static void +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - cpumask_t tmp, cleanup_mask; struct irte irte; int modify_ioapic_rte; unsigned int dest; unsigned long flags; unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpumask_intersects(mask, cpu_online_mask)) return; irq = desc->irq; @@@ -2365,7 -2314,8 +2365,7 @@@ set_extra_move_desc(desc, mask); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, mask); modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { @@@ -2382,10 -2332,14 +2382,10 @@@ */ modify_irte(irq, &irte); - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (cfg->move_in_progress) + send_cleanup_vector(cfg); - desc->affinity = mask; + cpumask_copy(&desc->affinity, mask); } static int migrate_irq_remapped_level_desc(struct irq_desc *desc) @@@ -2407,11 -2361,11 +2407,11 @@@ } /* everthing is clear. we have right of way */ - migrate_ioapic_irq_desc(desc, desc->pending_mask); + migrate_ioapic_irq_desc(desc, &desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); unmask: unmask_IO_APIC_irq_desc(desc); @@@ -2448,12 -2402,11 +2448,12 @@@ static void ir_irq_migration(struct wor /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask) +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) { if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; + cpumask_copy(&desc->pending_mask, mask); migrate_irq_remapped_level_desc(desc); return; } @@@ -2465,17 -2418,16 +2465,16 @@@ static void set_ir_ioapic_affinity_irq( { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, *mask); + set_ir_ioapic_affinity_irq_desc(desc, mask); } #endif asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); - #ifdef CONFIG_X86_64 exit_idle(); - #endif irq_enter(); me = smp_processor_id(); @@@ -2497,7 -2449,7 +2496,7 @@@ if (!cfg->move_cleanup_count) goto unlock; - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; __get_cpu_var(vector_irq)[vector] = -1; @@@ -2520,7 -2472,7 +2519,7 @@@ static void irq_complete_move(struct ir if (likely(!cfg->move_desc_pending)) return; - /* domain is not change, but affinity is changed */ + /* domain has not changed, but affinity did */ me = smp_processor_id(); if (cpu_isset(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); @@@ -2534,14 -2486,20 +2533,14 @@@ vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC *descp = desc = move_irq_desc(desc, me); /* get the new one */ cfg = desc->chip_data; #endif - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); } #else static inline void irq_complete_move(struct irq_desc **descp) {} @@@ -3266,13 -3224,16 +3265,13 @@@ static int msi_compose_msg(struct pci_d struct irq_cfg *cfg; int err; unsigned dest; - cpumask_t tmp; cfg = irq_cfg(irq); - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, cfg, tmp); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (err) return err; - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { @@@ -3332,12 -3293,19 +3331,12 @@@ static void set_msi_irq_affinity(unsign struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - if (!cpumask_intersects(mask, cpu_online_mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, *mask)) - return; - - set_extra_move_desc(desc, *mask); - - cpumask_and(&tmp, &cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); read_msi_msg_desc(desc, &msg); @@@ -3347,27 -3315,37 +3346,27 @@@ msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); - cpumask_copy(&desc->affinity, mask); } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void ir_set_msi_irq_affinity(unsigned int irq, - const struct cpumask *mask) +static void +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; + struct irq_cfg *cfg = desc->chip_data; unsigned int dest; - cpumask_t tmp, cleanup_mask; struct irte irte; - if (!cpumask_intersects(mask, cpu_online_mask)) - return; - if (get_irte(irq, &irte)) return; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, *mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - set_extra_move_desc(desc, *mask); - - cpumask_and(&tmp, &cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@@ -3381,8 -3359,14 +3380,8 @@@ * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - cpumask_copy(&desc->affinity, mask); + if (cfg->move_in_progress) + send_cleanup_vector(cfg); } #endif @@@ -3579,12 -3563,19 +3578,12 @@@ static void dmar_msi_set_affinity(unsig struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - if (!cpumask_intersects(mask, cpu_online_mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, *mask)) - return; - - set_extra_move_desc(desc, *mask); - - cpumask_and(&tmp, &cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); dmar_msi_read(irq, &msg); @@@ -3594,6 -3585,7 +3593,6 @@@ msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - cpumask_copy(&desc->affinity, mask); } #endif /* CONFIG_SMP */ @@@ -3633,12 -3625,19 +3632,12 @@@ static void hpet_msi_set_affinity(unsig struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - if (!cpumask_intersects(mask, cpu_online_mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, *mask)) - return; - - set_extra_move_desc(desc, *mask); - - cpumask_and(&tmp, &cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); hpet_msi_read(irq, &msg); @@@ -3648,6 -3647,7 +3647,6 @@@ msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); - cpumask_copy(&desc->affinity, mask); } #endif /* CONFIG_SMP */ @@@ -3707,14 -3707,22 +3706,14 @@@ static void set_ht_irq_affinity(unsigne struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp; - if (!cpumask_intersects(mask, cpu_online_mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, *mask)) - return; - - set_extra_move_desc(desc, *mask); - - cpumask_and(&tmp, &cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - cpumask_copy(&desc->affinity, mask); } #endif @@@ -3734,14 -3742,17 +3733,14 @@@ int arch_setup_ht_irq(unsigned int irq { struct irq_cfg *cfg; int err; - cpumask_t tmp; cfg = irq_cfg(irq); - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, cfg, tmp); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (!err) { struct ht_irq_msg msg; unsigned dest; - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@@ -3777,7 -3788,7 +3776,7 @@@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset) { - const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@@ -3787,7 -3798,7 +3786,7 @@@ cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, *eligible_cpu); + err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; @@@ -3806,7 -3817,7 +3805,7 @@@ entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(*eligible_cpu); + entry->dest = cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@@ -4017,7 -4028,7 +4016,7 @@@ void __init setup_ioapic_dest(void int pin, ioapic, irq, irq_entry; struct irq_desc *desc; struct irq_cfg *cfg; - cpumask_t mask; + const struct cpumask *mask; if (skip_ioapic_setup == 1) return; @@@ -4048,7 -4059,7 +4047,7 @@@ */ if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = &desc->affinity; else mask = TARGET_CPUS; diff --combined arch/x86/kernel/irq_64.c index fca2991443f,54c69d47a77..6383d50f82e --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@@ -13,12 -13,12 +13,12 @@@ #include #include #include + #include #include #include #include #include - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: * @@@ -28,26 -28,25 +28,25 @@@ */ static inline void stack_overflow_check(struct pt_regs *regs) { + #ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; - - if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); - show_stack(NULL,NULL); - warned = jiffies; - } - } + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); #endif + } /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ - asmlinkage unsigned int do_IRQ(struct pt_regs *regs) + asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; @@@ -60,9 -59,7 +59,7 @@@ irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; - #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); - #endif desc = irq_to_desc(irq); if (likely(desc)) @@@ -83,17 -80,16 +80,17 @@@ } #ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + const struct cpumask *affinity; if (!desc) continue; @@@ -103,23 -99,23 +100,23 @@@ /* interrupt's are disabled at this point */ spin_lock(&desc->lock); + affinity = &desc->affinity; if (!irq_has_action(irq) || - cpus_equal(desc->affinity, map)) { + cpumask_equal(affinity, cpu_online_mask)) { spin_unlock(&desc->lock); continue; } - cpus_and(mask, desc->affinity, map); - if (cpus_empty(mask)) { + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - mask = map; + affinity = cpu_all_mask; } if (desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, &mask); + desc->chip->set_affinity(irq, affinity); else if (!(warned++)) set_affinity = 0; diff --combined arch/x86/kernel/irqinit_32.c index 61aa2a1004b,203384ed2b5..84723295f88 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@@ -110,18 -110,6 +110,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@@ -140,7 -128,7 +140,7 @@@ void __init native_init_IRQ(void for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i]); + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } @@@ -158,12 -146,10 +158,12 @@@ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --combined arch/x86/kernel/irqinit_64.c index 1020919efe1,6190e6ef546..31ebfe38e96 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@@ -23,41 -23,6 +23,6 @@@ #include #include - /* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - - #define IRQ_NAME2(nr) nr##_interrupt(void) - #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - - /* - * SMP has a few special interrupts for IPI messages - */ - - #define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.text\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt\n" \ - ".previous"); - - #define BI(x,y) \ - BUILD_IRQ(x##y) - - #define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x30-0x3f) @@@ -73,37 -38,6 +38,6 @@@ * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) - BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) - BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) - BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - - #undef BUILD_16_IRQS - #undef BI - - - #define IRQ(x,y) \ - IRQ##x##y##_interrupt - - #define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - - /* for the irq vectors */ - static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) - }; - - #undef IRQ - #undef IRQLIST_16 - - - /* * IRQ2 is cascade interrupt to second interrupt controller @@@ -135,18 -69,6 +69,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + void __init init_ISA_irqs(void) { int i; @@@ -199,7 -121,6 +133,7 @@@ static void __init smp_intr_init(void /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif } diff --combined arch/x86/kernel/setup_percpu.c index 0b63b08e753,8e8b1193add..49f3f709ee1 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@@ -152,11 -152,6 +152,11 @@@ void __init setup_per_cpu_areas(void old_size = PERCPU_ENOUGH_ROOM; align = max_t(unsigned long, PAGE_SIZE, align); size = roundup(old_size, align); + + printk(KERN_INFO + "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@@ -173,24 -168,24 +173,24 @@@ "cpu %d has no node %d or node-local memory\n", cpu, node); if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", + printk(KERN_DEBUG + "per cpu data for cpu%d at %016lx\n", cpu, __pa(ptr)); } else { ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, __pa(MAX_DMA_ADDRESS)); if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); + printk(KERN_DEBUG + "per cpu data for cpu%d on node%d " + "at %016lx\n", + cpu, node, __pa(ptr)); } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", - NR_CPUS, nr_cpu_ids, nr_node_ids); - /* Setup percpu data maps */ setup_per_cpu_maps(); @@@ -339,25 -334,25 +339,25 @@@ static const cpumask_t cpu_mask_none /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ - const cpumask_t *_node_to_cpumask_ptr(int node) + const cpumask_t *cpumask_of_node(int node) { if (node_to_cpumask_map == NULL) { printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); dump_stack(); return (const cpumask_t *)&cpu_online_map; } if (node >= nr_node_ids) { printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", node, nr_node_ids); dump_stack(); return &cpu_mask_none; } return &node_to_cpumask_map[node]; } - EXPORT_SYMBOL(_node_to_cpumask_ptr); + EXPORT_SYMBOL(cpumask_of_node); /* * Returns a bitmask of CPUs on Node 'node'. diff --combined arch/x86/kernel/smp.c index 49ed667b06f,7e558db362c..beea2649a24 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@@ -118,22 -118,22 +118,22 @@@ static void native_smp_send_reschedule( WARN_ON(1); return; } - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); } -void native_send_call_func_ipi(cpumask_t mask) +void native_send_call_func_ipi(const struct cpumask *mask) { cpumask_t allbutself; allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - if (cpus_equal(mask, allbutself) && + if (cpus_equal(*mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else @@@ -165,11 -165,7 +165,7 @@@ static void native_smp_send_stop(void void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - #ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_resched_count++; - #else - add_pda(irq_resched_count, 1); - #endif + inc_irq_stat(irq_resched_count); } void smp_call_function_interrupt(struct pt_regs *regs) @@@ -177,11 -173,7 +173,7 @@@ ack_APIC_irq(); irq_enter(); generic_smp_call_function_interrupt(); - #ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; - #else - add_pda(irq_call_count, 1); - #endif + inc_irq_stat(irq_call_count); irq_exit(); } @@@ -190,11 -182,7 +182,7 @@@ void smp_call_function_single_interrupt ack_APIC_irq(); irq_enter(); generic_smp_call_function_single_interrupt(); - #ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; - #else - add_pda(irq_call_count, 1); - #endif + inc_irq_stat(irq_call_count); irq_exit(); } diff --combined arch/x86/kernel/smpboot.c index 1a9941b1115,c5392058cd0..9e177a4077e --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@@ -282,7 -282,7 +282,7 @@@ static int __cpuinitdata unsafe_smp /* * Activate a secondary processor. */ - static void __cpuinit start_secondary(void *unused) + notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@@ -496,7 -496,7 +496,7 @@@ void __cpuinit set_cpu_sibling_map(int } /* maps the cpu to the sched domain representing multi-core */ - cpumask_t cpu_coregroup_map(int cpu) + const struct cpumask *cpu_coregroup_mask(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); /* @@@ -504,9 -504,14 +504,14 @@@ * And for power savings, we return cpu_core_map */ if (sched_mc_power_savings || sched_smt_power_savings) - return per_cpu(cpu_core_map, cpu); + return &per_cpu(cpu_core_map, cpu); else - return c->llc_shared_map; + return &c->llc_shared_map; + } + + cpumask_t cpu_coregroup_map(int cpu) + { + return *cpu_coregroup_mask(cpu); } static void impress_friends(void) @@@ -1075,8 -1080,10 +1080,10 @@@ static int __init smp_sanity_check(unsi #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING "weird, boot CPU (#%d) not listed" - "by the BIOS.\n", hard_smp_processor_id()); + printk(KERN_WARNING + "weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); } @@@ -1252,15 -1259,6 +1259,15 @@@ void __init native_smp_cpus_done(unsign check_nmi_watchdog(); } +static int __initdata setup_possible_cpus = -1; +static int __init _setup_possible_cpus(char *str) +{ + get_option(&str, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@@ -1273,7 -1271,7 +1280,7 @@@ * * Three ways to find out the number of additional hotplug CPUs: * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with additional_cpus=NUM + * - The user can overwrite it with possible_cpus=NUM * - Otherwise don't reserve additional CPUs. * We do this because additional CPUs waste a lot of memory. * -AK @@@ -1286,17 -1284,9 +1293,17 @@@ __init void prefill_possible_map(void if (!num_processors) num_processors = 1; - possible = num_processors + disabled_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (setup_possible_cpus == -1) + possible = num_processors + disabled_cpus; + else + possible = setup_possible_cpus; + + if (possible > CONFIG_NR_CPUS) { + printk(KERN_WARNING + "%d Processors exceeds NR_CPUS limit of %d\n", + possible, CONFIG_NR_CPUS); + possible = CONFIG_NR_CPUS; + } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); @@@ -1361,7 -1351,7 +1368,7 @@@ void cpu_disable_common(void lock_vector_lock(); remove_cpu_from_maps(cpu); unlock_vector_lock(); - fixup_irqs(cpu_online_map); + fixup_irqs(); } int native_cpu_disable(void) diff --combined arch/x86/kernel/tlb_32.c index 174ea90d1cb,8da059f949b..ce505464224 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@@ -34,9 -34,8 +34,8 @@@ static DEFINE_SPINLOCK(tlbstate_lock) */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@@ -104,8 -103,8 +103,8 @@@ void smp_invalidate_interrupt(struct pt * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@@ -119,7 -118,7 +118,7 @@@ smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@@ -164,7 -163,7 +163,7 @@@ * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ @@@ -238,7 -237,7 +237,7 @@@ static void do_flush_tlb_all(void *info unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } diff --combined arch/x86/kernel/tlb_64.c index de6f1bda0c5,29887d7081a..f8be6f1d2e4 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@@ -154,7 -154,7 +154,7 @@@ asmlinkage void smp_invalidate_interrup out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@@ -191,7 -191,7 +191,7 @@@ * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); while (!cpus_empty(f->flush_cpumask)) cpu_relax(); diff --combined arch/x86/kernel/traps.c index 4a6dff39a47,141907ab6e2..2d1f4c7e405 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@@ -72,6 -72,9 +72,6 @@@ #include "cpu/mcheck/mce.h" -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@@ -86,9 -89,6 +86,9 @@@ gate_desc idt_table[256 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + static int ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) @@@ -481,11 -481,7 +481,7 @@@ do_nmi(struct pt_regs *regs, long error { nmi_enter(); - #ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } - #else - add_pda(__nmi_count, 1); - #endif + inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); @@@ -664,7 -660,7 +660,7 @@@ void math_error(void __user *ip { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@@ -675,7 -671,6 +671,6 @@@ task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@@ -689,34 -684,31 +684,31 @@@ */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ + + err = swd & ~cwd & 0x3f; + #ifdef CONFIG_X86_32 + if (!err) return; #endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ } force_sig_info(SIGFPE, &info, task); } @@@ -949,7 -941,9 +941,7 @@@ dotraplinkage void do_iret_error(struc void __init trap_init(void) { -#ifdef CONFIG_X86_32 int i; -#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@@ -1006,15 -1000,11 +998,15 @@@ } set_system_trap_gate(SYSCALL_VECTOR, &system_call); +#endif /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); +#ifdef CONFIG_X86_64 + set_bit(IA32_SYSCALL_VECTOR, used_vectors); +#else set_bit(SYSCALL_VECTOR, used_vectors); #endif /* diff --combined arch/x86/xen/mmu.c index e59e53b11e2,773d68d3e91..503c240e26c --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@@ -154,13 -154,13 +154,13 @@@ void xen_setup_mfn_list_list(void { unsigned pfn, idx; - for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@@ -179,7 -179,7 +179,7 @@@ void __init xen_build_dynamic_phys_to_m unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@@ -207,7 -207,7 +207,7 @@@ static void alloc_p2m(unsigned long **p p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); BUG_ON(p == NULL); - for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; if (cmpxchg(pp, p2m_missing, p) != p2m_missing) @@@ -407,7 -407,8 +407,8 @@@ out preempt_enable(); } - pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) + pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ return *ptep; @@@ -878,7 -879,8 +879,8 @@@ static void __xen_pgd_pin(struct mm_str if (user_pgd) { xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ @@@ -993,7 -995,8 +995,8 @@@ static void __xen_pgd_unpin(struct mm_s pgd_t *user_pgd = xen_get_user_pgd(pgd); if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } @@@ -1079,7 -1082,7 +1082,7 @@@ static void drop_other_mm_ref(void *inf static void xen_drop_mm_ref(struct mm_struct *mm) { - cpumask_t mask; + cpumask_var_t mask; unsigned cpu; if (current->active_mm == mm) { @@@ -1091,16 -1094,7 +1094,16 @@@ } /* Get the "official" set of cpus referring to our pagetable. */ - mask = mm->cpu_vm_mask; + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, &mm->cpu_vm_mask); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@@ -1109,12 -1103,11 +1112,12 @@@ if needed. */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, mask); } - if (!cpus_empty(mask)) - smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) diff --combined include/linux/sched.h index e5f928a079e,8395e715809..158d53d0776 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -250,7 -250,7 +250,7 @@@ extern void init_idle_bootup_task(struc extern int runqueue_is_locked(void); extern void task_rq_unlock_wait(struct task_struct *p); -extern cpumask_t nohz_cpu_mask; +extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); #else @@@ -571,12 -571,6 +571,6 @@@ struct signal_struct */ struct rlimit rlim[RLIM_NLIMITS]; - /* keep the process-shared keyrings here so that they do the right - * thing in threads created with CLONE_THREAD */ - #ifdef CONFIG_KEYS - struct key *session_keyring; /* keyring inherited over fork */ - struct key *process_keyring; /* keyring private to this process */ - #endif #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif @@@ -647,6 -641,7 +641,7 @@@ struct user_struct /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + struct user_namespace *user_ns; #ifdef CONFIG_USER_SCHED struct task_group *tg; @@@ -664,6 -659,7 +659,7 @@@ extern struct user_struct *find_user(ui extern struct user_struct root_user; #define INIT_USER (&root_user) + struct backing_dev_info; struct reclaim_state; @@@ -671,8 -667,7 +667,7 @@@ struct sched_info { /* cumulative counters */ unsigned long pcount; /* # of times run on this cpu */ - unsigned long long cpu_time, /* time spent on the cpu */ - run_delay; /* time spent waiting on a runqueue */ + unsigned long long run_delay; /* time spent waiting on a runqueue */ /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ @@@ -763,51 -758,20 +758,51 @@@ enum cpu_idle_type #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ #define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ -#define BALANCE_FOR_MC_POWER \ - (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) +enum powersavings_balance_level { + POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ + POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package + * first for long running threads + */ + POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle + * cpu package for power savings + */ + MAX_POWERSAVINGS_BALANCE_LEVELS +}; -#define BALANCE_FOR_PKG_POWER \ - ((sched_mc_power_savings || sched_smt_power_savings) ? \ - SD_POWERSAVINGS_BALANCE : 0) +extern int sched_mc_power_savings, sched_smt_power_savings; -#define test_sd_parent(sd, flag) ((sd->parent && \ - (sd->parent->flags & flag)) ? 1 : 0) +static inline int sd_balance_for_mc_power(void) +{ + if (sched_smt_power_savings) + return SD_POWERSAVINGS_BALANCE; + return 0; +} + +static inline int sd_balance_for_package_power(void) +{ + if (sched_mc_power_savings | sched_smt_power_savings) + return SD_POWERSAVINGS_BALANCE; + + return 0; +} + +/* + * Optimise SD flags for power savings: + * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. + * Keep default SD flags if sched_{smt,mc}_power_saving=0 + */ + +static inline int sd_power_saving_flags(void) +{ + if (sched_mc_power_savings | sched_smt_power_savings) + return SD_BALANCE_NEWIDLE; + + return 0; +} struct sched_group { struct sched_group *next; /* Must be a circular list */ - cpumask_t cpumask; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@@ -820,15 -784,8 +815,15 @@@ * (see include/linux/reciprocal_div.h) */ u32 reciprocal_cpu_power; + + unsigned long cpumask[]; }; +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + enum sched_domain_level { SD_LV_NONE = 0, SD_LV_SIBLING, @@@ -852,6 -809,7 +847,6 @@@ struct sched_domain struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ - cpumask_t span; /* span of all CPUs in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ @@@ -906,73 -864,25 +901,42 @@@ #ifdef CONFIG_SCHED_DEBUG char *name; #endif + + /* span of all CPUs in this domain */ + unsigned long span[]; }; -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +static inline struct cpumask *sched_domain_span(struct sched_domain *sd) +{ + return to_cpumask(sd->span); +} + +extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new); extern int arch_reinit_sched_domains(void); +/* Test a flag in parent sched domain */ +static inline int test_sd_parent(struct sched_domain *sd, int flag) +{ + if (sd->parent && (sd->parent->flags & flag)) + return 1; + + return 0; +} + #else /* CONFIG_SMP */ struct sched_domain_attr; static inline void -partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { } #endif /* !CONFIG_SMP */ struct io_context; /* See blkdev.h */ - #define NGROUPS_SMALL 32 - #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) - struct group_info { - int ngroups; - atomic_t usage; - gid_t small_block[NGROUPS_SMALL]; - int nblocks; - gid_t *blocks[0]; - }; - /* - * get_group_info() must be called with the owning task locked (via task_lock()) - * when task != current. The reason being that the vast majority of callers are - * looking at current->group_info, which can not be changed except by the - * current task. Changing current->group_info requires the task lock, too. - */ - #define get_group_info(group_info) do { \ - atomic_inc(&(group_info)->usage); \ - } while (0) - - #define put_group_info(group_info) do { \ - if (atomic_dec_and_test(&(group_info)->usage)) \ - groups_free(group_info); \ - } while (0) - - extern struct group_info *groups_alloc(int gidsetsize); - extern void groups_free(struct group_info *group_info); - extern int set_current_groups(struct group_info *group_info); - extern int groups_search(struct group_info *group_info, gid_t grp); - /* access the groups "array" with this macro */ - #define GROUP_AT(gi, i) \ - ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK extern void prefetch_stack(struct task_struct *t); @@@ -1016,7 -926,7 +980,7 @@@ struct sched_class void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const cpumask_t *newmask); + const struct cpumask *newmask); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); @@@ -1228,6 -1138,7 +1192,7 @@@ struct task_struct * The buffer to hold the BTS data. */ void *bts_buffer; + size_t bts_size; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ @@@ -1251,17 -1162,12 +1216,12 @@@ struct list_head cpu_timers[3]; /* process credentials */ - uid_t uid,euid,suid,fsuid; - gid_t gid,egid,sgid,fsgid; - struct group_info *group_info; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - struct user_struct *user; - unsigned securebits; - #ifdef CONFIG_KEYS - unsigned char jit_keyring; /* default keyring to attach requested keys to */ - struct key *request_key_auth; /* assumed request_key authority */ - struct key *thread_keyring; /* keyring private to this thread */ - #endif + const struct cred *real_cred; /* objective and real subjective task + * credentials (COW) */ + const struct cred *cred; /* effective (overridable) subjective task + * credentials (COW) */ + struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ + char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) @@@ -1298,9 -1204,6 +1258,6 @@@ int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; - #ifdef CONFIG_SECURITY - void *security; - #endif struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; @@@ -1676,12 -1579,12 +1633,12 @@@ extern cputime_t task_gtime(struct task #ifdef CONFIG_SMP extern int set_cpus_allowed_ptr(struct task_struct *p, - const cpumask_t *new_mask); + const struct cpumask *new_mask); #else static inline int set_cpus_allowed_ptr(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - if (!cpu_isset(0, *new_mask)) + if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } @@@ -1857,7 -1760,6 +1814,6 @@@ static inline struct user_struct *get_u return u; } extern void free_uid(struct user_struct *); - extern void switch_uid(struct user_struct *); extern void release_uids(struct user_namespace *ns); #include @@@ -1876,9 -1778,6 +1832,6 @@@ extern void wake_up_new_task(struct tas extern void sched_fork(struct task_struct *p, int clone_flags); extern void sched_dead(struct task_struct *p); - extern int in_group_p(gid_t); - extern int in_egroup_p(gid_t); - extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); @@@ -2010,6 -1909,8 +1963,8 @@@ static inline unsigned long wait_task_i #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + extern bool is_single_threaded(struct task_struct *); + /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. @@@ -2294,8 -2195,10 +2249,8 @@@ __trace_special(void *__tr, void *__dat } #endif -extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); -extern long sched_getaffinity(pid_t pid, cpumask_t *mask); - -extern int sched_mc_power_savings, sched_smt_power_savings; +extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); +extern long sched_getaffinity(pid_t pid, struct cpumask *mask); extern void normalize_rt_tasks(void); diff --combined kernel/rcuclassic.c index c03ca3e6191,0ff9b05706a..6ec495f60ea --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@@ -63,14 -63,14 +63,14 @@@ static struct rcu_ctrlblk rcu_ctrlblk .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), - .cpumask = CPU_MASK_NONE, + .cpumask = CPU_BITS_NONE, }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@@ -85,7 -85,6 +85,6 @@@ static void force_quiescent_state(struc struct rcu_ctrlblk *rcp) { int cpu; - cpumask_t cpumask; unsigned long flags; set_need_resched(); @@@ -96,10 -95,10 +95,10 @@@ * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. * - * cpu_online_map is updated by the _cpu_down() + * cpu_online_mask is updated by the _cpu_down() * using __stop_machine(). Since we're in irqs disabled * section, __stop_machine() is not exectuting, hence - * the cpu_online_map is stable. + * the cpu_online_mask is stable. * * However, a cpu might have been offlined _just_ before * we disabled irqs while entering here. @@@ -107,13 -106,14 +106,14 @@@ * notification, leading to the offlined cpu's bit * being set in the rcp->cpumask. * - * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent * sending smp_reschedule() to an offlined CPU. */ - cpus_and(cpumask, rcp->cpumask, cpu_online_map); - cpu_clear(rdp->cpu, cpumask); - for_each_cpu_mask_nr(cpu, cpumask) - smp_send_reschedule(cpu); + for_each_cpu_and(cpu, + to_cpumask(rcp->cpumask), cpu_online_mask) { + if (cpu != rdp->cpu) + smp_send_reschedule(cpu); + } } spin_unlock_irqrestore(&rcp->lock, flags); } @@@ -193,7 -193,7 +193,7 @@@ static void print_other_cpu_stall(struc printk(KERN_ERR "INFO: RCU detected CPU stalls:"); for_each_possible_cpu(cpu) { - if (cpu_isset(cpu, rcp->cpumask)) + if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask))) printk(" %d", cpu); } printk(" (detected by %d, t=%ld jiffies)\n", @@@ -221,7 -221,8 +221,8 @@@ static void check_cpu_stall(struct rcu_ long delta; delta = jiffies - rcp->jiffies_stall; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) && + delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rcp); @@@ -393,7 -394,8 +394,7 @@@ static void rcu_start_batch(struct rcu_ * unnecessarily. */ smp_mb(); - cpumask_andnot(to_cpumask(rcp->cpumask), - cpu_online_mask, &nohz_cpu_mask); + cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask); rcp->signaled = 0; } @@@ -406,8 -408,8 +407,8 @@@ */ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) { - cpu_clear(cpu, rcp->cpumask); - if (cpus_empty(rcp->cpumask)) { + cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask)); + if (cpumask_empty(to_cpumask(rcp->cpumask))) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp); diff --combined kernel/sched.c index 756d981d91a,f2095660efe..27ba1d642f0 --- a/kernel/sched.c +++ b/kernel/sched.c @@@ -209,7 -209,6 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static inline int rt_bandwidth_enabled(void) @@@ -361,7 -360,9 +360,9 @@@ static inline struct task_group *task_g struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->user->tg; + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@@ -497,26 -498,18 +498,26 @@@ struct rt_rq */ struct root_domain { atomic_t refcount; - cpumask_t span; - cpumask_t online; + cpumask_var_t span; + cpumask_var_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ - cpumask_t rto_mask; + cpumask_var_t rto_mask; atomic_t rto_count; #ifdef CONFIG_SMP struct cpupri cpupri; #endif +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Preferred wake up cpu nominated by sched_mc balance that will be + * used when most cpus are idle in the system indicating overall very + * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) + */ + unsigned int sched_mc_preferred_wakeup_cpu; +#endif }; /* @@@ -610,6 -603,8 +611,8 @@@ struct rq #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; @@@ -1143,7 -1138,6 +1146,6 @@@ static void init_rq_hrtick(struct rq *r hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@@ -1520,7 -1514,7 +1522,7 @@@ static int tg_shares_up(struct task_gro struct sched_domain *sd = data; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu(i, sched_domain_span(sd)) { /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@@ -1541,7 -1535,7 +1543,7 @@@ if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu_mask(i, sd->span) + for_each_cpu(i, sched_domain_span(sd)) update_group_shares_cpu(tg, i, shares, rq_weight); return 0; @@@ -1871,6 -1865,8 +1873,8 @@@ void set_task_cpu(struct task_struct *p clock_offset = old_rq->clock - new_rq->clock; + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; @@@ -2105,17 -2101,15 +2109,17 @@@ find_idlest_group(struct sched_domain * int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) continue; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@@ -2147,14 -2141,17 +2151,14 @@@ * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, - cpumask_t *tmp) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(*tmp, group->cpumask, p->cpus_allowed); - - for_each_cpu_mask_nr(i, *tmp) { + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@@ -2196,6 -2193,7 +2200,6 @@@ static int sched_balance_self(int cpu, update_shares(sd); while (sd) { - cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@@ -2204,13 -2202,14 +2208,13 @@@ continue; } - span = sd->span; group = find_idlest_group(sd, t, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@@ -2219,10 -2218,10 +2223,10 @@@ /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); sd = NULL; - weight = cpus_weight(span); for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) + if (weight <= cpumask_weight(sched_domain_span(tmp))) break; if (tmp->flags & flag) sd = tmp; @@@ -2267,7 -2266,7 +2271,7 @@@ static int try_to_wake_up(struct task_s cpu = task_cpu(p); for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { update_shares(sd); break; } @@@ -2277,6 -2276,7 +2281,7 @@@ smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@@ -2315,7 -2315,7 +2320,7 @@@ else { struct sched_domain *sd; for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } @@@ -2334,12 -2334,11 +2339,11 @@@ out_activate schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; out_running: - trace_sched_wakeup(rq, p); + trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@@ -2472,7 -2471,7 +2476,7 @@@ void wake_up_new_task(struct task_struc p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_sched_wakeup_new(rq, p); + trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@@ -2847,11 -2846,10 +2851,10 @@@ static void sched_migrate_task(struct t struct rq *rq; rq = task_rq_lock(p, &flags); - if (!cpu_isset(dest_cpu, p->cpus_allowed) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) || unlikely(!cpu_active(dest_cpu))) goto out; - trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@@ -2913,7 -2911,7 +2916,7 @@@ int can_migrate_task(struct task_struc * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } @@@ -3088,7 -3086,7 +3091,7 @@@ static int move_one_task(struct rq *thi static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const cpumask_t *cpus, int *balance) + int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@@ -3124,11 -3122,10 +3127,11 @@@ unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - local_group = cpu_isset(this_cpu, group->cpumask); + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); if (local_group) - balance_cpu = first_cpu(group->cpumask); + balance_cpu = cpumask_first(sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@@ -3137,8 -3134,13 +3140,8 @@@ max_cpu_load = 0; min_cpu_load = ~0UL; - for_each_cpu_mask_nr(i, group->cpumask) { - struct rq *rq; - - if (!cpu_isset(i, *cpus)) - continue; - - rq = cpu_rq(i); + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; @@@ -3249,8 -3251,8 +3252,8 @@@ */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - first_cpu(group->cpumask) < - first_cpu(group_min->cpumask))) { + cpumask_first(sched_group_cpus(group)) > + cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@@ -3265,8 -3267,8 +3268,8 @@@ if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - first_cpu(group->cpumask) > - first_cpu(group_leader->cpumask))) { + cpumask_first(sched_group_cpus(group)) < + cpumask_first(sched_group_cpus(group_leader)))) { group_leader = group; leader_nr_running = sum_nr_running; } @@@ -3392,10 -3394,6 +3395,10 @@@ out_balanced if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + cpumask_first(sched_group_cpus(group_leader)); + } return group_min; } #endif @@@ -3409,16 -3407,16 +3412,16 @@@ ret */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const cpumask_t *cpus) + unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; - for_each_cpu_mask_nr(i, group->cpumask) { + for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; - if (!cpu_isset(i, *cpus)) + if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); @@@ -3448,7 -3446,7 +3451,7 @@@ */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, cpumask_t *cpus) + int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@@ -3456,7 -3454,7 +3459,7 @@@ struct rq *busiest; unsigned long flags; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@@ -3516,8 -3514,8 +3519,8 @@@ redo /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } @@@ -3534,8 -3532,7 +3537,8 @@@ /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; @@@ -3610,7 -3607,7 +3613,7 @@@ out */ static int load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - cpumask_t *cpus) + struct cpumask *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@@ -3619,7 -3616,7 +3622,7 @@@ int sd_idle = 0; int all_pinned = 0; - cpus_setall(*cpus); + cpumask_setall(cpus); /* * When power savings policy is enabled for the parent domain, idle @@@ -3663,71 -3660,17 +3666,71 @@@ redo double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), *cpus); - if (!cpus_empty(*cpus)) + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) goto redo; } } if (!ld_moved) { + int active_balance = 0; + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; + + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return -1; + + if (sd->nr_balance_failed++ < 2) + return -1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. The same method used to move task in load_balance() + * have been extended for load_balance_newidle() to speedup + * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + + /* Lock busiest in correct order while this_rq is held */ + double_lock_balance(this_rq, busiest); + + /* + * don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + double_unlock_balance(this_rq, busiest); + all_pinned = 1; + return ld_moved; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + + double_unlock_balance(this_rq, busiest); + if (active_balance) + wake_up_process(busiest->migration_thread); + } else sd->nr_balance_failed = 0; @@@ -3753,10 -3696,7 +3756,10 @@@ static void idle_balance(int this_cpu, struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_t tmpmask; + cpumask_var_t tmpmask; + + if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) + return; for_each_domain(this_cpu, sd) { unsigned long interval; @@@ -3767,7 -3707,7 +3770,7 @@@ if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, &tmpmask); + sd, tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@@ -3782,7 -3722,6 +3785,7 @@@ */ this_rq->next_balance = next_balance; } + free_cpumask_var(tmpmask); } /* @@@ -3820,7 -3759,7 +3823,7 @@@ static void active_load_balance(struct /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } @@@ -3839,9 -3778,10 +3842,9 @@@ #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_var_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), - .cpu_mask = CPU_MASK_NONE, }; /* @@@ -3869,7 -3809,7 +3872,7 @@@ int select_nohz_load_balancer(int stop_ int cpu = smp_processor_id(); if (stop_tick) { - cpu_set(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; /* @@@ -3883,7 -3823,7 +3886,7 @@@ } /* time for ilb owner also to sleep */ - if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@@ -3896,10 -3836,10 +3899,10 @@@ } else if (atomic_read(&nohz.load_balancer) == cpu) return 1; } else { - if (!cpu_isset(cpu, nohz.cpu_mask)) + if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); if (atomic_read(&nohz.load_balancer) == cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) @@@ -3927,11 -3867,7 +3930,11 @@@ static void rebalance_domains(int cpu, unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_t tmp; + cpumask_var_t tmp; + + /* Fails alloc? Rebalancing probably not a priority right now. */ + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) + return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@@ -3956,7 -3892,7 +3959,7 @@@ } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@@ -3990,8 -3926,6 +3993,8 @@@ out */ if (likely(update_next_balance)) rq->next_balance = next_balance; + + free_cpumask_var(tmp); } /* @@@ -4016,13 -3950,12 +4019,13 @@@ static void run_rebalance_domains(struc */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { - cpumask_t cpus = nohz.cpu_mask; struct rq *rq; int balance_cpu; - cpu_clear(this_cpu, cpus); - for_each_cpu_mask_nr(balance_cpu, cpus) { + for_each_cpu(balance_cpu, nohz.cpu_mask) { + if (balance_cpu == this_cpu) + continue; + /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load @@@ -4060,7 -3993,7 +4063,7 @@@ static inline void trigger_load_balance rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { - cpu_clear(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } @@@ -4073,7 -4006,7 +4076,7 @@@ * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ - int ilb = first_cpu(nohz.cpu_mask); + int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@@ -4085,7 -4018,7 +4088,7 @@@ * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } @@@ -4095,7 -4028,7 +4098,7 @@@ * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpu_isset(cpu, nohz.cpu_mask)) + cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) @@@ -5187,6 -5120,22 +5190,22 @@@ __setscheduler(struct rq *rq, struct ta set_load_weight(p); } + /* + * check the target process has a UID that matches the current process's + */ + static bool check_same_owner(struct task_struct *p) + { + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; + } + static int __sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param, bool user) { @@@ -5246,8 -5195,7 +5265,7 @@@ recheck return -EPERM; /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) + if (!check_same_owner(p)) return -EPERM; } @@@ -5453,9 -5401,10 +5471,9 @@@ out_unlock return retval; } -long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { - cpumask_t cpus_allowed; - cpumask_t new_mask = *in_mask; + cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; @@@ -5477,58 -5426,45 +5495,57 @@@ get_task_struct(p); read_unlock(&tasklist_lock); + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); if (retval) goto out_unlock; - cpuset_cpus_allowed(p, &cpus_allowed); - cpus_and(new_mask, new_mask, cpus_allowed); + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); again: - retval = set_cpus_allowed_ptr(p, &new_mask); + retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { - cpuset_cpus_allowed(p, &cpus_allowed); - if (!cpus_subset(new_mask, cpus_allowed)) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset * update. Just reset the cpus_allowed to the * cpuset's cpus_allowed */ - new_mask = cpus_allowed; + cpumask_copy(new_mask, cpus_allowed); goto again; } } out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: put_task_struct(p); put_online_cpus(); return retval; } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - cpumask_t *new_mask) + struct cpumask *new_mask) { - if (len < sizeof(cpumask_t)) { - memset(new_mask, 0, sizeof(cpumask_t)); - } else if (len > sizeof(cpumask_t)) { - len = sizeof(cpumask_t); - } + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } @@@ -5541,20 -5477,17 +5558,20 @@@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - cpumask_t new_mask; + cpumask_var_t new_mask; int retval; - retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); - if (retval) - return retval; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; - return sched_setaffinity(pid, &new_mask); + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } -long sched_getaffinity(pid_t pid, cpumask_t *mask) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; int retval; @@@ -5571,7 -5504,7 +5588,7 @@@ if (retval) goto out_unlock; - cpus_and(*mask, p->cpus_allowed, cpu_online_map); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); out_unlock: read_unlock(&tasklist_lock); @@@ -5590,24 -5523,19 +5607,24 @@@ asmlinkage long sys_sched_getaffinity(p unsigned long __user *user_mask_ptr) { int ret; - cpumask_t mask; + cpumask_var_t mask; - if (len < sizeof(cpumask_t)) + if (len < cpumask_size()) return -EINVAL; - ret = sched_getaffinity(pid, &mask); - if (ret < 0) - return ret; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; - if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) - return -EFAULT; + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + ret = -EFAULT; + else + ret = cpumask_size(); + } + free_cpumask_var(mask); - return sizeof(cpumask_t); + return ret; } /** @@@ -5949,7 -5877,7 +5966,7 @@@ void __cpuinit init_idle(struct task_st idle->se.exec_start = sched_clock(); idle->prio = idle->normal_prio = MAX_PRIO; - idle->cpus_allowed = cpumask_of_cpu(cpu); + cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; @@@ -5976,9 -5904,9 +5993,9 @@@ * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_MASK_NONE. + * always be CPU_BITS_NONE. */ -cpumask_t nohz_cpu_mask = CPU_MASK_NONE; +cpumask_var_t nohz_cpu_mask; /* * Increase the granularity value when there are more CPUs, @@@ -6033,7 -5961,7 +6050,7 @@@ static inline void sched_init_granulari * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { struct migration_req req; unsigned long flags; @@@ -6041,13 -5969,13 +6058,13 @@@ int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(*new_mask, cpu_online_map)) { + if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpus_equal(p->cpus_allowed, *new_mask))) { + !cpumask_equal(&p->cpus_allowed, new_mask))) { ret = -EINVAL; goto out; } @@@ -6055,15 -5983,15 +6072,15 @@@ if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = *new_mask; - p->rt.nr_cpus_allowed = cpus_weight(*new_mask); + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), *new_mask)) + if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, any_online_cpu(*new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@@ -6105,7 -6033,7 +6122,7 @@@ static int __migrate_task(struct task_s if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpu_isset(dest_cpu, p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; on_rq = p->se.on_rq; @@@ -6202,43 -6130,50 +6219,43 @@@ static int __migrate_task_irq(struct ta */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - unsigned long flags; - cpumask_t mask; - struct rq *rq; int dest_cpu; + /* FIXME: Use cpumask_of_node here. */ + cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); + const struct cpumask *nodemask = &_nodemask; + +again: + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto move; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + if (dest_cpu < nr_cpu_ids) + goto move; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); - do { - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu >= nr_cpu_ids) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed; - - cpuset_cpus_allowed_locked(p, &cpus_allowed); - /* - * Try to stay on the same cpuset, where the - * current cpuset may be a subset of all cpus. - * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be - * called within calls to cpuset_lock/cpuset_unlock. - */ - rq = task_rq_lock(p, &flags); - p->cpus_allowed = cpus_allowed; - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } - } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); + } + +move: + /* It can have affinity changed while we were choosing. */ + if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) + goto again; } /* @@@ -6250,7 -6185,7 +6267,7 @@@ */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); unsigned long flags; local_irq_save(flags); @@@ -6540,7 -6475,7 +6557,7 @@@ static void set_rq_online(struct rq *rq if (!rq->online) { const struct sched_class *class; - cpu_set(rq->cpu, rq->rd->online); + cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; for_each_class(class) { @@@ -6560,7 -6495,7 +6577,7 @@@ static void set_rq_offline(struct rq *r class->rq_offline(rq); } - cpu_clear(rq->cpu, rq->rd->online); + cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; } } @@@ -6601,7 -6536,7 +6618,7 @@@ migration_call(struct notifier_block *n rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } @@@ -6615,7 -6550,7 +6632,7 @@@ break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, - any_online_cpu(cpu_online_map)); + cpumask_any(cpu_online_mask)); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; @@@ -6665,7 -6600,7 +6682,7 @@@ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); @@@ -6704,13 -6639,13 +6721,13 @@@ early_initcall(migration_init) #ifdef CONFIG_SCHED_DEBUG static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, - cpumask_t *groupmask) + struct cpumask *groupmask) { struct sched_group *group = sd->groups; char str[256]; - cpulist_scnprintf(str, sizeof(str), &sd->span); - cpus_clear(*groupmask); + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@@ -6724,11 -6659,11 +6741,11 @@@ printk(KERN_CONT "span %s level %s\n", str, sd->name); - if (!cpu_isset(cpu, sd->span)) { + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpu_isset(cpu, group->cpumask)) { + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@@ -6748,32 -6683,31 +6765,32 @@@ break; } - if (!cpus_weight(group->cpumask)) { + if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } - if (cpus_intersects(*groupmask, group->cpumask)) { + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(*groupmask, *groupmask, group->cpumask); + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), &group->cpumask); + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, *groupmask)) + if (!cpumask_equal(sched_domain_span(sd), groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@@ -6781,7 -6715,7 +6798,7 @@@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_t *groupmask; + cpumask_var_t groupmask; int level = 0; if (!sd) { @@@ -6791,7 -6725,8 +6808,7 @@@ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); - if (!groupmask) { + if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); return; } @@@ -6804,7 -6739,7 +6821,7 @@@ if (!sd) break; } - kfree(groupmask); + free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@@ -6812,7 -6747,7 +6829,7 @@@ static int sd_degenerate(struct sched_domain *sd) { - if (cpus_weight(sd->span) == 1) + if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ @@@ -6843,7 -6778,7 +6860,7 @@@ sd_parent_degenerate(struct sched_domai if (sd_degenerate(parent)) return 1; - if (!cpus_equal(sd->span, parent->span)) + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; /* Does parent contain flags not in child? */ @@@ -6867,16 -6802,6 +6884,16 @@@ return 1; } +static void free_rootdomain(struct root_domain *rd) +{ + cpupri_cleanup(&rd->cpupri); + + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; @@@ -6886,63 -6811,38 +6903,63 @@@ if (rq->rd) { struct root_domain *old_rd = rq->rd; - if (cpu_isset(rq->cpu, old_rd->online)) + if (cpumask_test_cpu(rq->cpu, old_rd->online)) set_rq_offline(rq); - cpu_clear(rq->cpu, old_rd->span); + cpumask_clear_cpu(rq->cpu, old_rd->span); if (atomic_dec_and_test(&old_rd->refcount)) - kfree(old_rd); + free_rootdomain(old_rd); } atomic_inc(&rd->refcount); rq->rd = rd; - cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd) +static int init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); - cpus_clear(rd->span); - cpus_clear(rd->online); + if (bootmem) { + alloc_bootmem_cpumask_var(&def_root_domain.span); + alloc_bootmem_cpumask_var(&def_root_domain.online); + alloc_bootmem_cpumask_var(&def_root_domain.rto_mask); + cpupri_init(&rd->cpupri, true); + return 0; + } + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto free_rd; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri, false) != 0) + goto free_rto_mask; + return 0; - cpupri_init(&rd->cpupri); +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +free_rd: + kfree(rd); + return -ENOMEM; } static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain); + init_rootdomain(&def_root_domain, true); + atomic_set(&def_root_domain.refcount, 1); } @@@ -6954,10 -6854,7 +6971,10 @@@ static struct root_domain *alloc_rootdo if (!rd) return NULL; - init_rootdomain(rd); + if (init_rootdomain(rd, false) != 0) { + kfree(rd); + return NULL; + } return rd; } @@@ -6999,12 -6896,19 +7016,12 @@@ cpu_attach_domain(struct sched_domain * } /* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; +static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { - static int __initdata ints[NR_CPUS]; - int i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); + cpulist_parse(str, cpu_isolated_map); return 1; } @@@ -7013,43 -6917,42 +7030,43 @@@ __setup("isolcpus=", isolated_cpu_setup /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS - * (due to the fact that we keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map, +init_sched_build_groups(const struct cpumask *span, + const struct cpumask *cpu_map, + int (*group_fn)(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, - cpumask_t *tmpmask), - cpumask_t *covered, cpumask_t *tmpmask) + struct cpumask *tmpmask), + struct cpumask *covered, struct cpumask *tmpmask) { struct sched_group *first = NULL, *last = NULL; int i; - cpus_clear(*covered); + cpumask_clear(covered); - for_each_cpu_mask_nr(i, *span) { + for_each_cpu(i, span) { struct sched_group *sg; int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, *covered)) + if (cpumask_test_cpu(i, covered)) continue; - cpus_clear(sg->cpumask); + cpumask_clear(sched_group_cpus(sg)); sg->__cpu_power = 0; - for_each_cpu_mask_nr(j, *span) { + for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, *covered); - cpu_set(j, sg->cpumask); + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } if (!first) first = sg; @@@ -7113,10 -7016,9 +7130,10 @@@ static int find_next_best_node(int node * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static void sched_domain_node_span(int node, cpumask_t *span) +static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; + /* FIXME: use cpumask_of_node() */ node_to_cpumask_ptr(nodemask, node); int i; @@@ -7137,34 -7039,19 +7154,34 @@@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +/* + * The cpus mask in sched_group and sched_domain hangs off the end. + * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space + * for nr_cpu_ids < CONFIG_NR_CPUS. + */ +struct static_sched_group { + struct sched_group sg; + DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); +}; + +struct static_sched_domain { + struct sched_domain sd; + DECLARE_BITMAP(span, CONFIG_NR_CPUS); +}; + /* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu); + *sg = &per_cpu(sched_group_cpus, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@@ -7173,55 -7060,56 +7190,55 @@@ * multi-core sched-domains: */ #ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_core); +static DEFINE_PER_CPU(struct static_sched_domain, core_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); #endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); if (sg) - *sg = &per_cpu(sched_group_core, group); + *sg = &per_cpu(sched_group_core, group).sg; return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *unused) +cpu_to_core_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_core, cpu); + *sg = &per_cpu(sched_group_core, cpu).sg; return cpu; } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_phys); +static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, - cpumask_t *mask) +cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { int group; #ifdef CONFIG_SCHED_MC - *mask = *cpu_coregroup_mask(cpu); + /* FIXME: Use cpu_coregroup_mask. */ + *mask = cpu_coregroup_map(cpu); cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - *mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(*mask, *mask, *cpu_map); - group = first_cpu(*mask); + cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + group = cpumask_first(mask); #else group = cpu; #endif if (sg) - *sg = &per_cpu(sched_group_phys, group); + *sg = &per_cpu(sched_group_phys, group).sg; return group; } @@@ -7235,21 -7123,19 +7252,21 @@@ static DEFINE_PER_CPU(struct sched_doma static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg, cpumask_t *nodemask) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { int group; + /* FIXME: use cpumask_of_node */ + node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - *nodemask = node_to_cpumask(cpu_to_node(cpu)); - cpus_and(*nodemask, *nodemask, *cpu_map); - group = first_cpu(*nodemask); + cpumask_and(nodemask, pnodemask, cpu_map); + group = cpumask_first(nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group); + *sg = &per_cpu(sched_group_allnodes, group).sg; return group; } @@@ -7261,11 -7147,11 +7278,11 @@@ static void init_numa_sched_groups_powe if (!sg) return; do { - for_each_cpu_mask_nr(j, sg->cpumask) { + for_each_cpu(j, sched_group_cpus(sg)) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { + sd = &per_cpu(phys_domains, j).sd; + if (j != cpumask_first(sched_group_cpus(sd->groups))) { /* * Only add "power" once for each * physical package. @@@ -7282,12 -7168,11 +7299,12 @@@ #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { int cpu, i; - for_each_cpu_mask_nr(cpu, *cpu_map) { + for_each_cpu(cpu, cpu_map) { struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; @@@ -7296,11 -7181,10 +7313,11 @@@ for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; + /* FIXME: Use cpumask_of_node */ + node_to_cpumask_ptr(pnodemask, i); - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + cpus_and(*nodemask, *pnodemask, *cpu_map); + if (cpumask_empty(nodemask)) continue; if (sg == NULL) @@@ -7318,8 -7202,7 +7335,8 @@@ next_sg } } #else /* !CONFIG_NUMA */ -static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) +static void free_sched_groups(const struct cpumask *cpu_map, + struct cpumask *nodemask) { } #endif /* CONFIG_NUMA */ @@@ -7345,7 -7228,7 +7362,7 @@@ static void init_sched_groups_power(in WARN_ON(!sd || !sd->groups); - if (cpu != first_cpu(sd->groups->cpumask)) + if (cpu != cpumask_first(sched_group_cpus(sd->groups))) return; child = sd->child; @@@ -7410,6 -7293,48 +7427,6 @@@ SD_INIT_FUNC(CPU SD_INIT_FUNC(MC) #endif -/* - * To minimize stack usage kmalloc room for cpumasks and share the - * space as the usage in build_sched_domains() dictates. Used only - * if the amount of space is significant. - */ -struct allmasks { - cpumask_t tmpmask; /* make this one first */ - union { - cpumask_t nodemask; - cpumask_t this_sibling_map; - cpumask_t this_core_map; - }; - cpumask_t send_covered; - -#ifdef CONFIG_NUMA - cpumask_t domainspan; - cpumask_t covered; - cpumask_t notcovered; -#endif -}; - -#if NR_CPUS > 128 -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ - *masks = kmalloc(sizeof(**masks), GFP_KERNEL); -} -static inline void sched_cpumask_free(struct allmasks *masks) -{ - kfree(masks); -} -#else -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v -static inline void sched_cpumask_alloc(struct allmasks **masks) -{ } -static inline void sched_cpumask_free(struct allmasks *masks) -{ } -#endif - -#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ - ((unsigned long)(a) + offsetof(struct allmasks, v)) - static int default_relax_domain_level = -1; static int __init setup_relax_domain_level(char *str) @@@ -7449,38 -7374,17 +7466,38 @@@ static void set_domain_attribute(struc * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const cpumask_t *cpu_map, +static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - int i; + int i, err = -ENOMEM; struct root_domain *rd; - SCHED_CPUMASK_DECLARE(allmasks); - cpumask_t *tmpmask; + cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, + tmpmask; #ifdef CONFIG_NUMA + cpumask_var_t domainspan, covered, notcovered; struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; + if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + goto free_domainspan; + if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + goto free_covered; +#endif + + if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + goto free_notcovered; + if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + goto free_nodemask; + if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + goto free_this_sibling_map; + if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + goto free_this_core_map; + if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + goto free_send_covered; + +#ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ @@@ -7488,37 -7392,54 +7505,37 @@@ GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; + goto free_tmpmask; } #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); -#ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; + goto free_sched_groups; } - /* get space for all scratch cpumask variables */ - sched_cpumask_alloc(&allmasks); - if (!allmasks) { - printk(KERN_WARNING "Cannot alloc cpumask array\n"); - kfree(rd); #ifdef CONFIG_NUMA - kfree(sched_group_nodes); -#endif - return -ENOMEM; - } - - tmpmask = (cpumask_t *)allmasks; - - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; #endif /* * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - SCHED_CPUMASK_VAR(nodemask, allmasks); + /* FIXME: use cpumask_of_node */ *nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA - if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); - sd->span = *cpu_map; + cpumask_copy(sched_domain_span(sd), cpu_map); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@@ -7528,19 -7449,18 +7545,19 @@@ sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), &sd->span); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = p; if (p) p->child = sd; - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); #endif p = sd; - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - sd->span = *nodemask; + cpumask_copy(sched_domain_span(sd), nodemask); sd->parent = p; if (p) p->child = sd; @@@ -7548,12 -7468,11 +7565,12 @@@ #ifdef CONFIG_SCHED_MC p = sd; - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - sd->span = *cpu_coregroup_mask(i); - cpus_and(sd->span, sd->span, *cpu_map); + *sched_domain_span(sd) = cpu_coregroup_map(i); + cpumask_and(sched_domain_span(sd), + sched_domain_span(sd), cpu_map); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@@ -7561,11 -7480,11 +7578,11 @@@ #ifdef CONFIG_SCHED_SMT p = sd; - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); - sd->span = per_cpu(cpu_sibling_map, i); - cpus_and(sd->span, sd->span, *cpu_map); + cpumask_and(sched_domain_span(sd), + &per_cpu(cpu_sibling_map, i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@@ -7574,10 -7493,13 +7591,10 @@@ #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_sibling_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); - if (i != first_cpu(*this_sibling_map)) + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, + &per_cpu(cpu_sibling_map, i), cpu_map); + if (i != cpumask_first(this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, @@@ -7588,11 -7510,13 +7605,11 @@@ #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ - for_each_cpu_mask_nr(i, *cpu_map) { - SCHED_CPUMASK_VAR(this_core_map, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - - *this_core_map = *cpu_coregroup_mask(i); + for_each_cpu(i, cpu_map) { + /* FIXME: Use cpu_coregroup_mask */ + *this_core_map = cpu_coregroup_map(i); cpus_and(*this_core_map, *this_core_map, *cpu_map); - if (i != first_cpu(*this_core_map)) + if (i != cpumask_first(this_core_map)) continue; init_sched_build_groups(this_core_map, cpu_map, @@@ -7603,10 -7527,12 +7620,10 @@@ /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(send_covered, allmasks); - + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) + if (cpumask_empty(nodemask)) continue; init_sched_build_groups(nodemask, cpu_map, @@@ -7617,6 -7543,8 +7634,6 @@@ #ifdef CONFIG_NUMA /* Set up node groups */ if (sd_allnodes) { - SCHED_CPUMASK_VAR(send_covered, allmasks); - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, send_covered, tmpmask); @@@ -7625,58 -7553,58 +7642,58 @@@ for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - SCHED_CPUMASK_VAR(nodemask, allmasks); - SCHED_CPUMASK_VAR(domainspan, allmasks); - SCHED_CPUMASK_VAR(covered, allmasks); int j; + /* FIXME: Use cpumask_of_node */ *nodemask = node_to_cpumask(i); - cpus_clear(*covered); + cpumask_clear(covered); cpus_and(*nodemask, *nodemask, *cpu_map); - if (cpus_empty(*nodemask)) { + if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; } sched_domain_node_span(i, domainspan); - cpus_and(*domainspan, *domainspan, *cpu_map); + cpumask_and(domainspan, domainspan, cpu_map); - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for " "node %d\n", i); goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask_nr(j, *nodemask) { + for_each_cpu(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = *nodemask; + cpumask_copy(sched_group_cpus(sg), nodemask); sg->next = sg; - cpus_or(*covered, *covered, *nodemask); + cpumask_or(covered, covered, nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { - SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % nr_node_ids; + /* FIXME: Use cpumask_of_node */ node_to_cpumask_ptr(pnodemask, n); - cpus_complement(*notcovered, *covered); - cpus_and(*tmpmask, *notcovered, *cpu_map); - cpus_and(*tmpmask, *tmpmask, *domainspan); - if (cpus_empty(*tmpmask)) + cpumask_complement(notcovered, covered); + cpumask_and(tmpmask, notcovered, cpu_map); + cpumask_and(tmpmask, tmpmask, domainspan); + if (cpumask_empty(tmpmask)) break; - cpus_and(*tmpmask, *tmpmask, *pnodemask); - if (cpus_empty(*tmpmask)) + cpumask_and(tmpmask, tmpmask, pnodemask); + if (cpumask_empty(tmpmask)) continue; - sg = kmalloc_node(sizeof(struct sched_group), + sg = kmalloc_node(sizeof(struct sched_group) + + cpumask_size(), GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING @@@ -7684,9 -7612,9 +7701,9 @@@ goto error; } sg->__cpu_power = 0; - sg->cpumask = *tmpmask; + cpumask_copy(sched_group_cpus(sg), tmpmask); sg->next = prev->next; - cpus_or(*covered, *covered, *tmpmask); + cpumask_or(covered, covered, tmpmask); prev->next = sg; prev = sg; } @@@ -7695,22 -7623,22 +7712,22 @@@ /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif - for_each_cpu_mask_nr(i, *cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i); + for_each_cpu(i, cpu_map) { + struct sched_domain *sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@@ -7722,78 -7650,53 +7739,78 @@@ if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, tmpmask); init_numa_sched_groups_power(sg); } #endif /* Attach the domains */ - for_each_cpu_mask_nr(i, *cpu_map) { + for_each_cpu(i, cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); + sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i); + sd = &per_cpu(core_domains, i).sd; #else - sd = &per_cpu(phys_domains, i); + sd = &per_cpu(phys_domains, i).sd; #endif cpu_attach_domain(sd, rd, i); } - sched_cpumask_free(allmasks); - return 0; + err = 0; + +free_tmpmask: + free_cpumask_var(tmpmask); +free_send_covered: + free_cpumask_var(send_covered); +free_this_core_map: + free_cpumask_var(this_core_map); +free_this_sibling_map: + free_cpumask_var(this_sibling_map); +free_nodemask: + free_cpumask_var(nodemask); +free_notcovered: +#ifdef CONFIG_NUMA + free_cpumask_var(notcovered); +free_covered: + free_cpumask_var(covered); +free_domainspan: + free_cpumask_var(domainspan); +out: +#endif + return err; + +free_sched_groups: +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + goto free_tmpmask; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - sched_cpumask_free(allmasks); - kfree(rd); - return -ENOMEM; + free_rootdomain(rd); + goto free_tmpmask; #endif } -static int build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const struct cpumask *cpu_map) { return __build_sched_domains(cpu_map, NULL); } -static cpumask_t *doms_cur; /* current sched domains */ +static struct cpumask *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of - * cpumask_t) fails, then fallback to a single sched domain, - * as determined by the single cpumask_t fallback_doms. + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. */ -static cpumask_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@@ -7810,16 -7713,16 +7827,16 @@@ int __attribute__((weak)) arch_update_c * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const struct cpumask *cpu_map) { int err; arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms_cur) - doms_cur = &fallback_doms; - cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + doms_cur = fallback_doms; + cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@@ -7827,8 -7730,8 +7844,8 @@@ return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map, - cpumask_t *tmpmask) +static void arch_destroy_sched_domains(const struct cpumask *cpu_map, + struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); } @@@ -7837,16 -7740,15 +7854,16 @@@ * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ -static void detach_destroy_domains(const cpumask_t *cpu_map) +static void detach_destroy_domains(const struct cpumask *cpu_map) { - cpumask_t tmpmask; + /* Save because hotplug lock held. */ + static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; - for_each_cpu_mask_nr(i, *cpu_map) + for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, &tmpmask); + arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@@ -7871,7 -7773,7 +7888,7 @@@ static int dattrs_equal(struct sched_do * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the @@@ -7885,14 -7787,13 +7902,14 @@@ * the single partition 'fallback_doms', it also forces the domains * to be rebuilt. * - * If doms_new == NULL it will be replaced with cpu_online_map. + * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, * and it will not create the default domain. * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, +/* FIXME: Change to struct cpumask *doms_new[] */ +void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; @@@ -7911,7 -7812,7 +7928,7 @@@ /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpus_equal(doms_cur[i], doms_new[j]) + if (cpumask_equal(&doms_cur[i], &doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } @@@ -7923,15 -7824,15 +7940,15 @@@ match1 if (doms_new == NULL) { ndoms_cur = 0; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + doms_new = fallback_doms; + cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpus_equal(doms_new[i], doms_cur[j]) + if (cpumask_equal(&doms_new[i], &doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } @@@ -7943,7 -7844,7 +7960,7 @@@ match2 } /* Remember the new sched domains */ - if (doms_cur != &fallback_doms) + if (doms_cur != fallback_doms) kfree(doms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; @@@ -7972,25 -7873,14 +7989,25 @@@ int arch_reinit_sched_domains(void static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { int ret; + unsigned int level = 0; - if (buf[0] != '0' && buf[0] != '1') + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) return -EINVAL; if (smt) - sched_smt_power_savings = (buf[0] == '1'); + sched_smt_power_savings = level; else - sched_mc_power_savings = (buf[0] == '1'); + sched_mc_power_savings = level; ret = arch_reinit_sched_domains(); @@@ -8094,9 -7984,7 +8111,9 @@@ static int update_runtime(struct notifi void __init sched_init_smp(void) { - cpumask_t non_isolated_cpus; + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); #if defined(CONFIG_NUMA) sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), @@@ -8105,10 -7993,10 +8122,10 @@@ #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); - if (cpus_empty(non_isolated_cpus)) - cpu_set(smp_processor_id(), non_isolated_cpus); + arch_init_sched_domains(cpu_online_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@@ -8123,13 -8011,9 +8140,13 @@@ init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + init_sched_rt_class(); } #else void __init sched_init_smp(void) @@@ -8444,15 -8328,6 +8461,15 @@@ void __init sched_init(void */ current->sched_class = &fair_sched_class; + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + alloc_bootmem_cpumask_var(&nohz_cpu_mask); +#ifdef CONFIG_SMP +#ifdef CONFIG_NO_HZ + alloc_bootmem_cpumask_var(&nohz.cpu_mask); +#endif + alloc_bootmem_cpumask_var(&cpu_isolated_map); +#endif /* SMP */ + scheduler_running = 1; } @@@ -9423,6 -9298,41 +9440,41 @@@ cpuacct_destroy(struct cgroup_subsys *s kfree(ca); } + static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) + { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + + #ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); + #else + data = *cpuusage; + #endif + + return data; + } + + static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) + { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + + #ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); + #else + *cpuusage = val; + #endif + } + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@@ -9430,17 -9340,8 +9482,8 @@@ u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@@ -9457,23 -9358,39 +9500,39 @@@ static int cpuusage_write(struct cgrou goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } + static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) + { + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; + } + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) diff --combined kernel/sched_fair.c index 36b5e34fa99,5ad4440f0fc..56c0efe902a --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@@ -492,6 -492,8 +492,8 @@@ static void update_curr(struct cfs_rq * * overflow on 32 bits): */ delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; @@@ -1017,33 -1019,16 +1019,33 @@@ static void yield_task_fair(struct rq * * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_map) + * hence we need to mask them out (cpu_active_mask) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, struct task_struct *p) { - cpumask_t tmp; struct sched_domain *sd; int i; + unsigned int chosen_wakeup_cpu; + int this_cpu; + + /* + * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu + * are idle and this is not a kernel thread and this task's affinity + * allows it to be moved to preferred cpu, then just move! + */ + + this_cpu = smp_processor_id(); + chosen_wakeup_cpu = + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && + idle_cpu(cpu) && idle_cpu(this_cpu) && + p->mm && !(p->flags & PF_KTHREAD) && + cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) + return chosen_wakeup_cpu; /* * If it is idle, then it is the best cpu to run this task. @@@ -1061,9 -1046,10 +1063,9 @@@ if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { - cpus_and(tmp, sd->span, p->cpus_allowed); - cpus_and(tmp, tmp, cpu_active_map); - for_each_cpu_mask_nr(i, tmp) { - if (idle_cpu(i)) { + for_each_cpu_and(i, sched_domain_span(sd), + &p->cpus_allowed) { + if (cpu_active(i) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@@ -1256,13 -1242,13 +1258,13 @@@ static int select_task_rq_fair(struct t * this_cpu and prev_cpu are present in: */ for_each_domain(this_cpu, sd) { - if (cpu_isset(prev_cpu, sd->span)) { + if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { this_sd = sd; break; } } - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) goto out; /* @@@ -1361,12 -1347,11 +1363,11 @@@ static void check_preempt_wakeup(struc { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); - if (unlikely(rt_prio(p->prio))) { - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + update_curr(cfs_rq); - update_rq_clock(rq); - update_curr(cfs_rq); + if (unlikely(rt_prio(p->prio))) { resched_task(curr); return; } diff --combined kernel/sched_rt.c index 1bbd9901401,51d2af3e619..833b6d44483 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@@ -15,7 -15,7 +15,7 @@@ static inline void rt_set_overload(stru if (!rq->online) return; - cpu_set(rq->cpu, rq->rd->rto_mask); + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set * the overload count. That is checked to determine @@@ -34,7 -34,7 +34,7 @@@ static inline void rt_clear_overload(st /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); - cpu_clear(rq->cpu, rq->rd->rto_mask); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } static void update_rt_migration(struct rq *rq) @@@ -77,7 -77,7 +77,7 @@@ static inline u64 sched_rt_period(struc } #define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { @@@ -139,14 -139,14 +139,14 @@@ static int rt_se_boosted(struct sched_r } #ifdef CONFIG_SMP -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_rq(smp_processor_id())->rd->span; } #else -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } #endif @@@ -212,9 -212,9 +212,9 @@@ static inline int rt_rq_throttled(struc return rt_rq->rt_throttled; } -static inline cpumask_t sched_rt_period_mask(void) +static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_online_map; + return cpu_online_mask; } static inline @@@ -241,11 -241,11 +241,11 @@@ static int do_balance_runtime(struct rt int i, weight, more = 0; u64 rt_period; - weight = cpus_weight(rd->span); + weight = cpumask_weight(rd->span); spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); - for_each_cpu_mask_nr(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@@ -324,7 -324,7 +324,7 @@@ static void __disable_runtime(struct r /* * Greedy reclaim, take back as much as we can. */ - for_each_cpu_mask(i, rd->span) { + for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; @@@ -429,13 -429,13 +429,13 @@@ static inline int balance_runtime(struc static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; - cpumask_t span; + const struct cpumask *span; if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); @@@ -805,20 -805,17 +805,20 @@@ static int select_task_rq_rt(struct tas static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_t mask; + cpumask_var_t mask; if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, &mask)) + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) - return; + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, mask)) + goto free; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) + goto free; /* * There appears to be other cpus that can accept @@@ -827,8 -824,6 +827,8 @@@ */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); +free: + free_cpumask_var(mask); } #endif /* CONFIG_SMP */ @@@ -919,7 -914,7 +919,7 @@@ static void deactivate_task(struct rq * static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && (p->rt.nr_cpus_allowed > 1)) return 1; return 0; @@@ -958,7 -953,7 +958,7 @@@ static struct task_struct *pick_next_hi return next; } -static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { @@@ -978,7 -973,7 +978,7 @@@ static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@@ -993,7 -988,7 +993,7 @@@ * I guess we might want to change cpupri_find() to ignore those * in the first place. */ - cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * At this point we have built a mask of cpus representing the @@@ -1003,7 -998,7 +1003,7 @@@ * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ - if (cpu_isset(cpu, *lowest_mask)) + if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* @@@ -1018,8 -1013,7 +1018,8 @@@ cpumask_t domain_mask; int best_cpu; - cpus_and(domain_mask, sd->span, *lowest_mask); + cpumask_and(&domain_mask, sched_domain_span(sd), + lowest_mask); best_cpu = pick_optimal_cpu(this_cpu, &domain_mask); @@@ -1060,8 -1054,8 +1060,8 @@@ static struct rq *find_lock_lowest_rq(s * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpu_isset(lowest_rq->cpu, - task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || task_running(rq, task) || !task->se.on_rq)) { @@@ -1182,7 -1176,7 +1182,7 @@@ static int pull_rt_task(struct rq *this next = pick_next_task_rt(this_rq); - for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@@ -1311,9 -1305,9 +1311,9 @@@ move_one_task_rt(struct rq *this_rq, in } static void set_cpus_allowed_rt(struct task_struct *p, - const cpumask_t *new_mask) + const struct cpumask *new_mask) { - int weight = cpus_weight(*new_mask); + int weight = cpumask_weight(new_mask); BUG_ON(!rt_task(p)); @@@ -1334,7 -1328,7 +1334,7 @@@ update_rt_migration(rq); } - p->cpus_allowed = *new_mask; + cpumask_copy(&p->cpus_allowed, new_mask); p->rt.nr_cpus_allowed = weight; } @@@ -1377,14 -1371,6 +1377,14 @@@ static void switched_from_rt(struct rq if (!rq->rt.rt_nr_running) pull_rt_task(rq); } + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL); +} #endif /* CONFIG_SMP */ /* @@@ -1555,4 -1541,3 +1555,4 @@@ static void print_rt_stats(struct seq_f rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ + diff --combined kernel/sched_stats.h index 5fcf0e18458,b59fd9cdc1b..f2773b5d122 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@@ -31,7 -31,7 +31,7 @@@ static int show_schedstat(struct seq_fi rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, rq->sched_switch, rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, - rq->rq_sched_info.cpu_time, + rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); seq_printf(seq, "\n"); @@@ -42,8 -42,7 +42,8 @@@ for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, &sd->span); + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { @@@ -124,7 -123,7 +124,7 @@@ static inline voi rq_sched_info_depart(struct rq *rq, unsigned long long delta) { if (rq) - rq->rq_sched_info.cpu_time += delta; + rq->rq_cpu_time += delta; } static inline void @@@ -237,7 -236,6 +237,6 @@@ static inline void sched_info_depart(st unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; - t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); if (t->state == TASK_RUNNING) diff --combined kernel/time/tick-sched.c index 70f872c71f4,8f3fc2582d3..76a574bbef9 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@@ -144,7 -144,7 +144,7 @@@ void tick_nohz_update_jiffies(void if (!ts->tick_stopped) return; - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); now = ktime_get(); ts->idle_waketime = now; @@@ -247,7 -247,7 +247,7 @@@ void tick_nohz_stop_sched_tick(int inid if (need_resched()) goto end; - if (unlikely(local_softirq_pending())) { + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { @@@ -282,8 -282,31 +282,31 @@@ /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + if (delta_jiffies > 1) - cpu_set(cpu, nohz_cpu_mask); + cpumask_set_cpu(cpu, nohz_cpu_mask); + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@@ -296,7 -319,7 +319,7 @@@ /* * sched tick not stopped! */ - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } @@@ -306,17 -329,6 +329,6 @@@ rcu_enter_nohz(); } - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. - */ - if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->idle_sleeps++; /* @@@ -332,12 -344,7 +344,7 @@@ goto out; } - /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); + /* Mark expiries */ ts->idle_expires = expires; if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { @@@ -354,7 -361,7 +361,7 @@@ * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@@ -432,7 -439,7 +439,7 @@@ void tick_nohz_restart_sched_tick(void select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - cpu_clear(cpu, nohz_cpu_mask); + cpumask_clear_cpu(cpu, nohz_cpu_mask); /* * We stopped the tick in idle. Update process times would miss the @@@ -681,7 -688,6 +688,6 @@@ void tick_setup_sched_timer(void */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());