Merge branch 'linus' into tracing/mmiotrace-mergefixups

author Ingo Molnar <mingo@elte.hu>

Mon, 16 Jun 2008 09:16:46 +0000 (11:16 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 16 Jun 2008 09:16:46 +0000 (11:16 +0200)
author Ingo Molnar <mingo@elte.hu>
Mon, 16 Jun 2008 09:16:46 +0000 (11:16 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 16 Jun 2008 09:16:46 +0000 (11:16 +0200)
diff --combined Makefile

index b4a273f19b52dcbd07c4f148e59392f83c1dac1b,6923d669a4f6c01505d1158f868bd562e907774c..db164f69cffa0a57f641bd8a4518af27d9c1c7bb
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -1,8 -1,8 +1,8 @@@
   VERSION = 2
   PATCHLEVEL = 6
   SUBLEVEL = 26
- EXTRAVERSION = -rc3
- NAME = Funky Weasel is Jiggy wit it
+ EXTRAVERSION = -rc6
+ NAME = Rotary Wombat
   
   # *DOCUMENTATION*
   # To see a list of typical targets execute "make help"
@@@ -528,10 -528,6 +528,10 @@@ KBUILD_CFLAGS    += -
   KBUILD_AFLAGS += -gdwarf-2
   endif
   
+ +ifdef CONFIG_FTRACE
+ +KBUILD_CFLAGS += -pg
+ +endif
+ +
   # We trigger additional mismatches with less inlining
   ifdef CONFIG_DEBUG_SECTION_MISMATCH
   KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --combined arch/x86/Kconfig

index fc86c54e791ec848360b7ef9fa2de802b53ceeae,52e18e6d2ba0aae955bf1624e8370e18229fb861..b0937c03af3c02e1da9eb7d42bc1723a5edc32c4
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -23,22 -23,13 +23,15 @@@ config X8
         select HAVE_OPROFILE
         select HAVE_KPROBES
         select HAVE_KRETPROBES
+ +      select HAVE_DYNAMIC_FTRACE
+ +      select HAVE_FTRACE
         select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
         select HAVE_ARCH_KGDB if !X86_VOYAGER
   
- config DEFCONFIG_LIST
+ config ARCH_DEFCONFIG
         string
-       depends on X86_32
-       option defconfig_list
-       default "arch/x86/configs/i386_defconfig"
- 
- config DEFCONFIG_LIST
-       string
-       depends on X86_64
-       option defconfig_list
-       default "arch/x86/configs/x86_64_defconfig"
+       default "arch/x86/configs/i386_defconfig" if X86_32
+       default "arch/x86/configs/x86_64_defconfig" if X86_64
   
   
   config GENERIC_LOCKBREAK
@@@ -1517,13 -1508,13 +1510,13 @@@ config PCI_GOMMCONFI
   config PCI_GODIRECT
         bool "Direct"
   
- config PCI_GOANY
-       bool "Any"
- 
   config PCI_GOOLPC
         bool "OLPC"
         depends on OLPC
   
+ config PCI_GOANY
+       bool "Any"
+ 
   endchoice
   
   config PCI_BIOS
@@@ -1540,9 -1531,8 +1533,8 @@@ config PCI_MMCONFI
         depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
   
   config PCI_OLPC
-       bool
-       depends on PCI && PCI_GOOLPC
-       default y
+       def_bool y
+       depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
   
   config PCI_DOMAINS
         def_bool y
diff --combined arch/x86/Kconfig.debug

index 1e53df0ba08cdbdbf860e18134903200db2a4aca,18363374d51a9a57b39b6fb8d3f87a054b4b4aa5..f7169edfbeab935eb34f4bc9aebb6a9275fc2940
--- 1/arch/x86/Kconfig.debug
--- 2/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@@ -6,15 -6,19 +6,19 @@@ config TRACE_IRQFLAGS_SUPPOR
   source "lib/Kconfig.debug"
   
   config NONPROMISC_DEVMEM
-       bool "Disable promiscuous /dev/mem"
+       bool "Filter access to /dev/mem"
         help
-         The /dev/mem file by default only allows userspace access to PCI
-         space and the BIOS code and data regions. This is sufficient for
-         dosemu and X and all common users of /dev/mem. With this config
-         option, you allow userspace access to all of memory, including
-         kernel and userspace memory. Accidental access to this is
-         obviously disasterous, but specific access can be used by people
-         debugging the kernel.
+         If this option is left off, you allow userspace access to all
+         of memory, including kernel and userspace memory. Accidental
+         access to this is obviously disastrous, but specific access can
+         be used by people debugging the kernel.
+ 
+         If this option is switched on, the /dev/mem file only allows
+         userspace access to PCI space and the BIOS code and data regions.
+         This is sufficient for dosemu and X and all common users of
+         /dev/mem.
+ 
+         If in doubt, say Y.
   
   config EARLY_PRINTK
         bool "Early printk" if EMBEDDED
@@@ -168,34 -172,6 +172,34 @@@ config IOMMU_LEA
           Add a simple leak tracer to the IOMMU code. This is useful when you
           are debugging a buggy device driver that leaks IOMMU mappings.
   
+ +config MMIOTRACE_HOOKS
+ +      bool
+ +
+ +config MMIOTRACE
+ +      bool "Memory mapped IO tracing"
+ +      depends on DEBUG_KERNEL && PCI
+ +      select TRACING
+ +      select MMIOTRACE_HOOKS
+ +      default y
+ +      help
+ +        Mmiotrace traces Memory Mapped I/O access and is meant for
+ +        debugging and reverse engineering. It is called from the ioremap
+ +        implementation and works via page faults. Tracing is disabled by
+ +        default and can be enabled at run-time.
+ +
+ +        See Documentation/tracers/mmiotrace.txt.
+ +        If you are not helping to develop drivers, say N.
+ +
+ +config MMIOTRACE_TEST
+ +      tristate "Test module for mmiotrace"
+ +      depends on MMIOTRACE && m
+ +      help
+ +        This is a dumb module for testing mmiotrace. It is very dangerous
+ +        as it will write garbage to IO memory starting at a given address.
+ +        However, it should be safe to use on e.g. unused portion of VRAM.
+ +
+ +        Say N, unless you absolutely know what you are doing.
+ +
   #
   # IO delay types:
   #
diff --combined arch/x86/kernel/entry_32.S

index e6517ce0b8249bde9d6028bcd3d9966d2002fb5c,c778e4fa55a2eacc79d34954c2fdcef1367d5732..04ea83ccb979c6fcc6088bbda7113eba2d5b29d1
--- 1/arch/x86/kernel/entry_32.S
--- 2/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@@ -248,6 -248,7 +248,7 @@@ ENTRY(resume_userspace
         DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                                         # setting need_resched or sigpending
                                         # between sampling and the iret
+       TRACE_IRQS_OFF
         movl TI_flags(%ebp), %ecx
         andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
                                         # int/exception return?
@@@ -1109,74 -1110,6 +1110,74 @@@ ENDPROC(xen_failsafe_callback
   
   #endif        /* CONFIG_XEN */
   
+ +#ifdef CONFIG_FTRACE
+ +#ifdef CONFIG_DYNAMIC_FTRACE
+ +
+ +ENTRY(mcount)
+ +      pushl %eax
+ +      pushl %ecx
+ +      pushl %edx
+ +      movl 0xc(%esp), %eax
+ +
+ +.globl mcount_call
+ +mcount_call:
+ +      call ftrace_stub
+ +
+ +      popl %edx
+ +      popl %ecx
+ +      popl %eax
+ +
+ +      ret
+ +END(mcount)
+ +
+ +ENTRY(ftrace_caller)
+ +      pushl %eax
+ +      pushl %ecx
+ +      pushl %edx
+ +      movl 0xc(%esp), %eax
+ +      movl 0x4(%ebp), %edx
+ +
+ +.globl ftrace_call
+ +ftrace_call:
+ +      call ftrace_stub
+ +
+ +      popl %edx
+ +      popl %ecx
+ +      popl %eax
+ +
+ +.globl ftrace_stub
+ +ftrace_stub:
+ +      ret
+ +END(ftrace_caller)
+ +
+ +#else /* ! CONFIG_DYNAMIC_FTRACE */
+ +
+ +ENTRY(mcount)
+ +      cmpl $ftrace_stub, ftrace_trace_function
+ +      jnz trace
+ +.globl ftrace_stub
+ +ftrace_stub:
+ +      ret
+ +
+ +      /* taken from glibc */
+ +trace:
+ +      pushl %eax
+ +      pushl %ecx
+ +      pushl %edx
+ +      movl 0xc(%esp), %eax
+ +      movl 0x4(%ebp), %edx
+ +
+ +      call *ftrace_trace_function
+ +
+ +      popl %edx
+ +      popl %ecx
+ +      popl %eax
+ +
+ +      jmp ftrace_stub
+ +END(mcount)
+ +#endif /* CONFIG_DYNAMIC_FTRACE */
+ +#endif /* CONFIG_FTRACE */
+ +
   .section .rodata,"a"
   #include "syscall_table_32.S"
   
diff --combined arch/x86/kernel/process_32.c

index a30aa1f2607a7d330c0a8b93d06940222aac8902,6d5483356e74f9eb161af21bbadf72c7de221cc2..61f7481c31ddc001d1824760ed0433fb43de6a53
--- 1/arch/x86/kernel/process_32.c
--- 2/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@@ -185,10 -185,7 +185,10 @@@ void cpu_idle(void
   
                         local_irq_disable();
                         __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+ +                      /* Don't trace irqs off for idle */
+ +                      stop_critical_timings();
                         idle();
+ +                      start_critical_timings();
                 }
                 tick_nohz_restart_sched_tick();
                 preempt_enable_no_resched();
@@@ -652,8 -649,11 +652,11 @@@ struct task_struct * __switch_to(struc
         /* If the task has used fpu the last 5 timeslices, just do a full
          * restore of the math state immediately to avoid the trap; the
          * chances of needing FPU soon are obviously high now
+        *
+        * tsk_used_math() checks prevent calling math_state_restore(),
+        * which can sleep in the case of !tsk_used_math()
          */
-       if (next_p->fpu_counter > 5)
+       if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
                 math_state_restore();
   
         /*
diff --combined arch/x86/kernel/process_64.c

index dd349c92f051f73ef9a0491309edce0fc056e56f,ac54ff56df80e407f927af322714c7c7b5ed41e2..dc534f40c8d3c68a810c7addf6ae69deb88b08f9
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -165,10 -165,7 +165,10 @@@ void cpu_idle(void
                          */
                         local_irq_disable();
                         enter_idle();
+ +                      /* Don't trace irqs off for idle */
+ +                      stop_critical_timings();
                         idle();
+ +                      start_critical_timings();
                         /* In many cases the interrupt that ended idle
                            has already called exit_idle. But some idle
                            loops can be woken up without interrupt. */
@@@ -661,8 -658,11 +661,11 @@@ __switch_to(struct task_struct *prev_p
         /* If the task has used fpu the last 5 timeslices, just do a full
          * restore of the math state immediately to avoid the trap; the
          * chances of needing FPU soon are obviously high now
+        *
+        * tsk_used_math() checks prevent calling math_state_restore(),
+        * which can sleep in the case of !tsk_used_math()
          */
-       if (next_p->fpu_counter>5)
+       if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
                 math_state_restore();
         return prev_p;
   }
diff --combined arch/x86/mm/fault.c

index 8c828a68d3b6a35588140944a797ff496066796e,8bcb6f40ccb6c61b762fac7def5c26a8eb01ba4b..0a778e3c43ee42df8b71405c9f5e35bd49a411e4
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -10,7 -10,6 +10,7 @@@
   #include <linux/string.h>
   #include <linux/types.h>
   #include <linux/ptrace.h>
+ +#include <linux/mmiotrace.h>
   #include <linux/mman.h>
   #include <linux/mm.h>
   #include <linux/smp.h>
@@@ -50,16 -49,6 +50,16 @@@
   #define PF_RSVD               (1<<3)
   #define PF_INSTR      (1<<4)
   
+ +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+ +{
+ +#ifdef CONFIG_MMIOTRACE_HOOKS
+ +      if (unlikely(is_kmmio_active()))
+ +              if (kmmio_handler(regs, addr) == 1)
+ +                      return -1;
+ +#endif
+ +      return 0;
+ +}
+ +
   static inline int notify_page_fault(struct pt_regs *regs)
   {
   #ifdef CONFIG_KPROBES
@@@ -508,6 -497,11 +508,11 @@@ static int vmalloc_fault(unsigned long 
         unsigned long pgd_paddr;
         pmd_t *pmd_k;
         pte_t *pte_k;
+ 
+       /* Make sure we are in vmalloc area */
+       if (!(address >= VMALLOC_START && address < VMALLOC_END))
+               return -1;
+ 
         /*
          * Synchronize this task's top level page-table
          * with the 'reference' page table.
@@@ -612,8 -606,6 +617,8 @@@ void __kprobes do_page_fault(struct pt_
   
         if (notify_page_fault(regs))
                 return;
+ +      if (unlikely(kmmio_fault(regs, address)))
+ +              return;
   
         /*
          * We fault-in kernel-space virtual memory on-demand. The
diff --combined arch/x86/mm/init_64.c

index 295be1d07b8275d197c25c94e5d8954b61e07402,156e6d7b0e329cd84b06342dffd37e46ec28ac9a..a5fd2e06f5c994b0f6874e1be4d739f5d91bb897
--- 1/arch/x86/mm/init_64.c
--- 2/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@@ -206,7 -206,7 +206,7 @@@ void __init cleanup_highmap(void
         pmd_t *last_pmd = pmd + PTRS_PER_PMD;
   
         for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
-               if (!pmd_present(*pmd))
+               if (pmd_none(*pmd))
                         continue;
                 if (vaddr < (unsigned long) _text || vaddr > end)
                         set_pmd(pmd, __pmd(0));
@@@ -506,7 -506,7 +506,7 @@@ early_param("memtest", parse_memtest)
   
   static void __init early_memtest(unsigned long start, unsigned long end)
   {
-       unsigned long t_start, t_size;
+       u64 t_start, t_size;
         unsigned pattern;
   
         if (!memtest_pattern)
@@@ -525,7 -525,7 +525,7 @@@
                         if (t_start + t_size > end)
                                 t_size = end - t_start;
   
-                       printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
+                       printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
                                 t_start, t_start + t_size, pattern);
   
                         memtest(t_start, t_size, pattern);
@@@ -766,13 -766,6 +766,13 @@@ EXPORT_SYMBOL_GPL(rodata_test_data)
   void mark_rodata_ro(void)
   {
         unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+ +      unsigned long rodata_start =
+ +              ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ +
+ +#ifdef CONFIG_DYNAMIC_FTRACE
+ +      /* Dynamic tracing modifies the kernel text section */
+ +      start = rodata_start;
+ +#endif
   
         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
                (end - start) >> 10);
@@@ -782,7 -775,8 +782,7 @@@
          * The rodata section (but not the kernel text!) should also be
          * not-executable.
          */
- -      start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
- -      set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+ +      set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
   
         rodata_test();
   
diff --combined arch/x86/mm/ioremap.c

index a7c80a6e8622c02053a6d6bba8058869958d91b7,2b2bb3f9b683156b7ef25aac9ccfeca69e90d364..e92aa461f4d6ddd62a77ea1cf5382a2ad2a0e0c7
--- 1/arch/x86/mm/ioremap.c
--- 2/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@@ -12,7 -12,6 +12,7 @@@
   #include <linux/module.h>
   #include <linux/slab.h>
   #include <linux/vmalloc.h>
+ +#include <linux/mmiotrace.h>
   
   #include <asm/cacheflush.h>
   #include <asm/e820.h>
@@@ -123,13 -122,10 +123,13 @@@ static void __iomem *__ioremap_caller(r
   {
         unsigned long pfn, offset, vaddr;
         resource_size_t last_addr;
+ +      const resource_size_t unaligned_phys_addr = phys_addr;
+ +      const unsigned long unaligned_size = size;
         struct vm_struct *area;
         unsigned long new_prot_val;
         pgprot_t prot;
         int retval;
+ +      void __iomem *ret_addr;
   
         /* Don't allow wraparound or zero size */
         last_addr = phys_addr + size - 1;
@@@ -237,10 -233,7 +237,10 @@@
                 return NULL;
         }
   
- -      return (void __iomem *) (vaddr + offset);
+ +      ret_addr = (void __iomem *) (vaddr + offset);
+ +      mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+ +
+ +      return ret_addr;
   }
   
   /**
@@@ -332,8 -325,6 +332,8 @@@ void iounmap(volatile void __iomem *add
         addr = (volatile void __iomem *)
                 (PAGE_MASK & (unsigned long __force)addr);
   
+ +      mmiotrace_iounmap(addr);
+ +
         /* Use the vm area unlocked, assuming the caller
            ensures there isn't another iounmap for the same address
            in parallel. Reuse of the virtual address is prevented by
@@@ -602,10 -593,11 +602,11 @@@ void __init early_iounmap(void *addr, u
         unsigned long offset;
         unsigned int nrpages;
         enum fixed_addresses idx;
-       unsigned int nesting;
+       int nesting;
   
         nesting = --early_ioremap_nested;
-       WARN_ON(nesting < 0);
+       if (WARN_ON(nesting < 0))
+               return;
   
         if (early_ioremap_debug) {
                 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
diff --combined arch/x86/vdso/vclock_gettime.c

index 5cb8f754c52da24569770b3dc538551e648a0aba,efa2ba7c600567ea95c29202eee2d87b892cb60d..1ef0f90813d626ed6be436b93d3d5b6550dbb392
--- 1/arch/x86/vdso/vclock_gettime.c
--- 2/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@@ -23,7 -23,7 +23,7 @@@
   
   #define gtod vdso_vsyscall_gtod_data
   
- -static long vdso_fallback_gettime(long clock, struct timespec *ts)
+ +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
   {
         long ret;
         asm("syscall" : "=a" (ret) :
@@@ -31,7 -31,7 +31,7 @@@
         return ret;
   }
   
- -static inline long vgetns(void)
+ +notrace static inline long vgetns(void)
   {
         long v;
         cycles_t (*vread)(void);
@@@ -40,7 -40,7 +40,7 @@@
         return (v * gtod->clock.mult) >> gtod->clock.shift;
   }
   
- -static noinline int do_realtime(struct timespec *ts)
+ +notrace static noinline int do_realtime(struct timespec *ts)
   {
         unsigned long seq, ns;
         do {
@@@ -54,8 -54,7 +54,8 @@@
   }
   
   /* Copy of the version in kernel/time.c which we cannot directly access */
- -static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+ +notrace static void
+ +vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
   {
         while (nsec >= NSEC_PER_SEC) {
                 nsec -= NSEC_PER_SEC;
@@@ -69,7 -68,7 +69,7 @@@
         ts->tv_nsec = nsec;
   }
   
- -static noinline int do_monotonic(struct timespec *ts)
+ +notrace static noinline int do_monotonic(struct timespec *ts)
   {
         unsigned long seq, ns, secs;
         do {
@@@ -83,7 -82,7 +83,7 @@@
         return 0;
   }
   
- -int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+ +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
   {
         if (likely(gtod->sysctl_enabled && gtod->clock.vread))
                 switch (clock) {
@@@ -97,7 -96,7 +97,7 @@@
   int clock_gettime(clockid_t, struct timespec *)
         __attribute__((weak, alias("__vdso_clock_gettime")));
   
- -int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+ +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
   {
         long ret;
         if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
@@@ -107,9 -106,9 +107,9 @@@
                 do_realtime((struct timespec *)tv);
                 tv->tv_usec /= 1000;
                 if (unlikely(tz != NULL)) {
-                       /* This relies on gcc inlining the memcpy. We'll notice
-                          if it ever fails to do so. */
-                       memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
+                       /* Avoid memcpy. Some old compilers fail to inline it */
+                       tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
+                       tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
                 }
                 return 0;
         }
diff --combined include/linux/sched.h

index c0b1c69b55cec57dfdbf8945d9669f741a465748,c5d3f847ca8d05bd52ca575608c2d428a23d1b28..aa609858aef07ee9dbada675f2beb5d98842883b
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -246,8 -246,6 +246,8 @@@ extern asmlinkage void schedule_tail(st
   extern void init_idle(struct task_struct *idle, int cpu);
   extern void init_idle_bootup_task(struct task_struct *idle);
   
+ +extern int runqueue_is_locked(void);
+ +
   extern cpumask_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
@@@ -768,7 -766,6 +768,6 @@@ struct sched_domain 
         struct sched_domain *child;     /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
         cpumask_t span;                 /* span of all CPUs in this domain */
-       int first_cpu;                  /* cache of the first cpu in this domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
         unsigned int busy_factor;       /* less balancing by factor if busy */
@@@ -1850,7 -1847,9 +1849,9 @@@ extern void exit_thread(void)
   extern void exit_files(struct task_struct *);
   extern void __cleanup_signal(struct signal_struct *);
   extern void __cleanup_sighand(struct sighand_struct *);
+ 
   extern void exit_itimers(struct signal_struct *);
+ extern void flush_itimer_signals(void);
   
   extern NORET_TYPE void do_group_exit(int);
   
@@@ -2027,6 -2026,19 +2028,19 @@@ static inline int fatal_signal_pending(
         return signal_pending(p) && __fatal_signal_pending(p);
   }
   
+ static inline int signal_pending_state(long state, struct task_struct *p)
+ {
+       if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
+               return 0;
+       if (!signal_pending(p))
+               return 0;
+ 
+       if (state & (__TASK_STOPPED | __TASK_TRACED))
+               return 0;
+ 
+       return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
+ }
+ 
   static inline int need_resched(void)
   {
         return unlikely(test_thread_flag(TIF_NEED_RESCHED));
@@@ -2119,18 -2131,6 +2133,18 @@@ static inline void arch_pick_mmap_layou
   }
   #endif
   
+ +#ifdef CONFIG_TRACING
+ +extern void
+ +__trace_special(void *__tr, void *__data,
+ +              unsigned long arg1, unsigned long arg2, unsigned long arg3);
+ +#else
+ +static inline void
+ +__trace_special(void *__tr, void *__data,
+ +              unsigned long arg1, unsigned long arg2, unsigned long arg3)
+ +{
+ +}
+ +#endif
+ +
   extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
   extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
   
@@@ -2225,8 -2225,6 +2239,8 @@@ static inline void mm_init_owner(struc
   }
   #endif /* CONFIG_MM_OWNER */
   
+ +#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+ +
   #endif /* __KERNEL__ */
   
   #endif
diff --combined kernel/sched.c

index e2e985eeee786298dac18e899d6e043a206b43a5,eaf6751e7612cbc5167865c8c1e4e929ff32ca5a..c994d12abbf66066f14b44ff2479aec2d9a4e389
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -70,7 -70,6 +70,7 @@@
   #include <linux/bootmem.h>
   #include <linux/debugfs.h>
   #include <linux/ctype.h>
+ +#include <linux/ftrace.h>
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
@@@ -137,7 -136,7 +137,7 @@@ static inline void sg_inc_cpu_power(str
   
   static inline int rt_policy(int policy)
   {
-       if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
                 return 1;
         return 0;
   }
@@@ -313,12 -312,15 +313,15 @@@ static DEFINE_SPINLOCK(task_group_lock)
   #endif
   
   /*
-  * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
+  * A weight of 0 or 1 can cause arithmetics problems.
+  * A weight of a cfs_rq is the sum of weights of which entities
+  * are queued on this cfs_rq, so a weight of a entity should not be
+  * too large, so as the shares value of a task group.
    * (The default weight is 1024 - so there's no practical
    *  limitation from this.)
    */
   #define MIN_SHARES    2
- #define MAX_SHARES    (ULONG_MAX - 1)
+ #define MAX_SHARES    (1UL << 18)
   
   static int init_task_group_load = INIT_TASK_GROUP_LOAD;
   #endif
@@@ -399,43 -401,6 +402,6 @@@ struct cfs_rq 
          */
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
- 
- #ifdef CONFIG_SMP
-       unsigned long task_weight;
-       unsigned long shares;
-       /*
-        * We need space to build a sched_domain wide view of the full task
-        * group tree, in order to avoid depending on dynamic memory allocation
-        * during the load balancing we place this in the per cpu task group
-        * hierarchy. This limits the load balancing to one instance per cpu,
-        * but more should not be needed anyway.
-        */
-       struct aggregate_struct {
-               /*
-                *   load = weight(cpus) * f(tg)
-                *
-                * Where f(tg) is the recursive weight fraction assigned to
-                * this group.
-                */
-               unsigned long load;
- 
-               /*
-                * part of the group weight distributed to this span.
-                */
-               unsigned long shares;
- 
-               /*
-                * The sum of all runqueue weights within this span.
-                */
-               unsigned long rq_weight;
- 
-               /*
-                * Weight contributed by tasks; this is the part we can
-                * influence by moving tasks around.
-                */
-               unsigned long task_weight;
-       } aggregate;
- #endif
   #endif
   };
   
@@@ -642,24 -607,6 +608,24 @@@ static inline void update_rq_clock(stru
   # define const_debug static const
   #endif
   
+ +/**
+ + * runqueue_is_locked
+ + *
+ + * Returns true if the current cpu runqueue is locked.
+ + * This interface allows printk to be called with the runqueue lock
+ + * held and know whether or not it is OK to wake up the klogd.
+ + */
+ +int runqueue_is_locked(void)
+ +{
+ +      int cpu = get_cpu();
+ +      struct rq *rq = cpu_rq(cpu);
+ +      int ret;
+ +
+ +      ret = spin_is_locked(&rq->lock);
+ +      put_cpu();
+ +      return ret;
+ +}
+ +
   /*
    * Debugging: various feature bits
    */
@@@ -1387,17 -1334,19 +1353,19 @@@ static void __resched_task(struct task_
    */
   #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
   
- /*
-  * delta *= weight / lw
-  */
   static unsigned long
   calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                 struct load_weight *lw)
   {
         u64 tmp;
   
-       if (!lw->inv_weight)
-               lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
+       if (!lw->inv_weight) {
+               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+                       lw->inv_weight = 1;
+               else
+                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+                               / (lw->weight+1);
+       }
   
         tmp = (u64)delta_exec * weight;
         /*
@@@ -1412,6 -1361,12 +1380,12 @@@
         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
   }
   
+ static inline unsigned long
+ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+ {
+       return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+ }
+ 
   static inline void update_load_add(struct load_weight *lw, unsigned long inc)
   {
         lw->weight += inc;
@@@ -1524,326 -1479,6 +1498,6 @@@ static unsigned long source_load(int cp
   static unsigned long target_load(int cpu, int type);
   static unsigned long cpu_avg_load_per_task(int cpu);
   static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 
- /*
-  * Group load balancing.
-  *
-  * We calculate a few balance domain wide aggregate numbers; load and weight.
-  * Given the pictures below, and assuming each item has equal weight:
-  *
-  *         root          1 - thread
-  *         / | \         A - group
-  *        A  1  B
-  *       /|\   / \
-  *      C 2 D 3   4
-  *      |   |
-  *      5   6
-  *
-  * load:
-  *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
-  *    which equals 1/9-th of the total load.
-  *
-  * shares:
-  *    The weight of this group on the selected cpus.
-  *
-  * rq_weight:
-  *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
-  *    B would get 2.
-  *
-  * task_weight:
-  *    Part of the rq_weight contributed by tasks; all groups except B would
-  *    get 1, B gets 2.
-  */
- 
- static inline struct aggregate_struct *
- aggregate(struct task_group *tg, struct sched_domain *sd)
- {
-       return &tg->cfs_rq[sd->first_cpu]->aggregate;
- }
- 
- typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
- 
- /*
-  * Iterate the full tree, calling @down when first entering a node and @up when
-  * leaving it for the final time.
-  */
- static
- void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-                        struct sched_domain *sd)
- {
-       struct task_group *parent, *child;
- 
-       rcu_read_lock();
-       parent = &root_task_group;
- down:
-       (*down)(parent, sd);
-       list_for_each_entry_rcu(child, &parent->children, siblings) {
-               parent = child;
-               goto down;
- 
- up:
-               continue;
-       }
-       (*up)(parent, sd);
- 
-       child = parent;
-       parent = parent->parent;
-       if (parent)
-               goto up;
-       rcu_read_unlock();
- }
- 
- /*
-  * Calculate the aggregate runqueue weight.
-  */
- static
- void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
- {
-       unsigned long rq_weight = 0;
-       unsigned long task_weight = 0;
-       int i;
- 
-       for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
-               task_weight += tg->cfs_rq[i]->task_weight;
-       }
- 
-       aggregate(tg, sd)->rq_weight = rq_weight;
-       aggregate(tg, sd)->task_weight = task_weight;
- }
- 
- /*
-  * Compute the weight of this group on the given cpus.
-  */
- static
- void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
- {
-       unsigned long shares = 0;
-       int i;
- 
-       for_each_cpu_mask(i, sd->span)
-               shares += tg->cfs_rq[i]->shares;
- 
-       if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
-               shares = tg->shares;
- 
-       aggregate(tg, sd)->shares = shares;
- }
- 
- /*
-  * Compute the load fraction assigned to this group, relies on the aggregate
-  * weight and this group's parent's load, i.e. top-down.
-  */
- static
- void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
- {
-       unsigned long load;
- 
-       if (!tg->parent) {
-               int i;
- 
-               load = 0;
-               for_each_cpu_mask(i, sd->span)
-                       load += cpu_rq(i)->load.weight;
- 
-       } else {
-               load = aggregate(tg->parent, sd)->load;
- 
-               /*
-                * shares is our weight in the parent's rq so
-                * shares/parent->rq_weight gives our fraction of the load
-                */
-               load *= aggregate(tg, sd)->shares;
-               load /= aggregate(tg->parent, sd)->rq_weight + 1;
-       }
- 
-       aggregate(tg, sd)->load = load;
- }
- 
- static void __set_se_shares(struct sched_entity *se, unsigned long shares);
- 
- /*
-  * Calculate and set the cpu's group shares.
-  */
- static void
- __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-                         int tcpu)
- {
-       int boost = 0;
-       unsigned long shares;
-       unsigned long rq_weight;
- 
-       if (!tg->se[tcpu])
-               return;
- 
-       rq_weight = tg->cfs_rq[tcpu]->load.weight;
- 
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
- 
-       /*
-        *           \Sum shares * rq_weight
-        * shares =  -----------------------
-        *               \Sum rq_weight
-        *
-        */
-       shares = aggregate(tg, sd)->shares * rq_weight;
-       shares /= aggregate(tg, sd)->rq_weight + 1;
- 
-       /*
-        * record the actual number of shares, not the boosted amount.
-        */
-       tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
- 
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       else if (shares > MAX_SHARES)
-               shares = MAX_SHARES;
- 
-       __set_se_shares(tg->se[tcpu], shares);
- }
- 
- /*
-  * Re-adjust the weights on the cpu the task came from and on the cpu the
-  * task went to.
-  */
- static void
- __move_group_shares(struct task_group *tg, struct sched_domain *sd,
-                   int scpu, int dcpu)
- {
-       unsigned long shares;
- 
-       shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
- 
-       __update_group_shares_cpu(tg, sd, scpu);
-       __update_group_shares_cpu(tg, sd, dcpu);
- 
-       /*
-        * ensure we never loose shares due to rounding errors in the
-        * above redistribution.
-        */
-       shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-       if (shares)
-               tg->cfs_rq[dcpu]->shares += shares;
- }
- 
- /*
-  * Because changing a group's shares changes the weight of the super-group
-  * we need to walk up the tree and change all shares until we hit the root.
-  */
- static void
- move_group_shares(struct task_group *tg, struct sched_domain *sd,
-                 int scpu, int dcpu)
- {
-       while (tg) {
-               __move_group_shares(tg, sd, scpu, dcpu);
-               tg = tg->parent;
-       }
- }
- 
- static
- void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
- {
-       unsigned long shares = aggregate(tg, sd)->shares;
-       int i;
- 
-       for_each_cpu_mask(i, sd->span) {
-               struct rq *rq = cpu_rq(i);
-               unsigned long flags;
- 
-               spin_lock_irqsave(&rq->lock, flags);
-               __update_group_shares_cpu(tg, sd, i);
-               spin_unlock_irqrestore(&rq->lock, flags);
-       }
- 
-       aggregate_group_shares(tg, sd);
- 
-       /*
-        * ensure we never loose shares due to rounding errors in the
-        * above redistribution.
-        */
-       shares -= aggregate(tg, sd)->shares;
-       if (shares) {
-               tg->cfs_rq[sd->first_cpu]->shares += shares;
-               aggregate(tg, sd)->shares += shares;
-       }
- }
- 
- /*
-  * Calculate the accumulative weight and recursive load of each task group
-  * while walking down the tree.
-  */
- static
- void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
- {
-       aggregate_group_weight(tg, sd);
-       aggregate_group_shares(tg, sd);
-       aggregate_group_load(tg, sd);
- }
- 
- /*
-  * Rebalance the cpu shares while walking back up the tree.
-  */
- static
- void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
- {
-       aggregate_group_set_shares(tg, sd);
- }
- 
- static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
- 
- static void __init init_aggregate(void)
- {
-       int i;
- 
-       for_each_possible_cpu(i)
-               spin_lock_init(&per_cpu(aggregate_lock, i));
- }
- 
- static int get_aggregate(struct sched_domain *sd)
- {
-       if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
-               return 0;
- 
-       aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
-       return 1;
- }
- 
- static void put_aggregate(struct sched_domain *sd)
- {
-       spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
- }
- 
- static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
- {
-       cfs_rq->shares = shares;
- }
- 
- #else
- 
- static inline void init_aggregate(void)
- {
- }
- 
- static inline int get_aggregate(struct sched_domain *sd)
- {
-       return 0;
- }
- 
- static inline void put_aggregate(struct sched_domain *sd)
- {
- }
- #endif
- 
   #else /* CONFIG_SMP */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1864,14 -1499,26 +1518,26 @@@ static void cfs_rq_set_shares(struct cf
   
   #define sched_class_highest (&rt_sched_class)
   
- static void inc_nr_running(struct rq *rq)
+ static inline void inc_load(struct rq *rq, const struct task_struct *p)
+ {
+       update_load_add(&rq->load, p->se.load.weight);
+ }
+ 
+ static inline void dec_load(struct rq *rq, const struct task_struct *p)
+ {
+       update_load_sub(&rq->load, p->se.load.weight);
+ }
+ 
+ static void inc_nr_running(struct task_struct *p, struct rq *rq)
   {
         rq->nr_running++;
+       inc_load(rq, p);
   }
   
- static void dec_nr_running(struct rq *rq)
+ static void dec_nr_running(struct task_struct *p, struct rq *rq)
   {
         rq->nr_running--;
+       dec_load(rq, p);
   }
   
   static void set_load_weight(struct task_struct *p)
@@@ -1963,7 -1610,7 +1629,7 @@@ static void activate_task(struct rq *rq
                 rq->nr_uninterruptible--;
   
         enqueue_task(rq, p, wakeup);
-       inc_nr_running(rq);
+       inc_nr_running(p, rq);
   }
   
   /*
@@@ -1975,7 -1622,7 +1641,7 @@@ static void deactivate_task(struct rq *
                 rq->nr_uninterruptible++;
   
         dequeue_task(rq, p, sleep);
-       dec_nr_running(rq);
+       dec_nr_running(p, rq);
   }
   
   /**
@@@ -2500,9 -2147,6 +2166,9 @@@ out_activate
         success = 1;
   
   out_running:
+ +      trace_mark(kernel_sched_wakeup,
+ +              "pid %d state %ld ## rq %p task %p rq->curr %p",
+ +              p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
   
         p->state = TASK_RUNNING;
@@@ -2631,11 -2275,8 +2297,11 @@@ void wake_up_new_task(struct task_struc
                  * management (if any):
                  */
                 p->sched_class->task_new(rq, p);
-               inc_nr_running(rq);
+               inc_nr_running(p, rq);
         }
+ +      trace_mark(kernel_sched_wakeup_new,
+ +              "pid %d state %ld ## rq %p task %p rq->curr %p",
+ +              p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2808,11 -2449,6 +2474,11 @@@ context_switch(struct rq *rq, struct ta
         struct mm_struct *mm, *oldmm;
   
         prepare_task_switch(rq, prev, next);
+ +      trace_mark(kernel_sched_schedule,
+ +              "prev_pid %d next_pid %d prev_state %ld "
+ +              "## rq %p prev %p next %p",
+ +              prev->pid, next->pid, prev->state,
+ +              rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@@ -3630,12 -3266,9 +3296,9 @@@ static int load_balance(int this_cpu, s
         unsigned long imbalance;
         struct rq *busiest;
         unsigned long flags;
-       int unlock_aggregate;
   
         cpus_setall(*cpus);
   
-       unlock_aggregate = get_aggregate(sd);
- 
         /*
          * When power savings policy is enabled for the parent domain, idle
          * sibling can pick up load irrespective of busy siblings. In this case,
@@@ -3751,9 -3384,8 +3414,8 @@@ redo
   
         if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               ld_moved = -1;
- 
-       goto out;
+               return -1;
+       return ld_moved;
   
   out_balanced:
         schedstat_inc(sd, lb_balanced[idle]);
@@@ -3768,13 -3400,8 +3430,8 @@@ out_one_pinned
   
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               ld_moved = -1;
-       else
-               ld_moved = 0;
- out:
-       if (unlock_aggregate)
-               put_aggregate(sd);
-       return ld_moved;
+               return -1;
+       return 0;
   }
   
   /*
@@@ -4392,44 -4019,26 +4049,44 @@@ void scheduler_tick(void
   #endif
   }
   
- -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+ +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ +                              defined(CONFIG_PREEMPT_TRACER))
+ +
+ +static inline unsigned long get_parent_ip(unsigned long addr)
+ +{
+ +      if (in_lock_functions(addr)) {
+ +              addr = CALLER_ADDR2;
+ +              if (in_lock_functions(addr))
+ +                      addr = CALLER_ADDR3;
+ +      }
+ +      return addr;
+ +}
   
   void __kprobes add_preempt_count(int val)
   {
+ +#ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                 return;
+ +#endif
         preempt_count() += val;
+ +#ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Spinlock count overflowing soon?
          */
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
+ +#endif
+ +      if (preempt_count() == val)
+ +              trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
   }
   EXPORT_SYMBOL(add_preempt_count);
   
   void __kprobes sub_preempt_count(int val)
   {
+ +#ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
@@@ -4441,10 -4050,7 +4098,10 @@@
         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                         !(preempt_count() & PREEMPT_MASK)))
                 return;
+ +#endif
   
+ +      if (preempt_count() == val)
+ +              trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
         preempt_count() -= val;
   }
   EXPORT_SYMBOL(sub_preempt_count);
@@@ -4481,7 -4087,7 +4138,7 @@@ static inline void schedule_debug(struc
          * schedule() atomically, we ignore that path for now.
          * Otherwise, whine if we are scheduling when we should not be.
          */
-       if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
+       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
                 __schedule_bug(prev);
   
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@@ -4561,12 -4167,10 +4218,10 @@@ need_resched_nonpreemptible
         clear_tsk_need_resched(prev);
   
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-                               signal_pending(prev))) {
+               if (unlikely(signal_pending_state(prev->state, prev)))
                         prev->state = TASK_RUNNING;
-               } else {
+               else
                         deactivate_task(rq, prev, 1);
-               }
                 switch_count = &prev->nvcsw;
         }
   
@@@ -4982,8 -4586,10 +4637,10 @@@ void set_user_nice(struct task_struct *
                 goto out_unlock;
         }
         on_rq = p->se.on_rq;
-       if (on_rq)
+       if (on_rq) {
                 dequeue_task(rq, p, 0);
+               dec_load(rq, p);
+       }
   
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@@ -4993,6 -4599,7 +4650,7 @@@
   
         if (on_rq) {
                 enqueue_task(rq, p, 0);
+               inc_load(rq, p);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@@ -5777,7 -5384,7 +5435,7 @@@ out_unlock
         return retval;
   }
   
- -static const char stat_nam[] = "RSDTtZX";
+ +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
   
   void sched_show_task(struct task_struct *p)
   {
@@@ -7367,7 -6974,6 +7025,6 @@@ static int __build_sched_domains(const 
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
                         sd->span = *cpu_map;
-                       sd->first_cpu = first_cpu(sd->span);
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
                         sd_allnodes = 1;
@@@ -7378,7 -6984,6 +7035,6 @@@
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
                 sched_domain_node_span(cpu_to_node(i), &sd->span);
-               sd->first_cpu = first_cpu(sd->span);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@@ -7390,7 -6995,6 +7046,6 @@@
                 SD_INIT(sd, CPU);
                 set_domain_attribute(sd, attr);
                 sd->span = *nodemask;
-               sd->first_cpu = first_cpu(sd->span);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@@ -7402,7 -7006,6 +7057,6 @@@
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
                 sd->span = cpu_coregroup_map(i);
-               sd->first_cpu = first_cpu(sd->span);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
                 p->child = sd;
@@@ -7415,7 -7018,6 +7069,6 @@@
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
                 sd->span = per_cpu(cpu_sibling_map, i);
-               sd->first_cpu = first_cpu(sd->span);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
                 p->child = sd;
@@@ -7619,8 -7221,8 +7272,8 @@@ static int build_sched_domains(const cp
   
   static cpumask_t *doms_cur;   /* current sched domains */
   static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
- static struct sched_domain_attr *dattr_cur;   /* attribues of custom domains
-                                                  in 'doms_cur' */
+ static struct sched_domain_attr *dattr_cur;
+                               /* attribues of custom domains in 'doms_cur' */
   
   /*
    * Special case: If a kmalloc of a doms_cur partition (array of
@@@ -8085,7 -7687,6 +7738,6 @@@ void __init sched_init(void
         }
   
   #ifdef CONFIG_SMP
-       init_aggregate();
         init_defrootdomain();
   #endif
   
@@@ -8650,11 -8251,14 +8302,14 @@@ void sched_move_task(struct task_struc
   #endif
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+ static void set_se_shares(struct sched_entity *se, unsigned long shares)
   {
         struct cfs_rq *cfs_rq = se->cfs_rq;
+       struct rq *rq = cfs_rq->rq;
         int on_rq;
   
+       spin_lock_irq(&rq->lock);
+ 
         on_rq = se->on_rq;
         if (on_rq)
                 dequeue_entity(cfs_rq, se, 0);
@@@ -8664,17 -8268,8 +8319,8 @@@
   
         if (on_rq)
                 enqueue_entity(cfs_rq, se, 0);
- }
   
- static void set_se_shares(struct sched_entity *se, unsigned long shares)
- {
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
-       unsigned long flags;
- 
-       spin_lock_irqsave(&rq->lock, flags);
-       __set_se_shares(se, shares);
-       spin_unlock_irqrestore(&rq->lock, flags);
+       spin_unlock_irq(&rq->lock);
   }
   
   static DEFINE_MUTEX(shares_mutex);
@@@ -8713,13 -8308,8 +8359,8 @@@ int sched_group_set_shares(struct task_
          * w/o tripping rebalance_share or load_balance_fair.
          */
         tg->shares = shares;
-       for_each_possible_cpu(i) {
-               /*
-                * force a rebalance
-                */
-               cfs_rq_set_shares(tg->cfs_rq[i], 0);
+       for_each_possible_cpu(i)
                 set_se_shares(tg->se[i], shares);
-       }
   
         /*
          * Enable load balance activity on this group, by inserting it back on
author	Ingo Molnar <mingo@elte.hu>
	Mon, 16 Jun 2008 09:16:46 +0000 (11:16 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 16 Jun 2008 09:16:46 +0000 (11:16 +0200)
		1	2
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_32.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/ioremap.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/vdso/vclock_gettime.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history