From: Thomas Gleixner Date: Mon, 20 Oct 2008 11:14:06 +0000 (+0200) Subject: Merge branches 'timers/clocksource', 'timers/hrtimers', 'timers/nohz', 'timers/ntp... X-Git-Tag: v2.6.28-rc1~82^2 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=c465a76af658b443075d6efee1c3131257643020;hp=-c;p=linux-2.6-omap-h63xx.git Merge branches 'timers/clocksource', 'timers/hrtimers', 'timers/nohz', 'timers/ntp', 'timers/posixtimers' and 'timers/debug' into v28-timers-for-linus --- c465a76af658b443075d6efee1c3131257643020 diff --combined drivers/clocksource/acpi_pm.c index 3df33848100,5ca1d80de18,71d2ac4e3f4,5ca1d80de18,4eee533f3f4,71d2ac4e3f4..c20171078d1 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@@@@@@ -21,6 -21,6 -21,7 -21,6 -21,7 -21,7 +21,7 @@@@@@@ #include #include #include ++ + #include #include /* @@@@@@@ -151,13 -151,13 -152,13 -151,13 -152,13 -152,13 +152,13 @@@@@@@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_S */ static int verify_pmtmr_rate(void) { -- - u32 value1, value2; ++ + cycle_t value1, value2; unsigned long count, delta; mach_prepare_counter(); -- - value1 = read_pmtmr(); ++ + value1 = clocksource_acpi_pm.read(); mach_countup(&count); -- - value2 = read_pmtmr(); ++ + value2 = clocksource_acpi_pm.read(); delta = (value2 - value1) & ACPI_PM_MASK; /* Check that the PMTMR delta is within 5% of what we expect */ @@@@@@@ -175,10 -175,10 -176,15 -175,10 -176,13 -176,15 +176,15 @@@@@@@ #define verify_pmtmr_rate() (0) #endif ++ + /* Number of monotonicity checks to perform during initialization */ ++ + #define ACPI_PM_MONOTONICITY_CHECKS 10 ++ ++ /* Number of reads we try to get two different values */ ++ ++ #define ACPI_PM_READ_CHECKS 10000 ++ + static int __init init_acpi_pm_clocksource(void) { -- - u32 value1, value2; -- - unsigned int i; ++ + cycle_t value1, value2; - unsigned int i, j, good = 0; ++ ++ unsigned int i, j = 0; if (!pmtmr_ioport) return -ENODEV; @@@@@@@ -187,24 -187,24 -193,29 -187,24 -191,32 -193,29 +193,29 @@@@@@@ clocksource_acpi_pm.shift); /* "verify" this timing source: */ -- - value1 = read_pmtmr(); -- - for (i = 0; i < 10000; i++) { -- - value2 = read_pmtmr(); -- - if (value2 == value1) -- - continue; -- - if (value2 > value1) -- - goto pm_good; -- - if ((value2 < value1) && ((value2) < 0xFFF)) -- - goto pm_good; -- - printk(KERN_INFO "PM-Timer had inconsistent results:" -- - " 0x%#x, 0x%#x - aborting.\n", value1, value2); -- - return -EINVAL; ++ + for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) { ++ ++ udelay(100 * j); ++ + value1 = clocksource_acpi_pm.read(); - for (i = 0; i < 10000; i++) { ++ ++ for (i = 0; i < ACPI_PM_READ_CHECKS; i++) { ++ + value2 = clocksource_acpi_pm.read(); ++ + if (value2 == value1) ++ + continue; ++ + if (value2 > value1) - good++; ++ + break; ++ + if ((value2 < value1) && ((value2) < 0xFFF)) - good++; ++ + break; ++ + printk(KERN_INFO "PM-Timer had inconsistent results:" ++ + " 0x%#llx, 0x%#llx - aborting.\n", ++ + value1, value2); ++ + return -EINVAL; ++ + } - udelay(300 * i); - } - - if (good != ACPI_PM_MONOTONICITY_CHECKS) { - printk(KERN_INFO "PM-Timer failed consistency check " - " (0x%#llx) - aborting.\n", value1); - return -ENODEV; ++ ++ if (i == ACPI_PM_READ_CHECKS) { ++ ++ printk(KERN_INFO "PM-Timer failed consistency check " ++ ++ " (0x%#llx) - aborting.\n", value1); ++ ++ return -ENODEV; ++ ++ } } -- - printk(KERN_INFO "PM-Timer had no reasonable result:" -- - " 0x%#x - aborting.\n", value1); -- - return -ENODEV; -- - pm_good: if (verify_pmtmr_rate() != 0) return -ENODEV; @@@@@@@ -226,12 -226,9 -237,9 -226,9 -238,9 -237,9 +237,12 @@@@@@@ static int __init parse_pmtmr(char *arg if (strict_strtoul(arg, 16, &base)) return -EINVAL; ----- +++++#ifdef CONFIG_X86_64 +++++ if (base > UINT_MAX) +++++ return -ERANGE; +++++#endif printk(KERN_INFO "PMTMR IOPort override: 0x%04x -> 0x%04lx\n", ----- (unsigned int)pmtmr_ioport, base); +++++ pmtmr_ioport, base); pmtmr_ioport = base; return 1; diff --combined fs/binfmt_elf.c index 655ed8d30a8,655ed8d30a8,c76afa26edf,655ed8d30a8,a8635f63703,c76afa26edf..83d72006e29 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@@@@@@ -683,7 -683,7 -683,7 -683,7 -683,7 -683,7 +683,7 @@@@@@@ static int load_elf_binary(struct linux * switch really is going to happen - do this in * flush_thread(). - akpm */ -- -- SET_PERSONALITY(loc->elf_ex, 0); ++ ++ SET_PERSONALITY(loc->elf_ex); interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); @@@@@@@ -734,7 -734,7 -734,7 -734,7 -734,7 -734,7 +734,7 @@@@@@@ goto out_free_dentry; } else { /* Executables without an interpreter also need a personality */ -- -- SET_PERSONALITY(loc->elf_ex, 0); ++ ++ SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@@@@@@ -748,7 -748,7 -748,7 -748,7 -748,7 -748,7 +748,7 @@@@@@@ /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ -- -- SET_PERSONALITY(loc->elf_ex, 0); ++ ++ SET_PERSONALITY(loc->elf_ex); if (elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@@@@@@ -1333,20 -1333,20 -1333,20 -1333,20 -1333,15 -1333,20 +1333,15 @@@@@@@ static void fill_prstatus(struct elf_pr prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { ++++ + struct task_cputime cputime; ++++ + /* ---- - * This is the record for the group leader. Add in the ---- - * cumulative times of previous dead threads. This total ---- - * won't include the time of each live thread whose state ---- - * is included in the core dump. The final total reported ---- - * to our parent process when it calls wait4 will include ---- - * those sums as well as the little bit more time it takes ---- - * this and each other thread to finish dying after the ---- - * core dump synchronization phase. ++++ + * This is the record for the group leader. It shows the ++++ + * group-wide total, not its individual thread total. */ ---- - cputime_to_timeval(cputime_add(p->utime, p->signal->utime), ---- - &prstatus->pr_utime); ---- - cputime_to_timeval(cputime_add(p->stime, p->signal->stime), ---- - &prstatus->pr_stime); ++++ + thread_group_cputime(p, &cputime); ++++ + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); ++++ + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { cputime_to_timeval(p->utime, &prstatus->pr_utime); cputime_to_timeval(p->stime, &prstatus->pr_stime); diff --combined fs/proc/array.c index 0d6eb33597c,0d6eb33597c,f4bc0e78953,71c9be59c9c,933953c4e40,f4bc0e78953..bb9f4b05703 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@@@@@@ -86,11 -86,11 -86,6 -86,11 -86,11 -86,6 +86,6 @@@@@@@ #include #include "internal.h" -- -- /* Gcc optimizes away "strlen(x)" for constant x */ -- -- #define ADDBUF(buffer, string) \ -- -- do { memcpy(buffer, string, strlen(string)); \ -- -- buffer += strlen(string); } while (0) -- -- static inline void task_name(struct seq_file *m, struct task_struct *p) { int i; @@@@@@@ -261,7 -261,7 -256,6 -261,7 -261,7 -256,6 +256,6 @@@@@@@ static inline void task_sig(struct seq_ sigemptyset(&ignored); sigemptyset(&caught); -- -- rcu_read_lock(); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; @@@@@@@ -272,7 -272,7 -266,6 -272,7 -272,7 -266,6 +266,6 @@@@@@@ qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } -- -- rcu_read_unlock(); seq_printf(m, "Threads:\t%d\n", num_threads); seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); @@@@@@@ -337,65 -337,65 -330,6 -337,6 -337,6 -330,6 +330,6 @@@@@@@ int proc_pid_status(struct seq_file *m return 0; } -- /* -- * Use precise platform statistics if available: -- */ -- #ifdef CONFIG_VIRT_CPU_ACCOUNTING -- static cputime_t task_utime(struct task_struct *p) -- { -- return p->utime; -- } -- -- static cputime_t task_stime(struct task_struct *p) -- { -- return p->stime; -- } -- #else -- static cputime_t task_utime(struct task_struct *p) -- { -- clock_t utime = cputime_to_clock_t(p->utime), -- total = utime + cputime_to_clock_t(p->stime); -- u64 temp; -- -- /* -- * Use CFS's precise accounting: -- */ -- temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); -- -- if (total) { -- temp *= utime; -- do_div(temp, total); -- } -- utime = (clock_t)temp; -- -- p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); -- return p->prev_utime; -- } -- -- static cputime_t task_stime(struct task_struct *p) -- { -- clock_t stime; -- -- /* -- * Use CFS's precise accounting. (we subtract utime from -- * the total, to make sure the total observed by userspace -- * grows monotonically - apps rely on that): -- */ -- stime = nsec_to_clock_t(p->se.sum_exec_runtime) - -- cputime_to_clock_t(task_utime(p)); -- -- if (stime >= 0) -- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); -- -- return p->prev_stime; -- } -- #endif -- -- static cputime_t task_gtime(struct task_struct *p) -- { -- return p->gtime; -- } -- static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { @@@@@@@ -454,20 -454,20 -388,20 -395,20 -395,20 -388,20 +388,20 @@@@@@@ /* add up live thread stats at the group level */ if (whole) { ++++ + struct task_cputime cputime; struct task_struct *t = task; do { min_flt += t->min_flt; maj_flt += t->maj_flt; ---- - utime = cputime_add(utime, task_utime(t)); ---- - stime = cputime_add(stime, task_stime(t)); gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; ---- - utime = cputime_add(utime, sig->utime); ---- - stime = cputime_add(stime, sig->stime); ++++ + thread_group_cputime(task, &cputime); ++++ + utime = cputime.utime; ++++ + stime = cputime.stime; gtime = cputime_add(gtime, sig->gtime); } diff --combined include/linux/hrtimer.h index 6d93dce61cb,8730b60c943,2f245fe63bd,6d93dce61cb,6d93dce61cb,2f245fe63bd..9a4e35cd5f7 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@@@@@@ -47,14 -47,14 -47,22 -47,14 -47,14 -47,22 +47,22 @@@@@@@ enum hrtimer_restart * HRTIMER_CB_IRQSAFE: Callback may run in hardirq context * HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and * does not restart the timer -- -- * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: Callback must run in hardirq context -- -- * Special mode for tick emultation ++ ++ * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context ++ ++ * Special mode for tick emulation and ++ ++ * scheduler timer. Such timers are per ++ ++ * cpu and not allowed to be migrated on ++ ++ * cpu unplug. ++ ++ * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context ++ ++ * with timer->base lock unlocked ++ ++ * used for timers which call wakeup to ++ ++ * avoid lock order problems with rq->lock */ enum hrtimer_cb_mode { HRTIMER_CB_SOFTIRQ, HRTIMER_CB_IRQSAFE, HRTIMER_CB_IRQSAFE_NO_RESTART, -- -- HRTIMER_CB_IRQSAFE_NO_SOFTIRQ, ++ ++ HRTIMER_CB_IRQSAFE_PERCPU, ++ ++ HRTIMER_CB_IRQSAFE_UNLOCKED, }; /* @@@@@@@ -67,9 -67,9 -75,10 -67,9 -67,9 -75,10 +75,10 @@@@@@@ * 0x02 callback function running * 0x04 callback pending (high resolution mode) * -- -- * Special case: ++ ++ * Special cases: * 0x03 callback function running and enqueued * (was requeued on another CPU) ++ ++ * 0x09 timer was migrated on CPU hotunplug * The "callback function running and enqueued" status is only possible on * SMP. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer @@@@@@@ -87,6 -87,6 -96,7 -87,6 -87,6 -96,7 +96,7 @@@@@@@ #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 #define HRTIMER_STATE_PENDING 0x04 ++ ++ #define HRTIMER_STATE_MIGRATE 0x08 /** * struct hrtimer - the basic hrtimer structure @@@@@@@ -115,12 -115,12 -125,12 -115,12 -115,12 -125,12 +125,12 @@@@@@@ struct hrtimer enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; - ---- enum hrtimer_cb_mode cb_mode; struct list_head cb_entry; + ++++ enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS + ++++ int start_pid; void *start_site; char start_comm[16]; - ---- int start_pid; #endif }; @@@@@@@ -145,10 -145,8 -155,10 -145,10 -145,10 -155,10 +155,8 @@@@@@@ struct hrtimer_sleeper * @first: pointer to the timer node which expires first * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock - ---- * @get_softirq_time: function to retrieve the current time from the softirq * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base - ---- * @reprogram: function to reprogram the timer event */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; @@@@@@@ -157,13 -155,9 -167,13 -157,13 -157,13 -167,13 +165,9 @@@@@@@ struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); - ---- ktime_t (*get_softirq_time)(void); ktime_t softirq_time; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t offset; - ---- int (*reprogram)(struct hrtimer *t, - ---- struct hrtimer_clock_base *b, - ---- ktime_t n); #endif }; diff --combined include/linux/sched.h index cfb0d87b99f,cfb0d87b99f,c226c7b8294,3d9120c5ad1,23d9d546454,c226c7b8294..81c68fef443 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@@@@@ -352,7 -352,7 -352,7 -352,7 -352,7 -352,7 +352,7 @@@@@@@ arch_get_unmapped_area_topdown(struct f extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); -- -- #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS ++ ++ #if USE_SPLIT_PTLOCKS /* * The mm counters are not protected by its page_table_lock, * so must be incremented atomically. @@@@@@@ -363,7 -363,7 -363,7 -363,7 -363,7 -363,7 +363,7 @@@@@@@ #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) -- -- #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++ ++ #else /* !USE_SPLIT_PTLOCKS */ /* * The mm counters are protected by its page_table_lock, * so can be incremented directly. @@@@@@@ -374,7 -374,7 -374,7 -374,7 -374,7 -374,7 +374,7 @@@@@@@ #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- -- -- #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++ ++ #endif /* !USE_SPLIT_PTLOCKS */ #define get_mm_rss(mm) \ (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) @@@@@@@ -425,6 -425,6 -425,6 -425,6 -425,39 -425,6 +425,39 @@@@@@@ struct pacct_struct unsigned long ac_minflt, ac_majflt; }; ++++ +/** ++++ + * struct task_cputime - collected CPU time counts ++++ + * @utime: time spent in user mode, in &cputime_t units ++++ + * @stime: time spent in kernel mode, in &cputime_t units ++++ + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds ++++ + * ++++ + * This structure groups together three kinds of CPU time that are ++++ + * tracked for threads and thread groups. Most things considering ++++ + * CPU time want to group these counts together and treat all three ++++ + * of them in parallel. ++++ + */ ++++ +struct task_cputime { ++++ + cputime_t utime; ++++ + cputime_t stime; ++++ + unsigned long long sum_exec_runtime; ++++ +}; ++++ +/* Alternate field names when used to cache expirations. */ ++++ +#define prof_exp stime ++++ +#define virt_exp utime ++++ +#define sched_exp sum_exec_runtime ++++ + ++++ +/** ++++ + * struct thread_group_cputime - thread group interval timer counts ++++ + * @totals: thread group interval timers; substructure for ++++ + * uniprocessor kernel, per-cpu for SMP kernel. ++++ + * ++++ + * This structure contains the version of task_cputime, above, that is ++++ + * used for thread group CPU clock calculations. ++++ + */ ++++ +struct thread_group_cputime { ++++ + struct task_cputime *totals; ++++ +}; ++++ + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@@@@@@ -451,8 -451,8 -451,8 -451,8 -484,8 -451,8 +484,8 @@@@@@@ struct signal_struct * - everyone except group_exit_task is stopped during signal delivery * of fatal signals, group_exit_task processes the signal. */ -- -- struct task_struct *group_exit_task; int notify_count; ++ ++ struct task_struct *group_exit_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; @@@@@@@ -470,6 -470,6 -470,6 -470,6 -503,17 -470,6 +503,17 @@@@@@@ cputime_t it_prof_expires, it_virt_expires; cputime_t it_prof_incr, it_virt_incr; ++++ + /* ++++ + * Thread group totals for process CPU clocks. ++++ + * See thread_group_cputime(), et al, for details. ++++ + */ ++++ + struct thread_group_cputime cputime; ++++ + ++++ + /* Earliest-expiration cache. */ ++++ + struct task_cputime cputime_expires; ++++ + ++++ + struct list_head cpu_timers[3]; ++++ + /* job control IDs */ /* @@@@@@@ -500,7 -500,7 -500,7 -500,7 -544,7 -500,7 +544,7 @@@@@@@ * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ ---- - cputime_t utime, stime, cutime, cstime; ++++ + cputime_t cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@@@@@@ -508,14 -508,14 -508,14 -508,14 -552,6 -508,14 +552,6 @@@@@@@ unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; ---- - /* ---- - * Cumulative ns of scheduled CPU time for dead threads in the ---- - * group, not including a zombie group leader. (This only differs ---- - * from jiffies_to_ns(utime + stime) if sched_clock uses something ---- - * other than jiffies.) ---- - */ ---- - unsigned long long sum_sched_runtime; ---- - /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs @@@@@@@ -527,8 -527,8 -527,8 -527,8 -563,6 -527,8 +563,6 @@@@@@@ */ struct rlimit rlim[RLIM_NLIMITS]; ---- - struct list_head cpu_timers[3]; ---- - /* keep the process-shared keyrings here so that they do the right * thing in threads created with CLONE_THREAD */ #ifdef CONFIG_KEYS @@@@@@@ -824,6 -824,6 -824,9 -824,6 -858,6 -824,9 +858,9 @@@@@@@ struct sched_domain unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif ++ ++ #ifdef CONFIG_SCHED_DEBUG ++ ++ char *name; ++ ++ #endif }; extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, @@@@@@@ -897,7 -897,7 -900,7 -897,7 -931,7 -900,7 +934,7 @@@@@@@ struct sched_class void (*yield_task) (struct rq *rq); int (*select_task_rq)(struct task_struct *p, int sync); -- -- void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); ++ ++ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); @@@@@@@ -1010,8 -1010,8 -1013,8 -1010,8 -1044,8 -1013,8 +1047,8 @@@@@@@ struct sched_entity struct sched_rt_entity { struct list_head run_list; -- -- unsigned int time_slice; unsigned long timeout; ++ ++ unsigned int time_slice; int nr_cpus_allowed; struct sched_rt_entity *back; @@@@@@@ -1134,8 -1134,8 -1137,8 -1134,8 -1168,7 -1137,8 +1171,7 @@@@@@@ struct task_struct /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; ---- - cputime_t it_prof_expires, it_virt_expires; ---- - unsigned long long it_sched_expires; ++++ + struct task_cputime cputime_expires; struct list_head cpu_timers[3]; /* process credentials */ @@@@@@@ -1475,6 -1475,6 -1478,10 -1475,10 -1508,10 -1478,10 +1511,10 @@@@@@@ static inline void put_task_struct(stru __put_task_struct(t); } ++ extern cputime_t task_utime(struct task_struct *p); ++ extern cputime_t task_stime(struct task_struct *p); ++ extern cputime_t task_gtime(struct task_struct *p); ++ /* * Per process flags */ @@@@@@@ -1581,6 -1581,6 -1588,6 -1585,6 -1618,7 -1588,6 +1621,7 @@@@@@@ extern unsigned long long cpu_clock(in extern unsigned long long task_sched_runtime(struct task_struct *task); ++++ +extern unsigned long long thread_group_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@@@@@@ -2077,6 -2077,6 -2084,6 -2081,6 -2115,30 -2084,6 +2118,30 @@@@@@@ static inline int spin_needbreak(spinlo #endif } ++++ +/* ++++ + * Thread group CPU time accounting. ++++ + */ ++++ + ++++ +extern int thread_group_cputime_alloc(struct task_struct *); ++++ +extern void thread_group_cputime(struct task_struct *, struct task_cputime *); ++++ + ++++ +static inline void thread_group_cputime_init(struct signal_struct *sig) ++++ +{ ++++ + sig->cputime.totals = NULL; ++++ +} ++++ + ++++ +static inline int thread_group_cputime_clone_thread(struct task_struct *curr) ++++ +{ ++++ + if (curr->signal->cputime.totals) ++++ + return 0; ++++ + return thread_group_cputime_alloc(curr); ++++ +} ++++ + ++++ +static inline void thread_group_cputime_free(struct signal_struct *sig) ++++ +{ ++++ + free_percpu(sig->cputime.totals); ++++ +} ++++ + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --combined include/linux/time.h index 205f974b9eb,e15206a7e82,51e883df0fa,e15206a7e82,1b70b3c293e,51e883df0fa..4f1c9db5770 --- a/include/linux/time.h +++ b/include/linux/time.h @@@@@@@ -29,6 -29,6 -29,8 -29,6 -29,6 -29,8 +29,8 @@@@@@@ struct timezone #ifdef __KERNEL__ ++ ++ extern struct timezone sys_tz; ++ ++ /* Parameters used to convert the timespec values: */ #define MSEC_PER_SEC 1000L #define USEC_PER_MSEC 1000L @@@@@@@ -117,7 -117,6 -119,6 -117,6 -117,6 -119,6 +119,7 @@@@@@@ extern int do_setitimer(int which, stru extern unsigned int alarm_setitimer(unsigned int seconds); extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); +++++extern void getrawmonotonic(struct timespec *ts); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); @@@@@@@ -126,6 -125,6 -127,6 -125,6 -125,9 -127,6 +128,9 @@@@@@@ extern int timekeeping_valid_for_hres(v extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); ++++ +struct tms; ++++ +extern void do_sys_times(struct tms *); ++++ + /** * timespec_to_ns - Convert timespec to nanoseconds * @ts: pointer to the timespec variable to be converted @@@@@@@ -215,7 -214,6 -216,6 -214,6 -217,6 -216,6 +220,7 @@@@@@@ struct itimerval #define CLOCK_MONOTONIC 1 #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 +++++#define CLOCK_MONOTONIC_RAW 4 /* * The IDs of various hardware clocks: diff --combined kernel/compat.c index 32c254a8ab9,32c254a8ab9,143990e48cb,32c254a8ab9,72650e39b3e,143990e48cb..8eafe3eb50d --- a/kernel/compat.c +++ b/kernel/compat.c @@@@@@@ -23,9 -23,9 -23,67 -23,9 -23,10 -23,67 +23,68 @@@@@@@ #include #include #include ++++ +#include #include ++ ++ /* ++ ++ * Note that the native side is already converted to a timespec, because ++ ++ * that's what we want anyway. ++ ++ */ ++ ++ static int compat_get_timeval(struct timespec *o, ++ ++ struct compat_timeval __user *i) ++ ++ { ++ ++ long usec; ++ ++ ++ ++ if (get_user(o->tv_sec, &i->tv_sec) || ++ ++ get_user(usec, &i->tv_usec)) ++ ++ return -EFAULT; ++ ++ o->tv_nsec = usec * 1000; ++ ++ return 0; ++ ++ } ++ ++ ++ ++ static int compat_put_timeval(struct compat_timeval __user *o, ++ ++ struct timeval *i) ++ ++ { ++ ++ return (put_user(i->tv_sec, &o->tv_sec) || ++ ++ put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; ++ ++ } ++ ++ ++ ++ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, ++ ++ struct timezone __user *tz) ++ ++ { ++ ++ if (tv) { ++ ++ struct timeval ktv; ++ ++ do_gettimeofday(&ktv); ++ ++ if (compat_put_timeval(tv, &ktv)) ++ ++ return -EFAULT; ++ ++ } ++ ++ if (tz) { ++ ++ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) ++ ++ return -EFAULT; ++ ++ } ++ ++ ++ ++ return 0; ++ ++ } ++ ++ ++ ++ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, ++ ++ struct timezone __user *tz) ++ ++ { ++ ++ struct timespec kts; ++ ++ struct timezone ktz; ++ ++ ++ ++ if (tv) { ++ ++ if (compat_get_timeval(&kts, tv)) ++ ++ return -EFAULT; ++ ++ } ++ ++ if (tz) { ++ ++ if (copy_from_user(&ktz, tz, sizeof(ktz))) ++ ++ return -EFAULT; ++ ++ } ++ ++ ++ ++ return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); ++ ++ } ++ ++ int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || @@@@@@@ -150,49 -150,49 -208,49 -150,49 -151,23 -208,49 +209,23 @@@@@@@ asmlinkage long compat_sys_setitimer(in return 0; } ++++ +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) ++++ +{ ++++ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); ++++ +} ++++ + asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) { ---- - /* ---- - * In the SMP world we might just be unlucky and have one of ---- - * the times increment as we use it. Since the value is an ---- - * atomically safe type this is just fine. Conceptually its ---- - * as if the syscall took an instant longer to occur. ---- - */ if (tbuf) { ++++ + struct tms tms; struct compat_tms tmp; ---- - struct task_struct *tsk = current; ---- - struct task_struct *t; ---- - cputime_t utime, stime, cutime, cstime; ---- - ---- - read_lock(&tasklist_lock); ---- - utime = tsk->signal->utime; ---- - stime = tsk->signal->stime; ---- - t = tsk; ---- - do { ---- - utime = cputime_add(utime, t->utime); ---- - stime = cputime_add(stime, t->stime); ---- - t = next_thread(t); ---- - } while (t != tsk); ---- - ---- - /* ---- - * While we have tasklist_lock read-locked, no dying thread ---- - * can be updating current->signal->[us]time. Instead, ---- - * we got their counts included in the live thread loop. ---- - * However, another thread can come in right now and ---- - * do a wait call that updates current->signal->c[us]time. ---- - * To make sure we always see that pair updated atomically, ---- - * we take the siglock around fetching them. ---- - */ ---- - spin_lock_irq(&tsk->sighand->siglock); ---- - cutime = tsk->signal->cutime; ---- - cstime = tsk->signal->cstime; ---- - spin_unlock_irq(&tsk->sighand->siglock); ---- - read_unlock(&tasklist_lock); ---- - ---- - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); ---- - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); ---- - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); ---- - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); ++++ + ++++ + do_sys_times(&tms); ++++ + /* Convert our struct tms to the compat version. */ ++++ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); ++++ + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); ++++ + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); ++++ + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --combined kernel/exit.c index 38ec4063014,38ec4063014,0ef4673e351,16395644a98,40036ac0427,0ef4673e351..059b38cae38 --- a/kernel/exit.c +++ b/kernel/exit.c @@@@@@@ -112,9 -112,9 -112,9 -112,9 -112,7 -112,9 +112,7 @@@@@@@ static void __exit_signal(struct task_s * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ -- sig->utime = cputime_add(sig->utime, tsk->utime); -- sig->stime = cputime_add(sig->stime, tsk->stime); -- sig->gtime = cputime_add(sig->gtime, tsk->gtime); -- - sig->utime = cputime_add(sig->utime, task_utime(tsk)); -- - sig->stime = cputime_add(sig->stime, task_stime(tsk)); ++ sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@@@@@@ -122,7 -122,7 -122,7 -122,7 -120,6 -122,7 +120,6 @@@@@@@ sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); ---- - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@@@@@@ -583,8 -583,8 -583,6 -583,8 -580,8 -583,6 +580,6 @@@@@@@ mm_need_new_owner(struct mm_struct *mm * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ -- -- if (!mm) -- -- return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@@@@@@ -627,29 -627,29 -625,38 -627,29 -624,29 -625,38 +622,38 @@@@@@@ retry } while_each_thread(g, c); read_unlock(&tasklist_lock); ++ ++ /* ++ ++ * We found no owner yet mm_users > 1: this implies that we are ++ ++ * most likely racing with swapoff (try_to_unuse()) or /proc or ++ ++ * ptrace or page migration (get_task_mm()). Mark owner as NULL, ++ ++ * so that subsystems can understand the callback and take action. ++ ++ */ ++ ++ down_write(&mm->mmap_sem); ++ ++ cgroup_mm_owner_callbacks(mm->owner, NULL); ++ ++ mm->owner = NULL; ++ ++ up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); ++ ++ read_unlock(&tasklist_lock); ++ ++ down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); -- -- /* -- -- * Delay read_unlock() till we have the task_lock() -- -- * to ensure that c does not slip away underneath us -- -- */ -- -- read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); ++ ++ up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); ++ ++ up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ @@@@@@@ -831,26 -831,26 -838,50 -831,50 -828,50 -838,50 +835,50 @@@@@@@ static void reparent_thread(struct task * the child reaper process (ie "init") in our pid * space. */ ++ static struct task_struct *find_new_reaper(struct task_struct *father) ++ { ++ struct pid_namespace *pid_ns = task_active_pid_ns(father); ++ struct task_struct *thread; ++ ++ thread = father; ++ while_each_thread(father, thread) { ++ if (thread->flags & PF_EXITING) ++ continue; ++ if (unlikely(pid_ns->child_reaper == father)) ++ pid_ns->child_reaper = thread; ++ return thread; ++ } ++ ++ if (unlikely(pid_ns->child_reaper == father)) { ++ write_unlock_irq(&tasklist_lock); ++ if (unlikely(pid_ns == &init_pid_ns)) ++ panic("Attempted to kill init!"); ++ ++ zap_pid_ns_processes(pid_ns); ++ write_lock_irq(&tasklist_lock); ++ /* ++ * We can not clear ->child_reaper or leave it alone. ++ * There may by stealth EXIT_DEAD tasks on ->children, ++ * forget_original_parent() must move them somewhere. ++ */ ++ pid_ns->child_reaper = init_pid_ns.child_reaper; ++ } ++ ++ return pid_ns->child_reaper; ++ } ++ static void forget_original_parent(struct task_struct *father) { -- struct task_struct *p, *n, *reaper = father; ++ struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); -- ++ reaper = find_new_reaper(father); /* * First clean up ptrace if we were using it. */ ptrace_exit(father, &ptrace_dead); -- do { -- reaper = next_thread(reaper); -- if (reaper == father) { -- reaper = task_child_reaper(father); -- break; -- } -- } while (reaper->flags & PF_EXITING); -- list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; if (p->parent == father) { @@@@@@@ -918,8 -918,8 -949,8 -942,8 -939,8 -949,8 +946,8 @@@@@@@ static void exit_notify(struct task_str /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && -- tsk->signal->notify_count < 0 && -- tsk->signal->group_exit_task) ++ tsk->signal->group_exit_task && ++ tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); @@@@@@@ -959,39 -959,39 -990,6 -983,6 -980,6 -990,6 +987,6 @@@@@@@ static void check_stack_usage(void static inline void check_stack_usage(void) {} #endif -- static inline void exit_child_reaper(struct task_struct *tsk) -- { -- if (likely(tsk->group_leader != task_child_reaper(tsk))) -- return; -- -- if (tsk->nsproxy->pid_ns == &init_pid_ns) -- panic("Attempted to kill init!"); -- -- /* -- * @tsk is the last thread in the 'cgroup-init' and is exiting. -- * Terminate all remaining processes in the namespace and reap them -- * before exiting @tsk. -- * -- * Note that @tsk (last thread of cgroup-init) may not necessarily -- * be the child-reaper (i.e main thread of cgroup-init) of the -- * namespace i.e the child_reaper may have already exited. -- * -- * Even after a child_reaper exits, we let it inherit orphaned children, -- * because, pid_ns->child_reaper remains valid as long as there is -- * at least one living sub-thread in the cgroup init. -- -- * This living sub-thread of the cgroup-init will be notified when -- * a child inherited by the 'child-reaper' exits (do_notify_parent() -- * uses __group_send_sig_info()). Further, when reaping child processes, -- * do_wait() iterates over children of all living sub threads. -- -- * i.e even though 'child_reaper' thread is listed as the parent of the -- * orphaned children, any living sub-thread in the cgroup-init can -- * perform the role of the child_reaper. -- */ -- zap_pid_ns_processes(tsk->nsproxy->pid_ns); -- } -- NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@@@@@@ -1051,7 -1051,7 -1049,6 -1042,6 -1039,6 -1049,6 +1046,6 @@@@@@@ } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { -- exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } @@@@@@@ -1304,6 -1304,6 -1301,6 -1294,6 -1291,7 -1301,6 +1298,7 @@@@@@@ static int wait_task_zombie(struct task if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; ++++ + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@@@@@@ -1319,20 -1319,20 -1316,20 -1309,20 -1307,23 -1316,20 +1314,23 @@@@@@@ * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. ++++ + * ++++ + * We use thread_group_cputime() to get times for the thread ++++ + * group, which consolidates times for all threads in the ++++ + * group including the group leader. */ spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; ++++ + thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, ---- - cputime_add(p->utime, ---- - cputime_add(sig->utime, ---- - sig->cutime))); ++++ + cputime_add(cputime.utime, ++++ + sig->cutime)); psig->cstime = cputime_add(psig->cstime, ---- - cputime_add(p->stime, ---- - cputime_add(sig->stime, ---- - sig->cstime))); ++++ + cputime_add(cputime.stime, ++++ + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, diff --combined kernel/fork.c index 7ce2ebe8479,7ce2ebe8479,30de644a40c,7ce2ebe8479,021ae012cc7,30de644a40c..44e64d7ba29 --- a/kernel/fork.c +++ b/kernel/fork.c @@@@@@@ -759,15 -759,15 -759,15 -759,15 -759,44 -759,15 +759,44 @@@@@@@ void __cleanup_sighand(struct sighand_s kmem_cache_free(sighand_cachep, sighand); } ++++ + ++++ +/* ++++ + * Initialize POSIX timer handling for a thread group. ++++ + */ ++++ +static void posix_cpu_timers_init_group(struct signal_struct *sig) ++++ +{ ++++ + /* Thread group counters. */ ++++ + thread_group_cputime_init(sig); ++++ + ++++ + /* Expiration times and increments. */ ++++ + sig->it_virt_expires = cputime_zero; ++++ + sig->it_virt_incr = cputime_zero; ++++ + sig->it_prof_expires = cputime_zero; ++++ + sig->it_prof_incr = cputime_zero; ++++ + ++++ + /* Cached expiration times. */ ++++ + sig->cputime_expires.prof_exp = cputime_zero; ++++ + sig->cputime_expires.virt_exp = cputime_zero; ++++ + sig->cputime_expires.sched_exp = 0; ++++ + ++++ + /* The timer lists. */ ++++ + INIT_LIST_HEAD(&sig->cpu_timers[0]); ++++ + INIT_LIST_HEAD(&sig->cpu_timers[1]); ++++ + INIT_LIST_HEAD(&sig->cpu_timers[2]); ++++ +} ++++ + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { ---- - atomic_inc(¤t->signal->count); ---- - atomic_inc(¤t->signal->live); ---- - return 0; ++++ + ret = thread_group_cputime_clone_thread(current); ++++ + if (likely(!ret)) { ++++ + atomic_inc(¤t->signal->count); ++++ + atomic_inc(¤t->signal->live); ++++ + } ++++ + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@@@@@@ -795,39 -795,39 -795,40 -795,39 -824,24 -795,40 +824,25 @@@@@@@ sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; ---- - sig->it_virt_expires = cputime_zero; ---- - sig->it_virt_incr = cputime_zero; ---- - sig->it_prof_expires = cputime_zero; ---- - sig->it_prof_incr = cputime_zero; ---- - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; ++ ++ sig->tty = NULL; ---- - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; ++++ + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); ---- - sig->sum_sched_runtime = 0; ---- - INIT_LIST_HEAD(&sig->cpu_timers[0]); ---- - INIT_LIST_HEAD(&sig->cpu_timers[1]); ---- - INIT_LIST_HEAD(&sig->cpu_timers[2]); taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); ---- - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { ---- - /* ---- - * New sole thread in the process gets an expiry time ---- - * of the whole CPU time limit. ---- - */ ---- - tsk->it_prof_expires = ---- - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); ---- - } ++++ + posix_cpu_timers_init_group(sig); ++++ + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@@@@@@ -837,7 -837,7 -838,8 -837,7 -851,8 -838,8 +852,9 @@@@@@@ void __cleanup_signal(struct signal_struct *sig) { ++++ + thread_group_cputime_free(sig); exit_thread_group_keys(sig); ++ ++ tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@@@@@@ -885,6 -885,6 -887,6 -885,6 -900,19 -887,6 +902,19 @@@@@@@ void mm_init_owner(struct mm_struct *mm } #endif /* CONFIG_MM_OWNER */ ++++ +/* ++++ + * Initialize POSIX timer handling for a single task. ++++ + */ ++++ +static void posix_cpu_timers_init(struct task_struct *tsk) ++++ +{ ++++ + tsk->cputime_expires.prof_exp = cputime_zero; ++++ + tsk->cputime_expires.virt_exp = cputime_zero; ++++ + tsk->cputime_expires.sched_exp = 0; ++++ + INIT_LIST_HEAD(&tsk->cpu_timers[0]); ++++ + INIT_LIST_HEAD(&tsk->cpu_timers[1]); ++++ + INIT_LIST_HEAD(&tsk->cpu_timers[2]); ++++ +} ++++ + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@@@@@@ -995,12 -995,12 -997,12 -995,12 -1023,7 -997,12 +1025,7 @@@@@@@ static struct task_struct *copy_process task_io_accounting_init(&p->ioac); acct_clear_integrals(p); ---- - p->it_virt_expires = cputime_zero; ---- - p->it_prof_expires = cputime_zero; ---- - p->it_sched_expires = 0; ---- - INIT_LIST_HEAD(&p->cpu_timers[0]); ---- - INIT_LIST_HEAD(&p->cpu_timers[1]); ---- - INIT_LIST_HEAD(&p->cpu_timers[2]); ++++ + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@@@@@@ -1201,21 -1201,21 -1203,21 -1201,21 -1224,6 -1203,21 +1226,6 @@@@@@@ if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); ---- - ---- - if (!cputime_eq(current->signal->it_virt_expires, ---- - cputime_zero) || ---- - !cputime_eq(current->signal->it_prof_expires, ---- - cputime_zero) || ---- - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || ---- - !list_empty(¤t->signal->cpu_timers[0]) || ---- - !list_empty(¤t->signal->cpu_timers[1]) || ---- - !list_empty(¤t->signal->cpu_timers[2])) { ---- - /* ---- - * Have child wake up on its first tick to check ---- - * for process CPU timers. ---- - */ ---- - p->it_prof_expires = jiffies_to_cputime(1); ---- - } } if (likely(p->pid)) { @@@@@@@ -1227,7 -1227,7 -1229,8 -1227,7 -1235,7 -1229,8 +1237,8 @@@@@@@ p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; -- -- p->signal->tty = current->signal->tty; ++ ++ tty_kref_put(p->signal->tty); ++ ++ p->signal->tty = tty_kref_get(current->signal->tty); set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); diff --combined kernel/hrtimer.c index b8e4dce80a7,4d761d50c52,cdec83e722f,b8e4dce80a7,b8e4dce80a7,cdec83e722f..95978f48e03 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@@@@@@ -672,13 -672,13 -672,14 -672,13 -672,13 -672,14 +672,14 @@@@@@@ static inline int hrtimer_enqueue_repro */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; -- -- case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: ++ ++ case HRTIMER_CB_IRQSAFE_PERCPU: ++ ++ case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site -- -- * takes care of this. ++ ++ * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@@@@@@ -1245,7 -1245,7 -1246,8 -1245,7 -1245,7 -1246,8 +1246,8 @@@@@@@ static void __run_hrtimer(struct hrtime timer_stats_account_hrtimer(timer); fn = timer->function; -- -- if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { ++ ++ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || ++ ++ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@@@@@@ -1401,9 -1401,7 -1403,9 -1401,9 -1401,9 -1403,9 +1403,7 @@@@@@@ void hrtimer_run_queues(void if (!base->first) continue; - ---- if (base->get_softirq_time) - ---- base->softirq_time = base->get_softirq_time(); - ---- else if (gettime) { + ++++ if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } @@@@@@@ -1452,7 -1450,7 -1454,7 -1452,7 -1452,7 -1454,7 +1452,7 @@@@@@@ void hrtimer_init_sleeper(struct hrtime sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS -- -- sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; ++ ++ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@@@@@@ -1591,49 -1589,50 -1593,122 -1591,49 -1591,49 -1593,122 +1591,123 @@@@@@@ static void __cpuinit init_hrtimers_cpu #ifdef CONFIG_HOTPLUG_CPU -- -- static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, -- -- struct hrtimer_clock_base *new_base) ++ ++ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, ++ ++ struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; ++ ++ int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); -- -- __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); ++ ++ ++ ++ /* ++ ++ * Should not happen. Per CPU timers should be ++ ++ * canceled _before_ the migration code is called ++ ++ */ ++ ++ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { ++ ++ __remove_hrtimer(timer, old_base, ++ ++ HRTIMER_STATE_INACTIVE, 0); ++ ++ WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", ++ ++ timer, timer->function, dcpu); ++ ++ continue; ++ ++ } ++ ++ ++ ++ /* ++ ++ * Mark it as STATE_MIGRATE not INACTIVE otherwise the ++ ++ * timer could be seen as !active and just vanish away ++ ++ * under us on another CPU ++ ++ */ ++ ++ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); ++ ++ ++ ++ #ifdef CONFIG_HIGH_RES_TIMERS ++ ++ /* ++ ++ * Happens with high res enabled when the timer was ++ ++ * already expired and the callback mode is ++ ++ * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The ++ ++ * enqueue code does not move them to the soft irq ++ ++ * pending list for performance/latency reasons, but ++ ++ * in the migration state, we need to do that ++ ++ * otherwise we end up with a stale timer. ++ ++ */ ++ ++ if (timer->state == HRTIMER_STATE_MIGRATE) { ++ ++ timer->state = HRTIMER_STATE_PENDING; ++ ++ list_add_tail(&timer->cb_entry, ++ ++ &new_base->cpu_base->cb_pending); ++ ++ raise = 1; ++ ++ } ++ ++ #endif ++ ++ /* Clear the migration state bit */ ++ ++ timer->state &= ~HRTIMER_STATE_MIGRATE; + } ++ ++ return raise; + } + ++ ++ #ifdef CONFIG_HIGH_RES_TIMERS ++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, ++ ++ struct hrtimer_cpu_base *new_base) ++ ++ { ++ ++ struct hrtimer *timer; ++ ++ int raise = 0; ++ ++ ++ ++ while (!list_empty(&old_base->cb_pending)) { ++ ++ timer = list_entry(old_base->cb_pending.next, ++ ++ struct hrtimer, cb_entry); ++ ++ ++ ++ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); ++ ++ timer->base = &new_base->clock_base[timer->base->index]; ++ ++ list_add_tail(&timer->cb_entry, &new_base->cb_pending); ++ ++ raise = 1; + ++ } ++ ++ return raise; ++ ++ } ++ ++ #else ++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, ++ ++ struct hrtimer_cpu_base *new_base) ++ ++ { ++ ++ return 0; + ++ } ++ ++ #endif + ++ static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; -- -- int i; ++ ++ int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - ---- - ---- local_irq_disable(); - ---- spin_lock(&new_base->lock); + ++++ /* + ++++ * The caller is globally serialized and nobody else + ++++ * takes two locks at once, deadlock is not possible. + ++++ */ + ++++ spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { -- -- migrate_hrtimer_list(&old_base->clock_base[i], -- -- &new_base->clock_base[i]); ++ ++ if (migrate_hrtimer_list(&old_base->clock_base[i], ++ ++ &new_base->clock_base[i], cpu)) ++ ++ raise = 1; } ++ ++ if (migrate_hrtimer_pending(old_base, new_base)) ++ ++ raise = 1; ++ ++ spin_unlock(&old_base->lock); - ---- spin_unlock(&new_base->lock); - ---- local_irq_enable(); + ++++ spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); ++ ++ ++ ++ if (raise) ++ ++ hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ diff --combined kernel/posix-timers.c index d3c66b53dff,e36d5798cbf,5131e547116,e36d5798cbf,95451bf7d2e,5131e547116..b931d7cedbf --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@@@@@@ -222,15 -222,6 -222,6 -222,6 -222,6 -222,6 +222,15 @@@@@@@ static int posix_ktime_get_ts(clockid_ return 0; } +++++/* +++++ * Get monotonic time for posix timers +++++ */ +++++static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +++++{ +++++ getrawmonotonic(tp); +++++ return 0; +++++} +++++ /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@@@@@@ -244,15 -235,9 -235,9 -235,9 -235,9 -235,9 +244,15 @@@@@@@ static __init int init_posix_timers(voi .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; +++++ struct k_clock clock_monotonic_raw = { +++++ .clock_getres = hrtimer_get_res, +++++ .clock_get = posix_get_monotonic_raw, +++++ .clock_set = do_posix_clock_nosettime, +++++ }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); +++++ register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@@@@@@ -313,6 -298,6 -298,6 -298,6 -298,7 -298,6 +313,7 @@@@@@@ void do_schedule_next_timer(struct sigi int posix_timer_event(struct k_itimer *timr, int si_private) { ++++ + int shared, ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@@@@@@ -326,25 -311,25 -311,25 -311,25 -312,10 -311,25 +327,10 @@@@@@@ */ timr->sigq->info.si_sys_private = si_private; ---- - timr->sigq->info.si_signo = timr->it_sigev_signo; ---- - timr->sigq->info.si_code = SI_TIMER; ---- - timr->sigq->info.si_tid = timr->it_id; ---- - timr->sigq->info.si_value = timr->it_sigev_value; ---- - ---- - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { ---- - struct task_struct *leader; ---- - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); ---- - ---- - if (likely(ret >= 0)) ---- - return ret; ---- - ---- - timr->it_sigev_notify = SIGEV_SIGNAL; ---- - leader = timr->it_process->group_leader; ---- - put_task_struct(timr->it_process); ---- - timr->it_process = leader; ---- - } ---- - ---- - return send_sigqueue(timr->sigq, timr->it_process, 1); ++++ + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); ++++ + ret = send_sigqueue(timr->sigq, timr->it_process, shared); ++++ + /* If we failed to send the signal the timer stops. */ ++++ + return ret > 0; } EXPORT_SYMBOL_GPL(posix_timer_event); @@@@@@@ -456,7 -441,7 -441,7 -441,7 -427,7 -441,7 +442,7 @@@@@@@ static struct k_itimer * alloc_posix_ti return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); -- -- tmr = NULL; ++ ++ return NULL; } memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; @@@@@@@ -483,11 -468,11 -468,11 -468,11 -454,9 -468,11 +469,9 @@@@@@@ sys_timer_create(const clockid_t which_ struct sigevent __user *timer_event_spec, timer_t __user * created_timer_id) { ---- - int error = 0; ---- - struct k_itimer *new_timer = NULL; ---- - int new_timer_id; ---- - struct task_struct *process = NULL; ---- - unsigned long flags; ++++ + struct k_itimer *new_timer; ++++ + int error, new_timer_id; ++++ + struct task_struct *process; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@@@@@@ -505,12 -490,12 -490,12 -490,12 -474,11 -490,12 +489,11 @@@@@@@ goto out; } spin_lock_irq(&idr_lock); ---- - error = idr_get_new(&posix_timers_id, (void *) new_timer, ---- - &new_timer_id); ++++ + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); ---- - if (error == -EAGAIN) ---- - goto retry; ---- - else if (error) { ++++ + if (error) { ++++ + if (error == -EAGAIN) ++++ + goto retry; /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) @@@@@@@ -541,67 -526,67 -526,67 -526,67 -509,43 -526,67 +524,43 @@@@@@@ error = -EFAULT; goto out; } ---- - new_timer->it_sigev_notify = event.sigev_notify; ---- - new_timer->it_sigev_signo = event.sigev_signo; ---- - new_timer->it_sigev_value = event.sigev_value; ---- - ---- - read_lock(&tasklist_lock); ---- - if ((process = good_sigevent(&event))) { ---- - /* ---- - * We may be setting up this process for another ---- - * thread. It may be exiting. To catch this ---- - * case the we check the PF_EXITING flag. If ---- - * the flag is not set, the siglock will catch ---- - * him before it is too late (in exit_itimers). ---- - * ---- - * The exec case is a bit more invloved but easy ---- - * to code. If the process is in our thread ---- - * group (and it must be or we would not allow ---- - * it here) and is doing an exec, it will cause ---- - * us to be killed. In this case it will wait ---- - * for us to die which means we can finish this ---- - * linkage with our last gasp. I.e. no code :) ---- - */ ---- - spin_lock_irqsave(&process->sighand->siglock, flags); ---- - if (!(process->flags & PF_EXITING)) { ---- - new_timer->it_process = process; ---- - list_add(&new_timer->list, ---- - &process->signal->posix_timers); ---- - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) ---- - get_task_struct(process); ---- - spin_unlock_irqrestore(&process->sighand->siglock, flags); ---- - } else { ---- - spin_unlock_irqrestore(&process->sighand->siglock, flags); ---- - process = NULL; ---- - } ---- - } ---- - read_unlock(&tasklist_lock); ++++ + rcu_read_lock(); ++++ + process = good_sigevent(&event); ++++ + if (process) ++++ + get_task_struct(process); ++++ + rcu_read_unlock(); if (!process) { error = -EINVAL; goto out; } } else { ---- - new_timer->it_sigev_notify = SIGEV_SIGNAL; ---- - new_timer->it_sigev_signo = SIGALRM; ---- - new_timer->it_sigev_value.sival_int = new_timer->it_id; ++++ + event.sigev_notify = SIGEV_SIGNAL; ++++ + event.sigev_signo = SIGALRM; ++++ + event.sigev_value.sival_int = new_timer->it_id; process = current->group_leader; ---- - spin_lock_irqsave(&process->sighand->siglock, flags); ---- - new_timer->it_process = process; ---- - list_add(&new_timer->list, &process->signal->posix_timers); ---- - spin_unlock_irqrestore(&process->sighand->siglock, flags); ++++ + get_task_struct(process); } ++++ + new_timer->it_sigev_notify = event.sigev_notify; ++++ + new_timer->sigq->info.si_signo = event.sigev_signo; ++++ + new_timer->sigq->info.si_value = event.sigev_value; ++++ + new_timer->sigq->info.si_tid = new_timer->it_id; ++++ + new_timer->sigq->info.si_code = SI_TIMER; ++++ + ++++ + spin_lock_irq(¤t->sighand->siglock); ++++ + new_timer->it_process = process; ++++ + list_add(&new_timer->list, ¤t->signal->posix_timers); ++++ + spin_unlock_irq(¤t->sighand->siglock); ++++ + ++++ + return 0; /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify * new_timer after the unlock call. */ ---- - out: ---- - if (error) ---- - release_posix_timer(new_timer, it_id_set); ---- - ++++ + release_posix_timer(new_timer, it_id_set); return error; } @@@@@@@ -612,7 -597,7 -597,7 -597,7 -556,7 -597,7 +571,7 @@@@@@@ * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ ---- -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) ++++ +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* @@@@@@@ -620,23 -605,23 -605,23 -605,23 -564,20 -605,23 +579,20 @@@@@@@ * flags part over to the timer lock. Must not let interrupts in * while we are moving the lock. */ ---- - spin_lock_irqsave(&idr_lock, *flags); ---- - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); ++++ + timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); ---- - ---- - if ((timr->it_id != timer_id) || !(timr->it_process) || ---- - !same_thread_group(timr->it_process, current)) { ---- - spin_unlock(&timr->it_lock); ---- - spin_unlock_irqrestore(&idr_lock, *flags); ---- - timr = NULL; ---- - } else ++++ + if (timr->it_process && ++++ + same_thread_group(timr->it_process, current)) { spin_unlock(&idr_lock); ---- - } else ---- - spin_unlock_irqrestore(&idr_lock, *flags); ++++ + return timr; ++++ + } ++++ + spin_unlock(&timr->it_lock); ++++ + } ++++ + spin_unlock_irqrestore(&idr_lock, *flags); ---- - return timr; ++++ + return NULL; } /* @@@@@@@ -877,8 -862,8 -862,8 -862,8 -818,7 -862,8 +833,7 @@@@@@@ retry_delete * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ ---- - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) ---- - put_task_struct(timer->it_process); ++++ + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); @@@@@@@ -905,8 -890,8 -890,8 -890,8 -845,7 -890,8 +860,7 @@@@@@@ retry_delete * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ ---- - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) ---- - put_task_struct(timer->it_process); ++++ + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); diff --combined kernel/sched.c index 9a1ddb84e26,9a1ddb84e26,6f230596bd0,1a5f73c1fcd,ebb03def564,6f230596bd0..09a8c15748f --- a/kernel/sched.c +++ b/kernel/sched.c @@@@@@@ -201,14 -201,14 -201,19 -201,14 -201,14 -201,19 +201,19 @@@@@@@ void init_rt_bandwidth(struct rt_bandwi hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; -- -- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; ++ ++ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; ++ ++ } ++ ++ ++ ++ static inline int rt_bandwidth_enabled(void) ++ ++ { ++ ++ return sysctl_sched_rt_runtime >= 0; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; -- -- if (rt_b->rt_runtime == RUNTIME_INF) ++ ++ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@@@@@@ -298,9 -298,9 -303,9 -298,9 -298,9 -303,9 +303,9 @@@@@@@ static DEFINE_PER_CPU(struct cfs_rq, in static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -- -- #else /* !CONFIG_FAIR_GROUP_SCHED */ ++ ++ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -- -- #endif /* CONFIG_FAIR_GROUP_SCHED */ ++ ++ #endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@@@@@@ -604,9 -604,9 -609,9 -604,9 -604,9 -609,9 +609,9 @@@@@@@ struct rq static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -- -- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++ ++ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { -- -- rq->curr->sched_class->check_preempt_curr(rq, p); ++ ++ rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@@@@@@ -1087,7 -1087,7 -1092,7 -1087,7 -1087,7 -1092,7 +1092,7 @@@@@@@ hotplug_hrtick(struct notifier_block *n return NOTIFY_DONE; } -- -- static void init_hrtick(void) ++ ++ static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } @@@@@@@ -1102,7 -1102,7 -1107,7 -1102,7 -1102,7 -1107,7 +1107,7 @@@@@@@ static void hrtick_start(struct rq *rq hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -- -- static void init_hrtick(void) ++ ++ static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@@@@@@ -1119,9 -1119,9 -1124,9 -1119,9 -1119,9 -1124,9 +1124,9 @@@@@@@ static void init_rq_hrtick(struct rq *r hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; -- -- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; ++ ++ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -- -- #else ++ ++ #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@@@@@@ -1133,7 -1133,7 -1138,7 -1133,7 -1133,7 -1138,7 +1138,7 @@@@@@@ static inline void init_rq_hrtick(struc static inline void init_hrtick(void) { } -- -- #endif ++ ++ #endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@@@@@@ -1380,38 -1380,38 -1385,24 -1380,38 -1380,38 -1385,24 +1385,24 @@@@@@@ static inline void dec_cpu_load(struct update_load_sub(&rq->load, load); } -- -- #ifdef CONFIG_SMP -- -- static unsigned long source_load(int cpu, int type); -- -- static unsigned long target_load(int cpu, int type); -- -- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -- -- -- -- static unsigned long cpu_avg_load_per_task(int cpu) -- -- { -- -- struct rq *rq = cpu_rq(cpu); -- -- -- -- if (rq->nr_running) -- -- rq->avg_load_per_task = rq->load.weight / rq->nr_running; -- -- -- -- return rq->avg_load_per_task; -- -- } -- -- -- -- #ifdef CONFIG_FAIR_GROUP_SCHED -- -- -- -- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); ++ ++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) ++ ++ typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -- -- static void -- -- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) ++ ++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; ++ ++ int ret; rcu_read_lock(); parent = &root_task_group; down: -- -- (*down)(parent, cpu, sd); ++ ++ ret = (*down)(parent, data); ++ ++ if (ret) ++ ++ goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@@@@@@ -1419,15 -1419,15 -1410,43 -1419,15 -1419,15 -1410,43 +1410,43 @@@@@@@ up: continue; } -- -- (*up)(parent, cpu, sd); ++ ++ ret = (*up)(parent, data); ++ ++ if (ret) ++ ++ goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; ++ ++ out_unlock: rcu_read_unlock(); ++ ++ ++ ++ return ret; ++ + } ++ + ++ ++ static int tg_nop(struct task_group *tg, void *data) ++ ++ { ++ ++ return 0; + } ++ ++ #endif ++ ++ ++ ++ #ifdef CONFIG_SMP ++ ++ static unsigned long source_load(int cpu, int type); ++ ++ static unsigned long target_load(int cpu, int type); ++ ++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); ++ ++ ++ ++ static unsigned long cpu_avg_load_per_task(int cpu) ++ ++ { ++ ++ struct rq *rq = cpu_rq(cpu); ++ ++ ++ ++ if (rq->nr_running) ++ ++ rq->avg_load_per_task = rq->load.weight / rq->nr_running; ++ ++ ++ ++ return rq->avg_load_per_task; ++ ++ } ++ ++ ++ ++ #ifdef CONFIG_FAIR_GROUP_SCHED + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@@@@@@ -1486,11 -1486,11 -1505,11 -1486,11 -1486,11 -1505,11 +1505,11 @@@@@@@ __update_group_shares_cpu(struct task_g * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -- -- static void -- -- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) ++ ++ static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; ++ ++ struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@@@@@@ -1515,6 -1515,6 -1534,8 -1515,6 -1515,6 -1534,8 +1534,8 @@@@@@@ __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } ++ ++ ++ ++ return 0; } /* @@@@@@@ -1522,10 -1522,10 -1543,10 -1522,10 -1522,10 -1543,10 +1543,10 @@@@@@@ * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -- -- static void -- -- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) ++ ++ static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; ++ ++ long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@@@@@@ -1536,11 -1536,11 -1557,8 -1536,11 -1536,11 -1557,8 +1557,8 @@@@@@@ } tg->cfs_rq[cpu]->h_load = load; -- -- } -- -- static void -- -- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -- -- { ++ ++ return 0; } static void update_shares(struct sched_domain *sd) @@@@@@@ -1550,7 -1550,7 -1568,7 -1550,7 -1550,7 -1568,7 +1568,7 @@@@@@@ if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; -- -- walk_tg_tree(tg_nop, tg_shares_up, 0, sd); ++ ++ walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@@@@@@ -1561,9 -1561,9 -1579,9 -1561,9 -1561,9 -1579,9 +1579,9 @@@@@@@ static void update_shares_locked(struc spin_lock(&rq->lock); } -- -- static void update_h_load(int cpu) ++ ++ static void update_h_load(long cpu) { -- -- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); ++ ++ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@@@@@@ -1921,11 -1921,11 -1939,8 -1921,11 -1921,11 -1939,8 +1939,8 @@@@@@@ unsigned long wait_task_inactive(struc running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; -- -- if (!match_state || p->state == match_state) { -- -- ncsw = p->nivcsw + p->nvcsw; -- -- if (unlikely(!ncsw)) -- -- ncsw = 1; -- -- } ++ ++ if (!match_state || p->state == match_state) ++ ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* @@@@@@@ -2285,7 -2285,7 -2300,7 -2285,7 -2285,7 -2300,7 +2300,7 @@@@@@@ out_running trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); -- -- check_preempt_curr(rq, p); ++ ++ check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@@@@@@ -2420,7 -2420,7 -2435,7 -2420,7 -2420,7 -2435,7 +2435,7 @@@@@@@ void wake_up_new_task(struct task_struc trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); -- -- check_preempt_curr(rq, p); ++ ++ check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@@@@@@ -2880,7 -2880,7 -2895,7 -2880,7 -2880,7 -2895,7 +2895,7 @@@@@@@ static void pull_task(struct rq *src_rq * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ -- -- check_preempt_curr(this_rq, p); ++ ++ check_preempt_curr(this_rq, p, 0); } /* @@@@@@@ -4037,23 -4037,23 -4052,23 -4037,23 -4037,26 -4052,23 +4052,26 @@@@@@@ DEFINE_PER_CPU(struct kernel_stat, ksta EXPORT_PER_CPU_SYMBOL(kstat); /* ---- - * Return p->sum_exec_runtime plus any more ns on the sched_clock ---- - * that have not yet been banked in case the task is currently running. ++++ + * Return any ns on the sched_clock that have not yet been banked in ++++ + * @p in case that task is currently running. */ ---- -unsigned long long task_sched_runtime(struct task_struct *p) ++++ +unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; ---- - u64 ns, delta_exec; struct rq *rq; ++++ + u64 ns = 0; rq = task_rq_lock(p, &flags); ---- - ns = p->se.sum_exec_runtime; ++++ + if (task_current(rq, p)) { ++++ + u64 delta_exec; ++++ + update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) ---- - ns += delta_exec; ++++ + ns = delta_exec; } ++++ + task_rq_unlock(rq, &flags); return ns; @@@@@@@ -4070,6 -4070,6 -4085,6 -4070,6 -4073,7 -4085,6 +4088,7 @@@@@@@ void account_user_time(struct task_stru cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); ++++ + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@@@@@@ -4094,6 -4094,6 -4109,6 -4094,6 -4098,7 -4109,6 +4113,7 @@@@@@@ static void account_guest_time(struct t tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); ++++ + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@@@@@@ -4129,6 -4129,6 -4144,6 -4129,6 -4134,7 -4144,6 +4149,7 @@@@@@@ void account_system_time(struct task_st } p->stime = cputime_add(p->stime, cputime); ++++ + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@@@@@@ -4170,6 -4170,6 -4185,6 -4170,6 -4176,7 -4185,6 +4191,7 @@@@@@@ void account_steal_time(struct task_str if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); ++++ + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@@@@@@ -4178,6 -4178,6 -4193,65 -4178,65 -4185,65 -4193,65 +4200,65 @@@@@@@ cpustat->steal = cputime64_add(cpustat->steal, tmp); } ++ /* ++ * Use precise platform statistics if available: ++ */ ++ #ifdef CONFIG_VIRT_CPU_ACCOUNTING ++ cputime_t task_utime(struct task_struct *p) ++ { ++ return p->utime; ++ } ++ ++ cputime_t task_stime(struct task_struct *p) ++ { ++ return p->stime; ++ } ++ #else ++ cputime_t task_utime(struct task_struct *p) ++ { ++ clock_t utime = cputime_to_clock_t(p->utime), ++ total = utime + cputime_to_clock_t(p->stime); ++ u64 temp; ++ ++ /* ++ * Use CFS's precise accounting: ++ */ ++ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); ++ ++ if (total) { ++ temp *= utime; ++ do_div(temp, total); ++ } ++ utime = (clock_t)temp; ++ ++ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); ++ return p->prev_utime; ++ } ++ ++ cputime_t task_stime(struct task_struct *p) ++ { ++ clock_t stime; ++ ++ /* ++ * Use CFS's precise accounting. (we subtract utime from ++ * the total, to make sure the total observed by userspace ++ * grows monotonically - apps rely on that): ++ */ ++ stime = nsec_to_clock_t(p->se.sum_exec_runtime) - ++ cputime_to_clock_t(task_utime(p)); ++ ++ if (stime >= 0) ++ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); ++ ++ return p->prev_stime; ++ } ++ #endif ++ ++ inline cputime_t task_gtime(struct task_struct *p) ++ { ++ return p->gtime; ++ } ++ /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@@@@@@ -4568,6 -4568,6 -4642,15 -4627,6 -4634,6 -4642,15 +4649,15 @@@@@@@ __wake_up_sync(wait_queue_head_t *q, un } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ ++ ++ /** ++ ++ * complete: - signals a single thread waiting on this completion ++ ++ * @x: holds the state of this particular completion ++ ++ * ++ ++ * This will wake up a single thread waiting on this completion. Threads will be ++ ++ * awakened in the same order in which they were queued. ++ ++ * ++ ++ * See also complete_all(), wait_for_completion() and related routines. ++ ++ */ void complete(struct completion *x) { unsigned long flags; @@@@@@@ -4579,6 -4579,6 -4662,12 -4638,6 -4645,6 -4662,12 +4669,12 @@@@@@@ } EXPORT_SYMBOL(complete); ++ ++ /** ++ ++ * complete_all: - signals all threads waiting on this completion ++ ++ * @x: holds the state of this particular completion ++ ++ * ++ ++ * This will wake up all threads waiting on this particular completion event. ++ ++ */ void complete_all(struct completion *x) { unsigned long flags; @@@@@@@ -4599,10 -4599,10 -4688,7 -4658,10 -4665,10 -4688,7 +4695,7 @@@@@@@ do_wait_for_common(struct completion *x wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { -- -- if ((state == TASK_INTERRUPTIBLE && -- -- signal_pending(current)) || -- -- (state == TASK_KILLABLE && -- -- fatal_signal_pending(current))) { ++ ++ if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@@@@@@ -4630,12 -4630,12 -4716,31 -4689,12 -4696,12 -4716,31 +4723,31 @@@@@@@ wait_for_common(struct completion *x, l return timeout; } ++ ++ /** ++ ++ * wait_for_completion: - waits for completion of a task ++ ++ * @x: holds the state of this particular completion ++ ++ * ++ ++ * This waits to be signaled for completion of a specific task. It is NOT ++ ++ * interruptible and there is no timeout. ++ ++ * ++ ++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout ++ ++ * and interrupt capability. Also see complete(). ++ ++ */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); ++ ++ /** ++ ++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout) ++ ++ * @x: holds the state of this particular completion ++ ++ * @timeout: timeout value in jiffies ++ ++ * ++ ++ * This waits for either a completion of a specific task to be signaled or for a ++ ++ * specified timeout to expire. The timeout is in jiffies. It is not ++ ++ * interruptible. ++ ++ */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@@@@@@ -4643,6 -4643,6 -4748,13 -4702,6 -4709,6 -4748,13 +4755,13 @@@@@@@ } EXPORT_SYMBOL(wait_for_completion_timeout); ++ ++ /** ++ ++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr) ++ ++ * @x: holds the state of this particular completion ++ ++ * ++ ++ * This waits for completion of a specific task to be signaled. It is ++ ++ * interruptible. ++ ++ */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@@@@@@ -4652,6 -4652,6 -4764,14 -4711,6 -4718,6 -4764,14 +4771,14 @@@@@@@ } EXPORT_SYMBOL(wait_for_completion_interruptible); ++ ++ /** ++ ++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) ++ ++ * @x: holds the state of this particular completion ++ ++ * @timeout: timeout value in jiffies ++ ++ * ++ ++ * This waits for either a completion of a specific task to be signaled or for a ++ ++ * specified timeout to expire. It is interruptible. The timeout is in jiffies. ++ ++ */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@@@@@@ -4660,6 -4660,6 -4780,13 -4719,6 -4726,6 -4780,13 +4787,13 @@@@@@@ } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); ++ ++ /** ++ ++ * wait_for_completion_killable: - waits for completion of a task (killable) ++ ++ * @x: holds the state of this particular completion ++ ++ * ++ ++ * This waits to be signaled for completion of a specific task. It can be ++ ++ * interrupted by a kill signal. ++ ++ */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@@@@@@ -5062,7 -5062,7 -5189,8 -5121,7 -5128,7 -5189,8 +5196,8 @@@@@@@ recheck * Do not allow realtime tasks into groups that have no runtime * assigned. */ -- -- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) ++ ++ if (rt_bandwidth_enabled() && rt_policy(policy) && ++ ++ task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@@@@@@ -5898,7 -5898,7 -6026,7 -5957,7 -5964,7 -6026,7 +6033,7 @@@@@@@ static int __migrate_task(struct task_s set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); -- -- check_preempt_curr(rq_dest, p); ++ ++ check_preempt_curr(rq_dest, p, 0); } done: ret = 1; @@@@@@@ -6223,7 -6223,7 -6351,7 -6282,7 -6289,7 -6351,7 +6358,7 @@@@@@@ set_table_entry(struct ctl_table *entry static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { -- -- struct ctl_table *table = sd_alloc_ctl_entry(12); ++ ++ struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@@@@@@ -6251,7 -6251,7 -6379,9 -6310,7 -6317,7 -6379,9 +6386,9 @@@@@@@ sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); -- -- /* &table[11] is terminator */ ++ ++ set_table_entry(&table[11], "name", sd->name, ++ ++ CORENAME_MAX_SIZE, 0444, proc_dostring); ++ ++ /* &table[12] is terminator */ return table; } @@@@@@@ -7135,13 -7135,13 -7265,21 -7194,13 -7201,13 -7265,21 +7272,21 @@@@@@@ static void init_sched_groups_power(in * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ ++ ++ #ifdef CONFIG_SCHED_DEBUG ++ ++ # define SD_INIT_NAME(sd, type) sd->name = #type ++ ++ #else ++ ++ # define SD_INIT_NAME(sd, type) do { } while (0) ++ ++ #endif ++ ++ #define SD_INIT(sd, type) sd_init_##type(sd) ++ ++ #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ ++ ++ SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@@@@@@ -7637,24 -7637,24 -7775,27 -7696,24 -7703,27 -7775,27 +7782,27 @@@@@@@ static int dattrs_equal(struct sched_do * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * ++ + * If doms_new==NULL it will be replaced with cpu_online_map. ++ + * ndoms_new==0 is a special case for destroying existing domains. ++ + * It will not create the default domain. ++ + * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { -- - int i, j; ++ + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); -- - if (doms_new == NULL) -- - ndoms_new = 0; ++ + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { -- - for (j = 0; j < ndoms_new; j++) { ++ + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@@@@@@ -7667,7 -7667,7 -7808,6 -7726,7 -7736,6 -7808,6 +7815,6 @@@@@@@ match1 if (doms_new == NULL) { ndoms_cur = 0; -- - ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; @@@@@@@ -7704,8 -7704,8 -7844,13 -7763,8 -7772,13 -7844,13 +7851,13 @@@@@@@ match2 int arch_reinit_sched_domains(void) { get_online_cpus(); ++ + ++ + /* Destroy domains first to force the rebuild */ ++ + partition_sched_domains(0, NULL, NULL); ++ + rebuild_sched_domains(); put_online_cpus(); ++ + return 0; } @@@@@@@ -7789,7 -7789,7 -7934,7 -7848,7 -7862,7 -7934,7 +7941,7 @@@@@@@ static int update_sched_domains(struct case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: -- - partition_sched_domains(0, NULL, NULL); ++ + partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: @@@@@@@ -8176,20 -8176,20 -8321,25 -8235,20 -8249,20 -8321,25 +8328,25 @@@@@@@ void __might_sleep(char *file, int line #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ -- -- if ((in_atomic() || irqs_disabled()) && -- -- system_state == SYSTEM_RUNNING && !oops_in_progress) { -- -- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -- -- return; -- -- prev_jiffy = jiffies; -- -- printk(KERN_ERR "BUG: sleeping function called from invalid" -- -- " context at %s:%d\n", file, line); -- -- printk("in_atomic():%d, irqs_disabled():%d\n", -- -- in_atomic(), irqs_disabled()); -- -- debug_show_held_locks(current); -- -- if (irqs_disabled()) -- -- print_irqtrace_events(current); -- -- dump_stack(); -- -- } ++ ++ if ((!in_atomic() && !irqs_disabled()) || ++ ++ system_state != SYSTEM_RUNNING || oops_in_progress) ++ ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ ++ return; ++ ++ prev_jiffy = jiffies; ++ ++ ++ ++ printk(KERN_ERR ++ ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ ++ file, line); ++ ++ printk(KERN_ERR ++ ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ ++ in_atomic(), irqs_disabled(), ++ ++ current->pid, current->comm); ++ ++ ++ ++ debug_show_held_locks(current); ++ ++ if (irqs_disabled()) ++ ++ print_irqtrace_events(current); ++ ++ dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@@@@@@ -8687,73 -8687,73 -8837,95 -8746,73 -8760,73 -8837,95 +8844,95 @@@@@@@ static DEFINE_MUTEX(rt_constraints_mute static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) -- -- return 1ULL << 16; ++ ++ return 1ULL << 20; -- -- return div64_u64(runtime << 16, period); ++ ++ return div64_u64(runtime << 20, period); } -- -- #ifdef CONFIG_CGROUP_SCHED -- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) ++ ++ /* Must be called with tasklist_lock held */ ++ ++ static inline int tg_has_rt_tasks(struct task_group *tg) { -- -- struct task_group *tgi, *parent = tg->parent; -- -- unsigned long total = 0; ++ ++ struct task_struct *g, *p; -- -- if (!parent) { -- -- if (global_rt_period() < period) -- -- return 0; ++ ++ do_each_thread(g, p) { ++ ++ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) ++ ++ return 1; ++ ++ } while_each_thread(g, p); -- -- return to_ratio(period, runtime) < -- -- to_ratio(global_rt_period(), global_rt_runtime()); -- -- } ++ ++ return 0; ++ ++ } -- -- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) -- -- return 0; ++ ++ struct rt_schedulable_data { ++ ++ struct task_group *tg; ++ ++ u64 rt_period; ++ ++ u64 rt_runtime; ++ ++ }; -- -- rcu_read_lock(); -- -- list_for_each_entry_rcu(tgi, &parent->children, siblings) { -- -- if (tgi == tg) -- -- continue; ++ ++ static int tg_schedulable(struct task_group *tg, void *data) ++ ++ { ++ ++ struct rt_schedulable_data *d = data; ++ ++ struct task_group *child; ++ ++ unsigned long total, sum = 0; ++ ++ u64 period, runtime; ++ -- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), -- tgi->rt_bandwidth.rt_runtime); ++ ++ period = ktime_to_ns(tg->rt_bandwidth.rt_period); ++ ++ runtime = tg->rt_bandwidth.rt_runtime; ++ -- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), -- tgi->rt_bandwidth.rt_runtime); ++ ++ if (tg == d->tg) { ++ ++ period = d->rt_period; ++ ++ runtime = d->rt_runtime; } -- -- rcu_read_unlock(); -- -- return total + to_ratio(period, runtime) <= -- -- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), -- -- parent->rt_bandwidth.rt_runtime); -- -- } -- -- #elif defined CONFIG_USER_SCHED -- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -- -- { -- -- struct task_group *tgi; -- -- unsigned long total = 0; -- -- unsigned long global_ratio = -- -- to_ratio(global_rt_period(), global_rt_runtime()); ++ ++ /* ++ ++ * Cannot have more runtime than the period. ++ ++ */ ++ ++ if (runtime > period && runtime != RUNTIME_INF) ++ ++ return -EINVAL; -- -- rcu_read_lock(); -- -- list_for_each_entry_rcu(tgi, &task_groups, list) { -- -- if (tgi == tg) -- -- continue; ++ ++ /* ++ ++ * Ensure we don't starve existing RT tasks. ++ ++ */ ++ ++ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) ++ ++ return -EBUSY; ++ + - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); ++ ++ total = to_ratio(period, runtime); ++ ++ ++ ++ /* ++ ++ * Nobody can have more than the global setting allows. ++ ++ */ ++ ++ if (total > to_ratio(global_rt_period(), global_rt_runtime())) ++ ++ return -EINVAL; ++ -- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), -- tgi->rt_bandwidth.rt_runtime); ++ ++ /* ++ ++ * The sum of our children's runtime should not exceed our own. ++ ++ */ ++ ++ list_for_each_entry_rcu(child, &tg->children, siblings) { ++ ++ period = ktime_to_ns(child->rt_bandwidth.rt_period); ++ ++ runtime = child->rt_bandwidth.rt_runtime; ++ ++ ++ ++ if (child == d->tg) { ++ ++ period = d->rt_period; ++ ++ runtime = d->rt_runtime; ++ ++ } ++ + - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); ++ ++ sum += to_ratio(period, runtime); } -- -- rcu_read_unlock(); -- -- return total + to_ratio(period, runtime) < global_ratio; ++ ++ if (sum > total) ++ ++ return -EINVAL; ++ ++ ++ ++ return 0; } -- -- #endif -- -- /* Must be called with tasklist_lock held */ -- -- static inline int tg_has_rt_tasks(struct task_group *tg) ++ ++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { -- -- struct task_struct *g, *p; -- -- do_each_thread(g, p) { -- -- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) -- -- return 1; -- -- } while_each_thread(g, p); -- -- return 0; ++ ++ struct rt_schedulable_data data = { ++ ++ .tg = tg, ++ ++ .rt_period = period, ++ ++ .rt_runtime = runtime, ++ ++ }; ++ ++ ++ ++ return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@@@@@@ -8763,14 -8763,14 -8935,9 -8822,14 -8836,14 -8935,9 +8942,9 @@@@@@@ mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); -- -- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { -- -- err = -EBUSY; -- -- goto unlock; -- -- } -- -- if (!__rt_schedulable(tg, rt_period, rt_runtime)) { -- -- err = -EINVAL; ++ ++ err = __rt_schedulable(tg, rt_period, rt_runtime); ++ ++ if (err) goto unlock; -- -- } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@@@@@@ -8839,16 -8839,16 -9006,25 -8898,16 -8912,16 -9006,25 +9013,25 @@@@@@@ long sched_group_rt_period(struct task_ static int sched_rt_global_constraints(void) { -- -- struct task_group *tg = &root_task_group; -- -- u64 rt_runtime, rt_period; ++ ++ u64 runtime, period; int ret = 0; -- -- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); -- -- rt_runtime = tg->rt_bandwidth.rt_runtime; ++ ++ if (sysctl_sched_rt_period <= 0) ++ ++ return -EINVAL; ++ ++ ++ ++ runtime = global_rt_runtime(); ++ ++ period = global_rt_period(); ++ ++ ++ ++ /* ++ ++ * Sanity check on the sysctl variables. ++ ++ */ ++ ++ if (runtime > period && runtime != RUNTIME_INF) ++ ++ return -EINVAL; mutex_lock(&rt_constraints_mutex); -- -- if (!__rt_schedulable(tg, rt_period, rt_runtime)) -- -- ret = -EINVAL; ++ ++ read_lock(&tasklist_lock); ++ ++ ret = __rt_schedulable(NULL, 0, 0); ++ ++ read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@@@@@@ -8859,6 -8859,6 -9035,9 -8918,6 -8932,6 -9035,9 +9042,9 @@@@@@@ static int sched_rt_global_constraints( unsigned long flags; int i; ++ ++ if (sysctl_sched_rt_period <= 0) ++ ++ return -EINVAL; ++ ++ spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@@@@@@ -8919,7 -8919,7 -9098,6 -8978,7 -8992,7 -9098,6 +9105,6 @@@@@@@ cpu_cgroup_create(struct cgroup_subsys if (!cgrp->parent) { /* This is early initialization for the top cgroup */ -- -- init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@@@@@@ -8928,9 -8928,9 -9106,6 -8987,9 -9001,9 -9106,6 +9113,6 @@@@@@@ if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); -- -- /* Bind the cgroup to task_group object we just created */ -- -- tg->css.cgroup = cgrp; -- -- return &tg->css; } diff --combined kernel/sched_fair.c index fb8994c6d4b,fb8994c6d4b,18fd17172eb,fb8994c6d4b,99aa31acc54,18fd17172eb..f604dae7131 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@@@@@@ -408,64 -408,64 -408,6 -408,64 -408,64 -408,6 +408,6 @@@@@@@ static u64 sched_vslice_add(struct cfs_ return __sched_period(nr_running); } -- -- /* -- -- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in -- -- * that it favours >=0 over <0. -- -- * -- -- * -20 | -- -- * | -- -- * 0 --------+------- -- -- * .' -- -- * 19 .' -- -- * -- -- */ -- -- static unsigned long -- -- calc_delta_asym(unsigned long delta, struct sched_entity *se) -- -- { -- -- struct load_weight lw = { -- -- .weight = NICE_0_LOAD, -- -- .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) -- -- }; -- -- -- -- for_each_sched_entity(se) { -- -- struct load_weight *se_lw = &se->load; -- -- unsigned long rw = cfs_rq_of(se)->load.weight; -- -- -- -- #ifdef CONFIG_FAIR_SCHED_GROUP -- -- struct cfs_rq *cfs_rq = se->my_q; -- -- struct task_group *tg = NULL -- -- -- -- if (cfs_rq) -- -- tg = cfs_rq->tg; -- -- -- -- if (tg && tg->shares < NICE_0_LOAD) { -- -- /* -- -- * scale shares to what it would have been had -- -- * tg->weight been NICE_0_LOAD: -- -- * -- -- * weight = 1024 * shares / tg->weight -- -- */ -- -- lw.weight *= se->load.weight; -- -- lw.weight /= tg->shares; -- -- -- -- lw.inv_weight = 0; -- -- -- -- se_lw = &lw; -- -- rw += lw.weight - se->load.weight; -- -- } else -- -- #endif -- -- -- -- if (se->load.weight < NICE_0_LOAD) { -- -- se_lw = &lw; -- -- rw += NICE_0_LOAD - se->load.weight; -- -- } -- -- -- -- delta = calc_delta_mine(delta, rw, se_lw); -- -- } -- -- -- -- return delta; -- -- } -- -- /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@@@@@@ -507,6 -507,6 -449,6 -507,6 -507,7 -449,6 +449,7 @@@@@@@ static void update_curr(struct cfs_rq * struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); ++++ + account_group_exec_runtime(curtask, delta_exec); } } @@@@@@@ -586,11 -586,11 -528,12 -586,11 -587,11 -528,12 +529,12 @@@@@@@ account_entity_enqueue(struct cfs_rq *c update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); -- -- if (entity_is_task(se)) ++ ++ if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); ++ ++ list_add(&se->group_node, &cfs_rq->tasks); ++ ++ } cfs_rq->nr_running++; se->on_rq = 1; -- -- list_add(&se->group_node, &cfs_rq->tasks); } static void @@@@@@@ -599,11 -599,11 -542,12 -599,11 -600,11 -542,12 +543,12 @@@@@@@ account_entity_dequeue(struct cfs_rq *c update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); -- -- if (entity_is_task(se)) ++ ++ if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); ++ ++ list_del_init(&se->group_node); ++ ++ } cfs_rq->nr_running--; se->on_rq = 0; -- -- list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@@@@@@ -1085,7 -1085,7 -1029,6 -1085,7 -1086,7 -1029,6 +1030,6 @@@@@@@ static long effective_load(struct task_ long wl, long wg) { struct sched_entity *se = tg->se[cpu]; -- -- long more_w; if (!tg->parent) return wl; @@@@@@@ -1097,18 -1097,18 -1040,17 -1097,18 -1098,18 -1040,17 +1041,17 @@@@@@@ if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; -- -- /* -- -- * Instead of using this increment, also add the difference -- -- * between when the shares were last updated and now. -- -- */ -- -- more_w = se->my_q->load.weight - se->my_q->rq_weight; -- -- wl += more_w; -- -- wg += more_w; -- -- for_each_sched_entity(se) { -- -- #define D(n) (likely(n) ? (n) : 1) -- -- long S, rw, s, a, b; ++ ++ long more_w; ++ ++ ++ ++ /* ++ ++ * Instead of using this increment, also add the difference ++ ++ * between when the shares were last updated and now. ++ ++ */ ++ ++ more_w = se->my_q->load.weight - se->my_q->rq_weight; ++ ++ wl += more_w; ++ ++ wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@@@@@@ -1117,7 -1117,7 -1059,11 -1117,7 -1118,7 -1059,11 +1060,11 @@@@@@@ a = S*(rw + wl); b = S*rw + s*wg; -- -- wl = s*(a-b)/D(b); ++ ++ wl = s*(a-b); ++ ++ ++ ++ if (likely(b)) ++ ++ wl /= b; ++ ++ /* * Assume the group is already running and will * thus already be accounted for in the weight. @@@@@@@ -1126,7 -1126,7 -1072,6 -1126,7 -1127,7 -1072,6 +1073,6 @@@@@@@ * alter the group weight. */ wg = 0; -- -- #undef D } return wl; @@@@@@@ -1143,7 -1143,7 -1088,7 -1143,7 -1144,7 -1088,7 +1089,7 @@@@@@@ static inline unsigned long effective_l #endif static int -- -- wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, ++ ++ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@@@@@@ -1158,6 -1158,6 -1103,11 -1158,6 -1159,6 -1103,11 +1104,11 @@@@@@@ if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; ++ ++ if (!sync && sched_feat(SYNC_WAKEUPS) && ++ ++ curr->se.avg_overlap < sysctl_sched_migration_cost && ++ ++ p->se.avg_overlap < sysctl_sched_migration_cost) ++ ++ sync = 1; ++ ++ /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@@@@@@ -1182,17 -1182,17 -1132,14 -1182,17 -1183,17 -1132,14 +1133,14 @@@@@@@ * a reasonable amount of time then attract this newly * woken task: */ -- -- if (sync && balanced) { -- -- if (curr->se.avg_overlap < sysctl_sched_migration_cost && -- -- p->se.avg_overlap < sysctl_sched_migration_cost) -- -- return 1; -- -- } ++ ++ if (sync && balanced) ++ ++ return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); -- -- if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || -- -- balanced) { ++ ++ if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= ++ ++ tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@@@@@@ -1211,16 -1211,16 -1158,17 -1211,16 -1212,16 -1158,17 +1159,17 @@@@@@@ static int select_task_rq_fair(struct t struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; -- -- struct rq *rq, *this_rq; ++ ++ struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); -- -- rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; ++ ++ if (prev_cpu == this_cpu) ++ ++ goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@@@@@@ -1248,13 -1248,13 -1196,10 -1248,13 -1249,13 -1196,10 +1197,10 @@@@@@@ load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); -- -- if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, ++ ++ if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; -- -- if (prev_cpu == this_cpu) -- -- goto out; -- -- /* * Start passive balancing when half the imbalance_pct * limit is reached. @@@@@@@ -1281,62 -1281,62 -1226,20 -1281,62 -1282,62 -1226,20 +1227,20 @@@@@@@ static unsigned long wakeup_gran(struc * + nice tasks. */ if (sched_feat(ASYM_GRAN)) -- -- gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); -- -- else -- -- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); ++ ++ gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } -- -- /* -- -- * Should 'se' preempt 'curr'. -- -- * -- -- * |s1 -- -- * |s2 -- -- * |s3 -- -- * g -- -- * |<--->|c -- -- * -- -- * w(c, s1) = -1 -- -- * w(c, s2) = 0 -- -- * w(c, s3) = 1 -- -- * -- -- */ -- -- static int -- -- wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -- -- { -- -- s64 gran, vdiff = curr->vruntime - se->vruntime; -- -- -- -- if (vdiff < 0) -- -- return -1; -- -- -- -- gran = wakeup_gran(curr); -- -- if (vdiff > gran) -- -- return 1; -- -- -- -- return 0; -- -- } -- -- -- -- /* return depth at which a sched entity is present in the hierarchy */ -- -- static inline int depth_se(struct sched_entity *se) -- -- { -- -- int depth = 0; -- -- -- -- for_each_sched_entity(se) -- -- depth++; -- -- -- -- return depth; -- -- } -- -- /* * Preempt the current task with a newly woken task if needed: */ -- -- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) ++ ++ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; -- -- int se_depth, pse_depth; ++ ++ s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@@@@@@ -1350,6 -1350,6 -1253,13 -1350,6 -1351,6 -1253,13 +1254,13 @@@@@@@ cfs_rq_of(pse)->next = pse; ++ ++ /* ++ ++ * We can come here with TIF_NEED_RESCHED already set from new task ++ ++ * wake up path. ++ ++ */ ++ ++ if (test_tsk_need_resched(curr)) ++ ++ return; ++ ++ /* * Batch tasks do not preempt (their preemption is driven by * the tick): @@@@@@@ -1360,33 -1360,33 -1270,15 -1360,33 -1361,33 -1270,15 +1271,15 @@@@@@@ if (!sched_feat(WAKEUP_PREEMPT)) return; -- -- /* -- -- * preemption test can be made between sibling entities who are in the -- -- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of -- -- * both tasks until we find their ancestors who are siblings of common -- -- * parent. -- -- */ -- -- -- -- /* First walk up until both entities are at same depth */ -- -- se_depth = depth_se(se); -- -- pse_depth = depth_se(pse); -- -- -- -- while (se_depth > pse_depth) { -- -- se_depth--; -- -- se = parent_entity(se); -- -- } -- -- -- -- while (pse_depth > se_depth) { -- -- pse_depth--; -- -- pse = parent_entity(pse); -- -- } -- -- -- -- while (!is_same_group(se, pse)) { -- -- se = parent_entity(se); -- -- pse = parent_entity(pse); ++ ++ if (sched_feat(WAKEUP_OVERLAP) && (sync || ++ ++ (se->avg_overlap < sysctl_sched_migration_cost && ++ ++ pse->avg_overlap < sysctl_sched_migration_cost))) { ++ ++ resched_task(curr); ++ ++ return; } -- -- if (wakeup_preempt_entity(se, pse) == 1) ++ ++ delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; ++ ++ if (delta_exec > wakeup_gran(pse)) resched_task(curr); } @@@@@@@ -1445,19 -1445,19 -1337,9 -1445,19 -1446,19 -1337,9 +1338,9 @@@@@@@ __load_balance_iterator(struct cfs_rq * if (next == &cfs_rq->tasks) return NULL; -- -- /* Skip over entities that are not tasks */ -- -- do { -- -- se = list_entry(next, struct sched_entity, group_node); -- -- next = next->next; -- -- } while (next != &cfs_rq->tasks && !entity_is_task(se)); -- -- -- -- if (next == &cfs_rq->tasks) -- -- return NULL; -- -- -- -- cfs_rq->balance_iterator = next; -- -- -- -- if (entity_is_task(se)) -- -- p = task_of(se); ++ ++ se = list_entry(next, struct sched_entity, group_node); ++ ++ p = task_of(se); ++ ++ cfs_rq->balance_iterator = next->next; return p; } @@@@@@@ -1507,7 -1507,7 -1389,7 -1507,7 -1508,7 -1389,7 +1390,7 @@@@@@@ load_balance_fair(struct rq *this_rq, i rcu_read_lock(); update_h_load(busiest_cpu); -- -- list_for_each_entry(tg, &task_groups, list) { ++ ++ list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; @@@@@@@ -1620,10 -1620,10 -1502,10 -1620,10 -1621,10 -1502,10 +1503,10 @@@@@@@ static void task_new_fair(struct rq *rq * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); ++ ++ resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); -- -- resched_task(rq->curr); } /* @@@@@@@ -1642,7 -1642,7 -1524,7 -1642,7 -1643,7 -1524,7 +1525,7 @@@@@@@ static void prio_changed_fair(struct r if (p->prio > oldprio) resched_task(rq->curr); } else -- -- check_preempt_curr(rq, p); ++ ++ check_preempt_curr(rq, p, 0); } /* @@@@@@@ -1659,7 -1659,7 -1541,7 -1659,7 -1660,7 -1541,7 +1542,7 @@@@@@@ static void switched_to_fair(struct rq if (running) resched_task(rq->curr); else -- -- check_preempt_curr(rq, p); ++ ++ check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --combined kernel/sched_rt.c index 998ba54b454,998ba54b454,cdf5740ab03,552310798da,8375e69af36,cdf5740ab03..b446dc87494 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@@@@@@ -102,12 -102,12 -102,12 -102,12 -102,12 -102,12 +102,12 @@@@@@@ static void dequeue_rt_entity(struct sc static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { ++ ++ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; -- -- if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { -- -- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; -- -- -- -- enqueue_rt_entity(rt_se); ++ ++ if (rt_rq->rt_nr_running) { ++ ++ if (rt_se && !on_rt_rq(rt_se)) ++ ++ enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } @@@@@@@ -199,6 -199,6 -199,8 -199,8 -199,8 -199,8 +199,8 @@@@@@@ static inline struct rt_rq *group_rt_rq static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { ++ if (rt_rq->rt_nr_running) ++ resched_task(rq_of_rt_rq(rt_rq)->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@@@@@@ -229,6 -229,6 -231,9 -231,6 -231,6 -231,9 +231,9 @@@@@@@ static inline struct rt_bandwidth *sche #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP ++ ++ /* ++ ++ * We ran out of runtime, see if we can borrow some from our neighbours. ++ ++ */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@@@@@@ -248,9 -248,9 -253,18 -250,9 -250,9 -253,18 +253,18 @@@@@@@ continue; spin_lock(&iter->rt_runtime_lock); ++ ++ /* ++ ++ * Either all rqs have inf runtime and there's nothing to steal ++ ++ * or __disable_runtime() below sets a specific rq to inf to ++ ++ * indicate its been disabled and disalow stealing. ++ ++ */ if (iter->rt_runtime == RUNTIME_INF) goto next; ++ ++ /* ++ ++ * From runqueues with spare time, take 1/n part of their ++ ++ * spare time, but no more than our period. ++ ++ */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@@@@@@ -272,6 -272,6 -286,9 -274,6 -274,6 -286,9 +286,9 @@@@@@@ next return more; } ++ ++ /* ++ ++ * Ensure this RQ takes back all the runtime it lend to its neighbours. ++ ++ */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@@@@@@ -287,17 -287,17 -304,33 -289,17 -289,17 -304,33 +304,33 @@@@@@@ spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); ++ ++ /* ++ ++ * Either we're all inf and nobody needs to borrow, or we're ++ ++ * already disabled and thus have nothing to do, or we have ++ ++ * exactly the right amount of runtime to take out. ++ ++ */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); ++ ++ /* ++ ++ * Calculate the difference between what we started out with ++ ++ * and what we current have, that's the amount of runtime ++ ++ * we lend and now have to reclaim. ++ ++ */ want = rt_b->rt_runtime - rt_rq->rt_runtime; ++ ++ /* ++ ++ * Greedy reclaim, take back as much as we can. ++ ++ */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; ++ ++ /* ++ ++ * Can't reclaim from ourselves or disabled runqueues. ++ ++ */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@@@@@@ -317,8 -317,8 -350,16 -319,8 -319,8 -350,16 +350,16 @@@@@@@ } spin_lock(&rt_rq->rt_runtime_lock); ++ ++ /* ++ ++ * We cannot be left wanting - that would mean some runtime ++ ++ * leaked out of the system. ++ ++ */ BUG_ON(want); balanced: ++ ++ /* ++ ++ * Disable all the borrow logic by pretending we have inf ++ ++ * runtime - in which case borrowing doesn't make sense. ++ ++ */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@@@@@@ -341,6 -341,6 -382,9 -343,6 -343,6 -382,9 +382,9 @@@@@@@ static void __enable_runtime(struct rq if (unlikely(!scheduler_running)) return; ++ ++ /* ++ ++ * Reset each runqueue's bandwidth settings ++ ++ */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@@@@@@ -348,6 -348,6 -392,7 -350,6 -350,6 -392,7 +392,7 @@@@@@@ spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; ++ ++ rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } @@@@@@@ -386,7 -386,7 -431,7 -388,7 -388,7 -431,7 +431,7 @@@@@@@ static int do_sched_rt_period_timer(str int i, idle = 1; cpumask_t span; -- -- if (rt_b->rt_runtime == RUNTIME_INF) ++ ++ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@@@@@@ -438,9 -438,9 -483,6 -440,6 -440,6 -483,6 +483,6 @@@@@@@ static int sched_rt_runtime_exceeded(st { u64 runtime = sched_rt_runtime(rt_rq); -- if (runtime == RUNTIME_INF) -- return 0; -- if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@@@@@@ -484,16 -484,16 -526,21 -483,18 -483,20 -526,21 +526,23 @@@@@@@ static void update_curr_rt(struct rq *r schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; ++++ + account_group_exec_runtime(curr, delta_exec); ++++ + curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); ++ ++ if (!rt_bandwidth_enabled()) ++ ++ return; ++ ++ for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); -- rt_rq->rt_time += delta_exec; -- if (sched_rt_runtime_exceeded(rt_rq)) -- resched_task(curr); ++ if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { ++ rt_rq->rt_time += delta_exec; ++ if (sched_rt_runtime_exceeded(rt_rq)) ++ resched_task(curr); ++ } spin_unlock(&rt_rq->rt_runtime_lock); } } @@@@@@@ -782,7 -782,7 -829,7 -783,7 -785,7 -829,7 +831,7 @@@@@@@ static void check_preempt_equal_prio(st /* * Preempt the current task with a newly woken task if needed: */ -- -- static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) ++ ++ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); @@@@@@@ -1411,7 -1411,7 -1458,7 -1412,7 -1414,7 -1458,7 +1460,7 @@@@@@@ static void watchdog(struct rq *rq, str p->rt.timeout++; next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) ---- - p->it_sched_expires = p->se.sum_exec_runtime; ++++ + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } diff --combined kernel/softirq.c index c506f266a6b,c506f266a6b,d410014279e,c506f266a6b,c506f266a6b,83ba21a13bd..7110daeb9a9 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@@@@@@ -6,6 -6,6 -6,6 -6,6 -6,6 -6,8 +6,8 @@@@@@@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) +++++ * +++++ * Remote softirq infrastructure is by Jens Axboe. */ #include @@@@@@@ -46,7 -46,7 -46,7 -46,7 -46,7 -48,7 +48,7 @@@@@@@ irq_cpustat_t irq_stat[NR_CPUS] ____cac EXPORT_SYMBOL(irq_stat); #endif -- -- static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; ++ ++ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@@@@@@ -205,7 -205,7 -205,18 -205,7 -205,7 -207,18 +207,18 @@@@@@@ restart do { if (pending & 1) { ++ ++ int prev_count = preempt_count(); ++ ++ h->action(h); ++ ++ ++ ++ if (unlikely(prev_count != preempt_count())) { ++ ++ printk(KERN_ERR "huh, entered softirq %td %p" ++ ++ "with preempt_count %08x," ++ ++ " exited with %08x?\n", h - softirq_vec, ++ ++ h->action, prev_count, preempt_count()); ++ ++ preempt_count() = prev_count; ++ ++ } ++ ++ rcu_bh_qsctr_inc(cpu); } h++; @@@@@@@ -254,16 -254,16 -265,12 -254,16 -254,16 -267,16 +267,12 @@@@@@@ asmlinkage void do_softirq(void */ void irq_enter(void) { -- ---#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); ++ +++ if (idle_cpu(cpu) && !in_interrupt()) -- --- tick_nohz_stop_idle(cpu); -- ---#endif ++ +++ tick_check_idle(cpu); ++ +++ __irq_enter(); -- ---#ifdef CONFIG_NO_HZ -- --- if (idle_cpu(cpu)) -- --- tick_nohz_update_jiffies(); -- ---#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@@@@@@ -463,17 -463,17 -470,17 -463,17 -463,17 -476,144 +472,144 @@@@@@@ void tasklet_kill(struct tasklet_struc EXPORT_SYMBOL(tasklet_kill); +++++ DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +++++ EXPORT_PER_CPU_SYMBOL(softirq_work_list); +++++ +++++ static void __local_trigger(struct call_single_data *cp, int softirq) +++++ { +++++ struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); +++++ +++++ list_add_tail(&cp->list, head); +++++ +++++ /* Trigger the softirq only if the list was previously empty. */ +++++ if (head->next == &cp->list) +++++ raise_softirq_irqoff(softirq); +++++ } +++++ +++++ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS +++++ static void remote_softirq_receive(void *data) +++++ { +++++ struct call_single_data *cp = data; +++++ unsigned long flags; +++++ int softirq; +++++ +++++ softirq = cp->priv; +++++ +++++ local_irq_save(flags); +++++ __local_trigger(cp, softirq); +++++ local_irq_restore(flags); +++++ } +++++ +++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +++++ { +++++ if (cpu_online(cpu)) { +++++ cp->func = remote_softirq_receive; +++++ cp->info = cp; +++++ cp->flags = 0; +++++ cp->priv = softirq; +++++ +++++ __smp_call_function_single(cpu, cp); +++++ return 0; +++++ } +++++ return 1; +++++ } +++++ #else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +++++ { +++++ return 1; +++++ } +++++ #endif +++++ +++++ /** +++++ * __send_remote_softirq - try to schedule softirq work on a remote cpu +++++ * @cp: private SMP call function data area +++++ * @cpu: the remote cpu +++++ * @this_cpu: the currently executing cpu +++++ * @softirq: the softirq for the work +++++ * +++++ * Attempt to schedule softirq work on a remote cpu. If this cannot be +++++ * done, the work is instead queued up on the local cpu. +++++ * +++++ * Interrupts must be disabled. +++++ */ +++++ void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +++++ { +++++ if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) +++++ __local_trigger(cp, softirq); +++++ } +++++ EXPORT_SYMBOL(__send_remote_softirq); +++++ +++++ /** +++++ * send_remote_softirq - try to schedule softirq work on a remote cpu +++++ * @cp: private SMP call function data area +++++ * @cpu: the remote cpu +++++ * @softirq: the softirq for the work +++++ * +++++ * Like __send_remote_softirq except that disabling interrupts and +++++ * computing the current cpu is done for the caller. +++++ */ +++++ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +++++ { +++++ unsigned long flags; +++++ int this_cpu; +++++ +++++ local_irq_save(flags); +++++ this_cpu = smp_processor_id(); +++++ __send_remote_softirq(cp, cpu, this_cpu, softirq); +++++ local_irq_restore(flags); +++++ } +++++ EXPORT_SYMBOL(send_remote_softirq); +++++ +++++ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, +++++ unsigned long action, void *hcpu) +++++ { +++++ /* +++++ * If a CPU goes away, splice its entries to the current CPU +++++ * and trigger a run of the softirq +++++ */ +++++ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { +++++ int cpu = (unsigned long) hcpu; +++++ int i; +++++ +++++ local_irq_disable(); +++++ for (i = 0; i < NR_SOFTIRQS; i++) { +++++ struct list_head *head = &per_cpu(softirq_work_list[i], cpu); +++++ struct list_head *local_head; +++++ +++++ if (list_empty(head)) +++++ continue; +++++ +++++ local_head = &__get_cpu_var(softirq_work_list[i]); +++++ list_splice_init(head, local_head); +++++ raise_softirq_irqoff(i); +++++ } +++++ local_irq_enable(); +++++ } +++++ +++++ return NOTIFY_OK; +++++ } +++++ +++++ static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { +++++ .notifier_call = remote_softirq_cpu_notify, +++++ }; +++++ void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { +++++ int i; +++++ per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; +++++ for (i = 0; i < NR_SOFTIRQS; i++) +++++ INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } +++++ register_hotcpu_notifier(&remote_softirq_cpu_notifier); +++++ open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --combined kernel/sys.c index 038a7bc0901,038a7bc0901,0bc8fa3c228,038a7bc0901,d046a7a055c,0bc8fa3c228..53879cdae48 --- a/kernel/sys.c +++ b/kernel/sys.c @@@@@@@ -853,38 -853,38 -853,38 -853,38 -853,28 -853,38 +853,28 @@@@@@@ asmlinkage long sys_setfsgid(gid_t gid return old_fsgid; } ++++ +void do_sys_times(struct tms *tms) ++++ +{ ++++ + struct task_cputime cputime; ++++ + cputime_t cutime, cstime; ++++ + ++++ + spin_lock_irq(¤t->sighand->siglock); ++++ + thread_group_cputime(current, &cputime); ++++ + cutime = current->signal->cutime; ++++ + cstime = current->signal->cstime; ++++ + spin_unlock_irq(¤t->sighand->siglock); ++++ + tms->tms_utime = cputime_to_clock_t(cputime.utime); ++++ + tms->tms_stime = cputime_to_clock_t(cputime.stime); ++++ + tms->tms_cutime = cputime_to_clock_t(cutime); ++++ + tms->tms_cstime = cputime_to_clock_t(cstime); ++++ +} ++++ + asmlinkage long sys_times(struct tms __user * tbuf) { ---- - /* ---- - * In the SMP world we might just be unlucky and have one of ---- - * the times increment as we use it. Since the value is an ---- - * atomically safe type this is just fine. Conceptually its ---- - * as if the syscall took an instant longer to occur. ---- - */ if (tbuf) { struct tms tmp; ---- - struct task_struct *tsk = current; ---- - struct task_struct *t; ---- - cputime_t utime, stime, cutime, cstime; ---- - ---- - spin_lock_irq(&tsk->sighand->siglock); ---- - utime = tsk->signal->utime; ---- - stime = tsk->signal->stime; ---- - t = tsk; ---- - do { ---- - utime = cputime_add(utime, t->utime); ---- - stime = cputime_add(stime, t->stime); ---- - t = next_thread(t); ---- - } while (t != tsk); ---- - ---- - cutime = tsk->signal->cutime; ---- - cstime = tsk->signal->cstime; ---- - spin_unlock_irq(&tsk->sighand->siglock); ---- - ---- - tmp.tms_utime = cputime_to_clock_t(utime); ---- - tmp.tms_stime = cputime_to_clock_t(stime); ---- - tmp.tms_cutime = cputime_to_clock_t(cutime); ---- - tmp.tms_cstime = cputime_to_clock_t(cstime); ++++ + ++++ + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@@@@@@ -1060,9 -1060,9 -1060,7 -1060,9 -1050,9 -1060,7 +1050,7 @@@@@@@ asmlinkage long sys_setsid(void group_leader->signal->leader = 1; __set_special_pids(sid); -- -- spin_lock(&group_leader->sighand->siglock); -- -- group_leader->signal->tty = NULL; -- -- spin_unlock(&group_leader->sighand->siglock); ++ ++ proc_clear_tty(group_leader); err = session; out: @@@@@@@ -1351,8 -1351,8 -1349,10 -1351,8 -1341,8 -1349,10 +1339,10 @@@@@@@ asmlinkage long sys_sethostname(char __ down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { -- -- memcpy(utsname()->nodename, tmp, len); -- -- utsname()->nodename[len] = 0; ++ ++ struct new_utsname *u = utsname(); ++ ++ ++ ++ memcpy(u->nodename, tmp, len); ++ ++ memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } up_write(&uts_sem); @@@@@@@ -1364,15 -1364,15 -1364,17 -1364,15 -1354,15 -1364,17 +1354,17 @@@@@@@ asmlinkage long sys_gethostname(char __user *name, int len) { int i, errno; ++ ++ struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); -- -- i = 1 + strlen(utsname()->nodename); ++ ++ u = utsname(); ++ ++ i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; -- -- if (copy_to_user(name, utsname()->nodename, i)) ++ ++ if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@@@@@@ -1397,8 -1397,8 -1399,10 -1397,8 -1387,8 -1399,10 +1389,10 @@@@@@@ asmlinkage long sys_setdomainname(char down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { -- -- memcpy(utsname()->domainname, tmp, len); -- -- utsname()->domainname[len] = 0; ++ ++ struct new_utsname *u = utsname(); ++ ++ ++ ++ memcpy(u->domainname, tmp, len); ++ ++ memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } up_write(&uts_sem); @@@@@@@ -1445,21 -1445,21 -1449,29 -1445,21 -1435,20 -1449,29 +1439,28 @@@@@@@ asmlinkage long sys_old_getrlimit(unsig asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; ---- - unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; -- -- if (new_rlim.rlim_cur > new_rlim.rlim_max) -- -- return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; -- -- if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) -- -- return -EPERM; ++ ++ ++ ++ if (resource == RLIMIT_NOFILE) { ++ ++ if (new_rlim.rlim_max == RLIM_INFINITY) ++ ++ new_rlim.rlim_max = sysctl_nr_open; ++ ++ if (new_rlim.rlim_cur == RLIM_INFINITY) ++ ++ new_rlim.rlim_cur = sysctl_nr_open; ++ ++ if (new_rlim.rlim_max > sysctl_nr_open) ++ ++ return -EPERM; ++ ++ } ++ ++ ++ ++ if (new_rlim.rlim_cur > new_rlim.rlim_max) ++ ++ return -EINVAL; retval = security_task_setrlimit(resource, &new_rlim); if (retval) @@@@@@@ -1491,18 -1491,18 -1503,18 -1491,18 -1480,7 -1503,18 +1492,7 @@@@@@@ if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; ---- - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); ---- - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { ---- - unsigned long rlim_cur = new_rlim.rlim_cur; ---- - cputime_t cputime; ---- - ---- - cputime = secs_to_cputime(rlim_cur); ---- - read_lock(&tasklist_lock); ---- - spin_lock_irq(¤t->sighand->siglock); ---- - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); ---- - spin_unlock_irq(¤t->sighand->siglock); ---- - read_unlock(&tasklist_lock); ---- - } ++++ + update_rlimit_cpu(new_rlim.rlim_cur); out: return 0; } @@@@@@@ -1540,11 -1540,11 -1552,11 -1540,11 -1518,8 -1552,11 +1530,8 @@@@@@@ * */ ---- -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, ---- - cputime_t *utimep, cputime_t *stimep) ++++ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { ---- - *utimep = cputime_add(*utimep, t->utime); ---- - *stimep = cputime_add(*stimep, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@@@@@@ -1558,12 -1558,12 -1570,12 -1558,12 -1533,13 -1570,12 +1545,13 @@@@@@@ static void k_getrusage(struct task_str struct task_struct *t; unsigned long flags; cputime_t utime, stime; ++++ + struct task_cputime cputime; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { ---- - accumulate_thread_rusage(p, r, &utime, &stime); ++++ + accumulate_thread_rusage(p, r); goto out; } @@@@@@@ -1586,8 -1586,8 -1598,8 -1586,8 -1562,9 -1598,8 +1574,9 @@@@@@@ break; case RUSAGE_SELF: ---- - utime = cputime_add(utime, p->signal->utime); ---- - stime = cputime_add(stime, p->signal->stime); ++++ + thread_group_cputime(p, &cputime); ++++ + utime = cputime_add(utime, cputime.utime); ++++ + stime = cputime_add(stime, cputime.stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@@@@@@ -1596,7 -1596,7 -1608,7 -1596,7 -1573,7 -1608,7 +1585,7 @@@@@@@ r->ru_oublock += p->signal->oublock; t = p; do { ---- - accumulate_thread_rusage(t, r, &utime, &stime); ++++ + accumulate_thread_rusage(t, r); t = next_thread(t); } while (t != p); break; diff --combined kernel/time/ntp.c index 5125ddd8196,5125ddd8196,1ad46f3df6e,ddb0465a6ba,1ad46f3df6e,1ad46f3df6e..1a20715bfd6 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@@@@@@ -10,13 -10,13 -10,13 -10,13 -10,13 -10,13 +10,13 @@@@@@@ #include #include --- --#include #include #include #include #include #include #include +++ ++#include #include /* @@@@@@@ -218,11 -218,11 -218,11 -218,11 -218,11 -218,11 +218,11 @@@@@@@ void second_overflow(void /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; --- --static void sync_cmos_clock(unsigned long dummy); +++ ++static void sync_cmos_clock(struct work_struct *work); --- --static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +++ ++static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); --- --static void sync_cmos_clock(unsigned long dummy) +++ ++static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@@@@@@ -245,7 -245,7 -245,7 -245,7 -245,7 -245,7 +245,7 @@@@@@@ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) fail = update_persistent_clock(now); -- - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; ++ + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; @@@@@@@ -258,13 -258,13 -258,13 -258,13 -258,13 -258,13 +258,13 @@@@@@@ next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } --- -- mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); +++ ++ schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) --- -- mod_timer(&sync_cmos_timer, jiffies + 1); +++ ++ schedule_delayed_work(&sync_cmos_work, 0); } #else @@@@@@@ -277,38 -277,38 -277,38 -277,50 -277,38 -277,38 +277,50 @@@@@@@ static inline void notify_cmos_timer(vo int do_adjtimex(struct timex *txc) { struct timespec ts; --- -- long save_adjust, sec; int result; --- -- /* In order to modify anything, you gotta be super-user! */ --- -- if (txc->modes && !capable(CAP_SYS_TIME)) --- -- return -EPERM; --- -- --- -- /* Now we validate the data before disabling interrupts */ --- -- --- -- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { +++ ++ /* Validate the data before disabling interrupts */ +++ ++ if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ --- -- if (txc->modes & ~ADJ_OFFSET_SS_READ) +++ ++ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; +++ ++ if (!(txc->modes & ADJ_OFFSET_READONLY) && +++ ++ !capable(CAP_SYS_TIME)) +++ ++ return -EPERM; +++ ++ } else { +++ ++ /* In order to modify anything, you gotta be super-user! */ +++ ++ if (txc->modes && !capable(CAP_SYS_TIME)) +++ ++ return -EPERM; +++ ++ +++ ++ /* if the quartz is off by more than 10% something is VERY wrong! */ +++ ++ if (txc->modes & ADJ_TICK && +++ ++ (txc->tick < 900000/USER_HZ || +++ ++ txc->tick > 1100000/USER_HZ)) +++ ++ return -EINVAL; +++ ++ +++ ++ if (txc->modes & ADJ_STATUS && time_state != TIME_OK) +++ ++ hrtimer_cancel(&leap_timer); } --- -- /* if the quartz is off by more than 10% something is VERY wrong ! */ --- -- if (txc->modes & ADJ_TICK) --- -- if (txc->tick < 900000/USER_HZ || --- -- txc->tick > 1100000/USER_HZ) --- -- return -EINVAL; --- -- --- -- if (time_state != TIME_OK && txc->modes & ADJ_STATUS) --- -- hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); --- -- /* Save for later - semantics of adjtime is to return old value */ --- -- save_adjust = time_adjust; --- -- /* If there are input parameters, then process them */ +++ ++ if (txc->modes & ADJ_ADJTIME) { +++ ++ long save_adjust = time_adjust; +++ ++ +++ ++ if (!(txc->modes & ADJ_OFFSET_READONLY)) { +++ ++ /* adjtime() is independent from ntp_adjtime() */ +++ ++ time_adjust = txc->offset; +++ ++ ntp_update_frequency(); +++ ++ } +++ ++ txc->offset = save_adjust; +++ ++ goto adj_done; +++ ++ } if (txc->modes) { +++ ++ long sec; +++ ++ if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@@@@@@ -375,13 -375,13 -375,13 -387,8 -375,13 -375,13 +387,8 @@@@@@@ if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; --- -- if (txc->modes & ADJ_OFFSET) { --- -- if (txc->modes == ADJ_OFFSET_SINGLESHOT) --- -- /* adjtime() is independent from ntp_adjtime() */ --- -- time_adjust = txc->offset; --- -- else --- -- ntp_update_offset(txc->offset); --- -- } +++ ++ if (txc->modes & ADJ_OFFSET) +++ ++ ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@@@@@@ -389,22 -389,22 -389,22 -396,18 -389,22 -389,22 +396,18 @@@@@@@ ntp_update_frequency(); } +++ ++ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, +++ ++ NTP_SCALE_SHIFT); +++ ++ if (!(time_status & STA_NANO)) +++ ++ txc->offset /= NSEC_PER_USEC; +++ ++ +++ ++adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; --- -- if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || --- -- (txc->modes == ADJ_OFFSET_SS_READ)) --- -- txc->offset = save_adjust; --- -- else { --- -- txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, --- -- NTP_SCALE_SHIFT); --- -- if (!(time_status & STA_NANO)) --- -- txc->offset /= NSEC_PER_USEC; --- -- } --- -- txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * --- -- (s64)PPM_SCALE_INV, --- -- NTP_SCALE_SHIFT); +++ ++ txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * +++ ++ (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --combined kernel/time/timekeeping.c index 5099c95b8aa,e91c29f961c,e91c29f961c,5ecbfc39a26,e91c29f961c,e91c29f961c..e7acfb482a6 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@@@@@@ -58,26 -58,27 -58,27 -58,27 -58,27 -58,27 +58,26 @@@@@@@ struct clocksource *clock #ifdef CONFIG_GENERIC_TIME /** ----- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook +++++ * clocksource_forward_now - update clock to the current time * ----- * private function, must hold xtime_lock lock when being ----- * called. Returns the number of nanoseconds since the ----- * last call to update_wall_time() (adjusted by NTP scaling) +++++ * Forward the current clock to update its state since the last call to +++++ * update_wall_time(). This is useful before significant clock changes, +++++ * as it avoids having to deal with this time offset explicitly. */ -----static inline s64 __get_nsec_offset(void) +++++static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; ----- s64 ns_offset; +++++ s64 nsec; ----- /* read clocksource: */ cycle_now = clocksource_read(clock); ----- ----- /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +++++ clock->cycle_last = cycle_now; ----- /* convert to nanoseconds: */ ----- ns_offset = cyc2ns(clock, cycle_delta); +++++ nsec = cyc2ns(clock, cycle_delta); +++++ timespec_add_ns(&xtime, nsec); ----- return ns_offset; +++++ nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; +++++ clock->raw_time.tv_nsec += nsec; } /** @@@@@@@ -88,7 -89,6 -89,6 -89,6 -89,6 -89,6 +88,7 @@@@@@@ */ void getnstimeofday(struct timespec *ts) { +++++ cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@@@@@@ -96,15 -96,7 -96,7 -96,7 -96,7 -96,7 +96,15 @@@@@@@ seq = read_seqbegin(&xtime_lock); *ts = xtime; ----- nsecs = __get_nsec_offset(); +++++ +++++ /* read clocksource: */ +++++ cycle_now = clocksource_read(clock); +++++ +++++ /* calculate the delta since the last update_wall_time: */ +++++ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +++++ +++++ /* convert to nanoseconds: */ +++++ nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@@@@@@ -137,22 -129,22 -129,22 -129,22 -129,22 -129,22 +137,22 @@@@@@@ EXPORT_SYMBOL(do_gettimeofday) */ int do_settimeofday(struct timespec *tv) { +++++ struct timespec ts_delta; unsigned long flags; ----- time_t wtm_sec, sec = tv->tv_sec; ----- long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); ----- nsec -= __get_nsec_offset(); +++++ clocksource_forward_now(); +++++ +++++ ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; +++++ ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; +++++ wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); ----- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); ----- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); +++++ xtime = *tv; ----- set_normalized_timespec(&xtime, sec, nsec); ----- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@@@@@@ -178,19 -170,22 -170,22 -170,22 -170,22 -170,22 +178,19 @@@@@@@ EXPORT_SYMBOL(do_settimeofday) static void change_clocksource(void) { struct clocksource *new; ----- cycle_t now; ----- u64 nsec; new = clocksource_get_next(); if (clock == new) return; ----- new->cycle_last = 0; ----- now = clocksource_read(new); ----- nsec = __get_nsec_offset(); ----- timespec_add_ns(&xtime, nsec); +++++ clocksource_forward_now(); ----- clock = new; ----- clock->cycle_last = now; +++++ new->raw_time = clock->raw_time; +++++ clock = new; +++++ clock->cycle_last = 0; +++++ clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@@@@@@ -205,43 -200,10 -200,10 -200,10 -200,10 -200,10 +205,43 @@@@@@@ */ } #else +++++static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -----static inline s64 __get_nsec_offset(void) { return 0; } #endif +++++/** +++++ * getrawmonotonic - Returns the raw monotonic time in a timespec +++++ * @ts: pointer to the timespec to be set +++++ * +++++ * Returns the raw monotonic time (completely un-modified by ntp) +++++ */ +++++void getrawmonotonic(struct timespec *ts) +++++{ +++++ unsigned long seq; +++++ s64 nsecs; +++++ cycle_t cycle_now, cycle_delta; +++++ +++++ do { +++++ seq = read_seqbegin(&xtime_lock); +++++ +++++ /* read clocksource: */ +++++ cycle_now = clocksource_read(clock); +++++ +++++ /* calculate the delta since the last update_wall_time: */ +++++ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +++++ +++++ /* convert to nanoseconds: */ +++++ nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; +++++ +++++ *ts = clock->raw_time; +++++ +++++ } while (read_seqretry(&xtime_lock, seq)); +++++ +++++ timespec_add_ns(ts, nsecs); +++++} +++++EXPORT_SYMBOL(getrawmonotonic); +++++ +++++ /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ @@@@@@@ -303,6 -265,8 -265,8 -265,8 -265,8 -265,8 +303,6 @@@@@@@ void __init timekeeping_init(void static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -----/* xtime offset when we went into suspend */ -----static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@@@@@@ -328,6 -292,8 -292,8 -292,8 -292,8 -292,8 +328,6 @@@@@@@ static int timekeeping_resume(struct sy wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } ----- /* Make sure that we have the correct xtime reference */ ----- timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@@@@@@ -353,7 -319,8 -319,8 -319,8 -319,8 -319,8 +353,7 @@@@@@@ static int timekeeping_suspend(struct s timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); ----- /* Get the current xtime offset */ ----- timekeeping_suspend_nsecs = __get_nsec_offset(); +++++ clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@@@@@@ -487,29 -454,23 -454,23 -454,23 -454,23 -454,23 +487,29 @@@@@@@ void update_wall_time(void #else offset = clock->cycle_interval; #endif --- -- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; +++ ++ clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ ----- clock->xtime_nsec += clock->xtime_interval; ----- clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; +++++ clock->cycle_last += clock->cycle_interval; +++++ clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } +++++ clock->raw_time.tv_nsec += clock->raw_interval; +++++ if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { +++++ clock->raw_time.tv_nsec -= NSEC_PER_SEC; +++++ clock->raw_time.tv_sec++; +++++ } +++++ /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); @@@@@@@ -518,9 -479,9 -479,9 -479,12 -479,9 -479,9 +518,12 @@@@@@@ /* correct the clock when NTP error is too big */ clocksource_adjust(offset); --- -- /* store full nanoseconds into xtime */ --- -- xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; +++ ++ /* store full nanoseconds into xtime after rounding it up and +++ ++ * add the remainder to the error difference. +++ ++ */ +++ ++ xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; +++ ++ clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); diff --combined kernel/timer.c index 03bc7f1f159,e8019cc3418,510fe69351c,03bc7f1f159,03bc7f1f159,510fe69351c..56becf373c5 --- a/kernel/timer.c +++ b/kernel/timer.c @@@@@@@ -978,6 -978,6 -978,7 -978,6 -978,6 -978,7 +978,7 @@@@@@@ void update_process_times(int user_tick run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); ++ ++ printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } @@@@@@@ -1435,9 -1435,11 -1436,9 -1435,9 -1435,9 -1436,9 +1436,11 @@@@@@@ static void __cpuinit migrate_timers(in BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - ---- - ---- local_irq_disable(); - ---- spin_lock(&new_base->lock); + ++++ /* + ++++ * The caller is globally serialized and nobody else + ++++ * takes two locks at once, deadlock is not possible. + ++++ */ + ++++ spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@@@@@@ -1452,8 -1454,7 -1453,8 -1452,8 -1452,8 -1453,8 +1455,7 @@@@@@@ } spin_unlock(&old_base->lock); - ---- spin_unlock(&new_base->lock); - ---- local_irq_enable(); + ++++ spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --combined security/selinux/hooks.c index 03fc6a81ae3,03fc6a81ae3,576e5119907,03fc6a81ae3,69649783c26,576e5119907..3e3fde7c1d2 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@@@@@@ -75,6 -75,6 -75,6 -75,6 -75,7 -75,6 +75,7 @@@@@@@ #include #include #include ++++ +#include #include "avc.h" #include "objsec.h" @@@@@@@ -291,6 -291,6 -291,7 -291,6 -292,6 -291,7 +292,7 @@@@@@@ static void sk_free_security(struct soc struct sk_security_struct *ssec = sk->sk_security; sk->sk_security = NULL; ++ ++ selinux_netlbl_sk_security_free(ssec); kfree(ssec); } @@@@@@@ -324,7 -324,7 -325,7 -324,7 -325,7 -325,7 +326,7 @@@@@@@ enum Opt_rootcontext = 4, }; -- -- static match_table_t tokens = { ++ ++ static const match_table_t tokens = { {Opt_context, CONTEXT_STR "%s"}, {Opt_fscontext, FSCONTEXT_STR "%s"}, {Opt_defcontext, DEFCONTEXT_STR "%s"}, @@@@@@@ -957,7 -957,7 -958,8 -957,7 -958,7 -958,8 +959,8 @@@@@@@ out_err return rc; } -- -- void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts) ++ ++ static void selinux_write_opts(struct seq_file *m, ++ ++ struct security_mnt_opts *opts) { int i; char *prefix; @@@@@@@ -1290,7 -1290,7 -1292,7 -1290,7 -1291,7 -1292,7 +1293,7 @@@@@@@ static int inode_doinit_with_dentry(str /* Default to the fs superblock SID. */ isec->sid = sbsec->sid; -- -- if (sbsec->proc) { ++ ++ if (sbsec->proc && !S_ISLNK(inode->i_mode)) { struct proc_inode *proci = PROC_I(inode); if (proci->pde) { isec->sclass = inode_mode_to_security_class(inode->i_mode); @@@@@@@ -2120,7 -2120,7 -2122,6 -2120,7 -2121,7 -2122,6 +2123,6 @@@@@@@ static inline void flush_unauthorized_f long j = -1; int drop_tty = 0; -- -- mutex_lock(&tty_mutex); tty = get_current_tty(); if (tty) { file_list_lock(); @@@@@@@ -2138,8 -2138,8 -2139,8 -2138,8 -2139,8 -2139,8 +2140,8 @@@@@@@ } } file_list_unlock(); ++ ++ tty_kref_put(tty); } -- -- mutex_unlock(&tty_mutex); /* Reset controlling tty. */ if (drop_tty) no_tty(); @@@@@@@ -2321,13 -2321,13 -2322,13 -2321,13 -2322,7 -2322,13 +2323,7 @@@@@@@ static void selinux_bprm_post_apply_cre initrlim = init_task.signal->rlim+i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } ---- - if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { ---- - /* ---- - * This will cause RLIMIT_CPU calculations ---- - * to be refigured. ---- - */ ---- - current->it_prof_expires = jiffies_to_cputime(1); ---- - } ++++ + update_rlimit_cpu(rlim->rlim_cur); } /* Wake up the parent if it is waiting so that it can @@@@@@@ -3548,38 -3548,38 -3549,44 -3548,38 -3543,38 -3549,44 +3544,44 @@@@@@@ out #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, -- -- char **addrp, int src, u8 *proto) ++ ++ char **_addrp, int src, u8 *proto) { -- -- int ret = 0; ++ ++ char *addrp; ++ ++ int ret; switch (ad->u.net.family) { case PF_INET: ret = selinux_parse_skb_ipv4(skb, ad, proto); -- -- if (ret || !addrp) -- -- break; -- -- *addrp = (char *)(src ? &ad->u.net.v4info.saddr : -- -- &ad->u.net.v4info.daddr); -- -- break; ++ ++ if (ret) ++ ++ goto parse_error; ++ ++ addrp = (char *)(src ? &ad->u.net.v4info.saddr : ++ ++ &ad->u.net.v4info.daddr); ++ ++ goto okay; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case PF_INET6: ret = selinux_parse_skb_ipv6(skb, ad, proto); -- -- if (ret || !addrp) -- -- break; -- -- *addrp = (char *)(src ? &ad->u.net.v6info.saddr : -- -- &ad->u.net.v6info.daddr); -- -- break; ++ ++ if (ret) ++ ++ goto parse_error; ++ ++ addrp = (char *)(src ? &ad->u.net.v6info.saddr : ++ ++ &ad->u.net.v6info.daddr); ++ ++ goto okay; #endif /* IPV6 */ default: -- -- break; ++ ++ addrp = NULL; ++ ++ goto okay; } -- -- if (unlikely(ret)) -- -- printk(KERN_WARNING -- -- "SELinux: failure in selinux_parse_skb()," -- -- " unable to parse packet\n"); -- -- ++ ++ parse_error: ++ ++ printk(KERN_WARNING ++ ++ "SELinux: failure in selinux_parse_skb()," ++ ++ " unable to parse packet\n"); return ret; ++ ++ ++ ++ okay: ++ ++ if (_addrp) ++ ++ *_addrp = addrp; ++ ++ return 0; } /** @@@@@@@ -3794,6 -3794,6 -3801,7 -3794,6 -3789,6 -3801,7 +3796,7 @@@@@@@ out static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { ++ ++ struct sock *sk = sock->sk; struct inode_security_struct *isec; int err; @@@@@@@ -3807,7 -3807,7 -3815,6 -3807,7 -3802,7 -3815,6 +3810,6 @@@@@@@ isec = SOCK_INODE(sock)->i_security; if (isec->sclass == SECCLASS_TCP_SOCKET || isec->sclass == SECCLASS_DCCP_SOCKET) { -- -- struct sock *sk = sock->sk; struct avc_audit_data ad; struct sockaddr_in *addr4 = NULL; struct sockaddr_in6 *addr6 = NULL; @@@@@@@ -3841,6 -3841,6 -3848,8 -3841,6 -3836,6 -3848,8 +3843,8 @@@@@@@ goto out; } ++ ++ err = selinux_netlbl_socket_connect(sk, address); ++ ++ out: return err; } @@@@@@@ -4070,20 -4070,20 -4079,28 -4070,20 -4065,20 -4079,28 +4074,28 @@@@@@@ static int selinux_sock_rcv_skb_iptable } static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, -- -- struct avc_audit_data *ad, -- -- u16 family, char *addrp) ++ ++ u16 family) { int err; struct sk_security_struct *sksec = sk->sk_security; u32 peer_sid; u32 sk_sid = sksec->sid; ++ ++ struct avc_audit_data ad; ++ ++ char *addrp; ++ ++ ++ ++ AVC_AUDIT_DATA_INIT(&ad, NET); ++ ++ ad.u.net.netif = skb->iif; ++ ++ ad.u.net.family = family; ++ ++ err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); ++ ++ if (err) ++ ++ return err; if (selinux_compat_net) -- -- err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad, ++ ++ err = selinux_sock_rcv_skb_iptables_compat(sk, skb, &ad, family, addrp); else err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, -- -- PACKET__RECV, ad); ++ ++ PACKET__RECV, &ad); if (err) return err; @@@@@@@ -4092,12 -4092,12 -4109,14 -4092,12 -4087,12 -4109,14 +4104,14 @@@@@@@ if (err) return err; err = avc_has_perm(sk_sid, peer_sid, -- -- SECCLASS_PEER, PEER__RECV, ad); ++ ++ SECCLASS_PEER, PEER__RECV, &ad); ++ ++ if (err) ++ ++ selinux_netlbl_err(skb, err, 0); } else { -- -- err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad); ++ ++ err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad); if (err) return err; -- -- err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad); ++ ++ err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); } return err; @@@@@@@ -4111,6 -4111,6 -4130,8 -4111,6 -4106,6 -4130,8 +4125,8 @@@@@@@ static int selinux_socket_sock_rcv_skb( u32 sk_sid = sksec->sid; struct avc_audit_data ad; char *addrp; ++ ++ u8 secmark_active; ++ ++ u8 peerlbl_active; if (family != PF_INET && family != PF_INET6) return 0; @@@@@@@ -4119,6 -4119,6 -4140,18 -4119,6 -4114,6 -4140,18 +4135,18 @@@@@@@ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; ++ ++ /* If any sort of compatibility mode is enabled then handoff processing ++ ++ * to the selinux_sock_rcv_skb_compat() function to deal with the ++ ++ * special handling. We do this in an attempt to keep this function ++ ++ * as fast and as clean as possible. */ ++ ++ if (selinux_compat_net || !selinux_policycap_netpeer) ++ ++ return selinux_sock_rcv_skb_compat(sk, skb, family); ++ ++ ++ ++ secmark_active = selinux_secmark_enabled(); ++ ++ peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); ++ ++ if (!secmark_active && !peerlbl_active) ++ ++ return 0; ++ ++ AVC_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = skb->iif; ad.u.net.family = family; @@@@@@@ -4126,15 -4126,15 -4159,7 -4126,15 -4121,15 -4159,7 +4154,7 @@@@@@@ if (err) return err; -- -- /* If any sort of compatibility mode is enabled then handoff processing -- -- * to the selinux_sock_rcv_skb_compat() function to deal with the -- -- * special handling. We do this in an attempt to keep this function -- -- * as fast and as clean as possible. */ -- -- if (selinux_compat_net || !selinux_policycap_netpeer) -- -- return selinux_sock_rcv_skb_compat(sk, skb, &ad, -- -- family, addrp); -- -- -- -- if (netlbl_enabled() || selinux_xfrm_enabled()) { ++ ++ if (peerlbl_active) { u32 peer_sid; err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); @@@@@@@ -4142,13 -4142,13 -4167,17 -4142,13 -4137,13 -4167,17 +4162,17 @@@@@@@ return err; err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family, peer_sid, &ad); -- -- if (err) ++ ++ if (err) { ++ ++ selinux_netlbl_err(skb, err, 0); return err; ++ ++ } err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, PEER__RECV, &ad); ++ ++ if (err) ++ ++ selinux_netlbl_err(skb, err, 0); } -- -- if (selinux_secmark_enabled()) { ++ ++ if (secmark_active) { err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); if (err) @@@@@@@ -4207,10 -4207,10 -4236,12 -4207,10 -4202,10 -4236,12 +4231,12 @@@@@@@ static int selinux_socket_getpeersec_dg u32 peer_secid = SECSID_NULL; u16 family; -- -- if (sock) ++ ++ if (skb && skb->protocol == htons(ETH_P_IP)) ++ ++ family = PF_INET; ++ ++ else if (skb && skb->protocol == htons(ETH_P_IPV6)) ++ ++ family = PF_INET6; ++ ++ else if (sock) family = sock->sk->sk_family; -- -- else if (skb && skb->sk) -- -- family = skb->sk->sk_family; else goto out; @@@@@@@ -4268,8 -4268,8 -4299,6 -4268,8 -4263,8 -4299,6 +4294,6 @@@@@@@ static void selinux_sock_graft(struct s sk->sk_family == PF_UNIX) isec->sid = sksec->sid; sksec->sclass = isec->sclass; -- -- -- -- selinux_netlbl_sock_graft(sk, parent); } static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, @@@@@@@ -4277,10 -4277,10 -4306,15 -4277,10 -4272,10 -4306,15 +4301,15 @@@@@@@ { struct sk_security_struct *sksec = sk->sk_security; int err; ++ ++ u16 family = sk->sk_family; u32 newsid; u32 peersid; -- -- err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid); ++ ++ /* handle mapped IPv4 packets arriving via IPv6 sockets */ ++ ++ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) ++ ++ family = PF_INET; ++ ++ ++ ++ err = selinux_skb_peerlbl_sid(skb, family, &peersid); if (err) return err; if (peersid == SECSID_NULL) { @@@@@@@ -4315,12 -4315,12 -4349,18 -4315,12 -4310,12 -4349,18 +4344,18 @@@@@@@ static void selinux_inet_csk_clone(stru selinux_netlbl_sk_security_reset(newsksec, req->rsk_ops->family); } -- -- static void selinux_inet_conn_established(struct sock *sk, -- -- struct sk_buff *skb) ++ ++ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) { ++ ++ u16 family = sk->sk_family; struct sk_security_struct *sksec = sk->sk_security; -- -- selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid); ++ ++ /* handle mapped IPv4 packets arriving via IPv6 sockets */ ++ ++ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) ++ ++ family = PF_INET; ++ ++ ++ ++ selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); ++ ++ ++ ++ selinux_netlbl_inet_conn_established(sk, family); } static void selinux_req_classify_flow(const struct request_sock *req, @@@@@@@ -4370,39 -4370,39 -4410,54 -4370,39 -4365,39 -4410,54 +4405,54 @@@@@@@ out static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, u16 family) { ++ ++ int err; char *addrp; u32 peer_sid; struct avc_audit_data ad; u8 secmark_active; ++ ++ u8 netlbl_active; u8 peerlbl_active; if (!selinux_policycap_netpeer) return NF_ACCEPT; secmark_active = selinux_secmark_enabled(); -- -- peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); ++ ++ netlbl_active = netlbl_enabled(); ++ ++ peerlbl_active = netlbl_active || selinux_xfrm_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; ++ ++ if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) ++ ++ return NF_DROP; ++ ++ AVC_AUDIT_DATA_INIT(&ad, NET); ad.u.net.netif = ifindex; ad.u.net.family = family; if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) return NF_DROP; -- -- if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) -- -- return NF_DROP; -- -- -- -- if (peerlbl_active) -- -- if (selinux_inet_sys_rcv_skb(ifindex, addrp, family, -- -- peer_sid, &ad) != 0) ++ ++ if (peerlbl_active) { ++ ++ err = selinux_inet_sys_rcv_skb(ifindex, addrp, family, ++ ++ peer_sid, &ad); ++ ++ if (err) { ++ ++ selinux_netlbl_err(skb, err, 1); return NF_DROP; ++ ++ } ++ ++ } if (secmark_active) if (avc_has_perm(peer_sid, skb->secmark, SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) return NF_DROP; ++ ++ if (netlbl_active) ++ ++ /* we do this in the FORWARD path and not the POST_ROUTING ++ ++ * path because we want to make sure we apply the necessary ++ ++ * labeling before IPsec is applied so we can leverage AH ++ ++ * protection */ ++ ++ if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0) ++ ++ return NF_DROP; ++ ++ return NF_ACCEPT; } @@@@@@@ -4426,6 -4426,6 -4481,37 -4426,6 -4421,6 -4481,37 +4476,37 @@@@@@@ static unsigned int selinux_ipv6_forwar } #endif /* IPV6 */ ++ ++ static unsigned int selinux_ip_output(struct sk_buff *skb, ++ ++ u16 family) ++ ++ { ++ ++ u32 sid; ++ ++ ++ ++ if (!netlbl_enabled()) ++ ++ return NF_ACCEPT; ++ ++ ++ ++ /* we do this in the LOCAL_OUT path and not the POST_ROUTING path ++ ++ * because we want to make sure we apply the necessary labeling ++ ++ * before IPsec is applied so we can leverage AH protection */ ++ ++ if (skb->sk) { ++ ++ struct sk_security_struct *sksec = skb->sk->sk_security; ++ ++ sid = sksec->sid; ++ ++ } else ++ ++ sid = SECINITSID_KERNEL; ++ ++ if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0) ++ ++ return NF_DROP; ++ ++ ++ ++ return NF_ACCEPT; ++ ++ } ++ ++ ++ ++ static unsigned int selinux_ipv4_output(unsigned int hooknum, ++ ++ struct sk_buff *skb, ++ ++ const struct net_device *in, ++ ++ const struct net_device *out, ++ ++ int (*okfn)(struct sk_buff *)) ++ ++ { ++ ++ return selinux_ip_output(skb, PF_INET); ++ ++ } ++ ++ static int selinux_ip_postroute_iptables_compat(struct sock *sk, int ifindex, struct avc_audit_data *ad, @@@@@@@ -4493,30 -4493,30 -4579,36 -4493,30 -4488,30 -4579,36 +4574,36 @@@@@@@ static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, int ifindex, -- -- struct avc_audit_data *ad, -- -- u16 family, -- -- char *addrp, -- -- u8 proto) ++ ++ u16 family) { struct sock *sk = skb->sk; struct sk_security_struct *sksec; ++ ++ struct avc_audit_data ad; ++ ++ char *addrp; ++ ++ u8 proto; if (sk == NULL) return NF_ACCEPT; sksec = sk->sk_security; ++ ++ AVC_AUDIT_DATA_INIT(&ad, NET); ++ ++ ad.u.net.netif = ifindex; ++ ++ ad.u.net.family = family; ++ ++ if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) ++ ++ return NF_DROP; ++ ++ if (selinux_compat_net) { if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex, -- -- ad, family, addrp)) ++ ++ &ad, family, addrp)) return NF_DROP; } else { if (avc_has_perm(sksec->sid, skb->secmark, -- -- SECCLASS_PACKET, PACKET__SEND, ad)) ++ ++ SECCLASS_PACKET, PACKET__SEND, &ad)) return NF_DROP; } if (selinux_policycap_netpeer) -- -- if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto)) ++ ++ if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto)) return NF_DROP; return NF_ACCEPT; @@@@@@@ -4530,23 -4530,23 -4622,15 -4530,23 -4525,23 -4622,15 +4617,15 @@@@@@@ static unsigned int selinux_ip_postrout struct sock *sk; struct avc_audit_data ad; char *addrp; -- -- u8 proto; u8 secmark_active; u8 peerlbl_active; -- -- AVC_AUDIT_DATA_INIT(&ad, NET); -- -- ad.u.net.netif = ifindex; -- -- ad.u.net.family = family; -- -- if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) -- -- return NF_DROP; -- -- /* If any sort of compatibility mode is enabled then handoff processing * to the selinux_ip_postroute_compat() function to deal with the * special handling. We do this in an attempt to keep this function * as fast and as clean as possible. */ if (selinux_compat_net || !selinux_policycap_netpeer) -- -- return selinux_ip_postroute_compat(skb, ifindex, &ad, -- -- family, addrp, proto); ++ ++ return selinux_ip_postroute_compat(skb, ifindex, family); /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec * packet transformation so allow the packet to pass without any checks @@@@@@@ -4562,21 -4562,21 -4646,45 -4562,21 -4557,21 -4646,45 +4641,45 @@@@@@@ if (!secmark_active && !peerlbl_active) return NF_ACCEPT; -- -- /* if the packet is locally generated (skb->sk != NULL) then use the -- -- * socket's label as the peer label, otherwise the packet is being -- -- * forwarded through this system and we need to fetch the peer label -- -- * directly from the packet */ ++ ++ /* if the packet is being forwarded then get the peer label from the ++ ++ * packet itself; otherwise check to see if it is from a local ++ ++ * application or the kernel, if from an application get the peer label ++ ++ * from the sending socket, otherwise use the kernel's sid */ sk = skb->sk; -- -- if (sk) { ++ ++ if (sk == NULL) { ++ ++ switch (family) { ++ ++ case PF_INET: ++ ++ if (IPCB(skb)->flags & IPSKB_FORWARDED) ++ ++ secmark_perm = PACKET__FORWARD_OUT; ++ ++ else ++ ++ secmark_perm = PACKET__SEND; ++ ++ break; ++ ++ case PF_INET6: ++ ++ if (IP6CB(skb)->flags & IP6SKB_FORWARDED) ++ ++ secmark_perm = PACKET__FORWARD_OUT; ++ ++ else ++ ++ secmark_perm = PACKET__SEND; ++ ++ break; ++ ++ default: ++ ++ return NF_DROP; ++ ++ } ++ ++ if (secmark_perm == PACKET__FORWARD_OUT) { ++ ++ if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) ++ ++ return NF_DROP; ++ ++ } else ++ ++ peer_sid = SECINITSID_KERNEL; ++ ++ } else { struct sk_security_struct *sksec = sk->sk_security; peer_sid = sksec->sid; secmark_perm = PACKET__SEND; -- -- } else { -- -- if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) -- -- return NF_DROP; -- -- secmark_perm = PACKET__FORWARD_OUT; } ++ ++ AVC_AUDIT_DATA_INIT(&ad, NET); ++ ++ ad.u.net.netif = ifindex; ++ ++ ad.u.net.family = family; ++ ++ if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL)) ++ ++ return NF_DROP; ++ ++ if (secmark_active) if (avc_has_perm(peer_sid, skb->secmark, SECCLASS_PACKET, secmark_perm, &ad)) @@@@@@@ -5219,8 -5219,8 -5327,12 -5219,8 -5214,8 -5327,12 +5322,12 @@@@@@@ static int selinux_setprocattr(struct t if (sid == 0) return -EINVAL; -- -- -- -- /* Only allow single threaded processes to change context */ ++ ++ /* ++ ++ * SELinux allows to change context in the following case only. ++ ++ * - Single threaded processes. ++ ++ * - Multi threaded processes intend to change its context into ++ ++ * more restricted domain (defined by TYPEBOUNDS statement). ++ ++ */ if (atomic_read(&p->mm->mm_users) != 1) { struct task_struct *g, *t; struct mm_struct *mm = p->mm; @@@@@@@ -5228,11 -5228,11 -5340,16 -5228,11 -5223,11 -5340,16 +5335,16 @@@@@@@ do_each_thread(g, t) { if (t->mm == mm && t != p) { read_unlock(&tasklist_lock); -- -- return -EPERM; ++ ++ error = security_bounded_transition(tsec->sid, sid); ++ ++ if (!error) ++ ++ goto boundary_ok; ++ ++ ++ ++ return error; } } while_each_thread(g, t); read_unlock(&tasklist_lock); } ++ ++ boundary_ok: /* Check permissions for the transition. */ error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, @@@@@@@ -5641,6 -5641,6 -5758,13 -5641,6 -5636,6 -5758,13 +5753,13 @@@@@@@ static struct nf_hook_ops selinux_ipv4_ .pf = PF_INET, .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_SELINUX_FIRST, ++ ++ }, ++ ++ { ++ ++ .hook = selinux_ipv4_output, ++ ++ .owner = THIS_MODULE, ++ ++ .pf = PF_INET, ++ ++ .hooknum = NF_INET_LOCAL_OUT, ++ ++ .priority = NF_IP_PRI_SELINUX_FIRST, } };