From: Linus Torvalds Date: Thu, 23 Oct 2008 16:37:16 +0000 (-0700) Subject: Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel... X-Git-Tag: v2.6.28-rc1~43 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=133e887f90208d339088dd60cb1d08a72ba27288;hp=-c;p=linux-2.6-omap-h63xx.git Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip * 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: disable the hrtick for now sched: revert back to per-rq vruntime sched: fair scheduler should not resched rt tasks sched: optimize group load balancer sched: minor fast-path overhead reduction sched: fix the wrong mask_len, cleanup sched: kill unused scheduler decl. sched: fix the wrong mask_len sched: only update rq->clock while holding rq->lock --- 133e887f90208d339088dd60cb1d08a72ba27288 diff --combined include/linux/sched.h index 5c38db536e0,4f59c8e8597..10bff55b082 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -287,7 -287,6 +287,6 @@@ extern void trap_init(void) extern void account_process_tick(struct task_struct *task, int user); extern void update_process_times(int user); extern void scheduler_tick(void); - extern void hrtick_resched(void); extern void sched_show_task(struct task_struct *p); @@@ -403,21 -402,12 +402,21 @@@ extern int get_dumpable(struct mm_struc #define MMF_DUMP_MAPPED_PRIVATE 4 #define MMF_DUMP_MAPPED_SHARED 5 #define MMF_DUMP_ELF_HEADERS 6 +#define MMF_DUMP_HUGETLB_PRIVATE 7 +#define MMF_DUMP_HUGETLB_SHARED 8 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 5 +#define MMF_DUMP_FILTER_BITS 7 #define MMF_DUMP_FILTER_MASK \ (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) + ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ + (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS +# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +#else +# define MMF_DUMP_MASK_DEFAULT_ELF 0 +#endif struct sighand_struct { atomic_t count; @@@ -434,39 -424,6 +433,39 @@@ struct pacct_struct unsigned long ac_minflt, ac_majflt; }; +/** + * struct task_cputime - collected CPU time counts + * @utime: time spent in user mode, in &cputime_t units + * @stime: time spent in kernel mode, in &cputime_t units + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * + * This structure groups together three kinds of CPU time that are + * tracked for threads and thread groups. Most things considering + * CPU time want to group these counts together and treat all three + * of them in parallel. + */ +struct task_cputime { + cputime_t utime; + cputime_t stime; + unsigned long long sum_exec_runtime; +}; +/* Alternate field names when used to cache expirations. */ +#define prof_exp stime +#define virt_exp utime +#define sched_exp sum_exec_runtime + +/** + * struct thread_group_cputime - thread group interval timer counts + * @totals: thread group interval timers; substructure for + * uniprocessor kernel, per-cpu for SMP kernel. + * + * This structure contains the version of task_cputime, above, that is + * used for thread group CPU clock calculations. + */ +struct thread_group_cputime { + struct task_cputime *totals; +}; + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@@ -512,17 -469,6 +511,17 @@@ struct signal_struct cputime_t it_prof_expires, it_virt_expires; cputime_t it_prof_incr, it_virt_incr; + /* + * Thread group totals for process CPU clocks. + * See thread_group_cputime(), et al, for details. + */ + struct thread_group_cputime cputime; + + /* Earliest-expiration cache. */ + struct task_cputime cputime_expires; + + struct list_head cpu_timers[3]; + /* job control IDs */ /* @@@ -553,7 -499,7 +552,7 @@@ * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t utime, stime, cutime, cstime; + cputime_t cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@@ -561,6 -507,14 +560,6 @@@ unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; - /* - * Cumulative ns of scheduled CPU time for dead threads in the - * group, not including a zombie group leader. (This only differs - * from jiffies_to_ns(utime + stime) if sched_clock uses something - * other than jiffies.) - */ - unsigned long long sum_sched_runtime; - /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs @@@ -572,6 -526,8 +571,6 @@@ */ struct rlimit rlim[RLIM_NLIMITS]; - struct list_head cpu_timers[3]; - /* keep the process-shared keyrings here so that they do the right * thing in threads created with CLONE_THREAD */ #ifdef CONFIG_KEYS @@@ -1180,7 -1136,8 +1179,7 @@@ struct task_struct /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; - cputime_t it_prof_expires, it_virt_expires; - unsigned long long it_sched_expires; + struct task_cputime cputime_expires; struct list_head cpu_timers[3]; /* process credentials */ @@@ -1630,7 -1587,6 +1629,7 @@@ extern unsigned long long cpu_clock(in extern unsigned long long task_sched_runtime(struct task_struct *task); +extern unsigned long long thread_group_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@@ -1665,6 -1621,7 +1664,7 @@@ extern unsigned int sysctl_sched_featur extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_shares_ratelimit; + extern unsigned int sysctl_sched_shares_thresh; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, @@@ -2127,30 -2084,6 +2127,30 @@@ static inline int spin_needbreak(spinlo #endif } +/* + * Thread group CPU time accounting. + */ + +extern int thread_group_cputime_alloc(struct task_struct *); +extern void thread_group_cputime(struct task_struct *, struct task_cputime *); + +static inline void thread_group_cputime_init(struct signal_struct *sig) +{ + sig->cputime.totals = NULL; +} + +static inline int thread_group_cputime_clone_thread(struct task_struct *curr) +{ + if (curr->signal->cputime.totals) + return 0; + return thread_group_cputime_alloc(curr); +} + +static inline void thread_group_cputime_free(struct signal_struct *sig) +{ + free_percpu(sig->cputime.totals); +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --combined kernel/sched.c index d906f72b42d,11ca3901783..945a97b9600 --- a/kernel/sched.c +++ b/kernel/sched.c @@@ -71,7 -71,6 +71,7 @@@ #include #include #include +#include #include #include @@@ -818,6 -817,13 +818,13 @@@ const_debug unsigned int sysctl_sched_n */ unsigned int sysctl_sched_shares_ratelimit = 250000; + /* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ + unsigned int sysctl_sched_shares_thresh = 4; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@@ -1454,8 -1460,8 +1461,8 @@@ static void __set_se_shares(struct sche * Calculate and set the cpu's group shares. */ static void - __update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) + update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@@ -1486,19 -1492,23 +1493,23 @@@ * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + spin_lock_irqsave(&rq->lock, flags); + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; - __set_se_shares(tg->se[cpu], shares); + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } } /* @@@ -1527,14 -1537,8 +1538,8 @@@ static int tg_shares_up(struct task_gro if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } + for_each_cpu_mask(i, sd->span) + update_group_shares_cpu(tg, i, shares, rq_weight); return 0; } @@@ -1937,7 -1941,6 +1942,7 @@@ unsigned long wait_task_inactive(struc * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; @@@ -2299,7 -2302,9 +2304,7 @@@ out_activate success = 1; out_running: - trace_mark(kernel_sched_wakeup, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup(rq, p); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@@ -2432,7 -2437,9 +2437,7 @@@ void wake_up_new_task(struct task_struc p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_mark(kernel_sched_wakeup_new, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup_new(rq, p); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@@ -2605,7 -2612,11 +2610,7 @@@ context_switch(struct rq *rq, struct ta struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_mark(kernel_sched_schedule, - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - prev->pid, next->pid, prev->state, - rq, prev, next); + trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@@ -2845,7 -2856,6 +2850,7 @@@ static void sched_migrate_task(struct t || unlikely(!cpu_active(dest_cpu))) goto out; + trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@@ -4047,26 -4057,23 +4052,26 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; struct rq *rq; + u64 ns = 0; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; + if (task_current(rq, p)) { + u64 delta_exec; + update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - ns += delta_exec; + ns = delta_exec; } + task_rq_unlock(rq, &flags); return ns; @@@ -4083,7 -4090,6 +4088,7 @@@ void account_user_time(struct task_stru cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@@ -4108,7 -4114,6 +4113,7 @@@ static void account_guest_time(struct t tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@@ -4144,7 -4149,6 +4149,7 @@@ void account_system_time(struct task_st } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@@ -4186,7 -4190,6 +4191,7 @@@ void account_steal_time(struct task_str if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@@ -4443,12 -4446,8 +4448,8 @@@ need_resched_nonpreemptible if (sched_feat(HRTICK)) hrtick_clear(rq); - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); + spin_lock_irq(&rq->lock); update_rq_clock(rq); - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { diff --combined kernel/sched_fair.c index f604dae7131,a0aa38b10fd..9573c33688b --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@@ -73,6 -73,8 +73,8 @@@ unsigned int sysctl_sched_wakeup_granul const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + static const struct sched_class fair_sched_class; + /************************************************************** * CFS operations on generic schedulable entities: */ @@@ -334,7 -336,7 +336,7 @@@ int sched_nr_latency_handler(struct ctl #endif /* - * delta *= w / rw + * delta *= P[w / rw] */ static inline unsigned long calc_delta_weight(unsigned long delta, struct sched_entity *se) @@@ -348,15 -350,13 +350,13 @@@ } /* - * delta *= rw / w + * delta /= w */ static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; } @@@ -386,26 -386,26 +386,26 @@@ static u64 __sched_period(unsigned lon * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * - * s = p*w/rw + * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); + unsigned long nr_running = cfs_rq->nr_running; + + if (unlikely(!se->on_rq)) + nr_running++; + + return calc_delta_weight(__sched_period(nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s*rw/w = p + * vs = s/w */ - static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) + static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; - - if (!se->on_rq) - nr_running++; - - return __sched_period(nr_running); + return calc_delta_fair(sched_slice(cfs_rq, se), se); } /* @@@ -449,7 -449,6 +449,7 @@@ static void update_curr(struct cfs_rq * struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); } } @@@ -628,7 -627,7 +628,7 @@@ place_entity(struct cfs_rq *cfs_rq, str * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); + vruntime += sched_vslice(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ @@@ -748,7 -747,7 +748,7 @@@ pick_next(struct cfs_rq *cfs_rq, struc struct rq *rq = rq_of(cfs_rq); u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { cfs_rq->pair_start = rq->clock; return se; } @@@ -849,11 -848,31 +849,31 @@@ static void hrtick_start_fair(struct r hrtick_start(rq, delta); } } + + /* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ + static void hrtick_update(struct rq *rq) + { + struct task_struct *curr = rq->curr; + + if (curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); + } #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } + + static inline void hrtick_update(struct rq *rq) + { + } #endif /* @@@ -874,7 -893,7 +894,7 @@@ static void enqueue_task_fair(struct r wakeup = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@@ -896,7 -915,7 +916,7 @@@ static void dequeue_task_fair(struct r sleep = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@@ -1002,8 -1021,6 +1022,6 @@@ static inline int wake_idle(int cpu, st #ifdef CONFIG_SMP - static const struct sched_class fair_sched_class; - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group diff --combined kernel/sched_stats.h index b8c156979cf,67579253b53..2df9d297d29 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@@ -9,7 -9,7 +9,7 @@@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = NR_CPUS/32 * 9; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) @@@ -270,89 -270,3 +270,89 @@@ sched_info_switch(struct task_struct *p #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} diff --combined kernel/sysctl.c index b3cc73931d1,3d804f41e64..a13bd4dfaeb --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@@ -274,6 -274,16 +274,16 @@@ static struct ctl_table kern_table[] = .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", @@@ -833,16 -843,6 +843,16 @@@ .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt