From: Linus Torvalds Date: Thu, 23 Oct 2008 17:53:02 +0000 (-0700) Subject: Merge branch 'v28-range-hrtimers-for-linus-v2' of git://git.kernel.org/pub/scm/linux... X-Git-Tag: v2.6.28-rc1~18 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=1f6d6e8ebe73ba9d9d4c693f7f6f50f661dbd6e4;hp=-c;p=linux-2.6-omap-h63xx.git Merge branch 'v28-range-hrtimers-for-linus-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip * 'v28-range-hrtimers-for-linus-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (37 commits) hrtimers: add missing docbook comments to struct hrtimer hrtimers: simplify hrtimer_peek_ahead_timers() hrtimers: fix docbook comments DECLARE_PER_CPU needs linux/percpu.h hrtimers: fix typo rangetimers: fix the bug reported by Ingo for real rangetimer: fix BUG_ON reported by Ingo rangetimer: fix x86 build failure for the !HRTIMERS case select: fix alpha OSF wrapper select: fix alpha OSF wrapper hrtimer: peek at the timer queue just before going idle hrtimer: make the futex() system call use the per process slack value hrtimer: make the nanosleep() syscall use the per process slack hrtimer: fix signed/unsigned bug in slack estimator hrtimer: show the timer ranges in /proc/timer_list hrtimer: incorporate feedback from Peter Zijlstra hrtimer: add a hrtimer_start_range() function hrtimer: another build fix hrtimer: fix build bug found by Ingo hrtimer: make select() and poll() use the hrtimer range feature ... --- 1f6d6e8ebe73ba9d9d4c693f7f6f50f661dbd6e4 diff --combined arch/alpha/kernel/osf_sys.c index f25f6c49095,8e19acbf288..18a3ea1aac5 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@@ -165,11 -165,14 +165,11 @@@ osf_getdirentries(unsigned int fd, stru buf.error = 0; error = vfs_readdir(file, osf_filldir, &buf); - if (error < 0) - goto out_putf; - - error = buf.error; + if (error >= 0) + error = buf.error; if (count != buf.count) error = count - buf.count; - out_putf: fput(file); out: return error; @@@ -983,10 -986,12 +983,12 @@@ asmlinkage in osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval32 __user *tvp) { - s64 timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec end_time, *to = NULL; if (tvp) { time_t sec, usec; + to = &end_time; + if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp)) || __get_user(sec, &tvp->tv_sec) || __get_user(usec, &tvp->tv_usec)) { @@@ -996,14 -1001,13 +998,13 @@@ if (sec < 0 || usec < 0) return -EINVAL; - if ((unsigned long) sec < MAX_SELECT_SECONDS) { - timeout = (usec + 1000000/HZ - 1) / (1000000/HZ); - timeout += sec * (unsigned long) HZ; - } + if (poll_select_set_timeout(to, sec, usec * NSEC_PER_USEC)) + return -EINVAL; + } /* OSF does not copy back the remaining time. */ - return core_sys_select(n, inp, outp, exp, &timeout); + return core_sys_select(n, inp, outp, exp, to); } struct rusage32 { diff --combined arch/powerpc/oprofile/cell/spu_profiler.c index 6edaebd5099,02ffe060db5..dd499c3e9da --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c @@@ -23,11 -23,12 +23,11 @@@ static u32 *samples; -static int spu_prof_running; +int spu_prof_running; static unsigned int profiling_interval; #define NUM_SPU_BITS_TRBUF 16 #define SPUS_PER_TB_ENTRY 4 -#define SPUS_PER_NODE 8 #define SPU_PC_MASK 0xFFFF @@@ -195,7 -196,7 +195,7 @@@ int start_spu_profiling(unsigned int cy pr_debug("timer resolution: %lu\n", TICK_NSEC); kt = ktime_set(0, profiling_interval); hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - timer.expires = kt; + hrtimer_set_expires(&timer, kt); timer.function = profile_spus; /* Allocate arrays for collecting SPU PC samples */ @@@ -207,7 -208,6 +207,7 @@@ spu_prof_running = 1; hrtimer_start(&timer, kt, HRTIMER_MODE_REL); + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); return 0; } diff --combined drivers/cpuidle/cpuidle.c index bb6e3b33804,2e314849936..5bed73329ef --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@@ -16,6 -16,7 +16,7 @@@ #include #include #include + #include #include "cpuidle.h" @@@ -56,14 -57,16 +57,20 @@@ static void cpuidle_idle_call(void if (pm_idle_old) pm_idle_old(); else +#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE) + default_idle(); +#else local_irq_enable(); +#endif return; } + /* + * run any timers that can be run now, at this point + * before calculating the idle duration etc. + */ + hrtimer_peek_ahead_timers(); + /* ask the governor for the next state */ next_state = cpuidle_curr_governor->select(dev); if (need_resched()) @@@ -71,11 -74,8 +78,11 @@@ target_state = &dev->states[next_state]; /* enter the state and update stats */ - dev->last_residency = target_state->enter(dev, target_state); dev->last_state = target_state; + dev->last_residency = target_state->enter(dev, target_state); + if (dev->last_state) + target_state = dev->last_state; + target_state->time += (unsigned long long)dev->last_residency; target_state->usage++; diff --combined fs/compat.c index cb36245f9fe,3b58c32be52..fe3c9bf8760 --- a/fs/compat.c +++ b/fs/compat.c @@@ -869,7 -869,7 +869,7 @@@ asmlinkage long compat_sys_old_readdir( buf.dirent = dirent; error = vfs_readdir(file, compat_fillonedir, &buf); - if (error >= 0) + if (buf.result) error = buf.result; fput(file); @@@ -956,8 -956,9 +956,8 @@@ asmlinkage long compat_sys_getdents(uns buf.error = 0; error = vfs_readdir(file, compat_filldir, &buf); - if (error < 0) - goto out_putf; - error = buf.error; + if (error >= 0) + error = buf.error; lastdirent = buf.previous; if (lastdirent) { if (put_user(file->f_pos, &lastdirent->d_off)) @@@ -965,6 -966,8 +965,6 @@@ else error = count - buf.count; } - -out_putf: fput(file); out: return error; @@@ -1044,16 -1047,19 +1044,16 @@@ asmlinkage long compat_sys_getdents64(u buf.error = 0; error = vfs_readdir(file, compat_filldir64, &buf); - if (error < 0) - goto out_putf; - error = buf.error; + if (error >= 0) + error = buf.error; lastdirent = buf.previous; if (lastdirent) { typeof(lastdirent->d_off) d_off = file->f_pos; - error = -EFAULT; if (__put_user_unaligned(d_off, &lastdirent->d_off)) - goto out_putf; - error = count - buf.count; + error = -EFAULT; + else + error = count - buf.count; } - -out_putf: fput(file); out: return error; @@@ -1469,6 -1475,57 +1469,57 @@@ out_ret #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) + { + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + + sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; + } + /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. @@@ -1550,7 -1607,8 +1601,8 @@@ int compat_set_fd_set(unsigned long nr ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) int compat_core_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout) + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) { fd_set_bits fds; void *bits; @@@ -1597,7 -1655,7 +1649,7 @@@ zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); - ret = do_select(n, &fds, timeout); + ret = do_select(n, &fds, end_time); if (ret < 0) goto out; @@@ -1623,7 -1681,7 +1675,7 @@@ asmlinkage long compat_sys_select(int n compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct compat_timeval __user *tvp) { - s64 timeout = -1; + struct timespec end_time, *to = NULL; struct compat_timeval tv; int ret; @@@ -1631,43 -1689,14 +1683,14 @@@ if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; - if (tv.tv_sec < 0 || tv.tv_usec < 0) + to = &end_time; + if (poll_select_set_timeout(to, tv.tv_sec, + tv.tv_usec * NSEC_PER_USEC)) return -EINVAL; - - /* Cast to u64 to make GCC stop complaining */ - if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS) - timeout = -1; /* infinite */ - else { - timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ); - timeout += tv.tv_sec * HZ; - } } - ret = compat_core_sys_select(n, inp, outp, exp, &timeout); - - if (tvp) { - struct compat_timeval rtv; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - rtv.tv_sec = timeout; - if (compat_timeval_compare(&rtv, &tv) >= 0) - rtv = tv; - if (copy_to_user(tvp, &rtv, sizeof(rtv))) { - sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } - } + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); return ret; } @@@ -1680,15 -1709,16 +1703,16 @@@ asmlinkage long compat_sys_pselect7(in { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - s64 timeout = MAX_SCHEDULE_TIMEOUT; struct compat_timespec ts; + struct timespec end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; - if (ts.tv_sec < 0 || ts.tv_nsec < 0) + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } @@@ -1703,51 -1733,8 +1727,8 @@@ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - do { - if (tsp) { - if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) { - timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ); - timeout += ts.tv_sec * (unsigned long)HZ; - ts.tv_sec = 0; - ts.tv_nsec = 0; - } else { - ts.tv_sec -= MAX_SELECT_SECONDS; - timeout = MAX_SELECT_SECONDS * HZ; - } - } - - ret = compat_core_sys_select(n, inp, outp, exp, &timeout); - - } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); - - if (tsp) { - struct compat_timespec rts; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - - rts.tv_sec = timeout / HZ; - rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); - if (rts.tv_nsec >= NSEC_PER_SEC) { - rts.tv_sec++; - rts.tv_nsec -= NSEC_PER_SEC; - } - if (compat_timespec_compare(&rts, &ts) >= 0) - rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) { - sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - } - } + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); if (ret == -ERESTARTNOHAND) { /* @@@ -1792,18 -1779,16 +1773,16 @@@ asmlinkage long compat_sys_ppoll(struc compat_sigset_t ss32; sigset_t ksigmask, sigsaved; struct compat_timespec ts; - s64 timeout = -1; + struct timespec end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; - /* We assume that ts.tv_sec is always lower than - the number of seconds that can be expressed in - an s64. Otherwise the compiler bitches at us */ - timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ); - timeout += ts.tv_sec * HZ; + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; } if (sigmask) { @@@ -1817,7 -1802,7 +1796,7 @@@ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - ret = do_sys_poll(ufds, nfds, &timeout); + ret = do_sys_poll(ufds, nfds, to); /* We can restart this syscall, usually */ if (ret == -EINTR) { @@@ -1835,31 -1820,7 +1814,7 @@@ } else if (sigmask) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - if (tsp && timeout >= 0) { - struct compat_timespec rts; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - /* Yes, we know it's actually an s64, but it's also positive. */ - rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * - 1000; - rts.tv_sec = timeout; - if (compat_timespec_compare(&rts, &ts) >= 0) - rts = ts; - if (copy_to_user(tsp, &rts, sizeof(rts))) { - sticky: - /* - * If an application puts its timeval in read-only - * memory, we don't want the Linux-specific update to - * the timeval to cause a fault after the select has - * completed successfully. However, because we're not - * updating the timeval, we can't restart the system - * call. - */ - if (ret == -ERESTARTNOHAND && timeout >= 0) - ret = -EINTR; - } - } + ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); return ret; } diff --combined include/linux/sched.h index 10bff55b082,9ee3bed0ff0..5ca620573d4 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -287,6 -287,7 +287,6 @@@ extern void trap_init(void) extern void account_process_tick(struct task_struct *task, int user); extern void update_process_times(int user); extern void scheduler_tick(void); -extern void hrtick_resched(void); extern void sched_show_task(struct task_struct *p); @@@ -1345,6 -1346,12 +1345,12 @@@ struct task_struct int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif + /* + * time slack values; these are used to round up poll() and + * select() etc timeout values. These are in nanoseconds. + */ + unsigned long timer_slack_ns; + unsigned long default_timer_slack_ns; }; /* @@@ -1664,7 -1671,6 +1670,7 @@@ extern unsigned int sysctl_sched_featur extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_shares_ratelimit; +extern unsigned int sysctl_sched_shares_thresh; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --combined kernel/sched.c index 945a97b9600,bfa87918380..1645c721194 --- a/kernel/sched.c +++ b/kernel/sched.c @@@ -227,9 -227,8 +227,8 @@@ static void start_rt_bandwidth(struct r now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start(&rt_b->rt_period_timer, - rt_b->rt_period_timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS); } spin_unlock(&rt_b->rt_runtime_lock); } @@@ -818,13 -817,6 +817,13 @@@ const_debug unsigned int sysctl_sched_n */ unsigned int sysctl_sched_shares_ratelimit = 250000; +/* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@@ -1071,7 -1063,7 +1070,7 @@@ static void hrtick_start(struct rq *rq struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); - timer->expires = time; + hrtimer_set_expires(timer, time); if (rq == this_rq()) { hrtimer_restart(timer); @@@ -1461,8 -1453,8 +1460,8 @@@ static void __set_se_shares(struct sche * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@@ -1493,23 -1485,19 +1492,23 @@@ * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + spin_lock_irqsave(&rq->lock, flags); + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; - __set_se_shares(tg->se[cpu], shares); + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } } /* @@@ -1538,8 -1526,14 +1537,8 @@@ static int tg_shares_up(struct task_gro if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } + for_each_cpu_mask(i, sd->span) + update_group_shares_cpu(tg, i, shares, rq_weight); return 0; } @@@ -4448,8 -4442,12 +4447,8 @@@ need_resched_nonpreemptible if (sched_feat(HRTICK)) hrtick_clear(rq); - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); + spin_lock_irq(&rq->lock); update_rq_clock(rq); - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {