Merge branch 'timers/range-hrtimers' into v28-range-hrtimers-for-linus-v2

author Thomas Gleixner <tglx@linutronix.de>

Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)

committer Thomas Gleixner <tglx@linutronix.de>

Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
committer Thomas Gleixner <tglx@linutronix.de>
Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
diff --combined include/linux/hrtimer.h

index 9a4e35cd5f79d80e52bf3a1509e0438bb6a77ec4,58bca8e9bae11e02926bf835442421c6429a8fd4..2b3645b1acf4609e55fa36f22368cca34d439a2a
--- 1/include/linux/hrtimer.h
--- 2/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@@ -20,6 -20,8 +20,8 @@@
   #include <linux/init.h>
   #include <linux/list.h>
   #include <linux/wait.h>
+ #include <linux/percpu.h>
+ 
   
   struct hrtimer_clock_base;
   struct hrtimer_cpu_base;
@@@ -101,9 -103,14 +103,14 @@@ enum hrtimer_cb_mode 
   /**
    * struct hrtimer - the basic hrtimer structure
    * @node:     red black tree node for time ordered insertion
-  * @expires:  the absolute expiry time in the hrtimers internal
+  * @_expires: the absolute expiry time in the hrtimers internal
    *            representation. The time is related to the clock on
-  *            which the timer is based.
+  *            which the timer is based. Is setup by adding
+  *            slack to the _softexpires value. For non range timers
+  *            identical to _softexpires.
+  * @_softexpires: the absolute earliest expiry time of the hrtimer.
+  *            The time which was given as expiry time when the timer
+  *            was armed.
    * @function: timer expiry callback function
    * @base:     pointer to the timer base (per cpu and per clock)
    * @state:    state information (See bit values above)
@@@ -121,16 -128,17 +128,17 @@@
    */
   struct hrtimer {
         struct rb_node                  node;
-       ktime_t                         expires;
+       ktime_t                         _expires;
+       ktime_t                         _softexpires;
         enum hrtimer_restart            (*function)(struct hrtimer *);
         struct hrtimer_clock_base       *base;
         unsigned long                   state;
- -      enum hrtimer_cb_mode            cb_mode;
         struct list_head                cb_entry;
+ +      enum hrtimer_cb_mode            cb_mode;
   #ifdef CONFIG_TIMER_STATS
+ +      int                             start_pid;
         void                            *start_site;
         char                            start_comm[16];
- -      int                             start_pid;
   #endif
   };
   
@@@ -155,8 -163,10 +163,8 @@@ struct hrtimer_sleeper 
    * @first:            pointer to the timer node which expires first
    * @resolution:               the resolution of the clock, in nanoseconds
    * @get_time:         function to retrieve the current time of the clock
- - * @get_softirq_time: function to retrieve the current time from the softirq
    * @softirq_time:     the time when running the hrtimer queue in the softirq
    * @offset:           offset of this clock to the monotonic base
- - * @reprogram:                function to reprogram the timer event
    */
   struct hrtimer_clock_base {
         struct hrtimer_cpu_base *cpu_base;
@@@ -165,9 -175,13 +173,9 @@@
         struct rb_node          *first;
         ktime_t                 resolution;
         ktime_t                 (*get_time)(void);
- -      ktime_t                 (*get_softirq_time)(void);
         ktime_t                 softirq_time;
   #ifdef CONFIG_HIGH_RES_TIMERS
         ktime_t                 offset;
- -      int                     (*reprogram)(struct hrtimer *t,
- -                                           struct hrtimer_clock_base *b,
- -                                           ktime_t n);
   #endif
   };
   
@@@ -201,6 -215,71 +209,71 @@@ struct hrtimer_cpu_base 
   #endif
   };
   
+ static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+ {
+       timer->_expires = time;
+       timer->_softexpires = time;
+ }
+ 
+ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
+ {
+       timer->_softexpires = time;
+       timer->_expires = ktime_add_safe(time, delta);
+ }
+ 
+ static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+ {
+       timer->_softexpires = time;
+       timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+ }
+ 
+ static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
+ {
+       timer->_expires.tv64 = tv64;
+       timer->_softexpires.tv64 = tv64;
+ }
+ 
+ static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
+ {
+       timer->_expires = ktime_add_safe(timer->_expires, time);
+       timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
+ }
+ 
+ static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+ {
+       timer->_expires = ktime_add_ns(timer->_expires, ns);
+       timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
+ }
+ 
+ static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
+ {
+       return timer->_expires;
+ }
+ 
+ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
+ {
+       return timer->_softexpires;
+ }
+ 
+ static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
+ {
+       return timer->_expires.tv64;
+ }
+ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
+ {
+       return timer->_softexpires.tv64;
+ }
+ 
+ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
+ {
+       return ktime_to_ns(timer->_expires);
+ }
+ 
+ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
+ {
+     return ktime_sub(timer->_expires, timer->base->get_time());
+ }
+ 
   #ifdef CONFIG_HIGH_RES_TIMERS
   struct clock_event_device;
   
@@@ -221,6 -300,8 +294,8 @@@ static inline int hrtimer_is_hres_activ
         return timer->base->cpu_base->hres_active;
   }
   
+ extern void hrtimer_peek_ahead_timers(void);
+ 
   /*
    * The resolution of the clocks. The resolution value is returned in
    * the clock_getres() system call to give application programmers an
@@@ -243,6 -324,7 +318,7 @@@
    * is expired in the next softirq when the clock was advanced.
    */
   static inline void clock_was_set(void) { }
+ static inline void hrtimer_peek_ahead_timers(void) { }
   
   static inline void hres_timers_resume(void) { }
   
@@@ -264,6 -346,10 +340,10 @@@ static inline int hrtimer_is_hres_activ
   extern ktime_t ktime_get(void);
   extern ktime_t ktime_get_real(void);
   
+ 
+ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+ 
+ 
   /* Exported timer functions: */
   
   /* Initialize timers: */
@@@ -288,12 -374,25 +368,25 @@@ static inline void destroy_hrtimer_on_s
   /* Basic timer operations: */
   extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
                          const enum hrtimer_mode mode);
+ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+                       unsigned long range_ns, const enum hrtimer_mode mode);
   extern int hrtimer_cancel(struct hrtimer *timer);
   extern int hrtimer_try_to_cancel(struct hrtimer *timer);
   
+ static inline int hrtimer_start_expires(struct hrtimer *timer,
+                                               enum hrtimer_mode mode)
+ {
+       unsigned long delta;
+       ktime_t soft, hard;
+       soft = hrtimer_get_softexpires(timer);
+       hard = hrtimer_get_expires(timer);
+       delta = ktime_to_ns(ktime_sub(hard, soft));
+       return hrtimer_start_range_ns(timer, soft, delta, mode);
+ }
+ 
   static inline int hrtimer_restart(struct hrtimer *timer)
   {
-       return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+       return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
   }
   
   /* Query timers: */
@@@ -350,6 -449,10 +443,10 @@@ extern long hrtimer_nanosleep_restart(s
   extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                  struct task_struct *tsk);
   
+ extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+                                               const enum hrtimer_mode mode);
+ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
+ 
   /* Soft interrupt function to run the hrtimer queues: */
   extern void hrtimer_run_queues(void);
   extern void hrtimer_run_pending(void);
diff --combined include/linux/sched.h

index 5c38db536e07f8b7b0d6e5b0dc4f31a9e3cb07c1,de53c109fd04f1de9eb9fd973af5ae3e7f790922..9ee3bed0ff065108a2f52404c2705b88277730f3
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -403,21 -403,12 +403,21 @@@ extern int get_dumpable(struct mm_struc
   #define MMF_DUMP_MAPPED_PRIVATE       4
   #define MMF_DUMP_MAPPED_SHARED        5
   #define MMF_DUMP_ELF_HEADERS  6
+ +#define MMF_DUMP_HUGETLB_PRIVATE 7
+ +#define MMF_DUMP_HUGETLB_SHARED  8
   #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
- -#define MMF_DUMP_FILTER_BITS  5
+ +#define MMF_DUMP_FILTER_BITS  7
   #define MMF_DUMP_FILTER_MASK \
         (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
   #define MMF_DUMP_FILTER_DEFAULT \
- -      ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
+ +      ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
+ +       (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+ +
+ +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+ +# define MMF_DUMP_MASK_DEFAULT_ELF    (1 << MMF_DUMP_ELF_HEADERS)
+ +#else
+ +# define MMF_DUMP_MASK_DEFAULT_ELF    0
+ +#endif
   
   struct sighand_struct {
         atomic_t                count;
@@@ -434,39 -425,6 +434,39 @@@ struct pacct_struct 
         unsigned long           ac_minflt, ac_majflt;
   };
   
+ +/**
+ + * struct task_cputime - collected CPU time counts
+ + * @utime:            time spent in user mode, in &cputime_t units
+ + * @stime:            time spent in kernel mode, in &cputime_t units
+ + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
+ + *
+ + * This structure groups together three kinds of CPU time that are
+ + * tracked for threads and thread groups.  Most things considering
+ + * CPU time want to group these counts together and treat all three
+ + * of them in parallel.
+ + */
+ +struct task_cputime {
+ +      cputime_t utime;
+ +      cputime_t stime;
+ +      unsigned long long sum_exec_runtime;
+ +};
+ +/* Alternate field names when used to cache expirations. */
+ +#define prof_exp      stime
+ +#define virt_exp      utime
+ +#define sched_exp     sum_exec_runtime
+ +
+ +/**
+ + * struct thread_group_cputime - thread group interval timer counts
+ + * @totals:           thread group interval timers; substructure for
+ + *                    uniprocessor kernel, per-cpu for SMP kernel.
+ + *
+ + * This structure contains the version of task_cputime, above, that is
+ + * used for thread group CPU clock calculations.
+ + */
+ +struct thread_group_cputime {
+ +      struct task_cputime *totals;
+ +};
+ +
   /*
    * NOTE! "signal_struct" does not have it's own
    * locking, because a shared signal_struct always
@@@ -512,17 -470,6 +512,17 @@@ struct signal_struct 
         cputime_t it_prof_expires, it_virt_expires;
         cputime_t it_prof_incr, it_virt_incr;
   
+ +      /*
+ +       * Thread group totals for process CPU clocks.
+ +       * See thread_group_cputime(), et al, for details.
+ +       */
+ +      struct thread_group_cputime cputime;
+ +
+ +      /* Earliest-expiration cache. */
+ +      struct task_cputime cputime_expires;
+ +
+ +      struct list_head cpu_timers[3];
+ +
         /* job control IDs */
   
         /*
@@@ -553,7 -500,7 +553,7 @@@
          * Live threads maintain their own counters and add to these
          * in __exit_signal, except for the group leader.
          */
- -      cputime_t utime, stime, cutime, cstime;
+ +      cputime_t cutime, cstime;
         cputime_t gtime;
         cputime_t cgtime;
         unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@@ -561,6 -508,14 +561,6 @@@
         unsigned long inblock, oublock, cinblock, coublock;
         struct task_io_accounting ioac;
   
- -      /*
- -       * Cumulative ns of scheduled CPU time for dead threads in the
- -       * group, not including a zombie group leader.  (This only differs
- -       * from jiffies_to_ns(utime + stime) if sched_clock uses something
- -       * other than jiffies.)
- -       */
- -      unsigned long long sum_sched_runtime;
- -
         /*
          * We don't bother to synchronize most readers of this at all,
          * because there is no reader checking a limit that actually needs
@@@ -572,6 -527,8 +572,6 @@@
          */
         struct rlimit rlim[RLIM_NLIMITS];
   
- -      struct list_head cpu_timers[3];
- -
         /* keep the process-shared keyrings here so that they do the right
          * thing in threads created with CLONE_THREAD */
   #ifdef CONFIG_KEYS
@@@ -1180,7 -1137,8 +1180,7 @@@ struct task_struct 
   /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
         unsigned long min_flt, maj_flt;
   
- -      cputime_t it_prof_expires, it_virt_expires;
- -      unsigned long long it_sched_expires;
+ +      struct task_cputime cputime_expires;
         struct list_head cpu_timers[3];
   
   /* process credentials */
@@@ -1346,6 -1304,12 +1346,12 @@@
         int latency_record_count;
         struct latency_record latency_record[LT_SAVECOUNT];
   #endif
+       /*
+        * time slack values; these are used to round up poll() and
+        * select() etc timeout values. These are in nanoseconds.
+        */
+       unsigned long timer_slack_ns;
+       unsigned long default_timer_slack_ns;
   };
   
   /*
@@@ -1630,7 -1594,6 +1636,7 @@@ extern unsigned long long cpu_clock(in
   
   extern unsigned long long
   task_sched_runtime(struct task_struct *task);
+ +extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
   
   /* sched_exec is called by processes performing an exec */
   #ifdef CONFIG_SMP
@@@ -2127,30 -2090,6 +2133,30 @@@ static inline int spin_needbreak(spinlo
   #endif
   }
   
+ +/*
+ + * Thread group CPU time accounting.
+ + */
+ +
+ +extern int thread_group_cputime_alloc(struct task_struct *);
+ +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
+ +
+ +static inline void thread_group_cputime_init(struct signal_struct *sig)
+ +{
+ +      sig->cputime.totals = NULL;
+ +}
+ +
+ +static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
+ +{
+ +      if (curr->signal->cputime.totals)
+ +              return 0;
+ +      return thread_group_cputime_alloc(curr);
+ +}
+ +
+ +static inline void thread_group_cputime_free(struct signal_struct *sig)
+ +{
+ +      free_percpu(sig->cputime.totals);
+ +}
+ +
   /*
    * Reevaluate whether the task has signals pending delivery.
    * Wake the task if so.
diff --combined include/linux/time.h

index 4f1c9db577079ed3cc688def99e476219b5aa9b0,c911ef69ea87e2dbb2c7cd8988dce850d13b79be..ce321ac5c8f8ceba2f0bdb7640dbc71a9eab6687
--- 1/include/linux/time.h
--- 2/include/linux/time.h
+++ b/include/linux/time.h
@@@ -40,6 -40,8 +40,8 @@@ extern struct timezone sys_tz
   #define NSEC_PER_SEC  1000000000L
   #define FSEC_PER_SEC  1000000000000000L
   
+ #define TIME_T_MAX    (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
+ 
   static inline int timespec_equal(const struct timespec *a,
                                    const struct timespec *b)
   {
@@@ -74,6 -76,8 +76,8 @@@ extern unsigned long mktime(const unsig
                             const unsigned int min, const unsigned int sec);
   
   extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+ extern struct timespec timespec_add_safe(const struct timespec lhs,
+                                        const struct timespec rhs);
   
   /*
    * sub = lhs - rhs, in normalized form
@@@ -119,7 -123,6 +123,7 @@@ extern int do_setitimer(int which, stru
   extern unsigned int alarm_setitimer(unsigned int seconds);
   extern int do_getitimer(int which, struct itimerval *value);
   extern void getnstimeofday(struct timespec *tv);
+ +extern void getrawmonotonic(struct timespec *ts);
   extern void getboottime(struct timespec *ts);
   extern void monotonic_to_bootbased(struct timespec *ts);
   
@@@ -128,9 -131,6 +132,9 @@@ extern int timekeeping_valid_for_hres(v
   extern void update_wall_time(void);
   extern void update_xtime_cache(u64 nsec);
   
+ +struct tms;
+ +extern void do_sys_times(struct tms *);
+ +
   /**
    * timespec_to_ns - Convert timespec to nanoseconds
    * @ts:               pointer to the timespec variable to be converted
@@@ -220,7 -220,6 +224,7 @@@ struct itimerval 
   #define CLOCK_MONOTONIC                       1
   #define CLOCK_PROCESS_CPUTIME_ID      2
   #define CLOCK_THREAD_CPUTIME_ID               3
+ +#define CLOCK_MONOTONIC_RAW           4
   
   /*
    * The IDs of various hardware clocks:
diff --combined kernel/fork.c

index 4d093552dd6e79aea8553b32e1ff465cd0439116,37b3e150ae3956759b684c6806bbb6dc5ad12376..f6083561dfe0a9f8d2a13138f7332bc358a51653
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -58,7 -58,6 +58,7 @@@
   #include <linux/tty.h>
   #include <linux/proc_fs.h>
   #include <linux/blkdev.h>
+ +#include <trace/sched.h>
   
   #include <asm/pgtable.h>
   #include <asm/pgalloc.h>
@@@ -760,44 -759,15 +760,44 @@@ void __cleanup_sighand(struct sighand_s
                 kmem_cache_free(sighand_cachep, sighand);
   }
   
+ +
+ +/*
+ + * Initialize POSIX timer handling for a thread group.
+ + */
+ +static void posix_cpu_timers_init_group(struct signal_struct *sig)
+ +{
+ +      /* Thread group counters. */
+ +      thread_group_cputime_init(sig);
+ +
+ +      /* Expiration times and increments. */
+ +      sig->it_virt_expires = cputime_zero;
+ +      sig->it_virt_incr = cputime_zero;
+ +      sig->it_prof_expires = cputime_zero;
+ +      sig->it_prof_incr = cputime_zero;
+ +
+ +      /* Cached expiration times. */
+ +      sig->cputime_expires.prof_exp = cputime_zero;
+ +      sig->cputime_expires.virt_exp = cputime_zero;
+ +      sig->cputime_expires.sched_exp = 0;
+ +
+ +      /* The timer lists. */
+ +      INIT_LIST_HEAD(&sig->cpu_timers[0]);
+ +      INIT_LIST_HEAD(&sig->cpu_timers[1]);
+ +      INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ +}
+ +
   static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
   {
         struct signal_struct *sig;
         int ret;
   
         if (clone_flags & CLONE_THREAD) {
- -              atomic_inc(&current->signal->count);
- -              atomic_inc(&current->signal->live);
- -              return 0;
+ +              ret = thread_group_cputime_clone_thread(current);
+ +              if (likely(!ret)) {
+ +                      atomic_inc(&current->signal->count);
+ +                      atomic_inc(&current->signal->live);
+ +              }
+ +              return ret;
         }
         sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
         tsk->signal = sig;
@@@ -825,25 -795,40 +825,25 @@@
         sig->it_real_incr.tv64 = 0;
         sig->real_timer.function = it_real_fn;
   
- -      sig->it_virt_expires = cputime_zero;
- -      sig->it_virt_incr = cputime_zero;
- -      sig->it_prof_expires = cputime_zero;
- -      sig->it_prof_incr = cputime_zero;
- -
         sig->leader = 0;        /* session leadership doesn't inherit */
         sig->tty_old_pgrp = NULL;
         sig->tty = NULL;
   
- -      sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ +      sig->cutime = sig->cstime = cputime_zero;
         sig->gtime = cputime_zero;
         sig->cgtime = cputime_zero;
         sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
         sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
         sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
         task_io_accounting_init(&sig->ioac);
- -      sig->sum_sched_runtime = 0;
- -      INIT_LIST_HEAD(&sig->cpu_timers[0]);
- -      INIT_LIST_HEAD(&sig->cpu_timers[1]);
- -      INIT_LIST_HEAD(&sig->cpu_timers[2]);
         taskstats_tgid_init(sig);
   
         task_lock(current->group_leader);
         memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
         task_unlock(current->group_leader);
   
- -      if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
- -              /*
- -               * New sole thread in the process gets an expiry time
- -               * of the whole CPU time limit.
- -               */
- -              tsk->it_prof_expires =
- -                      secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
- -      }
+ +      posix_cpu_timers_init_group(sig);
+ +
         acct_init_pacct(&sig->pacct);
   
         tty_audit_fork(sig);
@@@ -853,7 -838,6 +853,7 @@@
   
   void __cleanup_signal(struct signal_struct *sig)
   {
+ +      thread_group_cputime_free(sig);
         exit_thread_group_keys(sig);
         tty_kref_put(sig->tty);
         kmem_cache_free(signal_cachep, sig);
@@@ -903,19 -887,6 +903,19 @@@ void mm_init_owner(struct mm_struct *mm
   }
   #endif /* CONFIG_MM_OWNER */
   
+ +/*
+ + * Initialize POSIX timer handling for a single task.
+ + */
+ +static void posix_cpu_timers_init(struct task_struct *tsk)
+ +{
+ +      tsk->cputime_expires.prof_exp = cputime_zero;
+ +      tsk->cputime_expires.virt_exp = cputime_zero;
+ +      tsk->cputime_expires.sched_exp = 0;
+ +      INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+ +      INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+ +      INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+ +}
+ +
   /*
    * This creates a new process as a copy of the old one,
    * but does not actually start it yet.
@@@ -1018,6 -989,8 +1018,8 @@@ static struct task_struct *copy_process
         p->prev_utime = cputime_zero;
         p->prev_stime = cputime_zero;
   
+       p->default_timer_slack_ns = current->timer_slack_ns;
+ 
   #ifdef CONFIG_DETECT_SOFTLOCKUP
         p->last_switch_count = 0;
         p->last_switch_timestamp = 0;
@@@ -1026,7 -999,12 +1028,7 @@@
         task_io_accounting_init(&p->ioac);
         acct_clear_integrals(p);
   
- -      p->it_virt_expires = cputime_zero;
- -      p->it_prof_expires = cputime_zero;
- -      p->it_sched_expires = 0;
- -      INIT_LIST_HEAD(&p->cpu_timers[0]);
- -      INIT_LIST_HEAD(&p->cpu_timers[1]);
- -      INIT_LIST_HEAD(&p->cpu_timers[2]);
+ +      posix_cpu_timers_init(p);
   
         p->lock_depth = -1;             /* -1 = no lock */
         do_posix_clock_monotonic_gettime(&p->start_time);
@@@ -1227,6 -1205,21 +1229,6 @@@
         if (clone_flags & CLONE_THREAD) {
                 p->group_leader = current->group_leader;
                 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
- -
- -              if (!cputime_eq(current->signal->it_virt_expires,
- -                              cputime_zero) ||
- -                  !cputime_eq(current->signal->it_prof_expires,
- -                              cputime_zero) ||
- -                  current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
- -                  !list_empty(&current->signal->cpu_timers[0]) ||
- -                  !list_empty(&current->signal->cpu_timers[1]) ||
- -                  !list_empty(&current->signal->cpu_timers[2])) {
- -                      /*
- -                       * Have child wake up on its first tick to check
- -                       * for process CPU timers.
- -                       */
- -                      p->it_prof_expires = jiffies_to_cputime(1);
- -              }
         }
   
         if (likely(p->pid)) {
@@@ -1373,8 -1366,6 +1375,8 @@@ long do_fork(unsigned long clone_flags
         if (!IS_ERR(p)) {
                 struct completion vfork;
   
+ +              trace_sched_process_fork(current, p);
+ +
                 nr = task_pid_vnr(p);
   
                 if (clone_flags & CLONE_PARENT_SETTID)
diff --combined kernel/hrtimer.c

index 95978f48e039fcbd7e7e233224a7e3f4a0b2f884,4fc41414fc068395b7202ec80e39ce404812efa7..2b465dfde4269b6d4609957999c9eecb0a148a36
--- 1/kernel/hrtimer.c
--- 2/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@@ -517,7 -517,7 +517,7 @@@ static void hrtimer_force_reprogram(str
                 if (!base->first)
                         continue;
                 timer = rb_entry(base->first, struct hrtimer, node);
-               expires = ktime_sub(timer->expires, base->offset);
+               expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                 if (expires.tv64 < cpu_base->expires_next.tv64)
                         cpu_base->expires_next = expires;
         }
@@@ -539,10 -539,10 +539,10 @@@ static int hrtimer_reprogram(struct hrt
                              struct hrtimer_clock_base *base)
   {
         ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
-       ktime_t expires = ktime_sub(timer->expires, base->offset);
+       ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
         int res;
   
-       WARN_ON_ONCE(timer->expires.tv64 < 0);
+       WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
   
         /*
          * When the callback is running, we do not reprogram the clock event
@@@ -795,7 -795,7 +795,7 @@@ u64 hrtimer_forward(struct hrtimer *tim
         u64 orun = 1;
         ktime_t delta;
   
-       delta = ktime_sub(now, timer->expires);
+       delta = ktime_sub(now, hrtimer_get_expires(timer));
   
         if (delta.tv64 < 0)
                 return 0;
@@@ -807,8 -807,8 +807,8 @@@
                 s64 incr = ktime_to_ns(interval);
   
                 orun = ktime_divns(delta, incr);
-               timer->expires = ktime_add_ns(timer->expires, incr * orun);
-               if (timer->expires.tv64 > now.tv64)
+               hrtimer_add_expires_ns(timer, incr * orun);
+               if (hrtimer_get_expires_tv64(timer) > now.tv64)
                         return orun;
                 /*
                  * This (and the ktime_add() below) is the
@@@ -816,7 -816,7 +816,7 @@@
                  */
                 orun++;
         }
-       timer->expires = ktime_add_safe(timer->expires, interval);
+       hrtimer_add_expires(timer, interval);
   
         return orun;
   }
@@@ -848,7 -848,8 +848,8 @@@ static void enqueue_hrtimer(struct hrti
                  * We dont care about collisions. Nodes with
                  * the same expiry time stay together.
                  */
-               if (timer->expires.tv64 < entry->expires.tv64) {
+               if (hrtimer_get_expires_tv64(timer) <
+                               hrtimer_get_expires_tv64(entry)) {
                         link = &(*link)->rb_left;
                 } else {
                         link = &(*link)->rb_right;
@@@ -945,9 -946,10 +946,10 @@@ remove_hrtimer(struct hrtimer *timer, s
   }
   
   /**
-  * hrtimer_start - (re)start an relative timer on the current CPU
+  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
    * @timer:    the timer to be added
    * @tim:      expiry time
+  * @delta_ns: "slack" range for the timer
    * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
    *
    * Returns:
@@@ -955,7 -957,8 +957,8 @@@
    *  1 when the timer was active
    */
   int
- hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+                       const enum hrtimer_mode mode)
   {
         struct hrtimer_clock_base *base, *new_base;
         unsigned long flags;
@@@ -983,7 -986,7 +986,7 @@@
   #endif
         }
   
-       timer->expires = tim;
+       hrtimer_set_expires_range_ns(timer, tim, delta_ns);
   
         timer_stats_hrtimer_set_start_info(timer);
   
@@@ -1016,8 -1019,26 +1019,26 @@@
   
         return ret;
   }
+ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+ 
+ /**
+  * hrtimer_start - (re)start an hrtimer on the current CPU
+  * @timer:    the timer to be added
+  * @tim:      expiry time
+  * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+  *
+  * Returns:
+  *  0 on success
+  *  1 when the timer was active
+  */
+ int
+ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+ {
+       return hrtimer_start_range_ns(timer, tim, 0, mode);
+ }
   EXPORT_SYMBOL_GPL(hrtimer_start);
   
+ 
   /**
    * hrtimer_try_to_cancel - try to deactivate a timer
    * @timer:    hrtimer to stop
@@@ -1077,7 -1098,7 +1098,7 @@@ ktime_t hrtimer_get_remaining(const str
         ktime_t rem;
   
         base = lock_hrtimer_base(timer, &flags);
-       rem = ktime_sub(timer->expires, base->get_time());
+       rem = hrtimer_expires_remaining(timer);
         unlock_hrtimer_base(timer, &flags);
   
         return rem;
@@@ -1109,7 -1130,7 +1130,7 @@@ ktime_t hrtimer_get_next_event(void
                                 continue;
   
                         timer = rb_entry(base->first, struct hrtimer, node);
-                       delta.tv64 = timer->expires.tv64;
+                       delta.tv64 = hrtimer_get_expires_tv64(timer);
                         delta = ktime_sub(delta, base->get_time());
                         if (delta.tv64 < mindelta.tv64)
                                 mindelta.tv64 = delta.tv64;
@@@ -1310,10 -1331,23 +1331,23 @@@ void hrtimer_interrupt(struct clock_eve
   
                         timer = rb_entry(node, struct hrtimer, node);
   
-                       if (basenow.tv64 < timer->expires.tv64) {
+                       /*
+                        * The immediate goal for using the softexpires is
+                        * minimizing wakeups, not running timers at the
+                        * earliest interrupt after their soft expiration.
+                        * This allows us to avoid using a Priority Search
+                        * Tree, which can answer a stabbing querry for
+                        * overlapping intervals and instead use the simple
+                        * BST we already have.
+                        * We don't add extra wakeups by delaying timers that
+                        * are right-of a not yet expired timer, because that
+                        * timer will have to trigger a wakeup anyway.
+                        */
+ 
+                       if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
                                 ktime_t expires;
   
-                               expires = ktime_sub(timer->expires,
+                               expires = ktime_sub(hrtimer_get_expires(timer),
                                                     base->offset);
                                 if (expires.tv64 < expires_next.tv64)
                                         expires_next = expires;
@@@ -1349,6 -1383,30 +1383,30 @@@
                 raise_softirq(HRTIMER_SOFTIRQ);
   }
   
+ /**
+  * hrtimer_peek_ahead_timers -- run soft-expired timers now
+  *
+  * hrtimer_peek_ahead_timers will peek at the timer queue of
+  * the current cpu and check if there are any timers for which
+  * the soft expires time has passed. If any such timers exist,
+  * they are run immediately and then removed from the timer queue.
+  *
+  */
+ void hrtimer_peek_ahead_timers(void)
+ {
+       struct tick_device *td;
+       unsigned long flags;
+ 
+       if (!hrtimer_hres_active())
+               return;
+ 
+       local_irq_save(flags);
+       td = &__get_cpu_var(tick_cpu_device);
+       if (td && td->evtdev)
+               hrtimer_interrupt(td->evtdev);
+       local_irq_restore(flags);
+ }
+ 
   static void run_hrtimer_softirq(struct softirq_action *h)
   {
         run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@@ -1403,7 -1461,9 +1461,7 @@@ void hrtimer_run_queues(void
                 if (!base->first)
                         continue;
   
- -              if (base->get_softirq_time)
- -                      base->softirq_time = base->get_softirq_time();
- -              else if (gettime) {
+ +              if (gettime) {
                         hrtimer_get_softirq_time(cpu_base);
                         gettime = 0;
                 }
@@@ -1414,7 -1474,8 +1472,8 @@@
                         struct hrtimer *timer;
   
                         timer = rb_entry(node, struct hrtimer, node);
-                       if (base->softirq_time.tv64 <= timer->expires.tv64)
+                       if (base->softirq_time.tv64 <=
+                                       hrtimer_get_expires_tv64(timer))
                                 break;
   
                         if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@@ -1462,7 -1523,7 +1521,7 @@@ static int __sched do_nanosleep(struct 
   
         do {
                 set_current_state(TASK_INTERRUPTIBLE);
-               hrtimer_start(&t->timer, t->timer.expires, mode);
+               hrtimer_start_expires(&t->timer, mode);
                 if (!hrtimer_active(&t->timer))
                         t->task = NULL;
   
@@@ -1484,7 -1545,7 +1543,7 @@@ static int update_rmtp(struct hrtimer *
         struct timespec rmt;
         ktime_t rem;
   
-       rem = ktime_sub(timer->expires, timer->base->get_time());
+       rem = hrtimer_expires_remaining(timer);
         if (rem.tv64 <= 0)
                 return 0;
         rmt = ktime_to_timespec(rem);
@@@ -1503,7 -1564,7 +1562,7 @@@ long __sched hrtimer_nanosleep_restart(
   
         hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
                                 HRTIMER_MODE_ABS);
-       t.timer.expires.tv64 = restart->nanosleep.expires;
+       hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
   
         if (do_nanosleep(&t, HRTIMER_MODE_ABS))
                 goto out;
@@@ -1528,9 -1589,14 +1587,14 @@@ long hrtimer_nanosleep(struct timespec 
         struct restart_block *restart;
         struct hrtimer_sleeper t;
         int ret = 0;
+       unsigned long slack;
+ 
+       slack = current->timer_slack_ns;
+       if (rt_task(current))
+               slack = 0;
   
         hrtimer_init_on_stack(&t.timer, clockid, mode);
-       t.timer.expires = timespec_to_ktime(*rqtp);
+       hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
         if (do_nanosleep(&t, mode))
                 goto out;
   
@@@ -1550,7 -1616,7 +1614,7 @@@
         restart->fn = hrtimer_nanosleep_restart;
         restart->nanosleep.index = t.timer.base->index;
         restart->nanosleep.rmtp = rmtp;
-       restart->nanosleep.expires = t.timer.expires.tv64;
+       restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
   
         ret = -ERESTART_RESTARTBLOCK;
   out:
@@@ -1686,11 -1752,9 +1750,11 @@@ static void migrate_hrtimers(int cpu
         new_base = &get_cpu_var(hrtimer_bases);
   
         tick_cancel_sched_timer(cpu);
- -
- -      local_irq_disable();
- -      spin_lock(&new_base->lock);
+ +      /*
+ +       * The caller is globally serialized and nobody else
+ +       * takes two locks at once, deadlock is not possible.
+ +       */
+ +      spin_lock_irq(&new_base->lock);
         spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
   
         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@@ -1703,7 -1767,8 +1767,7 @@@
                 raise = 1;
   
         spin_unlock(&old_base->lock);
- -      spin_unlock(&new_base->lock);
- -      local_irq_enable();
+ +      spin_unlock_irq(&new_base->lock);
         put_cpu_var(hrtimer_bases);
   
         if (raise)
@@@ -1752,3 -1817,103 +1816,103 @@@ void __init hrtimers_init(void
   #endif
   }
   
+ /**
+  * schedule_hrtimeout_range - sleep until timeout
+  * @expires:  timeout value (ktime_t)
+  * @delta:    slack in expires timeout (ktime_t)
+  * @mode:     timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+  *
+  * Make the current task sleep until the given expiry time has
+  * elapsed. The routine will return immediately unless
+  * the current task state has been set (see set_current_state()).
+  *
+  * The @delta argument gives the kernel the freedom to schedule the
+  * actual wakeup to a time that is both power and performance friendly.
+  * The kernel give the normal best effort behavior for "@expires+@delta",
+  * but may decide to fire the timer earlier, but no earlier than @expires.
+  *
+  * You can set the task state as follows -
+  *
+  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+  * pass before the routine returns.
+  *
+  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+  * delivered to the current task.
+  *
+  * The current task state is guaranteed to be TASK_RUNNING when this
+  * routine returns.
+  *
+  * Returns 0 when the timer has expired otherwise -EINTR
+  */
+ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+                              const enum hrtimer_mode mode)
+ {
+       struct hrtimer_sleeper t;
+ 
+       /*
+        * Optimize when a zero timeout value is given. It does not
+        * matter whether this is an absolute or a relative time.
+        */
+       if (expires && !expires->tv64) {
+               __set_current_state(TASK_RUNNING);
+               return 0;
+       }
+ 
+       /*
+        * A NULL parameter means "inifinte"
+        */
+       if (!expires) {
+               schedule();
+               __set_current_state(TASK_RUNNING);
+               return -EINTR;
+       }
+ 
+       hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+       hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+ 
+       hrtimer_init_sleeper(&t, current);
+ 
+       hrtimer_start_expires(&t.timer, mode);
+       if (!hrtimer_active(&t.timer))
+               t.task = NULL;
+ 
+       if (likely(t.task))
+               schedule();
+ 
+       hrtimer_cancel(&t.timer);
+       destroy_hrtimer_on_stack(&t.timer);
+ 
+       __set_current_state(TASK_RUNNING);
+ 
+       return !t.task ? 0 : -EINTR;
+ }
+ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+ 
+ /**
+  * schedule_hrtimeout - sleep until timeout
+  * @expires:  timeout value (ktime_t)
+  * @mode:     timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+  *
+  * Make the current task sleep until the given expiry time has
+  * elapsed. The routine will return immediately unless
+  * the current task state has been set (see set_current_state()).
+  *
+  * You can set the task state as follows -
+  *
+  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+  * pass before the routine returns.
+  *
+  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+  * delivered to the current task.
+  *
+  * The current task state is guaranteed to be TASK_RUNNING when this
+  * routine returns.
+  *
+  * Returns 0 when the timer has expired otherwise -EINTR
+  */
+ int __sched schedule_hrtimeout(ktime_t *expires,
+                              const enum hrtimer_mode mode)
+ {
+       return schedule_hrtimeout_range(expires, 0, mode);
+ }
+ EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --combined kernel/posix-timers.c

index b931d7cedbfa9fd70a47d07535d3b791661ec39a,ee204586149a0c8ab6144808c0610320545f7b8f..5e79c662294bf542750af232be67a1e92bed858d
--- 1/kernel/posix-timers.c
--- 2/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@@ -222,15 -222,6 +222,15 @@@ static int posix_ktime_get_ts(clockid_
         return 0;
   }
   
+ +/*
+ + * Get monotonic time for posix timers
+ + */
+ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+ +{
+ +      getrawmonotonic(tp);
+ +      return 0;
+ +}
+ +
   /*
    * Initialize everything, well, just everything in Posix clocks/timers ;)
    */
@@@ -244,15 -235,9 +244,15 @@@ static __init int init_posix_timers(voi
                 .clock_get = posix_ktime_get_ts,
                 .clock_set = do_posix_clock_nosettime,
         };
+ +      struct k_clock clock_monotonic_raw = {
+ +              .clock_getres = hrtimer_get_res,
+ +              .clock_get = posix_get_monotonic_raw,
+ +              .clock_set = do_posix_clock_nosettime,
+ +      };
   
         register_posix_clock(CLOCK_REALTIME, &clock_realtime);
         register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ +      register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
   
         posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                         sizeof (struct k_itimer), 0, SLAB_PANIC,
@@@ -313,7 -298,6 +313,7 @@@ void do_schedule_next_timer(struct sigi
   
   int posix_timer_event(struct k_itimer *timr, int si_private)
   {
+ +      int shared, ret;
         /*
          * FIXME: if ->sigq is queued we can race with
          * dequeue_signal()->do_schedule_next_timer().
@@@ -327,10 -311,25 +327,10 @@@
          */
         timr->sigq->info.si_sys_private = si_private;
   
- -      timr->sigq->info.si_signo = timr->it_sigev_signo;
- -      timr->sigq->info.si_code = SI_TIMER;
- -      timr->sigq->info.si_tid = timr->it_id;
- -      timr->sigq->info.si_value = timr->it_sigev_value;
- -
- -      if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
- -              struct task_struct *leader;
- -              int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
- -
- -              if (likely(ret >= 0))
- -                      return ret;
- -
- -              timr->it_sigev_notify = SIGEV_SIGNAL;
- -              leader = timr->it_process->group_leader;
- -              put_task_struct(timr->it_process);
- -              timr->it_process = leader;
- -      }
- -
- -      return send_sigqueue(timr->sigq, timr->it_process, 1);
+ +      shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
+ +      ret = send_sigqueue(timr->sigq, timr->it_process, shared);
+ +      /* If we failed to send the signal the timer stops. */
+ +      return ret > 0;
   }
   EXPORT_SYMBOL_GPL(posix_timer_event);
   
@@@ -469,9 -468,11 +469,9 @@@ sys_timer_create(const clockid_t which_
                  struct sigevent __user *timer_event_spec,
                  timer_t __user * created_timer_id)
   {
- -      int error = 0;
- -      struct k_itimer *new_timer = NULL;
- -      int new_timer_id;
- -      struct task_struct *process = NULL;
- -      unsigned long flags;
+ +      struct k_itimer *new_timer;
+ +      int error, new_timer_id;
+ +      struct task_struct *process;
         sigevent_t event;
         int it_id_set = IT_ID_NOT_SET;
   
@@@ -489,11 -490,12 +489,11 @@@
                 goto out;
         }
         spin_lock_irq(&idr_lock);
- -      error = idr_get_new(&posix_timers_id, (void *) new_timer,
- -                          &new_timer_id);
+ +      error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
         spin_unlock_irq(&idr_lock);
- -      if (error == -EAGAIN)
- -              goto retry;
- -      else if (error) {
+ +      if (error) {
+ +              if (error == -EAGAIN)
+ +                      goto retry;
                 /*
                  * Weird looking, but we return EAGAIN if the IDR is
                  * full (proper POSIX return value for this)
@@@ -524,43 -526,67 +524,43 @@@
                         error = -EFAULT;
                         goto out;
                 }
- -              new_timer->it_sigev_notify = event.sigev_notify;
- -              new_timer->it_sigev_signo = event.sigev_signo;
- -              new_timer->it_sigev_value = event.sigev_value;
- -
- -              read_lock(&tasklist_lock);
- -              if ((process = good_sigevent(&event))) {
- -                      /*
- -                       * We may be setting up this process for another
- -                       * thread.  It may be exiting.  To catch this
- -                       * case the we check the PF_EXITING flag.  If
- -                       * the flag is not set, the siglock will catch
- -                       * him before it is too late (in exit_itimers).
- -                       *
- -                       * The exec case is a bit more invloved but easy
- -                       * to code.  If the process is in our thread
- -                       * group (and it must be or we would not allow
- -                       * it here) and is doing an exec, it will cause
- -                       * us to be killed.  In this case it will wait
- -                       * for us to die which means we can finish this
- -                       * linkage with our last gasp. I.e. no code :)
- -                       */
- -                      spin_lock_irqsave(&process->sighand->siglock, flags);
- -                      if (!(process->flags & PF_EXITING)) {
- -                              new_timer->it_process = process;
- -                              list_add(&new_timer->list,
- -                                       &process->signal->posix_timers);
- -                              if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- -                                      get_task_struct(process);
- -                              spin_unlock_irqrestore(&process->sighand->siglock, flags);
- -                      } else {
- -                              spin_unlock_irqrestore(&process->sighand->siglock, flags);
- -                              process = NULL;
- -                      }
- -              }
- -              read_unlock(&tasklist_lock);
+ +              rcu_read_lock();
+ +              process = good_sigevent(&event);
+ +              if (process)
+ +                      get_task_struct(process);
+ +              rcu_read_unlock();
                 if (!process) {
                         error = -EINVAL;
                         goto out;
                 }
         } else {
- -              new_timer->it_sigev_notify = SIGEV_SIGNAL;
- -              new_timer->it_sigev_signo = SIGALRM;
- -              new_timer->it_sigev_value.sival_int = new_timer->it_id;
+ +              event.sigev_notify = SIGEV_SIGNAL;
+ +              event.sigev_signo = SIGALRM;
+ +              event.sigev_value.sival_int = new_timer->it_id;
                 process = current->group_leader;
- -              spin_lock_irqsave(&process->sighand->siglock, flags);
- -              new_timer->it_process = process;
- -              list_add(&new_timer->list, &process->signal->posix_timers);
- -              spin_unlock_irqrestore(&process->sighand->siglock, flags);
+ +              get_task_struct(process);
         }
   
+ +      new_timer->it_sigev_notify     = event.sigev_notify;
+ +      new_timer->sigq->info.si_signo = event.sigev_signo;
+ +      new_timer->sigq->info.si_value = event.sigev_value;
+ +      new_timer->sigq->info.si_tid   = new_timer->it_id;
+ +      new_timer->sigq->info.si_code  = SI_TIMER;
+ +
+ +      spin_lock_irq(&current->sighand->siglock);
+ +      new_timer->it_process = process;
+ +      list_add(&new_timer->list, &current->signal->posix_timers);
+ +      spin_unlock_irq(&current->sighand->siglock);
+ +
+ +      return 0;
         /*
          * In the case of the timer belonging to another task, after
          * the task is unlocked, the timer is owned by the other task
          * and may cease to exist at any time.  Don't use or modify
          * new_timer after the unlock call.
          */
- -
   out:
- -      if (error)
- -              release_posix_timer(new_timer, it_id_set);
- -
+ +      release_posix_timer(new_timer, it_id_set);
         return error;
   }
   
@@@ -571,7 -597,7 +571,7 @@@
    * the find to the timer lock.  To avoid a dead lock, the timer id MUST
    * be release with out holding the timer lock.
    */
- -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
+ +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
   {
         struct k_itimer *timr;
         /*
@@@ -579,20 -605,23 +579,20 @@@
          * flags part over to the timer lock.  Must not let interrupts in
          * while we are moving the lock.
          */
- -
         spin_lock_irqsave(&idr_lock, *flags);
- -      timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+ +      timr = idr_find(&posix_timers_id, (int)timer_id);
         if (timr) {
                 spin_lock(&timr->it_lock);
- -
- -              if ((timr->it_id != timer_id) || !(timr->it_process) ||
- -                              !same_thread_group(timr->it_process, current)) {
- -                      spin_unlock(&timr->it_lock);
- -                      spin_unlock_irqrestore(&idr_lock, *flags);
- -                      timr = NULL;
- -              } else
+ +              if (timr->it_process &&
+ +                  same_thread_group(timr->it_process, current)) {
                         spin_unlock(&idr_lock);
- -      } else
- -              spin_unlock_irqrestore(&idr_lock, *flags);
+ +                      return timr;
+ +              }
+ +              spin_unlock(&timr->it_lock);
+ +      }
+ +      spin_unlock_irqrestore(&idr_lock, *flags);
   
- -      return timr;
+ +      return NULL;
   }
   
   /*
@@@ -639,7 -668,7 +639,7 @@@ common_timer_get(struct k_itimer *timr
             (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
                 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
   
-       remaining = ktime_sub(timer->expires, now);
+       remaining = ktime_sub(hrtimer_get_expires(timer), now);
         /* Return 0 only, when the timer is expired and not pending */
         if (remaining.tv64 <= 0) {
                 /*
@@@ -733,7 -762,7 +733,7 @@@ common_timer_set(struct k_itimer *timr
         hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
         timr->it.real.timer.function = posix_timer_fn;
   
-       timer->expires = timespec_to_ktime(new_setting->it_value);
+       hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
   
         /* Convert interval */
         timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@@ -742,14 -771,12 +742,12 @@@
         if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
                 /* Setup correct expiry time for relative timers */
                 if (mode == HRTIMER_MODE_REL) {
-                       timer->expires =
-                               ktime_add_safe(timer->expires,
-                                              timer->base->get_time());
+                       hrtimer_add_expires(timer, timer->base->get_time());
                 }
                 return 0;
         }
   
-       hrtimer_start(timer, timer->expires, mode);
+       hrtimer_start_expires(timer, mode);
         return 0;
   }
   
@@@ -833,7 -860,8 +831,7 @@@ retry_delete
          * This keeps any tasks waiting on the spin lock from thinking
          * they got something (see the lock code above).
          */
- -      if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- -              put_task_struct(timer->it_process);
+ +      put_task_struct(timer->it_process);
         timer->it_process = NULL;
   
         unlock_timer(timer, flags);
@@@ -860,7 -888,8 +858,7 @@@ retry_delete
          * This keeps any tasks waiting on the spin lock from thinking
          * they got something (see the lock code above).
          */
- -      if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
- -              put_task_struct(timer->it_process);
+ +      put_task_struct(timer->it_process);
         timer->it_process = NULL;
   
         unlock_timer(timer, flags);
diff --combined kernel/sched.c

index d906f72b42d23ae1d8c2355d9b605e5fd0761eaa,eb3c72953615c06d19b08be2480a3144b259c55d..bfa87918380ffa3005847c832a3fd86fdf1c54ce
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -71,7 -71,6 +71,7 @@@
   #include <linux/debugfs.h>
   #include <linux/ctype.h>
   #include <linux/ftrace.h>
+ +#include <trace/sched.h>
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
@@@ -227,9 -226,8 +227,8 @@@ static void start_rt_bandwidth(struct r
   
                 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start(&rt_b->rt_period_timer,
-                             rt_b->rt_period_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                               HRTIMER_MODE_ABS);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
   }
@@@ -1064,7 -1062,7 +1063,7 @@@ static void hrtick_start(struct rq *rq
         struct hrtimer *timer = &rq->hrtick_timer;
         ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
   
-       timer->expires = time;
+       hrtimer_set_expires(timer, time);
   
         if (rq == this_rq()) {
                 hrtimer_restart(timer);
@@@ -1937,7 -1935,6 +1936,7 @@@ unsigned long wait_task_inactive(struc
                  * just go back and repeat.
                  */
                 rq = task_rq_lock(p, &flags);
+ +              trace_sched_wait_task(rq, p);
                 running = task_running(rq, p);
                 on_rq = p->se.on_rq;
                 ncsw = 0;
@@@ -2299,7 -2296,9 +2298,7 @@@ out_activate
         success = 1;
   
   out_running:
- -      trace_mark(kernel_sched_wakeup,
- -              "pid %d state %ld ## rq %p task %p rq->curr %p",
- -              p->pid, p->state, rq, p, rq->curr);
+ +      trace_sched_wakeup(rq, p);
         check_preempt_curr(rq, p, sync);
   
         p->state = TASK_RUNNING;
@@@ -2432,7 -2431,9 +2431,7 @@@ void wake_up_new_task(struct task_struc
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
- -      trace_mark(kernel_sched_wakeup_new,
- -              "pid %d state %ld ## rq %p task %p rq->curr %p",
- -              p->pid, p->state, rq, p, rq->curr);
+ +      trace_sched_wakeup_new(rq, p);
         check_preempt_curr(rq, p, 0);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2605,7 -2606,11 +2604,7 @@@ context_switch(struct rq *rq, struct ta
         struct mm_struct *mm, *oldmm;
   
         prepare_task_switch(rq, prev, next);
- -      trace_mark(kernel_sched_schedule,
- -              "prev_pid %d next_pid %d prev_state %ld "
- -              "## rq %p prev %p next %p",
- -              prev->pid, next->pid, prev->state,
- -              rq, prev, next);
+ +      trace_sched_switch(rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@@ -2845,7 -2850,6 +2844,7 @@@ static void sched_migrate_task(struct t
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
+ +      trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@@ -4047,26 -4051,23 +4046,26 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
   EXPORT_PER_CPU_SYMBOL(kstat);
   
   /*
- - * Return p->sum_exec_runtime plus any more ns on the sched_clock
- - * that have not yet been banked in case the task is currently running.
+ + * Return any ns on the sched_clock that have not yet been banked in
+ + * @p in case that task is currently running.
    */
- -unsigned long long task_sched_runtime(struct task_struct *p)
+ +unsigned long long task_delta_exec(struct task_struct *p)
   {
         unsigned long flags;
- -      u64 ns, delta_exec;
         struct rq *rq;
+ +      u64 ns = 0;
   
         rq = task_rq_lock(p, &flags);
- -      ns = p->se.sum_exec_runtime;
+ +
         if (task_current(rq, p)) {
+ +              u64 delta_exec;
+ +
                 update_rq_clock(rq);
                 delta_exec = rq->clock - p->se.exec_start;
                 if ((s64)delta_exec > 0)
- -                      ns += delta_exec;
+ +                      ns = delta_exec;
         }
+ +
         task_rq_unlock(rq, &flags);
   
         return ns;
@@@ -4083,7 -4084,6 +4082,7 @@@ void account_user_time(struct task_stru
         cputime64_t tmp;
   
         p->utime = cputime_add(p->utime, cputime);
+ +      account_group_user_time(p, cputime);
   
         /* Add user time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@@ -4108,7 -4108,6 +4107,7 @@@ static void account_guest_time(struct t
         tmp = cputime_to_cputime64(cputime);
   
         p->utime = cputime_add(p->utime, cputime);
+ +      account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
   
         cpustat->user = cputime64_add(cpustat->user, tmp);
@@@ -4144,7 -4143,6 +4143,7 @@@ void account_system_time(struct task_st
         }
   
         p->stime = cputime_add(p->stime, cputime);
+ +      account_group_system_time(p, cputime);
   
         /* Add system time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@@ -4186,7 -4184,6 +4185,7 @@@ void account_steal_time(struct task_str
   
         if (p == rq->idle) {
                 p->stime = cputime_add(p->stime, steal);
+ +              account_group_system_time(p, steal);
                 if (atomic_read(&rq->nr_iowait) > 0)
                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                 else
diff --combined kernel/sys.c

index 53879cdae483b6371543bdeb94818292e91f07bb,fc71f99fb469a22da88b08263e2609409566173b..31deba8f7d160c19bf262b1be13272e2ede1050d
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -853,28 -853,38 +853,28 @@@ asmlinkage long sys_setfsgid(gid_t gid
         return old_fsgid;
   }
   
+ +void do_sys_times(struct tms *tms)
+ +{
+ +      struct task_cputime cputime;
+ +      cputime_t cutime, cstime;
+ +
+ +      spin_lock_irq(&current->sighand->siglock);
+ +      thread_group_cputime(current, &cputime);
+ +      cutime = current->signal->cutime;
+ +      cstime = current->signal->cstime;
+ +      spin_unlock_irq(&current->sighand->siglock);
+ +      tms->tms_utime = cputime_to_clock_t(cputime.utime);
+ +      tms->tms_stime = cputime_to_clock_t(cputime.stime);
+ +      tms->tms_cutime = cputime_to_clock_t(cutime);
+ +      tms->tms_cstime = cputime_to_clock_t(cstime);
+ +}
+ +
   asmlinkage long sys_times(struct tms __user * tbuf)
   {
- -      /*
- -       *      In the SMP world we might just be unlucky and have one of
- -       *      the times increment as we use it. Since the value is an
- -       *      atomically safe type this is just fine. Conceptually its
- -       *      as if the syscall took an instant longer to occur.
- -       */
         if (tbuf) {
                 struct tms tmp;
- -              struct task_struct *tsk = current;
- -              struct task_struct *t;
- -              cputime_t utime, stime, cutime, cstime;
- -
- -              spin_lock_irq(&tsk->sighand->siglock);
- -              utime = tsk->signal->utime;
- -              stime = tsk->signal->stime;
- -              t = tsk;
- -              do {
- -                      utime = cputime_add(utime, t->utime);
- -                      stime = cputime_add(stime, t->stime);
- -                      t = next_thread(t);
- -              } while (t != tsk);
- -
- -              cutime = tsk->signal->cutime;
- -              cstime = tsk->signal->cstime;
- -              spin_unlock_irq(&tsk->sighand->siglock);
- -
- -              tmp.tms_utime = cputime_to_clock_t(utime);
- -              tmp.tms_stime = cputime_to_clock_t(stime);
- -              tmp.tms_cutime = cputime_to_clock_t(cutime);
- -              tmp.tms_cstime = cputime_to_clock_t(cstime);
+ +
+ +              do_sys_times(&tmp);
                 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                         return -EFAULT;
         }
@@@ -1439,6 -1449,7 +1439,6 @@@ asmlinkage long sys_old_getrlimit(unsig
   asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
   {
         struct rlimit new_rlim, *old_rlim;
- -      unsigned long it_prof_secs;
         int retval;
   
         if (resource >= RLIM_NLIMITS)
@@@ -1492,7 -1503,18 +1492,7 @@@
         if (new_rlim.rlim_cur == RLIM_INFINITY)
                 goto out;
   
- -      it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
- -      if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
- -              unsigned long rlim_cur = new_rlim.rlim_cur;
- -              cputime_t cputime;
- -
- -              cputime = secs_to_cputime(rlim_cur);
- -              read_lock(&tasklist_lock);
- -              spin_lock_irq(&current->sighand->siglock);
- -              set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
- -              spin_unlock_irq(&current->sighand->siglock);
- -              read_unlock(&tasklist_lock);
- -      }
+ +      update_rlimit_cpu(new_rlim.rlim_cur);
   out:
         return 0;
   }
@@@ -1530,8 -1552,11 +1530,8 @@@
    *
    */
   
- -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
- -                                   cputime_t *utimep, cputime_t *stimep)
+ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
   {
- -      *utimep = cputime_add(*utimep, t->utime);
- -      *stimep = cputime_add(*stimep, t->stime);
         r->ru_nvcsw += t->nvcsw;
         r->ru_nivcsw += t->nivcsw;
         r->ru_minflt += t->min_flt;
@@@ -1545,13 -1570,12 +1545,13 @@@ static void k_getrusage(struct task_str
         struct task_struct *t;
         unsigned long flags;
         cputime_t utime, stime;
+ +      struct task_cputime cputime;
   
         memset((char *) r, 0, sizeof *r);
         utime = stime = cputime_zero;
   
         if (who == RUSAGE_THREAD) {
- -              accumulate_thread_rusage(p, r, &utime, &stime);
+ +              accumulate_thread_rusage(p, r);
                 goto out;
         }
   
@@@ -1574,9 -1598,8 +1574,9 @@@
                                 break;
   
                 case RUSAGE_SELF:
- -                      utime = cputime_add(utime, p->signal->utime);
- -                      stime = cputime_add(stime, p->signal->stime);
+ +                      thread_group_cputime(p, &cputime);
+ +                      utime = cputime_add(utime, cputime.utime);
+ +                      stime = cputime_add(stime, cputime.stime);
                         r->ru_nvcsw += p->signal->nvcsw;
                         r->ru_nivcsw += p->signal->nivcsw;
                         r->ru_minflt += p->signal->min_flt;
@@@ -1585,7 -1608,7 +1585,7 @@@
                         r->ru_oublock += p->signal->oublock;
                         t = p;
                         do {
- -                              accumulate_thread_rusage(t, r, &utime, &stime);
+ +                              accumulate_thread_rusage(t, r);
                                 t = next_thread(t);
                         } while (t != p);
                         break;
@@@ -1716,6 -1739,16 +1716,16 @@@ asmlinkage long sys_prctl(int option, u
                 case PR_SET_TSC:
                         error = SET_TSC_CTL(arg2);
                         break;
+               case PR_GET_TIMERSLACK:
+                       error = current->timer_slack_ns;
+                       break;
+               case PR_SET_TIMERSLACK:
+                       if (arg2 <= 0)
+                               current->timer_slack_ns =
+                                       current->default_timer_slack_ns;
+                       else
+                               current->timer_slack_ns = arg2;
+                       break;
                 default:
                         error = -EINVAL;
                         break;
diff --combined kernel/time/ntp.c

index 1a20715bfd6e4854e96e96eb1541f792767aeaa5,9c114b726ab3353e75fa222e8597e52db9d34a82..8ff15e5d486b137e65f96c64b5a206315bab7ee4
--- 1/kernel/time/ntp.c
--- 2/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@@ -10,13 -10,13 +10,13 @@@
   
   #include <linux/mm.h>
   #include <linux/time.h>
- -#include <linux/timer.h>
   #include <linux/timex.h>
   #include <linux/jiffies.h>
   #include <linux/hrtimer.h>
   #include <linux/capability.h>
   #include <linux/math64.h>
   #include <linux/clocksource.h>
+ +#include <linux/workqueue.h>
   #include <asm/timex.h>
   
   /*
@@@ -142,8 -142,7 +142,7 @@@ static enum hrtimer_restart ntp_leap_se
                 time_state = TIME_OOP;
                 printk(KERN_NOTICE "Clock: "
                        "inserting leap second 23:59:60 UTC\n");
-               leap_timer.expires = ktime_add_ns(leap_timer.expires,
-                                                 NSEC_PER_SEC);
+               hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
                 res = HRTIMER_RESTART;
                 break;
         case TIME_DEL:
@@@ -218,11 -217,11 +217,11 @@@ void second_overflow(void
   /* Disable the cmos update - used by virtualization and embedded */
   int no_sync_cmos_clock  __read_mostly;
   
- -static void sync_cmos_clock(unsigned long dummy);
+ +static void sync_cmos_clock(struct work_struct *work);
   
- -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+ +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
   
- -static void sync_cmos_clock(unsigned long dummy)
+ +static void sync_cmos_clock(struct work_struct *work)
   {
         struct timespec now, next;
         int fail = 1;
@@@ -258,13 -257,13 +257,13 @@@
                 next.tv_sec++;
                 next.tv_nsec -= NSEC_PER_SEC;
         }
- -      mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+ +      schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
   }
   
   static void notify_cmos_timer(void)
   {
         if (!no_sync_cmos_clock)
- -              mod_timer(&sync_cmos_timer, jiffies + 1);
+ +              schedule_delayed_work(&sync_cmos_work, 0);
   }
   
   #else
@@@ -277,50 -276,38 +276,50 @@@ static inline void notify_cmos_timer(vo
   int do_adjtimex(struct timex *txc)
   {
         struct timespec ts;
- -      long save_adjust, sec;
         int result;
   
- -      /* In order to modify anything, you gotta be super-user! */
- -      if (txc->modes && !capable(CAP_SYS_TIME))
- -              return -EPERM;
- -
- -      /* Now we validate the data before disabling interrupts */
- -
- -      if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+ +      /* Validate the data before disabling interrupts */
+ +      if (txc->modes & ADJ_ADJTIME) {
                 /* singleshot must not be used with any other mode bits */
- -              if (txc->modes & ~ADJ_OFFSET_SS_READ)
+ +              if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                         return -EINVAL;
+ +              if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ +                  !capable(CAP_SYS_TIME))
+ +                      return -EPERM;
+ +      } else {
+ +              /* In order to modify anything, you gotta be super-user! */
+ +               if (txc->modes && !capable(CAP_SYS_TIME))
+ +                      return -EPERM;
+ +
+ +              /* if the quartz is off by more than 10% something is VERY wrong! */
+ +              if (txc->modes & ADJ_TICK &&
+ +                  (txc->tick <  900000/USER_HZ ||
+ +                   txc->tick > 1100000/USER_HZ))
+ +                              return -EINVAL;
+ +
+ +              if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+ +                      hrtimer_cancel(&leap_timer);
         }
   
- -      /* if the quartz is off by more than 10% something is VERY wrong ! */
- -      if (txc->modes & ADJ_TICK)
- -              if (txc->tick <  900000/USER_HZ ||
- -                  txc->tick > 1100000/USER_HZ)
- -                      return -EINVAL;
- -
- -      if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
- -              hrtimer_cancel(&leap_timer);
         getnstimeofday(&ts);
   
         write_seqlock_irq(&xtime_lock);
   
- -      /* Save for later - semantics of adjtime is to return old value */
- -      save_adjust = time_adjust;
- -
         /* If there are input parameters, then process them */
+ +      if (txc->modes & ADJ_ADJTIME) {
+ +              long save_adjust = time_adjust;
+ +
+ +              if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+ +                      /* adjtime() is independent from ntp_adjtime() */
+ +                      time_adjust = txc->offset;
+ +                      ntp_update_frequency();
+ +              }
+ +              txc->offset = save_adjust;
+ +              goto adj_done;
+ +      }
         if (txc->modes) {
+ +              long sec;
+ +
                 if (txc->modes & ADJ_STATUS) {
                         if ((time_status & STA_PLL) &&
                             !(txc->status & STA_PLL)) {
@@@ -387,8 -374,13 +386,8 @@@
                 if (txc->modes & ADJ_TAI && txc->constant > 0)
                         time_tai = txc->constant;
   
- -              if (txc->modes & ADJ_OFFSET) {
- -                      if (txc->modes == ADJ_OFFSET_SINGLESHOT)
- -                              /* adjtime() is independent from ntp_adjtime() */
- -                              time_adjust = txc->offset;
- -                      else
- -                              ntp_update_offset(txc->offset);
- -              }
+ +              if (txc->modes & ADJ_OFFSET)
+ +                      ntp_update_offset(txc->offset);
                 if (txc->modes & ADJ_TICK)
                         tick_usec = txc->tick;
   
@@@ -396,18 -388,22 +395,18 @@@
                         ntp_update_frequency();
         }
   
+ +      txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+ +                                NTP_SCALE_SHIFT);
+ +      if (!(time_status & STA_NANO))
+ +              txc->offset /= NSEC_PER_USEC;
+ +
+ +adj_done:
         result = time_state;    /* mostly `TIME_OK' */
         if (time_status & (STA_UNSYNC|STA_CLOCKERR))
                 result = TIME_ERROR;
   
- -      if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
- -          (txc->modes == ADJ_OFFSET_SS_READ))
- -              txc->offset = save_adjust;
- -      else {
- -              txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
- -                                        NTP_SCALE_SHIFT);
- -              if (!(time_status & STA_NANO))
- -                      txc->offset /= NSEC_PER_USEC;
- -      }
- -      txc->freq          = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
- -                                       (s64)PPM_SCALE_INV,
- -                                       NTP_SCALE_SHIFT);
+ +      txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+ +                                       (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
         txc->maxerror      = time_maxerror;
         txc->esterror      = time_esterror;
         txc->status        = time_status;
diff --combined kernel/time/tick-sched.c

index 727c1ae0517ae68f02b6ba6fa0830b45a6213aab,a547be11cf976c9abba879b4fcd1c025c78ca78d..5bbb1044f8473ff98bd3fdfe2c93a1c0798ed8e7
--- 1/kernel/time/tick-sched.c
--- 2/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@@ -155,7 -155,7 +155,7 @@@ void tick_nohz_update_jiffies(void
         touch_softlockup_watchdog();
   }
   
- -void tick_nohz_stop_idle(int cpu)
+ +static void tick_nohz_stop_idle(int cpu)
   {
         struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
   
@@@ -300,7 -300,7 +300,7 @@@ void tick_nohz_stop_sched_tick(int inid
                                 goto out;
                         }
   
-                       ts->idle_tick = ts->sched_timer.expires;
+                       ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                         ts->tick_stopped = 1;
                         ts->idle_jiffies = last_jiffies;
                         rcu_enter_nohz();
@@@ -377,32 -377,6 +377,32 @@@ ktime_t tick_nohz_get_sleep_length(void
         return ts->sleep_length;
   }
   
-       ts->sched_timer.expires = ts->idle_tick;
+ +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+ +{
+ +      hrtimer_cancel(&ts->sched_timer);
-                       hrtimer_start(&ts->sched_timer,
-                                     ts->sched_timer.expires,
++      hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+ +
+ +      while (1) {
+ +              /* Forward the time to expire in the future */
+ +              hrtimer_forward(&ts->sched_timer, now, tick_period);
+ +
+ +              if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-                       if (!tick_program_event(ts->sched_timer.expires, 0))
++                      hrtimer_start_expires(&ts->sched_timer,
+ +                                    HRTIMER_MODE_ABS);
+ +                      /* Check, if the timer was already in the past */
+ +                      if (hrtimer_active(&ts->sched_timer))
+ +                              break;
+ +              } else {
++                      if (!tick_program_event(
++                              hrtimer_get_expires(&ts->sched_timer), 0))
+ +                              break;
+ +              }
+ +              /* Update jiffies and reread time */
+ +              tick_do_update_jiffies64(now);
+ +              now = ktime_get();
+ +      }
+ +}
+ +
   /**
    * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
    *
@@@ -456,14 -430,35 +456,16 @@@ void tick_nohz_restart_sched_tick(void
          */
         ts->tick_stopped  = 0;
         ts->idle_exittime = now;
- -      hrtimer_cancel(&ts->sched_timer);
- -      hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+ 
- -      while (1) {
- -              /* Forward the time to expire in the future */
- -              hrtimer_forward(&ts->sched_timer, now, tick_period);
+ +      tick_nohz_restart(ts, now);
+ 
- -              if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- -                      hrtimer_start_expires(&ts->sched_timer,
- -                                    HRTIMER_MODE_ABS);
- -                      /* Check, if the timer was already in the past */
- -                      if (hrtimer_active(&ts->sched_timer))
- -                              break;
- -              } else {
- -                      if (!tick_program_event(
- -                              hrtimer_get_expires(&ts->sched_timer), 0))
- -                              break;
- -              }
- -              /* Update jiffies and reread time */
- -              tick_do_update_jiffies64(now);
- -              now = ktime_get();
- -      }
         local_irq_enable();
   }
   
   static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
   {
         hrtimer_forward(&ts->sched_timer, now, tick_period);
-       return tick_program_event(ts->sched_timer.expires, 0);
+       return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
   }
   
   /*
@@@ -508,6 -503,10 +510,6 @@@ static void tick_nohz_handler(struct cl
         update_process_times(user_mode(regs));
         profile_tick(CPU_PROFILING);
   
- -      /* Do not restart, when we are in the idle loop */
- -      if (ts->tick_stopped)
- -              return;
- -
         while (tick_nohz_reprogram(ts, now)) {
                 now = ktime_get();
                 tick_do_update_jiffies64(now);
@@@ -542,7 -541,7 +544,7 @@@ static void tick_nohz_switch_to_nohz(vo
         next = tick_init_jiffy_update();
   
         for (;;) {
-               ts->sched_timer.expires = next;
+               hrtimer_set_expires(&ts->sched_timer, next);
                 if (!tick_program_event(next, 0))
                         break;
                 next = ktime_add(next, tick_period);
@@@ -553,56 -552,12 +555,56 @@@
                smp_processor_id());
   }
   
-       delta = ktime_sub(ts->sched_timer.expires, now);
+ +/*
+ + * When NOHZ is enabled and the tick is stopped, we need to kick the
+ + * tick timer from irq_enter() so that the jiffies update is kept
+ + * alive during long running softirqs. That's ugly as hell, but
+ + * correctness is key even if we need to fix the offending softirq in
+ + * the first place.
+ + *
+ + * Note, this is different to tick_nohz_restart. We just kick the
+ + * timer and do not touch the other magic bits which need to be done
+ + * when idle is left.
+ + */
+ +static void tick_nohz_kick_tick(int cpu)
+ +{
+ +      struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ +      ktime_t delta, now;
+ +
+ +      if (!ts->tick_stopped)
+ +              return;
+ +
+ +      /*
+ +       * Do not touch the tick device, when the next expiry is either
+ +       * already reached or less/equal than the tick period.
+ +       */
+ +      now = ktime_get();
++      delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+ +      if (delta.tv64 <= tick_period.tv64)
+ +              return;
+ +
+ +      tick_nohz_restart(ts, now);
+ +}
+ +
   #else
   
   static inline void tick_nohz_switch_to_nohz(void) { }
   
   #endif /* NO_HZ */
   
+ +/*
+ + * Called from irq_enter to notify about the possible interruption of idle()
+ + */
+ +void tick_check_idle(int cpu)
+ +{
+ +      tick_check_oneshot_broadcast(cpu);
+ +#ifdef CONFIG_NO_HZ
+ +      tick_nohz_stop_idle(cpu);
+ +      tick_nohz_update_jiffies();
+ +      tick_nohz_kick_tick(cpu);
+ +#endif
+ +}
+ +
   /*
    * High resolution timer specific code
    */
@@@ -656,6 -611,10 +658,6 @@@ static enum hrtimer_restart tick_sched_
                 profile_tick(CPU_PROFILING);
         }
   
- -      /* Do not restart, when we are in the idle loop */
- -      if (ts->tick_stopped)
- -              return HRTIMER_NORESTART;
- -
         hrtimer_forward(timer, now, tick_period);
   
         return HRTIMER_RESTART;
@@@ -678,16 -637,15 +680,15 @@@ void tick_setup_sched_timer(void
         ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
   
         /* Get the next period (per cpu) */
-       ts->sched_timer.expires = tick_init_jiffy_update();
+       hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
         offset = ktime_to_ns(tick_period) >> 1;
         do_div(offset, num_possible_cpus());
         offset *= smp_processor_id();
-       ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
+       hrtimer_add_expires_ns(&ts->sched_timer, offset);
   
         for (;;) {
                 hrtimer_forward(&ts->sched_timer, now, tick_period);
-               hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
                 /* Check, if the timer was already in the past */
                 if (hrtimer_active(&ts->sched_timer))
                         break;
diff --combined kernel/time/timer_list.c

index f6426911e35a6da3a711e8b8e2cb602a5057a2dc,122ee751d2d1e16e6429489f70bc7ef13ba48a94..a999b92a12773750daded0c912a822b6f0eebe84
--- 1/kernel/time/timer_list.c
--- 2/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@@ -47,14 -47,13 +47,14 @@@ static void print_name_offset(struct se
   }
   
   static void
- -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
+ +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
+ +          int idx, u64 now)
   {
   #ifdef CONFIG_TIMER_STATS
         char tmp[TASK_COMM_LEN + 1];
   #endif
         SEQ_printf(m, " #%d: ", idx);
- -      print_name_offset(m, timer);
+ +      print_name_offset(m, taddr);
         SEQ_printf(m, ", ");
         print_name_offset(m, timer->function);
         SEQ_printf(m, ", S:%02lx", timer->state);
@@@ -66,9 -65,11 +66,11 @@@
         SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
   #endif
         SEQ_printf(m, "\n");
-       SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
-               (unsigned long long)ktime_to_ns(timer->expires),
-               (long long)(ktime_to_ns(timer->expires) - now));
+       SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+               (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+               (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+               (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+               (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
   }
   
   static void
@@@ -100,7 -101,7 +102,7 @@@ next_one
                 tmp = *timer;
                 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
   
- -              print_timer(m, &tmp, i, now);
+ +              print_timer(m, timer, &tmp, i, now);
                 next++;
                 goto next_one;
         }
@@@ -110,7 -111,6 +112,7 @@@
   static void
   print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
   {
+ +      SEQ_printf(m, "  .base:       %p\n", base);
         SEQ_printf(m, "  .index:      %d\n",
                         base->index);
         SEQ_printf(m, "  .resolution: %Lu nsecs\n",
@@@ -185,16 -185,12 +187,16 @@@ static void print_cpu(struct seq_file *
   
   #ifdef CONFIG_GENERIC_CLOCKEVENTS
   static void
- -print_tickdevice(struct seq_file *m, struct tick_device *td)
+ +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
   {
         struct clock_event_device *dev = td->evtdev;
   
         SEQ_printf(m, "\n");
         SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
+ +      if (cpu < 0)
+ +              SEQ_printf(m, "Broadcast device\n");
+ +      else
+ +              SEQ_printf(m, "Per CPU device: %d\n", cpu);
   
         SEQ_printf(m, "Clock Event Device: ");
         if (!dev) {
@@@ -228,7 -224,7 +230,7 @@@ static void timer_list_show_tickdevices
         int cpu;
   
   #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
- -      print_tickdevice(m, tick_get_broadcast_device());
+ +      print_tickdevice(m, tick_get_broadcast_device(), -1);
         SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
                    tick_get_broadcast_mask()->bits[0]);
   #ifdef CONFIG_TICK_ONESHOT
@@@ -238,7 -234,7 +240,7 @@@
         SEQ_printf(m, "\n");
   #endif
         for_each_online_cpu(cpu)
- -                 print_tickdevice(m, tick_get_device(cpu));
+ +              print_tickdevice(m, tick_get_device(cpu), cpu);
         SEQ_printf(m, "\n");
   }
   #else
@@@ -250,7 -246,7 +252,7 @@@ static int timer_list_show(struct seq_f
         u64 now = ktime_to_ns(ktime_get());
         int cpu;
   
- -      SEQ_printf(m, "Timer List Version: v0.3\n");
+ +      SEQ_printf(m, "Timer List Version: v0.4\n");
         SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
         SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
author	Thomas Gleixner <tglx@linutronix.de>
	Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
committer	Thomas Gleixner <tglx@linutronix.de>
	Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
		1	2
include/linux/hrtimer.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/time.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/hrtimer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/posix-timers.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/ntp.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/timer_list.c	patch \|	diff1 \|	diff2 \|	blob \| history