]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'timers/range-hrtimers' into v28-range-hrtimers-for-linus-v2
authorThomas Gleixner <tglx@linutronix.de>
Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Wed, 22 Oct 2008 07:48:06 +0000 (09:48 +0200)
Conflicts:

kernel/time/tick-sched.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
1  2 
include/linux/hrtimer.h
include/linux/sched.h
include/linux/time.h
kernel/fork.c
kernel/hrtimer.c
kernel/posix-timers.c
kernel/sched.c
kernel/sys.c
kernel/time/ntp.c
kernel/time/tick-sched.c
kernel/time/timer_list.c

diff --combined include/linux/hrtimer.h
index 9a4e35cd5f79d80e52bf3a1509e0438bb6a77ec4,58bca8e9bae11e02926bf835442421c6429a8fd4..2b3645b1acf4609e55fa36f22368cca34d439a2a
@@@ -20,6 -20,8 +20,8 @@@
  #include <linux/init.h>
  #include <linux/list.h>
  #include <linux/wait.h>
+ #include <linux/percpu.h>
  
  struct hrtimer_clock_base;
  struct hrtimer_cpu_base;
@@@ -101,9 -103,14 +103,14 @@@ enum hrtimer_cb_mode 
  /**
   * struct hrtimer - the basic hrtimer structure
   * @node:     red black tree node for time ordered insertion
-  * @expires:  the absolute expiry time in the hrtimers internal
+  * @_expires: the absolute expiry time in the hrtimers internal
   *            representation. The time is related to the clock on
-  *            which the timer is based.
+  *            which the timer is based. Is setup by adding
+  *            slack to the _softexpires value. For non range timers
+  *            identical to _softexpires.
+  * @_softexpires: the absolute earliest expiry time of the hrtimer.
+  *            The time which was given as expiry time when the timer
+  *            was armed.
   * @function: timer expiry callback function
   * @base:     pointer to the timer base (per cpu and per clock)
   * @state:    state information (See bit values above)
   */
  struct hrtimer {
        struct rb_node                  node;
-       ktime_t                         expires;
+       ktime_t                         _expires;
+       ktime_t                         _softexpires;
        enum hrtimer_restart            (*function)(struct hrtimer *);
        struct hrtimer_clock_base       *base;
        unsigned long                   state;
 -      enum hrtimer_cb_mode            cb_mode;
        struct list_head                cb_entry;
 +      enum hrtimer_cb_mode            cb_mode;
  #ifdef CONFIG_TIMER_STATS
 +      int                             start_pid;
        void                            *start_site;
        char                            start_comm[16];
 -      int                             start_pid;
  #endif
  };
  
@@@ -155,8 -163,10 +163,8 @@@ struct hrtimer_sleeper 
   * @first:            pointer to the timer node which expires first
   * @resolution:               the resolution of the clock, in nanoseconds
   * @get_time:         function to retrieve the current time of the clock
 - * @get_softirq_time: function to retrieve the current time from the softirq
   * @softirq_time:     the time when running the hrtimer queue in the softirq
   * @offset:           offset of this clock to the monotonic base
 - * @reprogram:                function to reprogram the timer event
   */
  struct hrtimer_clock_base {
        struct hrtimer_cpu_base *cpu_base;
        struct rb_node          *first;
        ktime_t                 resolution;
        ktime_t                 (*get_time)(void);
 -      ktime_t                 (*get_softirq_time)(void);
        ktime_t                 softirq_time;
  #ifdef CONFIG_HIGH_RES_TIMERS
        ktime_t                 offset;
 -      int                     (*reprogram)(struct hrtimer *t,
 -                                           struct hrtimer_clock_base *b,
 -                                           ktime_t n);
  #endif
  };
  
@@@ -201,6 -215,71 +209,71 @@@ struct hrtimer_cpu_base 
  #endif
  };
  
+ static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+ {
+       timer->_expires = time;
+       timer->_softexpires = time;
+ }
+ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
+ {
+       timer->_softexpires = time;
+       timer->_expires = ktime_add_safe(time, delta);
+ }
+ static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+ {
+       timer->_softexpires = time;
+       timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+ }
+ static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
+ {
+       timer->_expires.tv64 = tv64;
+       timer->_softexpires.tv64 = tv64;
+ }
+ static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
+ {
+       timer->_expires = ktime_add_safe(timer->_expires, time);
+       timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
+ }
+ static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+ {
+       timer->_expires = ktime_add_ns(timer->_expires, ns);
+       timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
+ }
+ static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
+ {
+       return timer->_expires;
+ }
+ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
+ {
+       return timer->_softexpires;
+ }
+ static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
+ {
+       return timer->_expires.tv64;
+ }
+ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
+ {
+       return timer->_softexpires.tv64;
+ }
+ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
+ {
+       return ktime_to_ns(timer->_expires);
+ }
+ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
+ {
+     return ktime_sub(timer->_expires, timer->base->get_time());
+ }
  #ifdef CONFIG_HIGH_RES_TIMERS
  struct clock_event_device;
  
@@@ -221,6 -300,8 +294,8 @@@ static inline int hrtimer_is_hres_activ
        return timer->base->cpu_base->hres_active;
  }
  
+ extern void hrtimer_peek_ahead_timers(void);
  /*
   * The resolution of the clocks. The resolution value is returned in
   * the clock_getres() system call to give application programmers an
   * is expired in the next softirq when the clock was advanced.
   */
  static inline void clock_was_set(void) { }
+ static inline void hrtimer_peek_ahead_timers(void) { }
  
  static inline void hres_timers_resume(void) { }
  
@@@ -264,6 -346,10 +340,10 @@@ static inline int hrtimer_is_hres_activ
  extern ktime_t ktime_get(void);
  extern ktime_t ktime_get_real(void);
  
+ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
  /* Exported timer functions: */
  
  /* Initialize timers: */
@@@ -288,12 -374,25 +368,25 @@@ static inline void destroy_hrtimer_on_s
  /* Basic timer operations: */
  extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
                         const enum hrtimer_mode mode);
+ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+                       unsigned long range_ns, const enum hrtimer_mode mode);
  extern int hrtimer_cancel(struct hrtimer *timer);
  extern int hrtimer_try_to_cancel(struct hrtimer *timer);
  
+ static inline int hrtimer_start_expires(struct hrtimer *timer,
+                                               enum hrtimer_mode mode)
+ {
+       unsigned long delta;
+       ktime_t soft, hard;
+       soft = hrtimer_get_softexpires(timer);
+       hard = hrtimer_get_expires(timer);
+       delta = ktime_to_ns(ktime_sub(hard, soft));
+       return hrtimer_start_range_ns(timer, soft, delta, mode);
+ }
  static inline int hrtimer_restart(struct hrtimer *timer)
  {
-       return hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+       return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  }
  
  /* Query timers: */
@@@ -350,6 -449,10 +443,10 @@@ extern long hrtimer_nanosleep_restart(s
  extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                 struct task_struct *tsk);
  
+ extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+                                               const enum hrtimer_mode mode);
+ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
  /* Soft interrupt function to run the hrtimer queues: */
  extern void hrtimer_run_queues(void);
  extern void hrtimer_run_pending(void);
diff --combined include/linux/sched.h
index 5c38db536e07f8b7b0d6e5b0dc4f31a9e3cb07c1,de53c109fd04f1de9eb9fd973af5ae3e7f790922..9ee3bed0ff065108a2f52404c2705b88277730f3
@@@ -403,21 -403,12 +403,21 @@@ extern int get_dumpable(struct mm_struc
  #define MMF_DUMP_MAPPED_PRIVATE       4
  #define MMF_DUMP_MAPPED_SHARED        5
  #define MMF_DUMP_ELF_HEADERS  6
 +#define MMF_DUMP_HUGETLB_PRIVATE 7
 +#define MMF_DUMP_HUGETLB_SHARED  8
  #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
 -#define MMF_DUMP_FILTER_BITS  5
 +#define MMF_DUMP_FILTER_BITS  7
  #define MMF_DUMP_FILTER_MASK \
        (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
  #define MMF_DUMP_FILTER_DEFAULT \
 -      ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
 +      ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
 +       (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
 +
 +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
 +# define MMF_DUMP_MASK_DEFAULT_ELF    (1 << MMF_DUMP_ELF_HEADERS)
 +#else
 +# define MMF_DUMP_MASK_DEFAULT_ELF    0
 +#endif
  
  struct sighand_struct {
        atomic_t                count;
@@@ -434,39 -425,6 +434,39 @@@ struct pacct_struct 
        unsigned long           ac_minflt, ac_majflt;
  };
  
 +/**
 + * struct task_cputime - collected CPU time counts
 + * @utime:            time spent in user mode, in &cputime_t units
 + * @stime:            time spent in kernel mode, in &cputime_t units
 + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
 + *
 + * This structure groups together three kinds of CPU time that are
 + * tracked for threads and thread groups.  Most things considering
 + * CPU time want to group these counts together and treat all three
 + * of them in parallel.
 + */
 +struct task_cputime {
 +      cputime_t utime;
 +      cputime_t stime;
 +      unsigned long long sum_exec_runtime;
 +};
 +/* Alternate field names when used to cache expirations. */
 +#define prof_exp      stime
 +#define virt_exp      utime
 +#define sched_exp     sum_exec_runtime
 +
 +/**
 + * struct thread_group_cputime - thread group interval timer counts
 + * @totals:           thread group interval timers; substructure for
 + *                    uniprocessor kernel, per-cpu for SMP kernel.
 + *
 + * This structure contains the version of task_cputime, above, that is
 + * used for thread group CPU clock calculations.
 + */
 +struct thread_group_cputime {
 +      struct task_cputime *totals;
 +};
 +
  /*
   * NOTE! "signal_struct" does not have it's own
   * locking, because a shared signal_struct always
@@@ -512,17 -470,6 +512,17 @@@ struct signal_struct 
        cputime_t it_prof_expires, it_virt_expires;
        cputime_t it_prof_incr, it_virt_incr;
  
 +      /*
 +       * Thread group totals for process CPU clocks.
 +       * See thread_group_cputime(), et al, for details.
 +       */
 +      struct thread_group_cputime cputime;
 +
 +      /* Earliest-expiration cache. */
 +      struct task_cputime cputime_expires;
 +
 +      struct list_head cpu_timers[3];
 +
        /* job control IDs */
  
        /*
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
 -      cputime_t utime, stime, cutime, cstime;
 +      cputime_t cutime, cstime;
        cputime_t gtime;
        cputime_t cgtime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long inblock, oublock, cinblock, coublock;
        struct task_io_accounting ioac;
  
 -      /*
 -       * Cumulative ns of scheduled CPU time for dead threads in the
 -       * group, not including a zombie group leader.  (This only differs
 -       * from jiffies_to_ns(utime + stime) if sched_clock uses something
 -       * other than jiffies.)
 -       */
 -      unsigned long long sum_sched_runtime;
 -
        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         */
        struct rlimit rlim[RLIM_NLIMITS];
  
 -      struct list_head cpu_timers[3];
 -
        /* keep the process-shared keyrings here so that they do the right
         * thing in threads created with CLONE_THREAD */
  #ifdef CONFIG_KEYS
@@@ -1180,7 -1137,8 +1180,7 @@@ struct task_struct 
  /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
        unsigned long min_flt, maj_flt;
  
 -      cputime_t it_prof_expires, it_virt_expires;
 -      unsigned long long it_sched_expires;
 +      struct task_cputime cputime_expires;
        struct list_head cpu_timers[3];
  
  /* process credentials */
        int latency_record_count;
        struct latency_record latency_record[LT_SAVECOUNT];
  #endif
+       /*
+        * time slack values; these are used to round up poll() and
+        * select() etc timeout values. These are in nanoseconds.
+        */
+       unsigned long timer_slack_ns;
+       unsigned long default_timer_slack_ns;
  };
  
  /*
@@@ -1630,7 -1594,6 +1636,7 @@@ extern unsigned long long cpu_clock(in
  
  extern unsigned long long
  task_sched_runtime(struct task_struct *task);
 +extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
  
  /* sched_exec is called by processes performing an exec */
  #ifdef CONFIG_SMP
@@@ -2127,30 -2090,6 +2133,30 @@@ static inline int spin_needbreak(spinlo
  #endif
  }
  
 +/*
 + * Thread group CPU time accounting.
 + */
 +
 +extern int thread_group_cputime_alloc(struct task_struct *);
 +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
 +
 +static inline void thread_group_cputime_init(struct signal_struct *sig)
 +{
 +      sig->cputime.totals = NULL;
 +}
 +
 +static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 +{
 +      if (curr->signal->cputime.totals)
 +              return 0;
 +      return thread_group_cputime_alloc(curr);
 +}
 +
 +static inline void thread_group_cputime_free(struct signal_struct *sig)
 +{
 +      free_percpu(sig->cputime.totals);
 +}
 +
  /*
   * Reevaluate whether the task has signals pending delivery.
   * Wake the task if so.
diff --combined include/linux/time.h
index 4f1c9db577079ed3cc688def99e476219b5aa9b0,c911ef69ea87e2dbb2c7cd8988dce850d13b79be..ce321ac5c8f8ceba2f0bdb7640dbc71a9eab6687
@@@ -40,6 -40,8 +40,8 @@@ extern struct timezone sys_tz
  #define NSEC_PER_SEC  1000000000L
  #define FSEC_PER_SEC  1000000000000000L
  
+ #define TIME_T_MAX    (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
  static inline int timespec_equal(const struct timespec *a,
                                   const struct timespec *b)
  {
@@@ -74,6 -76,8 +76,8 @@@ extern unsigned long mktime(const unsig
                            const unsigned int min, const unsigned int sec);
  
  extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+ extern struct timespec timespec_add_safe(const struct timespec lhs,
+                                        const struct timespec rhs);
  
  /*
   * sub = lhs - rhs, in normalized form
@@@ -119,7 -123,6 +123,7 @@@ extern int do_setitimer(int which, stru
  extern unsigned int alarm_setitimer(unsigned int seconds);
  extern int do_getitimer(int which, struct itimerval *value);
  extern void getnstimeofday(struct timespec *tv);
 +extern void getrawmonotonic(struct timespec *ts);
  extern void getboottime(struct timespec *ts);
  extern void monotonic_to_bootbased(struct timespec *ts);
  
@@@ -128,9 -131,6 +132,9 @@@ extern int timekeeping_valid_for_hres(v
  extern void update_wall_time(void);
  extern void update_xtime_cache(u64 nsec);
  
 +struct tms;
 +extern void do_sys_times(struct tms *);
 +
  /**
   * timespec_to_ns - Convert timespec to nanoseconds
   * @ts:               pointer to the timespec variable to be converted
@@@ -220,7 -220,6 +224,7 @@@ struct itimerval 
  #define CLOCK_MONOTONIC                       1
  #define CLOCK_PROCESS_CPUTIME_ID      2
  #define CLOCK_THREAD_CPUTIME_ID               3
 +#define CLOCK_MONOTONIC_RAW           4
  
  /*
   * The IDs of various hardware clocks:
diff --combined kernel/fork.c
index 4d093552dd6e79aea8553b32e1ff465cd0439116,37b3e150ae3956759b684c6806bbb6dc5ad12376..f6083561dfe0a9f8d2a13138f7332bc358a51653
@@@ -58,7 -58,6 +58,7 @@@
  #include <linux/tty.h>
  #include <linux/proc_fs.h>
  #include <linux/blkdev.h>
 +#include <trace/sched.h>
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
@@@ -760,44 -759,15 +760,44 @@@ void __cleanup_sighand(struct sighand_s
                kmem_cache_free(sighand_cachep, sighand);
  }
  
 +
 +/*
 + * Initialize POSIX timer handling for a thread group.
 + */
 +static void posix_cpu_timers_init_group(struct signal_struct *sig)
 +{
 +      /* Thread group counters. */
 +      thread_group_cputime_init(sig);
 +
 +      /* Expiration times and increments. */
 +      sig->it_virt_expires = cputime_zero;
 +      sig->it_virt_incr = cputime_zero;
 +      sig->it_prof_expires = cputime_zero;
 +      sig->it_prof_incr = cputime_zero;
 +
 +      /* Cached expiration times. */
 +      sig->cputime_expires.prof_exp = cputime_zero;
 +      sig->cputime_expires.virt_exp = cputime_zero;
 +      sig->cputime_expires.sched_exp = 0;
 +
 +      /* The timer lists. */
 +      INIT_LIST_HEAD(&sig->cpu_timers[0]);
 +      INIT_LIST_HEAD(&sig->cpu_timers[1]);
 +      INIT_LIST_HEAD(&sig->cpu_timers[2]);
 +}
 +
  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
  {
        struct signal_struct *sig;
        int ret;
  
        if (clone_flags & CLONE_THREAD) {
 -              atomic_inc(&current->signal->count);
 -              atomic_inc(&current->signal->live);
 -              return 0;
 +              ret = thread_group_cputime_clone_thread(current);
 +              if (likely(!ret)) {
 +                      atomic_inc(&current->signal->count);
 +                      atomic_inc(&current->signal->live);
 +              }
 +              return ret;
        }
        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
  
 -      sig->it_virt_expires = cputime_zero;
 -      sig->it_virt_incr = cputime_zero;
 -      sig->it_prof_expires = cputime_zero;
 -      sig->it_prof_incr = cputime_zero;
 -
        sig->leader = 0;        /* session leadership doesn't inherit */
        sig->tty_old_pgrp = NULL;
        sig->tty = NULL;
  
 -      sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 +      sig->cutime = sig->cstime = cputime_zero;
        sig->gtime = cputime_zero;
        sig->cgtime = cputime_zero;
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
        task_io_accounting_init(&sig->ioac);
 -      sig->sum_sched_runtime = 0;
 -      INIT_LIST_HEAD(&sig->cpu_timers[0]);
 -      INIT_LIST_HEAD(&sig->cpu_timers[1]);
 -      INIT_LIST_HEAD(&sig->cpu_timers[2]);
        taskstats_tgid_init(sig);
  
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
  
 -      if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
 -              /*
 -               * New sole thread in the process gets an expiry time
 -               * of the whole CPU time limit.
 -               */
 -              tsk->it_prof_expires =
 -                      secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
 -      }
 +      posix_cpu_timers_init_group(sig);
 +
        acct_init_pacct(&sig->pacct);
  
        tty_audit_fork(sig);
  
  void __cleanup_signal(struct signal_struct *sig)
  {
 +      thread_group_cputime_free(sig);
        exit_thread_group_keys(sig);
        tty_kref_put(sig->tty);
        kmem_cache_free(signal_cachep, sig);
@@@ -903,19 -887,6 +903,19 @@@ void mm_init_owner(struct mm_struct *mm
  }
  #endif /* CONFIG_MM_OWNER */
  
 +/*
 + * Initialize POSIX timer handling for a single task.
 + */
 +static void posix_cpu_timers_init(struct task_struct *tsk)
 +{
 +      tsk->cputime_expires.prof_exp = cputime_zero;
 +      tsk->cputime_expires.virt_exp = cputime_zero;
 +      tsk->cputime_expires.sched_exp = 0;
 +      INIT_LIST_HEAD(&tsk->cpu_timers[0]);
 +      INIT_LIST_HEAD(&tsk->cpu_timers[1]);
 +      INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 +}
 +
  /*
   * This creates a new process as a copy of the old one,
   * but does not actually start it yet.
@@@ -1018,6 -989,8 +1018,8 @@@ static struct task_struct *copy_process
        p->prev_utime = cputime_zero;
        p->prev_stime = cputime_zero;
  
+       p->default_timer_slack_ns = current->timer_slack_ns;
  #ifdef CONFIG_DETECT_SOFTLOCKUP
        p->last_switch_count = 0;
        p->last_switch_timestamp = 0;
        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);
  
 -      p->it_virt_expires = cputime_zero;
 -      p->it_prof_expires = cputime_zero;
 -      p->it_sched_expires = 0;
 -      INIT_LIST_HEAD(&p->cpu_timers[0]);
 -      INIT_LIST_HEAD(&p->cpu_timers[1]);
 -      INIT_LIST_HEAD(&p->cpu_timers[2]);
 +      posix_cpu_timers_init(p);
  
        p->lock_depth = -1;             /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 -
 -              if (!cputime_eq(current->signal->it_virt_expires,
 -                              cputime_zero) ||
 -                  !cputime_eq(current->signal->it_prof_expires,
 -                              cputime_zero) ||
 -                  current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
 -                  !list_empty(&current->signal->cpu_timers[0]) ||
 -                  !list_empty(&current->signal->cpu_timers[1]) ||
 -                  !list_empty(&current->signal->cpu_timers[2])) {
 -                      /*
 -                       * Have child wake up on its first tick to check
 -                       * for process CPU timers.
 -                       */
 -                      p->it_prof_expires = jiffies_to_cputime(1);
 -              }
        }
  
        if (likely(p->pid)) {
@@@ -1373,8 -1366,6 +1375,8 @@@ long do_fork(unsigned long clone_flags
        if (!IS_ERR(p)) {
                struct completion vfork;
  
 +              trace_sched_process_fork(current, p);
 +
                nr = task_pid_vnr(p);
  
                if (clone_flags & CLONE_PARENT_SETTID)
diff --combined kernel/hrtimer.c
index 95978f48e039fcbd7e7e233224a7e3f4a0b2f884,4fc41414fc068395b7202ec80e39ce404812efa7..2b465dfde4269b6d4609957999c9eecb0a148a36
@@@ -517,7 -517,7 +517,7 @@@ static void hrtimer_force_reprogram(str
                if (!base->first)
                        continue;
                timer = rb_entry(base->first, struct hrtimer, node);
-               expires = ktime_sub(timer->expires, base->offset);
+               expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires.tv64 < cpu_base->expires_next.tv64)
                        cpu_base->expires_next = expires;
        }
@@@ -539,10 -539,10 +539,10 @@@ static int hrtimer_reprogram(struct hrt
                             struct hrtimer_clock_base *base)
  {
        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
-       ktime_t expires = ktime_sub(timer->expires, base->offset);
+       ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
  
-       WARN_ON_ONCE(timer->expires.tv64 < 0);
+       WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
  
        /*
         * When the callback is running, we do not reprogram the clock event
@@@ -795,7 -795,7 +795,7 @@@ u64 hrtimer_forward(struct hrtimer *tim
        u64 orun = 1;
        ktime_t delta;
  
-       delta = ktime_sub(now, timer->expires);
+       delta = ktime_sub(now, hrtimer_get_expires(timer));
  
        if (delta.tv64 < 0)
                return 0;
                s64 incr = ktime_to_ns(interval);
  
                orun = ktime_divns(delta, incr);
-               timer->expires = ktime_add_ns(timer->expires, incr * orun);
-               if (timer->expires.tv64 > now.tv64)
+               hrtimer_add_expires_ns(timer, incr * orun);
+               if (hrtimer_get_expires_tv64(timer) > now.tv64)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 */
                orun++;
        }
-       timer->expires = ktime_add_safe(timer->expires, interval);
+       hrtimer_add_expires(timer, interval);
  
        return orun;
  }
@@@ -848,7 -848,8 +848,8 @@@ static void enqueue_hrtimer(struct hrti
                 * We dont care about collisions. Nodes with
                 * the same expiry time stay together.
                 */
-               if (timer->expires.tv64 < entry->expires.tv64) {
+               if (hrtimer_get_expires_tv64(timer) <
+                               hrtimer_get_expires_tv64(entry)) {
                        link = &(*link)->rb_left;
                } else {
                        link = &(*link)->rb_right;
@@@ -945,9 -946,10 +946,10 @@@ remove_hrtimer(struct hrtimer *timer, s
  }
  
  /**
-  * hrtimer_start - (re)start an relative timer on the current CPU
+  * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
   * @timer:    the timer to be added
   * @tim:      expiry time
+  * @delta_ns: "slack" range for the timer
   * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
   *
   * Returns:
   *  1 when the timer was active
   */
  int
- hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
+                       const enum hrtimer_mode mode)
  {
        struct hrtimer_clock_base *base, *new_base;
        unsigned long flags;
  #endif
        }
  
-       timer->expires = tim;
+       hrtimer_set_expires_range_ns(timer, tim, delta_ns);
  
        timer_stats_hrtimer_set_start_info(timer);
  
  
        return ret;
  }
+ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+ /**
+  * hrtimer_start - (re)start an hrtimer on the current CPU
+  * @timer:    the timer to be added
+  * @tim:      expiry time
+  * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+  *
+  * Returns:
+  *  0 on success
+  *  1 when the timer was active
+  */
+ int
+ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+ {
+       return hrtimer_start_range_ns(timer, tim, 0, mode);
+ }
  EXPORT_SYMBOL_GPL(hrtimer_start);
  
  /**
   * hrtimer_try_to_cancel - try to deactivate a timer
   * @timer:    hrtimer to stop
@@@ -1077,7 -1098,7 +1098,7 @@@ ktime_t hrtimer_get_remaining(const str
        ktime_t rem;
  
        base = lock_hrtimer_base(timer, &flags);
-       rem = ktime_sub(timer->expires, base->get_time());
+       rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);
  
        return rem;
@@@ -1109,7 -1130,7 +1130,7 @@@ ktime_t hrtimer_get_next_event(void
                                continue;
  
                        timer = rb_entry(base->first, struct hrtimer, node);
-                       delta.tv64 = timer->expires.tv64;
+                       delta.tv64 = hrtimer_get_expires_tv64(timer);
                        delta = ktime_sub(delta, base->get_time());
                        if (delta.tv64 < mindelta.tv64)
                                mindelta.tv64 = delta.tv64;
@@@ -1310,10 -1331,23 +1331,23 @@@ void hrtimer_interrupt(struct clock_eve
  
                        timer = rb_entry(node, struct hrtimer, node);
  
-                       if (basenow.tv64 < timer->expires.tv64) {
+                       /*
+                        * The immediate goal for using the softexpires is
+                        * minimizing wakeups, not running timers at the
+                        * earliest interrupt after their soft expiration.
+                        * This allows us to avoid using a Priority Search
+                        * Tree, which can answer a stabbing querry for
+                        * overlapping intervals and instead use the simple
+                        * BST we already have.
+                        * We don't add extra wakeups by delaying timers that
+                        * are right-of a not yet expired timer, because that
+                        * timer will have to trigger a wakeup anyway.
+                        */
+                       if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
                                ktime_t expires;
  
-                               expires = ktime_sub(timer->expires,
+                               expires = ktime_sub(hrtimer_get_expires(timer),
                                                    base->offset);
                                if (expires.tv64 < expires_next.tv64)
                                        expires_next = expires;
                raise_softirq(HRTIMER_SOFTIRQ);
  }
  
+ /**
+  * hrtimer_peek_ahead_timers -- run soft-expired timers now
+  *
+  * hrtimer_peek_ahead_timers will peek at the timer queue of
+  * the current cpu and check if there are any timers for which
+  * the soft expires time has passed. If any such timers exist,
+  * they are run immediately and then removed from the timer queue.
+  *
+  */
+ void hrtimer_peek_ahead_timers(void)
+ {
+       struct tick_device *td;
+       unsigned long flags;
+       if (!hrtimer_hres_active())
+               return;
+       local_irq_save(flags);
+       td = &__get_cpu_var(tick_cpu_device);
+       if (td && td->evtdev)
+               hrtimer_interrupt(td->evtdev);
+       local_irq_restore(flags);
+ }
  static void run_hrtimer_softirq(struct softirq_action *h)
  {
        run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@@ -1403,7 -1461,9 +1461,7 @@@ void hrtimer_run_queues(void
                if (!base->first)
                        continue;
  
 -              if (base->get_softirq_time)
 -                      base->softirq_time = base->get_softirq_time();
 -              else if (gettime) {
 +              if (gettime) {
                        hrtimer_get_softirq_time(cpu_base);
                        gettime = 0;
                }
                        struct hrtimer *timer;
  
                        timer = rb_entry(node, struct hrtimer, node);
-                       if (base->softirq_time.tv64 <= timer->expires.tv64)
+                       if (base->softirq_time.tv64 <=
+                                       hrtimer_get_expires_tv64(timer))
                                break;
  
                        if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@@ -1462,7 -1523,7 +1521,7 @@@ static int __sched do_nanosleep(struct 
  
        do {
                set_current_state(TASK_INTERRUPTIBLE);
-               hrtimer_start(&t->timer, t->timer.expires, mode);
+               hrtimer_start_expires(&t->timer, mode);
                if (!hrtimer_active(&t->timer))
                        t->task = NULL;
  
@@@ -1484,7 -1545,7 +1543,7 @@@ static int update_rmtp(struct hrtimer *
        struct timespec rmt;
        ktime_t rem;
  
-       rem = ktime_sub(timer->expires, timer->base->get_time());
+       rem = hrtimer_expires_remaining(timer);
        if (rem.tv64 <= 0)
                return 0;
        rmt = ktime_to_timespec(rem);
@@@ -1503,7 -1564,7 +1562,7 @@@ long __sched hrtimer_nanosleep_restart(
  
        hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
                                HRTIMER_MODE_ABS);
-       t.timer.expires.tv64 = restart->nanosleep.expires;
+       hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
  
        if (do_nanosleep(&t, HRTIMER_MODE_ABS))
                goto out;
@@@ -1528,9 -1589,14 +1587,14 @@@ long hrtimer_nanosleep(struct timespec 
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;
+       unsigned long slack;
+       slack = current->timer_slack_ns;
+       if (rt_task(current))
+               slack = 0;
  
        hrtimer_init_on_stack(&t.timer, clockid, mode);
-       t.timer.expires = timespec_to_ktime(*rqtp);
+       hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
        if (do_nanosleep(&t, mode))
                goto out;
  
        restart->fn = hrtimer_nanosleep_restart;
        restart->nanosleep.index = t.timer.base->index;
        restart->nanosleep.rmtp = rmtp;
-       restart->nanosleep.expires = t.timer.expires.tv64;
+       restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
  
        ret = -ERESTART_RESTARTBLOCK;
  out:
@@@ -1686,11 -1752,9 +1750,11 @@@ static void migrate_hrtimers(int cpu
        new_base = &get_cpu_var(hrtimer_bases);
  
        tick_cancel_sched_timer(cpu);
 -
 -      local_irq_disable();
 -      spin_lock(&new_base->lock);
 +      /*
 +       * The caller is globally serialized and nobody else
 +       * takes two locks at once, deadlock is not possible.
 +       */
 +      spin_lock_irq(&new_base->lock);
        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
  
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                raise = 1;
  
        spin_unlock(&old_base->lock);
 -      spin_unlock(&new_base->lock);
 -      local_irq_enable();
 +      spin_unlock_irq(&new_base->lock);
        put_cpu_var(hrtimer_bases);
  
        if (raise)
@@@ -1752,3 -1817,103 +1816,103 @@@ void __init hrtimers_init(void
  #endif
  }
  
+ /**
+  * schedule_hrtimeout_range - sleep until timeout
+  * @expires:  timeout value (ktime_t)
+  * @delta:    slack in expires timeout (ktime_t)
+  * @mode:     timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+  *
+  * Make the current task sleep until the given expiry time has
+  * elapsed. The routine will return immediately unless
+  * the current task state has been set (see set_current_state()).
+  *
+  * The @delta argument gives the kernel the freedom to schedule the
+  * actual wakeup to a time that is both power and performance friendly.
+  * The kernel give the normal best effort behavior for "@expires+@delta",
+  * but may decide to fire the timer earlier, but no earlier than @expires.
+  *
+  * You can set the task state as follows -
+  *
+  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+  * pass before the routine returns.
+  *
+  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+  * delivered to the current task.
+  *
+  * The current task state is guaranteed to be TASK_RUNNING when this
+  * routine returns.
+  *
+  * Returns 0 when the timer has expired otherwise -EINTR
+  */
+ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+                              const enum hrtimer_mode mode)
+ {
+       struct hrtimer_sleeper t;
+       /*
+        * Optimize when a zero timeout value is given. It does not
+        * matter whether this is an absolute or a relative time.
+        */
+       if (expires && !expires->tv64) {
+               __set_current_state(TASK_RUNNING);
+               return 0;
+       }
+       /*
+        * A NULL parameter means "inifinte"
+        */
+       if (!expires) {
+               schedule();
+               __set_current_state(TASK_RUNNING);
+               return -EINTR;
+       }
+       hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+       hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+       hrtimer_init_sleeper(&t, current);
+       hrtimer_start_expires(&t.timer, mode);
+       if (!hrtimer_active(&t.timer))
+               t.task = NULL;
+       if (likely(t.task))
+               schedule();
+       hrtimer_cancel(&t.timer);
+       destroy_hrtimer_on_stack(&t.timer);
+       __set_current_state(TASK_RUNNING);
+       return !t.task ? 0 : -EINTR;
+ }
+ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+ /**
+  * schedule_hrtimeout - sleep until timeout
+  * @expires:  timeout value (ktime_t)
+  * @mode:     timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+  *
+  * Make the current task sleep until the given expiry time has
+  * elapsed. The routine will return immediately unless
+  * the current task state has been set (see set_current_state()).
+  *
+  * You can set the task state as follows -
+  *
+  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+  * pass before the routine returns.
+  *
+  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+  * delivered to the current task.
+  *
+  * The current task state is guaranteed to be TASK_RUNNING when this
+  * routine returns.
+  *
+  * Returns 0 when the timer has expired otherwise -EINTR
+  */
+ int __sched schedule_hrtimeout(ktime_t *expires,
+                              const enum hrtimer_mode mode)
+ {
+       return schedule_hrtimeout_range(expires, 0, mode);
+ }
+ EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --combined kernel/posix-timers.c
index b931d7cedbfa9fd70a47d07535d3b791661ec39a,ee204586149a0c8ab6144808c0610320545f7b8f..5e79c662294bf542750af232be67a1e92bed858d
@@@ -222,15 -222,6 +222,15 @@@ static int posix_ktime_get_ts(clockid_
        return 0;
  }
  
 +/*
 + * Get monotonic time for posix timers
 + */
 +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 +{
 +      getrawmonotonic(tp);
 +      return 0;
 +}
 +
  /*
   * Initialize everything, well, just everything in Posix clocks/timers ;)
   */
@@@ -244,15 -235,9 +244,15 @@@ static __init int init_posix_timers(voi
                .clock_get = posix_ktime_get_ts,
                .clock_set = do_posix_clock_nosettime,
        };
 +      struct k_clock clock_monotonic_raw = {
 +              .clock_getres = hrtimer_get_res,
 +              .clock_get = posix_get_monotonic_raw,
 +              .clock_set = do_posix_clock_nosettime,
 +      };
  
        register_posix_clock(CLOCK_REALTIME, &clock_realtime);
        register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 +      register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
  
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof (struct k_itimer), 0, SLAB_PANIC,
@@@ -313,7 -298,6 +313,7 @@@ void do_schedule_next_timer(struct sigi
  
  int posix_timer_event(struct k_itimer *timr, int si_private)
  {
 +      int shared, ret;
        /*
         * FIXME: if ->sigq is queued we can race with
         * dequeue_signal()->do_schedule_next_timer().
         */
        timr->sigq->info.si_sys_private = si_private;
  
 -      timr->sigq->info.si_signo = timr->it_sigev_signo;
 -      timr->sigq->info.si_code = SI_TIMER;
 -      timr->sigq->info.si_tid = timr->it_id;
 -      timr->sigq->info.si_value = timr->it_sigev_value;
 -
 -      if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
 -              struct task_struct *leader;
 -              int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
 -
 -              if (likely(ret >= 0))
 -                      return ret;
 -
 -              timr->it_sigev_notify = SIGEV_SIGNAL;
 -              leader = timr->it_process->group_leader;
 -              put_task_struct(timr->it_process);
 -              timr->it_process = leader;
 -      }
 -
 -      return send_sigqueue(timr->sigq, timr->it_process, 1);
 +      shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
 +      ret = send_sigqueue(timr->sigq, timr->it_process, shared);
 +      /* If we failed to send the signal the timer stops. */
 +      return ret > 0;
  }
  EXPORT_SYMBOL_GPL(posix_timer_event);
  
@@@ -469,9 -468,11 +469,9 @@@ sys_timer_create(const clockid_t which_
                 struct sigevent __user *timer_event_spec,
                 timer_t __user * created_timer_id)
  {
 -      int error = 0;
 -      struct k_itimer *new_timer = NULL;
 -      int new_timer_id;
 -      struct task_struct *process = NULL;
 -      unsigned long flags;
 +      struct k_itimer *new_timer;
 +      int error, new_timer_id;
 +      struct task_struct *process;
        sigevent_t event;
        int it_id_set = IT_ID_NOT_SET;
  
                goto out;
        }
        spin_lock_irq(&idr_lock);
 -      error = idr_get_new(&posix_timers_id, (void *) new_timer,
 -                          &new_timer_id);
 +      error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
        spin_unlock_irq(&idr_lock);
 -      if (error == -EAGAIN)
 -              goto retry;
 -      else if (error) {
 +      if (error) {
 +              if (error == -EAGAIN)
 +                      goto retry;
                /*
                 * Weird looking, but we return EAGAIN if the IDR is
                 * full (proper POSIX return value for this)
                        error = -EFAULT;
                        goto out;
                }
 -              new_timer->it_sigev_notify = event.sigev_notify;
 -              new_timer->it_sigev_signo = event.sigev_signo;
 -              new_timer->it_sigev_value = event.sigev_value;
 -
 -              read_lock(&tasklist_lock);
 -              if ((process = good_sigevent(&event))) {
 -                      /*
 -                       * We may be setting up this process for another
 -                       * thread.  It may be exiting.  To catch this
 -                       * case the we check the PF_EXITING flag.  If
 -                       * the flag is not set, the siglock will catch
 -                       * him before it is too late (in exit_itimers).
 -                       *
 -                       * The exec case is a bit more invloved but easy
 -                       * to code.  If the process is in our thread
 -                       * group (and it must be or we would not allow
 -                       * it here) and is doing an exec, it will cause
 -                       * us to be killed.  In this case it will wait
 -                       * for us to die which means we can finish this
 -                       * linkage with our last gasp. I.e. no code :)
 -                       */
 -                      spin_lock_irqsave(&process->sighand->siglock, flags);
 -                      if (!(process->flags & PF_EXITING)) {
 -                              new_timer->it_process = process;
 -                              list_add(&new_timer->list,
 -                                       &process->signal->posix_timers);
 -                              if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 -                                      get_task_struct(process);
 -                              spin_unlock_irqrestore(&process->sighand->siglock, flags);
 -                      } else {
 -                              spin_unlock_irqrestore(&process->sighand->siglock, flags);
 -                              process = NULL;
 -                      }
 -              }
 -              read_unlock(&tasklist_lock);
 +              rcu_read_lock();
 +              process = good_sigevent(&event);
 +              if (process)
 +                      get_task_struct(process);
 +              rcu_read_unlock();
                if (!process) {
                        error = -EINVAL;
                        goto out;
                }
        } else {
 -              new_timer->it_sigev_notify = SIGEV_SIGNAL;
 -              new_timer->it_sigev_signo = SIGALRM;
 -              new_timer->it_sigev_value.sival_int = new_timer->it_id;
 +              event.sigev_notify = SIGEV_SIGNAL;
 +              event.sigev_signo = SIGALRM;
 +              event.sigev_value.sival_int = new_timer->it_id;
                process = current->group_leader;
 -              spin_lock_irqsave(&process->sighand->siglock, flags);
 -              new_timer->it_process = process;
 -              list_add(&new_timer->list, &process->signal->posix_timers);
 -              spin_unlock_irqrestore(&process->sighand->siglock, flags);
 +              get_task_struct(process);
        }
  
 +      new_timer->it_sigev_notify     = event.sigev_notify;
 +      new_timer->sigq->info.si_signo = event.sigev_signo;
 +      new_timer->sigq->info.si_value = event.sigev_value;
 +      new_timer->sigq->info.si_tid   = new_timer->it_id;
 +      new_timer->sigq->info.si_code  = SI_TIMER;
 +
 +      spin_lock_irq(&current->sighand->siglock);
 +      new_timer->it_process = process;
 +      list_add(&new_timer->list, &current->signal->posix_timers);
 +      spin_unlock_irq(&current->sighand->siglock);
 +
 +      return 0;
        /*
         * In the case of the timer belonging to another task, after
         * the task is unlocked, the timer is owned by the other task
         * and may cease to exist at any time.  Don't use or modify
         * new_timer after the unlock call.
         */
 -
  out:
 -      if (error)
 -              release_posix_timer(new_timer, it_id_set);
 -
 +      release_posix_timer(new_timer, it_id_set);
        return error;
  }
  
   * the find to the timer lock.  To avoid a dead lock, the timer id MUST
   * be release with out holding the timer lock.
   */
 -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
 +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
  {
        struct k_itimer *timr;
        /*
         * flags part over to the timer lock.  Must not let interrupts in
         * while we are moving the lock.
         */
 -
        spin_lock_irqsave(&idr_lock, *flags);
 -      timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
 +      timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
                spin_lock(&timr->it_lock);
 -
 -              if ((timr->it_id != timer_id) || !(timr->it_process) ||
 -                              !same_thread_group(timr->it_process, current)) {
 -                      spin_unlock(&timr->it_lock);
 -                      spin_unlock_irqrestore(&idr_lock, *flags);
 -                      timr = NULL;
 -              } else
 +              if (timr->it_process &&
 +                  same_thread_group(timr->it_process, current)) {
                        spin_unlock(&idr_lock);
 -      } else
 -              spin_unlock_irqrestore(&idr_lock, *flags);
 +                      return timr;
 +              }
 +              spin_unlock(&timr->it_lock);
 +      }
 +      spin_unlock_irqrestore(&idr_lock, *flags);
  
 -      return timr;
 +      return NULL;
  }
  
  /*
@@@ -639,7 -668,7 +639,7 @@@ common_timer_get(struct k_itimer *timr
            (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
                timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
  
-       remaining = ktime_sub(timer->expires, now);
+       remaining = ktime_sub(hrtimer_get_expires(timer), now);
        /* Return 0 only, when the timer is expired and not pending */
        if (remaining.tv64 <= 0) {
                /*
@@@ -733,7 -762,7 +733,7 @@@ common_timer_set(struct k_itimer *timr
        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;
  
-       timer->expires = timespec_to_ktime(new_setting->it_value);
+       hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
  
        /* Convert interval */
        timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
        if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
                /* Setup correct expiry time for relative timers */
                if (mode == HRTIMER_MODE_REL) {
-                       timer->expires =
-                               ktime_add_safe(timer->expires,
-                                              timer->base->get_time());
+                       hrtimer_add_expires(timer, timer->base->get_time());
                }
                return 0;
        }
  
-       hrtimer_start(timer, timer->expires, mode);
+       hrtimer_start_expires(timer, mode);
        return 0;
  }
  
@@@ -833,7 -860,8 +831,7 @@@ retry_delete
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
 -      if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 -              put_task_struct(timer->it_process);
 +      put_task_struct(timer->it_process);
        timer->it_process = NULL;
  
        unlock_timer(timer, flags);
@@@ -860,7 -888,8 +858,7 @@@ retry_delete
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
 -      if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 -              put_task_struct(timer->it_process);
 +      put_task_struct(timer->it_process);
        timer->it_process = NULL;
  
        unlock_timer(timer, flags);
diff --combined kernel/sched.c
index d906f72b42d23ae1d8c2355d9b605e5fd0761eaa,eb3c72953615c06d19b08be2480a3144b259c55d..bfa87918380ffa3005847c832a3fd86fdf1c54ce
@@@ -71,7 -71,6 +71,7 @@@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
 +#include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@@ -227,9 -226,8 +227,8 @@@ static void start_rt_bandwidth(struct r
  
                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start(&rt_b->rt_period_timer,
-                             rt_b->rt_period_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                               HRTIMER_MODE_ABS);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
  }
@@@ -1064,7 -1062,7 +1063,7 @@@ static void hrtick_start(struct rq *rq
        struct hrtimer *timer = &rq->hrtick_timer;
        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
  
-       timer->expires = time;
+       hrtimer_set_expires(timer, time);
  
        if (rq == this_rq()) {
                hrtimer_restart(timer);
@@@ -1937,7 -1935,6 +1936,7 @@@ unsigned long wait_task_inactive(struc
                 * just go back and repeat.
                 */
                rq = task_rq_lock(p, &flags);
 +              trace_sched_wait_task(rq, p);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
@@@ -2299,7 -2296,9 +2298,7 @@@ out_activate
        success = 1;
  
  out_running:
 -      trace_mark(kernel_sched_wakeup,
 -              "pid %d state %ld ## rq %p task %p rq->curr %p",
 -              p->pid, p->state, rq, p, rq->curr);
 +      trace_sched_wakeup(rq, p);
        check_preempt_curr(rq, p, sync);
  
        p->state = TASK_RUNNING;
@@@ -2432,7 -2431,9 +2431,7 @@@ void wake_up_new_task(struct task_struc
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
 -      trace_mark(kernel_sched_wakeup_new,
 -              "pid %d state %ld ## rq %p task %p rq->curr %p",
 -              p->pid, p->state, rq, p, rq->curr);
 +      trace_sched_wakeup_new(rq, p);
        check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@@ -2605,7 -2606,11 +2604,7 @@@ context_switch(struct rq *rq, struct ta
        struct mm_struct *mm, *oldmm;
  
        prepare_task_switch(rq, prev, next);
 -      trace_mark(kernel_sched_schedule,
 -              "prev_pid %d next_pid %d prev_state %ld "
 -              "## rq %p prev %p next %p",
 -              prev->pid, next->pid, prev->state,
 -              rq, prev, next);
 +      trace_sched_switch(rq, prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@@ -2845,7 -2850,6 +2844,7 @@@ static void sched_migrate_task(struct t
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
 +      trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@@ -4047,26 -4051,23 +4046,26 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
 - * Return p->sum_exec_runtime plus any more ns on the sched_clock
 - * that have not yet been banked in case the task is currently running.
 + * Return any ns on the sched_clock that have not yet been banked in
 + * @p in case that task is currently running.
   */
 -unsigned long long task_sched_runtime(struct task_struct *p)
 +unsigned long long task_delta_exec(struct task_struct *p)
  {
        unsigned long flags;
 -      u64 ns, delta_exec;
        struct rq *rq;
 +      u64 ns = 0;
  
        rq = task_rq_lock(p, &flags);
 -      ns = p->se.sum_exec_runtime;
 +
        if (task_current(rq, p)) {
 +              u64 delta_exec;
 +
                update_rq_clock(rq);
                delta_exec = rq->clock - p->se.exec_start;
                if ((s64)delta_exec > 0)
 -                      ns += delta_exec;
 +                      ns = delta_exec;
        }
 +
        task_rq_unlock(rq, &flags);
  
        return ns;
@@@ -4083,7 -4084,6 +4082,7 @@@ void account_user_time(struct task_stru
        cputime64_t tmp;
  
        p->utime = cputime_add(p->utime, cputime);
 +      account_group_user_time(p, cputime);
  
        /* Add user time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
@@@ -4108,7 -4108,6 +4107,7 @@@ static void account_guest_time(struct t
        tmp = cputime_to_cputime64(cputime);
  
        p->utime = cputime_add(p->utime, cputime);
 +      account_group_user_time(p, cputime);
        p->gtime = cputime_add(p->gtime, cputime);
  
        cpustat->user = cputime64_add(cpustat->user, tmp);
@@@ -4144,7 -4143,6 +4143,7 @@@ void account_system_time(struct task_st
        }
  
        p->stime = cputime_add(p->stime, cputime);
 +      account_group_system_time(p, cputime);
  
        /* Add system time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
@@@ -4186,7 -4184,6 +4185,7 @@@ void account_steal_time(struct task_str
  
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
 +              account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
diff --combined kernel/sys.c
index 53879cdae483b6371543bdeb94818292e91f07bb,fc71f99fb469a22da88b08263e2609409566173b..31deba8f7d160c19bf262b1be13272e2ede1050d
@@@ -853,28 -853,38 +853,28 @@@ asmlinkage long sys_setfsgid(gid_t gid
        return old_fsgid;
  }
  
 +void do_sys_times(struct tms *tms)
 +{
 +      struct task_cputime cputime;
 +      cputime_t cutime, cstime;
 +
 +      spin_lock_irq(&current->sighand->siglock);
 +      thread_group_cputime(current, &cputime);
 +      cutime = current->signal->cutime;
 +      cstime = current->signal->cstime;
 +      spin_unlock_irq(&current->sighand->siglock);
 +      tms->tms_utime = cputime_to_clock_t(cputime.utime);
 +      tms->tms_stime = cputime_to_clock_t(cputime.stime);
 +      tms->tms_cutime = cputime_to_clock_t(cutime);
 +      tms->tms_cstime = cputime_to_clock_t(cstime);
 +}
 +
  asmlinkage long sys_times(struct tms __user * tbuf)
  {
 -      /*
 -       *      In the SMP world we might just be unlucky and have one of
 -       *      the times increment as we use it. Since the value is an
 -       *      atomically safe type this is just fine. Conceptually its
 -       *      as if the syscall took an instant longer to occur.
 -       */
        if (tbuf) {
                struct tms tmp;
 -              struct task_struct *tsk = current;
 -              struct task_struct *t;
 -              cputime_t utime, stime, cutime, cstime;
 -
 -              spin_lock_irq(&tsk->sighand->siglock);
 -              utime = tsk->signal->utime;
 -              stime = tsk->signal->stime;
 -              t = tsk;
 -              do {
 -                      utime = cputime_add(utime, t->utime);
 -                      stime = cputime_add(stime, t->stime);
 -                      t = next_thread(t);
 -              } while (t != tsk);
 -
 -              cutime = tsk->signal->cutime;
 -              cstime = tsk->signal->cstime;
 -              spin_unlock_irq(&tsk->sighand->siglock);
 -
 -              tmp.tms_utime = cputime_to_clock_t(utime);
 -              tmp.tms_stime = cputime_to_clock_t(stime);
 -              tmp.tms_cutime = cputime_to_clock_t(cutime);
 -              tmp.tms_cstime = cputime_to_clock_t(cstime);
 +
 +              do_sys_times(&tmp);
                if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                        return -EFAULT;
        }
@@@ -1439,6 -1449,7 +1439,6 @@@ asmlinkage long sys_old_getrlimit(unsig
  asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
  {
        struct rlimit new_rlim, *old_rlim;
 -      unsigned long it_prof_secs;
        int retval;
  
        if (resource >= RLIM_NLIMITS)
        if (new_rlim.rlim_cur == RLIM_INFINITY)
                goto out;
  
 -      it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
 -      if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
 -              unsigned long rlim_cur = new_rlim.rlim_cur;
 -              cputime_t cputime;
 -
 -              cputime = secs_to_cputime(rlim_cur);
 -              read_lock(&tasklist_lock);
 -              spin_lock_irq(&current->sighand->siglock);
 -              set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 -              spin_unlock_irq(&current->sighand->siglock);
 -              read_unlock(&tasklist_lock);
 -      }
 +      update_rlimit_cpu(new_rlim.rlim_cur);
  out:
        return 0;
  }
   *
   */
  
 -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
 -                                   cputime_t *utimep, cputime_t *stimep)
 +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
  {
 -      *utimep = cputime_add(*utimep, t->utime);
 -      *stimep = cputime_add(*stimep, t->stime);
        r->ru_nvcsw += t->nvcsw;
        r->ru_nivcsw += t->nivcsw;
        r->ru_minflt += t->min_flt;
@@@ -1545,13 -1570,12 +1545,13 @@@ static void k_getrusage(struct task_str
        struct task_struct *t;
        unsigned long flags;
        cputime_t utime, stime;
 +      struct task_cputime cputime;
  
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
  
        if (who == RUSAGE_THREAD) {
 -              accumulate_thread_rusage(p, r, &utime, &stime);
 +              accumulate_thread_rusage(p, r);
                goto out;
        }
  
                                break;
  
                case RUSAGE_SELF:
 -                      utime = cputime_add(utime, p->signal->utime);
 -                      stime = cputime_add(stime, p->signal->stime);
 +                      thread_group_cputime(p, &cputime);
 +                      utime = cputime_add(utime, cputime.utime);
 +                      stime = cputime_add(stime, cputime.stime);
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
                        r->ru_oublock += p->signal->oublock;
                        t = p;
                        do {
 -                              accumulate_thread_rusage(t, r, &utime, &stime);
 +                              accumulate_thread_rusage(t, r);
                                t = next_thread(t);
                        } while (t != p);
                        break;
@@@ -1716,6 -1739,16 +1716,16 @@@ asmlinkage long sys_prctl(int option, u
                case PR_SET_TSC:
                        error = SET_TSC_CTL(arg2);
                        break;
+               case PR_GET_TIMERSLACK:
+                       error = current->timer_slack_ns;
+                       break;
+               case PR_SET_TIMERSLACK:
+                       if (arg2 <= 0)
+                               current->timer_slack_ns =
+                                       current->default_timer_slack_ns;
+                       else
+                               current->timer_slack_ns = arg2;
+                       break;
                default:
                        error = -EINVAL;
                        break;
diff --combined kernel/time/ntp.c
index 1a20715bfd6e4854e96e96eb1541f792767aeaa5,9c114b726ab3353e75fa222e8597e52db9d34a82..8ff15e5d486b137e65f96c64b5a206315bab7ee4
  
  #include <linux/mm.h>
  #include <linux/time.h>
 -#include <linux/timer.h>
  #include <linux/timex.h>
  #include <linux/jiffies.h>
  #include <linux/hrtimer.h>
  #include <linux/capability.h>
  #include <linux/math64.h>
  #include <linux/clocksource.h>
 +#include <linux/workqueue.h>
  #include <asm/timex.h>
  
  /*
@@@ -142,8 -142,7 +142,7 @@@ static enum hrtimer_restart ntp_leap_se
                time_state = TIME_OOP;
                printk(KERN_NOTICE "Clock: "
                       "inserting leap second 23:59:60 UTC\n");
-               leap_timer.expires = ktime_add_ns(leap_timer.expires,
-                                                 NSEC_PER_SEC);
+               hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
                res = HRTIMER_RESTART;
                break;
        case TIME_DEL:
@@@ -218,11 -217,11 +217,11 @@@ void second_overflow(void
  /* Disable the cmos update - used by virtualization and embedded */
  int no_sync_cmos_clock  __read_mostly;
  
 -static void sync_cmos_clock(unsigned long dummy);
 +static void sync_cmos_clock(struct work_struct *work);
  
 -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
 +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
  
 -static void sync_cmos_clock(unsigned long dummy)
 +static void sync_cmos_clock(struct work_struct *work)
  {
        struct timespec now, next;
        int fail = 1;
                next.tv_sec++;
                next.tv_nsec -= NSEC_PER_SEC;
        }
 -      mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
 +      schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
  }
  
  static void notify_cmos_timer(void)
  {
        if (!no_sync_cmos_clock)
 -              mod_timer(&sync_cmos_timer, jiffies + 1);
 +              schedule_delayed_work(&sync_cmos_work, 0);
  }
  
  #else
@@@ -277,50 -276,38 +276,50 @@@ static inline void notify_cmos_timer(vo
  int do_adjtimex(struct timex *txc)
  {
        struct timespec ts;
 -      long save_adjust, sec;
        int result;
  
 -      /* In order to modify anything, you gotta be super-user! */
 -      if (txc->modes && !capable(CAP_SYS_TIME))
 -              return -EPERM;
 -
 -      /* Now we validate the data before disabling interrupts */
 -
 -      if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
 +      /* Validate the data before disabling interrupts */
 +      if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
 -              if (txc->modes & ~ADJ_OFFSET_SS_READ)
 +              if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
 +              if (!(txc->modes & ADJ_OFFSET_READONLY) &&
 +                  !capable(CAP_SYS_TIME))
 +                      return -EPERM;
 +      } else {
 +              /* In order to modify anything, you gotta be super-user! */
 +               if (txc->modes && !capable(CAP_SYS_TIME))
 +                      return -EPERM;
 +
 +              /* if the quartz is off by more than 10% something is VERY wrong! */
 +              if (txc->modes & ADJ_TICK &&
 +                  (txc->tick <  900000/USER_HZ ||
 +                   txc->tick > 1100000/USER_HZ))
 +                              return -EINVAL;
 +
 +              if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
 +                      hrtimer_cancel(&leap_timer);
        }
  
 -      /* if the quartz is off by more than 10% something is VERY wrong ! */
 -      if (txc->modes & ADJ_TICK)
 -              if (txc->tick <  900000/USER_HZ ||
 -                  txc->tick > 1100000/USER_HZ)
 -                      return -EINVAL;
 -
 -      if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
 -              hrtimer_cancel(&leap_timer);
        getnstimeofday(&ts);
  
        write_seqlock_irq(&xtime_lock);
  
 -      /* Save for later - semantics of adjtime is to return old value */
 -      save_adjust = time_adjust;
 -
        /* If there are input parameters, then process them */
 +      if (txc->modes & ADJ_ADJTIME) {
 +              long save_adjust = time_adjust;
 +
 +              if (!(txc->modes & ADJ_OFFSET_READONLY)) {
 +                      /* adjtime() is independent from ntp_adjtime() */
 +                      time_adjust = txc->offset;
 +                      ntp_update_frequency();
 +              }
 +              txc->offset = save_adjust;
 +              goto adj_done;
 +      }
        if (txc->modes) {
 +              long sec;
 +
                if (txc->modes & ADJ_STATUS) {
                        if ((time_status & STA_PLL) &&
                            !(txc->status & STA_PLL)) {
                if (txc->modes & ADJ_TAI && txc->constant > 0)
                        time_tai = txc->constant;
  
 -              if (txc->modes & ADJ_OFFSET) {
 -                      if (txc->modes == ADJ_OFFSET_SINGLESHOT)
 -                              /* adjtime() is independent from ntp_adjtime() */
 -                              time_adjust = txc->offset;
 -                      else
 -                              ntp_update_offset(txc->offset);
 -              }
 +              if (txc->modes & ADJ_OFFSET)
 +                      ntp_update_offset(txc->offset);
                if (txc->modes & ADJ_TICK)
                        tick_usec = txc->tick;
  
                        ntp_update_frequency();
        }
  
 +      txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 +                                NTP_SCALE_SHIFT);
 +      if (!(time_status & STA_NANO))
 +              txc->offset /= NSEC_PER_USEC;
 +
 +adj_done:
        result = time_state;    /* mostly `TIME_OK' */
        if (time_status & (STA_UNSYNC|STA_CLOCKERR))
                result = TIME_ERROR;
  
 -      if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
 -          (txc->modes == ADJ_OFFSET_SS_READ))
 -              txc->offset = save_adjust;
 -      else {
 -              txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 -                                        NTP_SCALE_SHIFT);
 -              if (!(time_status & STA_NANO))
 -                      txc->offset /= NSEC_PER_USEC;
 -      }
 -      txc->freq          = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
 -                                       (s64)PPM_SCALE_INV,
 -                                       NTP_SCALE_SHIFT);
 +      txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
 +                                       (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
        txc->maxerror      = time_maxerror;
        txc->esterror      = time_esterror;
        txc->status        = time_status;
diff --combined kernel/time/tick-sched.c
index 727c1ae0517ae68f02b6ba6fa0830b45a6213aab,a547be11cf976c9abba879b4fcd1c025c78ca78d..5bbb1044f8473ff98bd3fdfe2c93a1c0798ed8e7
@@@ -155,7 -155,7 +155,7 @@@ void tick_nohz_update_jiffies(void
        touch_softlockup_watchdog();
  }
  
 -void tick_nohz_stop_idle(int cpu)
 +static void tick_nohz_stop_idle(int cpu)
  {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
  
@@@ -300,7 -300,7 +300,7 @@@ void tick_nohz_stop_sched_tick(int inid
                                goto out;
                        }
  
-                       ts->idle_tick = ts->sched_timer.expires;
+                       ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                        rcu_enter_nohz();
@@@ -377,32 -377,6 +377,32 @@@ ktime_t tick_nohz_get_sleep_length(void
        return ts->sleep_length;
  }
  
-       ts->sched_timer.expires = ts->idle_tick;
 +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 +{
 +      hrtimer_cancel(&ts->sched_timer);
-                       hrtimer_start(&ts->sched_timer,
-                                     ts->sched_timer.expires,
++      hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
 +
 +      while (1) {
 +              /* Forward the time to expire in the future */
 +              hrtimer_forward(&ts->sched_timer, now, tick_period);
 +
 +              if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-                       if (!tick_program_event(ts->sched_timer.expires, 0))
++                      hrtimer_start_expires(&ts->sched_timer,
 +                                    HRTIMER_MODE_ABS);
 +                      /* Check, if the timer was already in the past */
 +                      if (hrtimer_active(&ts->sched_timer))
 +                              break;
 +              } else {
++                      if (!tick_program_event(
++                              hrtimer_get_expires(&ts->sched_timer), 0))
 +                              break;
 +              }
 +              /* Update jiffies and reread time */
 +              tick_do_update_jiffies64(now);
 +              now = ktime_get();
 +      }
 +}
 +
  /**
   * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
   *
@@@ -456,14 -430,35 +456,16 @@@ void tick_nohz_restart_sched_tick(void
         */
        ts->tick_stopped  = 0;
        ts->idle_exittime = now;
 -      hrtimer_cancel(&ts->sched_timer);
 -      hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
 -      while (1) {
 -              /* Forward the time to expire in the future */
 -              hrtimer_forward(&ts->sched_timer, now, tick_period);
 +      tick_nohz_restart(ts, now);
 -              if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 -                      hrtimer_start_expires(&ts->sched_timer,
 -                                    HRTIMER_MODE_ABS);
 -                      /* Check, if the timer was already in the past */
 -                      if (hrtimer_active(&ts->sched_timer))
 -                              break;
 -              } else {
 -                      if (!tick_program_event(
 -                              hrtimer_get_expires(&ts->sched_timer), 0))
 -                              break;
 -              }
 -              /* Update jiffies and reread time */
 -              tick_do_update_jiffies64(now);
 -              now = ktime_get();
 -      }
        local_irq_enable();
  }
  
  static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
  {
        hrtimer_forward(&ts->sched_timer, now, tick_period);
-       return tick_program_event(ts->sched_timer.expires, 0);
+       return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
  }
  
  /*
@@@ -508,6 -503,10 +510,6 @@@ static void tick_nohz_handler(struct cl
        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
  
 -      /* Do not restart, when we are in the idle loop */
 -      if (ts->tick_stopped)
 -              return;
 -
        while (tick_nohz_reprogram(ts, now)) {
                now = ktime_get();
                tick_do_update_jiffies64(now);
@@@ -542,7 -541,7 +544,7 @@@ static void tick_nohz_switch_to_nohz(vo
        next = tick_init_jiffy_update();
  
        for (;;) {
-               ts->sched_timer.expires = next;
+               hrtimer_set_expires(&ts->sched_timer, next);
                if (!tick_program_event(next, 0))
                        break;
                next = ktime_add(next, tick_period);
               smp_processor_id());
  }
  
-       delta = ktime_sub(ts->sched_timer.expires, now);
 +/*
 + * When NOHZ is enabled and the tick is stopped, we need to kick the
 + * tick timer from irq_enter() so that the jiffies update is kept
 + * alive during long running softirqs. That's ugly as hell, but
 + * correctness is key even if we need to fix the offending softirq in
 + * the first place.
 + *
 + * Note, this is different to tick_nohz_restart. We just kick the
 + * timer and do not touch the other magic bits which need to be done
 + * when idle is left.
 + */
 +static void tick_nohz_kick_tick(int cpu)
 +{
 +      struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 +      ktime_t delta, now;
 +
 +      if (!ts->tick_stopped)
 +              return;
 +
 +      /*
 +       * Do not touch the tick device, when the next expiry is either
 +       * already reached or less/equal than the tick period.
 +       */
 +      now = ktime_get();
++      delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
 +      if (delta.tv64 <= tick_period.tv64)
 +              return;
 +
 +      tick_nohz_restart(ts, now);
 +}
 +
  #else
  
  static inline void tick_nohz_switch_to_nohz(void) { }
  
  #endif /* NO_HZ */
  
 +/*
 + * Called from irq_enter to notify about the possible interruption of idle()
 + */
 +void tick_check_idle(int cpu)
 +{
 +      tick_check_oneshot_broadcast(cpu);
 +#ifdef CONFIG_NO_HZ
 +      tick_nohz_stop_idle(cpu);
 +      tick_nohz_update_jiffies();
 +      tick_nohz_kick_tick(cpu);
 +#endif
 +}
 +
  /*
   * High resolution timer specific code
   */
@@@ -656,6 -611,10 +658,6 @@@ static enum hrtimer_restart tick_sched_
                profile_tick(CPU_PROFILING);
        }
  
 -      /* Do not restart, when we are in the idle loop */
 -      if (ts->tick_stopped)
 -              return HRTIMER_NORESTART;
 -
        hrtimer_forward(timer, now, tick_period);
  
        return HRTIMER_RESTART;
@@@ -678,16 -637,15 +680,15 @@@ void tick_setup_sched_timer(void
        ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  
        /* Get the next period (per cpu) */
-       ts->sched_timer.expires = tick_init_jiffy_update();
+       hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
        offset = ktime_to_ns(tick_period) >> 1;
        do_div(offset, num_possible_cpus());
        offset *= smp_processor_id();
-       ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
+       hrtimer_add_expires_ns(&ts->sched_timer, offset);
  
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
-               hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
                /* Check, if the timer was already in the past */
                if (hrtimer_active(&ts->sched_timer))
                        break;
diff --combined kernel/time/timer_list.c
index f6426911e35a6da3a711e8b8e2cb602a5057a2dc,122ee751d2d1e16e6429489f70bc7ef13ba48a94..a999b92a12773750daded0c912a822b6f0eebe84
@@@ -47,14 -47,13 +47,14 @@@ static void print_name_offset(struct se
  }
  
  static void
 -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
 +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 +          int idx, u64 now)
  {
  #ifdef CONFIG_TIMER_STATS
        char tmp[TASK_COMM_LEN + 1];
  #endif
        SEQ_printf(m, " #%d: ", idx);
 -      print_name_offset(m, timer);
 +      print_name_offset(m, taddr);
        SEQ_printf(m, ", ");
        print_name_offset(m, timer->function);
        SEQ_printf(m, ", S:%02lx", timer->state);
        SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
  #endif
        SEQ_printf(m, "\n");
-       SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
-               (unsigned long long)ktime_to_ns(timer->expires),
-               (long long)(ktime_to_ns(timer->expires) - now));
+       SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+               (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+               (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+               (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+               (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
  }
  
  static void
@@@ -100,7 -101,7 +102,7 @@@ next_one
                tmp = *timer;
                spin_unlock_irqrestore(&base->cpu_base->lock, flags);
  
 -              print_timer(m, &tmp, i, now);
 +              print_timer(m, timer, &tmp, i, now);
                next++;
                goto next_one;
        }
  static void
  print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
  {
 +      SEQ_printf(m, "  .base:       %p\n", base);
        SEQ_printf(m, "  .index:      %d\n",
                        base->index);
        SEQ_printf(m, "  .resolution: %Lu nsecs\n",
@@@ -185,16 -185,12 +187,16 @@@ static void print_cpu(struct seq_file *
  
  #ifdef CONFIG_GENERIC_CLOCKEVENTS
  static void
 -print_tickdevice(struct seq_file *m, struct tick_device *td)
 +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
  {
        struct clock_event_device *dev = td->evtdev;
  
        SEQ_printf(m, "\n");
        SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
 +      if (cpu < 0)
 +              SEQ_printf(m, "Broadcast device\n");
 +      else
 +              SEQ_printf(m, "Per CPU device: %d\n", cpu);
  
        SEQ_printf(m, "Clock Event Device: ");
        if (!dev) {
@@@ -228,7 -224,7 +230,7 @@@ static void timer_list_show_tickdevices
        int cpu;
  
  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 -      print_tickdevice(m, tick_get_broadcast_device());
 +      print_tickdevice(m, tick_get_broadcast_device(), -1);
        SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
                   tick_get_broadcast_mask()->bits[0]);
  #ifdef CONFIG_TICK_ONESHOT
        SEQ_printf(m, "\n");
  #endif
        for_each_online_cpu(cpu)
 -                 print_tickdevice(m, tick_get_device(cpu));
 +              print_tickdevice(m, tick_get_device(cpu), cpu);
        SEQ_printf(m, "\n");
  }
  #else
@@@ -250,7 -246,7 +252,7 @@@ static int timer_list_show(struct seq_f
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
  
 -      SEQ_printf(m, "Timer List Version: v0.3\n");
 +      SEQ_printf(m, "Timer List Version: v0.4\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);