#include <linux/errno.h>
#include <linux/init.h>
#include <linux/pci.h>
++ + #include <linux/delay.h>
#include <asm/io.h>
/*
*/
static int verify_pmtmr_rate(void)
{
-- - u32 value1, value2;
++ + cycle_t value1, value2;
unsigned long count, delta;
mach_prepare_counter();
-- - value1 = read_pmtmr();
++ + value1 = clocksource_acpi_pm.read();
mach_countup(&count);
-- - value2 = read_pmtmr();
++ + value2 = clocksource_acpi_pm.read();
delta = (value2 - value1) & ACPI_PM_MASK;
/* Check that the PMTMR delta is within 5% of what we expect */
#define verify_pmtmr_rate() (0)
#endif
++ + /* Number of monotonicity checks to perform during initialization */
++ + #define ACPI_PM_MONOTONICITY_CHECKS 10
++ ++ /* Number of reads we try to get two different values */
++ ++ #define ACPI_PM_READ_CHECKS 10000
++ +
static int __init init_acpi_pm_clocksource(void)
{
-- - u32 value1, value2;
-- - unsigned int i;
++ + cycle_t value1, value2;
- unsigned int i, j, good = 0;
++ ++ unsigned int i, j = 0;
if (!pmtmr_ioport)
return -ENODEV;
clocksource_acpi_pm.shift);
/* "verify" this timing source: */
-- - value1 = read_pmtmr();
-- - for (i = 0; i < 10000; i++) {
-- - value2 = read_pmtmr();
-- - if (value2 == value1)
-- - continue;
-- - if (value2 > value1)
-- - goto pm_good;
-- - if ((value2 < value1) && ((value2) < 0xFFF))
-- - goto pm_good;
-- - printk(KERN_INFO "PM-Timer had inconsistent results:"
-- - " 0x%#x, 0x%#x - aborting.\n", value1, value2);
-- - return -EINVAL;
++ + for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
++ ++ udelay(100 * j);
++ + value1 = clocksource_acpi_pm.read();
- for (i = 0; i < 10000; i++) {
++ ++ for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
++ + value2 = clocksource_acpi_pm.read();
++ + if (value2 == value1)
++ + continue;
++ + if (value2 > value1)
- good++;
++ + break;
++ + if ((value2 < value1) && ((value2) < 0xFFF))
- good++;
++ + break;
++ + printk(KERN_INFO "PM-Timer had inconsistent results:"
++ + " 0x%#llx, 0x%#llx - aborting.\n",
++ + value1, value2);
++ + return -EINVAL;
++ + }
- udelay(300 * i);
- }
-
- if (good != ACPI_PM_MONOTONICITY_CHECKS) {
- printk(KERN_INFO "PM-Timer failed consistency check "
- " (0x%#llx) - aborting.\n", value1);
- return -ENODEV;
++ ++ if (i == ACPI_PM_READ_CHECKS) {
++ ++ printk(KERN_INFO "PM-Timer failed consistency check "
++ ++ " (0x%#llx) - aborting.\n", value1);
++ ++ return -ENODEV;
++ ++ }
}
-- - printk(KERN_INFO "PM-Timer had no reasonable result:"
-- - " 0x%#x - aborting.\n", value1);
-- - return -ENODEV;
-- - pm_good:
if (verify_pmtmr_rate() != 0)
return -ENODEV;
if (strict_strtoul(arg, 16, &base))
return -EINVAL;
-----
+++++#ifdef CONFIG_X86_64
+++++ if (base > UINT_MAX)
+++++ return -ERANGE;
+++++#endif
printk(KERN_INFO "PMTMR IOPort override: 0x%04x -> 0x%04lx\n",
----- (unsigned int)pmtmr_ioport, base);
+++++ pmtmr_ioport, base);
pmtmr_ioport = base;
return 1;
* switch really is going to happen - do this in
* flush_thread(). - akpm
*/
-- -- SET_PERSONALITY(loc->elf_ex, 0);
++ ++ SET_PERSONALITY(loc->elf_ex);
interpreter = open_exec(elf_interpreter);
retval = PTR_ERR(interpreter);
goto out_free_dentry;
} else {
/* Executables without an interpreter also need a personality */
-- -- SET_PERSONALITY(loc->elf_ex, 0);
++ ++ SET_PERSONALITY(loc->elf_ex);
}
/* Flush all traces of the currently running executable */
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
-- -- SET_PERSONALITY(loc->elf_ex, 0);
++ ++ SET_PERSONALITY(loc->elf_ex);
if (elf_read_implies_exec(loc->elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
prstatus->pr_pgrp = task_pgrp_vnr(p);
prstatus->pr_sid = task_session_vnr(p);
if (thread_group_leader(p)) {
++++ + struct task_cputime cputime;
++++ +
/*
---- - * This is the record for the group leader. Add in the
---- - * cumulative times of previous dead threads. This total
---- - * won't include the time of each live thread whose state
---- - * is included in the core dump. The final total reported
---- - * to our parent process when it calls wait4 will include
---- - * those sums as well as the little bit more time it takes
---- - * this and each other thread to finish dying after the
---- - * core dump synchronization phase.
++++ + * This is the record for the group leader. It shows the
++++ + * group-wide total, not its individual thread total.
*/
---- - cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
---- - &prstatus->pr_utime);
---- - cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
---- - &prstatus->pr_stime);
++++ + thread_group_cputime(p, &cputime);
++++ + cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
++++ + cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
cputime_to_timeval(p->utime, &prstatus->pr_utime);
cputime_to_timeval(p->stime, &prstatus->pr_stime);
#include <asm/processor.h>
#include "internal.h"
-- -- /* Gcc optimizes away "strlen(x)" for constant x */
-- -- #define ADDBUF(buffer, string) \
-- -- do { memcpy(buffer, string, strlen(string)); \
-- -- buffer += strlen(string); } while (0)
-- --
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
int i;
sigemptyset(&ignored);
sigemptyset(&caught);
-- -- rcu_read_lock();
if (lock_task_sighand(p, &flags)) {
pending = p->pending.signal;
shpending = p->signal->shared_pending.signal;
qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
unlock_task_sighand(p, &flags);
}
-- -- rcu_read_unlock();
seq_printf(m, "Threads:\t%d\n", num_threads);
seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
return 0;
}
-- /*
-- * Use precise platform statistics if available:
-- */
-- #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-- static cputime_t task_utime(struct task_struct *p)
-- {
-- return p->utime;
-- }
--
-- static cputime_t task_stime(struct task_struct *p)
-- {
-- return p->stime;
-- }
-- #else
-- static cputime_t task_utime(struct task_struct *p)
-- {
-- clock_t utime = cputime_to_clock_t(p->utime),
-- total = utime + cputime_to_clock_t(p->stime);
-- u64 temp;
--
-- /*
-- * Use CFS's precise accounting:
-- */
-- temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
--
-- if (total) {
-- temp *= utime;
-- do_div(temp, total);
-- }
-- utime = (clock_t)temp;
--
-- p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-- return p->prev_utime;
-- }
--
-- static cputime_t task_stime(struct task_struct *p)
-- {
-- clock_t stime;
--
-- /*
-- * Use CFS's precise accounting. (we subtract utime from
-- * the total, to make sure the total observed by userspace
-- * grows monotonically - apps rely on that):
-- */
-- stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-- cputime_to_clock_t(task_utime(p));
--
-- if (stime >= 0)
-- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
--
-- return p->prev_stime;
-- }
-- #endif
--
-- static cputime_t task_gtime(struct task_struct *p)
-- {
-- return p->gtime;
-- }
--
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
/* add up live thread stats at the group level */
if (whole) {
++++ + struct task_cputime cputime;
struct task_struct *t = task;
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
---- - utime = cputime_add(utime, task_utime(t));
---- - stime = cputime_add(stime, task_stime(t));
gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
---- - utime = cputime_add(utime, sig->utime);
---- - stime = cputime_add(stime, sig->stime);
++++ + thread_group_cputime(task, &cputime);
++++ + utime = cputime.utime;
++++ + stime = cputime.stime;
gtime = cputime_add(gtime, sig->gtime);
}
* HRTIMER_CB_IRQSAFE: Callback may run in hardirq context
* HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and
* does not restart the timer
-- -- * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: Callback must run in hardirq context
-- -- * Special mode for tick emultation
++ ++ * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context
++ ++ * Special mode for tick emulation and
++ ++ * scheduler timer. Such timers are per
++ ++ * cpu and not allowed to be migrated on
++ ++ * cpu unplug.
++ ++ * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context
++ ++ * with timer->base lock unlocked
++ ++ * used for timers which call wakeup to
++ ++ * avoid lock order problems with rq->lock
*/
enum hrtimer_cb_mode {
HRTIMER_CB_SOFTIRQ,
HRTIMER_CB_IRQSAFE,
HRTIMER_CB_IRQSAFE_NO_RESTART,
-- -- HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
++ ++ HRTIMER_CB_IRQSAFE_PERCPU,
++ ++ HRTIMER_CB_IRQSAFE_UNLOCKED,
};
/*
* 0x02 callback function running
* 0x04 callback pending (high resolution mode)
*
-- -- * Special case:
++ ++ * Special cases:
* 0x03 callback function running and enqueued
* (was requeued on another CPU)
++ ++ * 0x09 timer was migrated on CPU hotunplug
* The "callback function running and enqueued" status is only possible on
* SMP. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer
#define HRTIMER_STATE_ENQUEUED 0x01
#define HRTIMER_STATE_CALLBACK 0x02
#define HRTIMER_STATE_PENDING 0x04
++ ++ #define HRTIMER_STATE_MIGRATE 0x08
/**
* struct hrtimer - the basic hrtimer structure
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
- ---- enum hrtimer_cb_mode cb_mode;
struct list_head cb_entry;
+ ++++ enum hrtimer_cb_mode cb_mode;
#ifdef CONFIG_TIMER_STATS
+ ++++ int start_pid;
void *start_site;
char start_comm[16];
- ---- int start_pid;
#endif
};
* @first: pointer to the timer node which expires first
* @resolution: the resolution of the clock, in nanoseconds
* @get_time: function to retrieve the current time of the clock
- ---- * @get_softirq_time: function to retrieve the current time from the softirq
* @softirq_time: the time when running the hrtimer queue in the softirq
* @offset: offset of this clock to the monotonic base
- ---- * @reprogram: function to reprogram the timer event
*/
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
struct rb_node *first;
ktime_t resolution;
ktime_t (*get_time)(void);
- ---- ktime_t (*get_softirq_time)(void);
ktime_t softirq_time;
#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t offset;
- ---- int (*reprogram)(struct hrtimer *t,
- ---- struct hrtimer_clock_base *b,
- ---- ktime_t n);
#endif
};
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
-- -- #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
++ ++ #if USE_SPLIT_PTLOCKS
/*
* The mm counters are not protected by its page_table_lock,
* so must be incremented atomically.
#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-- -- #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++ ++ #else /* !USE_SPLIT_PTLOCKS */
/*
* The mm counters are protected by its page_table_lock,
* so can be incremented directly.
#define inc_mm_counter(mm, member) (mm)->_##member++
#define dec_mm_counter(mm, member) (mm)->_##member--
-- -- #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++ ++ #endif /* !USE_SPLIT_PTLOCKS */
#define get_mm_rss(mm) \
(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
unsigned long ac_minflt, ac_majflt;
};
++++ +/**
++++ + * struct task_cputime - collected CPU time counts
++++ + * @utime: time spent in user mode, in &cputime_t units
++++ + * @stime: time spent in kernel mode, in &cputime_t units
++++ + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
++++ + *
++++ + * This structure groups together three kinds of CPU time that are
++++ + * tracked for threads and thread groups. Most things considering
++++ + * CPU time want to group these counts together and treat all three
++++ + * of them in parallel.
++++ + */
++++ +struct task_cputime {
++++ + cputime_t utime;
++++ + cputime_t stime;
++++ + unsigned long long sum_exec_runtime;
++++ +};
++++ +/* Alternate field names when used to cache expirations. */
++++ +#define prof_exp stime
++++ +#define virt_exp utime
++++ +#define sched_exp sum_exec_runtime
++++ +
++++ +/**
++++ + * struct thread_group_cputime - thread group interval timer counts
++++ + * @totals: thread group interval timers; substructure for
++++ + * uniprocessor kernel, per-cpu for SMP kernel.
++++ + *
++++ + * This structure contains the version of task_cputime, above, that is
++++ + * used for thread group CPU clock calculations.
++++ + */
++++ +struct thread_group_cputime {
++++ + struct task_cputime *totals;
++++ +};
++++ +
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
* - everyone except group_exit_task is stopped during signal delivery
* of fatal signals, group_exit_task processes the signal.
*/
-- -- struct task_struct *group_exit_task;
int notify_count;
++ ++ struct task_struct *group_exit_task;
/* thread group stop support, overloads group_exit_code too */
int group_stop_count;
cputime_t it_prof_expires, it_virt_expires;
cputime_t it_prof_incr, it_virt_incr;
++++ + /*
++++ + * Thread group totals for process CPU clocks.
++++ + * See thread_group_cputime(), et al, for details.
++++ + */
++++ + struct thread_group_cputime cputime;
++++ +
++++ + /* Earliest-expiration cache. */
++++ + struct task_cputime cputime_expires;
++++ +
++++ + struct list_head cpu_timers[3];
++++ +
/* job control IDs */
/*
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
---- - cputime_t utime, stime, cutime, cstime;
++++ + cputime_t cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long inblock, oublock, cinblock, coublock;
struct task_io_accounting ioac;
---- - /*
---- - * Cumulative ns of scheduled CPU time for dead threads in the
---- - * group, not including a zombie group leader. (This only differs
---- - * from jiffies_to_ns(utime + stime) if sched_clock uses something
---- - * other than jiffies.)
---- - */
---- - unsigned long long sum_sched_runtime;
---- -
/*
* We don't bother to synchronize most readers of this at all,
* because there is no reader checking a limit that actually needs
*/
struct rlimit rlim[RLIM_NLIMITS];
---- - struct list_head cpu_timers[3];
---- -
/* keep the process-shared keyrings here so that they do the right
* thing in threads created with CLONE_THREAD */
#ifdef CONFIG_KEYS
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
++ ++ #ifdef CONFIG_SCHED_DEBUG
++ ++ char *name;
++ ++ #endif
};
extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
void (*yield_task) (struct rq *rq);
int (*select_task_rq)(struct task_struct *p, int sync);
-- -- void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
++ ++ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
struct sched_rt_entity {
struct list_head run_list;
-- -- unsigned int time_slice;
unsigned long timeout;
++ ++ unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;
---- - cputime_t it_prof_expires, it_virt_expires;
---- - unsigned long long it_sched_expires;
++++ + struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
/* process credentials */
__put_task_struct(t);
}
++ extern cputime_t task_utime(struct task_struct *p);
++ extern cputime_t task_stime(struct task_struct *p);
++ extern cputime_t task_gtime(struct task_struct *p);
++
/*
* Per process flags
*/
extern unsigned long long
task_sched_runtime(struct task_struct *task);
++++ +extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
#endif
}
++++ +/*
++++ + * Thread group CPU time accounting.
++++ + */
++++ +
++++ +extern int thread_group_cputime_alloc(struct task_struct *);
++++ +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
++++ +
++++ +static inline void thread_group_cputime_init(struct signal_struct *sig)
++++ +{
++++ + sig->cputime.totals = NULL;
++++ +}
++++ +
++++ +static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
++++ +{
++++ + if (curr->signal->cputime.totals)
++++ + return 0;
++++ + return thread_group_cputime_alloc(curr);
++++ +}
++++ +
++++ +static inline void thread_group_cputime_free(struct signal_struct *sig)
++++ +{
++++ + free_percpu(sig->cputime.totals);
++++ +}
++++ +
/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
#ifdef __KERNEL__
++ ++ extern struct timezone sys_tz;
++ ++
/* Parameters used to convert the timespec values: */
#define MSEC_PER_SEC 1000L
#define USEC_PER_MSEC 1000L
extern unsigned int alarm_setitimer(unsigned int seconds);
extern int do_getitimer(int which, struct itimerval *value);
extern void getnstimeofday(struct timespec *tv);
+++++extern void getrawmonotonic(struct timespec *ts);
extern void getboottime(struct timespec *ts);
extern void monotonic_to_bootbased(struct timespec *ts);
extern void update_wall_time(void);
extern void update_xtime_cache(u64 nsec);
++++ +struct tms;
++++ +extern void do_sys_times(struct tms *);
++++ +
/**
* timespec_to_ns - Convert timespec to nanoseconds
* @ts: pointer to the timespec variable to be converted
#define CLOCK_MONOTONIC 1
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
+++++#define CLOCK_MONOTONIC_RAW 4
/*
* The IDs of various hardware clocks:
#include <linux/timex.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
++++ +#include <linux/times.h>
#include <asm/uaccess.h>
++ ++ /*
++ ++ * Note that the native side is already converted to a timespec, because
++ ++ * that's what we want anyway.
++ ++ */
++ ++ static int compat_get_timeval(struct timespec *o,
++ ++ struct compat_timeval __user *i)
++ ++ {
++ ++ long usec;
++ ++
++ ++ if (get_user(o->tv_sec, &i->tv_sec) ||
++ ++ get_user(usec, &i->tv_usec))
++ ++ return -EFAULT;
++ ++ o->tv_nsec = usec * 1000;
++ ++ return 0;
++ ++ }
++ ++
++ ++ static int compat_put_timeval(struct compat_timeval __user *o,
++ ++ struct timeval *i)
++ ++ {
++ ++ return (put_user(i->tv_sec, &o->tv_sec) ||
++ ++ put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
++ ++ }
++ ++
++ ++ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
++ ++ struct timezone __user *tz)
++ ++ {
++ ++ if (tv) {
++ ++ struct timeval ktv;
++ ++ do_gettimeofday(&ktv);
++ ++ if (compat_put_timeval(tv, &ktv))
++ ++ return -EFAULT;
++ ++ }
++ ++ if (tz) {
++ ++ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
++ ++ return -EFAULT;
++ ++ }
++ ++
++ ++ return 0;
++ ++ }
++ ++
++ ++ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
++ ++ struct timezone __user *tz)
++ ++ {
++ ++ struct timespec kts;
++ ++ struct timezone ktz;
++ ++
++ ++ if (tv) {
++ ++ if (compat_get_timeval(&kts, tv))
++ ++ return -EFAULT;
++ ++ }
++ ++ if (tz) {
++ ++ if (copy_from_user(&ktz, tz, sizeof(ktz)))
++ ++ return -EFAULT;
++ ++ }
++ ++
++ ++ return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
++ ++ }
++ ++
int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
return 0;
}
++++ +static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
++++ +{
++++ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
++++ +}
++++ +
asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
{
---- - /*
---- - * In the SMP world we might just be unlucky and have one of
---- - * the times increment as we use it. Since the value is an
---- - * atomically safe type this is just fine. Conceptually its
---- - * as if the syscall took an instant longer to occur.
---- - */
if (tbuf) {
++++ + struct tms tms;
struct compat_tms tmp;
---- - struct task_struct *tsk = current;
---- - struct task_struct *t;
---- - cputime_t utime, stime, cutime, cstime;
---- -
---- - read_lock(&tasklist_lock);
---- - utime = tsk->signal->utime;
---- - stime = tsk->signal->stime;
---- - t = tsk;
---- - do {
---- - utime = cputime_add(utime, t->utime);
---- - stime = cputime_add(stime, t->stime);
---- - t = next_thread(t);
---- - } while (t != tsk);
---- -
---- - /*
---- - * While we have tasklist_lock read-locked, no dying thread
---- - * can be updating current->signal->[us]time. Instead,
---- - * we got their counts included in the live thread loop.
---- - * However, another thread can come in right now and
---- - * do a wait call that updates current->signal->c[us]time.
---- - * To make sure we always see that pair updated atomically,
---- - * we take the siglock around fetching them.
---- - */
---- - spin_lock_irq(&tsk->sighand->siglock);
---- - cutime = tsk->signal->cutime;
---- - cstime = tsk->signal->cstime;
---- - spin_unlock_irq(&tsk->sighand->siglock);
---- - read_unlock(&tasklist_lock);
---- -
---- - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
---- - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
---- - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
---- - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
++++ +
++++ + do_sys_times(&tms);
++++ + /* Convert our struct tms to the compat version. */
++++ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
++++ + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
++++ + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
++++ + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
return -EFAULT;
}
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
-- sig->utime = cputime_add(sig->utime, tsk->utime);
-- sig->stime = cputime_add(sig->stime, tsk->stime);
-- sig->gtime = cputime_add(sig->gtime, tsk->gtime);
-- - sig->utime = cputime_add(sig->utime, task_utime(tsk));
-- - sig->stime = cputime_add(sig->stime, task_stime(tsk));
++ sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
---- - sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
* If there are other users of the mm and the owner (us) is exiting
* we need to find a new owner to take on the responsibility.
*/
-- -- if (!mm)
-- -- return 0;
if (atomic_read(&mm->mm_users) <= 1)
return 0;
if (mm->owner != p)
} while_each_thread(g, c);
read_unlock(&tasklist_lock);
++ ++ /*
++ ++ * We found no owner yet mm_users > 1: this implies that we are
++ ++ * most likely racing with swapoff (try_to_unuse()) or /proc or
++ ++ * ptrace or page migration (get_task_mm()). Mark owner as NULL,
++ ++ * so that subsystems can understand the callback and take action.
++ ++ */
++ ++ down_write(&mm->mmap_sem);
++ ++ cgroup_mm_owner_callbacks(mm->owner, NULL);
++ ++ mm->owner = NULL;
++ ++ up_write(&mm->mmap_sem);
return;
assign_new_owner:
BUG_ON(c == p);
get_task_struct(c);
++ ++ read_unlock(&tasklist_lock);
++ ++ down_write(&mm->mmap_sem);
/*
* The task_lock protects c->mm from changing.
* We always want mm->owner->mm == mm
*/
task_lock(c);
-- -- /*
-- -- * Delay read_unlock() till we have the task_lock()
-- -- * to ensure that c does not slip away underneath us
-- -- */
-- -- read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
++ ++ up_write(&mm->mmap_sem);
put_task_struct(c);
goto retry;
}
cgroup_mm_owner_callbacks(mm->owner, c);
mm->owner = c;
task_unlock(c);
++ ++ up_write(&mm->mmap_sem);
put_task_struct(c);
}
#endif /* CONFIG_MM_OWNER */
* the child reaper process (ie "init") in our pid
* space.
*/
++ static struct task_struct *find_new_reaper(struct task_struct *father)
++ {
++ struct pid_namespace *pid_ns = task_active_pid_ns(father);
++ struct task_struct *thread;
++
++ thread = father;
++ while_each_thread(father, thread) {
++ if (thread->flags & PF_EXITING)
++ continue;
++ if (unlikely(pid_ns->child_reaper == father))
++ pid_ns->child_reaper = thread;
++ return thread;
++ }
++
++ if (unlikely(pid_ns->child_reaper == father)) {
++ write_unlock_irq(&tasklist_lock);
++ if (unlikely(pid_ns == &init_pid_ns))
++ panic("Attempted to kill init!");
++
++ zap_pid_ns_processes(pid_ns);
++ write_lock_irq(&tasklist_lock);
++ /*
++ * We can not clear ->child_reaper or leave it alone.
++ * There may by stealth EXIT_DEAD tasks on ->children,
++ * forget_original_parent() must move them somewhere.
++ */
++ pid_ns->child_reaper = init_pid_ns.child_reaper;
++ }
++
++ return pid_ns->child_reaper;
++ }
++
static void forget_original_parent(struct task_struct *father)
{
-- struct task_struct *p, *n, *reaper = father;
++ struct task_struct *p, *n, *reaper;
LIST_HEAD(ptrace_dead);
write_lock_irq(&tasklist_lock);
--
++ reaper = find_new_reaper(father);
/*
* First clean up ptrace if we were using it.
*/
ptrace_exit(father, &ptrace_dead);
-- do {
-- reaper = next_thread(reaper);
-- if (reaper == father) {
-- reaper = task_child_reaper(father);
-- break;
-- }
-- } while (reaper->flags & PF_EXITING);
--
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
if (p->parent == father) {
/* mt-exec, de_thread() is waiting for us */
if (thread_group_leader(tsk) &&
-- tsk->signal->notify_count < 0 &&
-- tsk->signal->group_exit_task)
++ tsk->signal->group_exit_task &&
++ tsk->signal->notify_count < 0)
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
static inline void check_stack_usage(void) {}
#endif
-- static inline void exit_child_reaper(struct task_struct *tsk)
-- {
-- if (likely(tsk->group_leader != task_child_reaper(tsk)))
-- return;
--
-- if (tsk->nsproxy->pid_ns == &init_pid_ns)
-- panic("Attempted to kill init!");
--
-- /*
-- * @tsk is the last thread in the 'cgroup-init' and is exiting.
-- * Terminate all remaining processes in the namespace and reap them
-- * before exiting @tsk.
-- *
-- * Note that @tsk (last thread of cgroup-init) may not necessarily
-- * be the child-reaper (i.e main thread of cgroup-init) of the
-- * namespace i.e the child_reaper may have already exited.
-- *
-- * Even after a child_reaper exits, we let it inherit orphaned children,
-- * because, pid_ns->child_reaper remains valid as long as there is
-- * at least one living sub-thread in the cgroup init.
--
-- * This living sub-thread of the cgroup-init will be notified when
-- * a child inherited by the 'child-reaper' exits (do_notify_parent()
-- * uses __group_send_sig_info()). Further, when reaping child processes,
-- * do_wait() iterates over children of all living sub threads.
--
-- * i.e even though 'child_reaper' thread is listed as the parent of the
-- * orphaned children, any living sub-thread in the cgroup-init can
-- * perform the role of the child_reaper.
-- */
-- zap_pid_ns_processes(tsk->nsproxy->pid_ns);
-- }
--
NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
}
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
-- exit_child_reaper(tsk);
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
}
if (likely(!traced)) {
struct signal_struct *psig;
struct signal_struct *sig;
++++ + struct task_cputime cputime;
/*
* The resource counters for the group leader are in its
* need to protect the access to p->parent->signal fields,
* as other threads in the parent group can be right
* here reaping other children at the same time.
++++ + *
++++ + * We use thread_group_cputime() to get times for the thread
++++ + * group, which consolidates times for all threads in the
++++ + * group including the group leader.
*/
spin_lock_irq(&p->parent->sighand->siglock);
psig = p->parent->signal;
sig = p->signal;
++++ + thread_group_cputime(p, &cputime);
psig->cutime =
cputime_add(psig->cutime,
---- - cputime_add(p->utime,
---- - cputime_add(sig->utime,
---- - sig->cutime)));
++++ + cputime_add(cputime.utime,
++++ + sig->cutime));
psig->cstime =
cputime_add(psig->cstime,
---- - cputime_add(p->stime,
---- - cputime_add(sig->stime,
---- - sig->cstime)));
++++ + cputime_add(cputime.stime,
++++ + sig->cstime));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
kmem_cache_free(sighand_cachep, sighand);
}
++++ +
++++ +/*
++++ + * Initialize POSIX timer handling for a thread group.
++++ + */
++++ +static void posix_cpu_timers_init_group(struct signal_struct *sig)
++++ +{
++++ + /* Thread group counters. */
++++ + thread_group_cputime_init(sig);
++++ +
++++ + /* Expiration times and increments. */
++++ + sig->it_virt_expires = cputime_zero;
++++ + sig->it_virt_incr = cputime_zero;
++++ + sig->it_prof_expires = cputime_zero;
++++ + sig->it_prof_incr = cputime_zero;
++++ +
++++ + /* Cached expiration times. */
++++ + sig->cputime_expires.prof_exp = cputime_zero;
++++ + sig->cputime_expires.virt_exp = cputime_zero;
++++ + sig->cputime_expires.sched_exp = 0;
++++ +
++++ + /* The timer lists. */
++++ + INIT_LIST_HEAD(&sig->cpu_timers[0]);
++++ + INIT_LIST_HEAD(&sig->cpu_timers[1]);
++++ + INIT_LIST_HEAD(&sig->cpu_timers[2]);
++++ +}
++++ +
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
int ret;
if (clone_flags & CLONE_THREAD) {
---- - atomic_inc(¤t->signal->count);
---- - atomic_inc(¤t->signal->live);
---- - return 0;
++++ + ret = thread_group_cputime_clone_thread(current);
++++ + if (likely(!ret)) {
++++ + atomic_inc(¤t->signal->count);
++++ + atomic_inc(¤t->signal->live);
++++ + }
++++ + return ret;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
---- - sig->it_virt_expires = cputime_zero;
---- - sig->it_virt_incr = cputime_zero;
---- - sig->it_prof_expires = cputime_zero;
---- - sig->it_prof_incr = cputime_zero;
---- -
sig->leader = 0; /* session leadership doesn't inherit */
sig->tty_old_pgrp = NULL;
++ ++ sig->tty = NULL;
---- - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
++++ + sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
task_io_accounting_init(&sig->ioac);
---- - sig->sum_sched_runtime = 0;
---- - INIT_LIST_HEAD(&sig->cpu_timers[0]);
---- - INIT_LIST_HEAD(&sig->cpu_timers[1]);
---- - INIT_LIST_HEAD(&sig->cpu_timers[2]);
taskstats_tgid_init(sig);
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
---- - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
---- - /*
---- - * New sole thread in the process gets an expiry time
---- - * of the whole CPU time limit.
---- - */
---- - tsk->it_prof_expires =
---- - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
---- - }
++++ + posix_cpu_timers_init_group(sig);
++++ +
acct_init_pacct(&sig->pacct);
tty_audit_fork(sig);
void __cleanup_signal(struct signal_struct *sig)
{
++++ + thread_group_cputime_free(sig);
exit_thread_group_keys(sig);
++ ++ tty_kref_put(sig->tty);
kmem_cache_free(signal_cachep, sig);
}
}
#endif /* CONFIG_MM_OWNER */
++++ +/*
++++ + * Initialize POSIX timer handling for a single task.
++++ + */
++++ +static void posix_cpu_timers_init(struct task_struct *tsk)
++++ +{
++++ + tsk->cputime_expires.prof_exp = cputime_zero;
++++ + tsk->cputime_expires.virt_exp = cputime_zero;
++++ + tsk->cputime_expires.sched_exp = 0;
++++ + INIT_LIST_HEAD(&tsk->cpu_timers[0]);
++++ + INIT_LIST_HEAD(&tsk->cpu_timers[1]);
++++ + INIT_LIST_HEAD(&tsk->cpu_timers[2]);
++++ +}
++++ +
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
---- - p->it_virt_expires = cputime_zero;
---- - p->it_prof_expires = cputime_zero;
---- - p->it_sched_expires = 0;
---- - INIT_LIST_HEAD(&p->cpu_timers[0]);
---- - INIT_LIST_HEAD(&p->cpu_timers[1]);
---- - INIT_LIST_HEAD(&p->cpu_timers[2]);
++++ + posix_cpu_timers_init(p);
p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time);
if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
---- -
---- - if (!cputime_eq(current->signal->it_virt_expires,
---- - cputime_zero) ||
---- - !cputime_eq(current->signal->it_prof_expires,
---- - cputime_zero) ||
---- - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
---- - !list_empty(¤t->signal->cpu_timers[0]) ||
---- - !list_empty(¤t->signal->cpu_timers[1]) ||
---- - !list_empty(¤t->signal->cpu_timers[2])) {
---- - /*
---- - * Have child wake up on its first tick to check
---- - * for process CPU timers.
---- - */
---- - p->it_prof_expires = jiffies_to_cputime(1);
---- - }
}
if (likely(p->pid)) {
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
-- -- p->signal->tty = current->signal->tty;
++ ++ tty_kref_put(p->signal->tty);
++ ++ p->signal->tty = tty_kref_get(current->signal->tty);
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
*/
BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
return 1;
-- -- case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
++ ++ case HRTIMER_CB_IRQSAFE_PERCPU:
++ ++ case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
* This is solely for the sched tick emulation with
* dynamic tick support to ensure that we do not
* restart the tick right on the edge and end up with
* the tick timer in the softirq ! The calling site
-- -- * takes care of this.
++ ++ * takes care of this. Also used for hrtimer sleeper !
*/
debug_hrtimer_deactivate(timer);
return 1;
timer_stats_account_hrtimer(timer);
fn = timer->function;
-- -- if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
++ ++ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
++ ++ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
/*
* Used for scheduler timers, avoid lock inversion with
* rq->lock and tasklist_lock.
if (!base->first)
continue;
- ---- if (base->get_softirq_time)
- ---- base->softirq_time = base->get_softirq_time();
- ---- else if (gettime) {
+ ++++ if (gettime) {
hrtimer_get_softirq_time(cpu_base);
gettime = 0;
}
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
-- -- sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
#endif
}
#ifdef CONFIG_HOTPLUG_CPU
-- -- static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-- -- struct hrtimer_clock_base *new_base)
++ ++ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
++ ++ struct hrtimer_clock_base *new_base, int dcpu)
{
struct hrtimer *timer;
struct rb_node *node;
++ ++ int raise = 0;
while ((node = rb_first(&old_base->active))) {
timer = rb_entry(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_hrtimer_deactivate(timer);
-- -- __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
++ ++
++ ++ /*
++ ++ * Should not happen. Per CPU timers should be
++ ++ * canceled _before_ the migration code is called
++ ++ */
++ ++ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
++ ++ __remove_hrtimer(timer, old_base,
++ ++ HRTIMER_STATE_INACTIVE, 0);
++ ++ WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
++ ++ timer, timer->function, dcpu);
++ ++ continue;
++ ++ }
++ ++
++ ++ /*
++ ++ * Mark it as STATE_MIGRATE not INACTIVE otherwise the
++ ++ * timer could be seen as !active and just vanish away
++ ++ * under us on another CPU
++ ++ */
++ ++ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
timer->base = new_base;
/*
* Enqueue the timer. Allow reprogramming of the event device
*/
enqueue_hrtimer(timer, new_base, 1);
++ ++
++ ++ #ifdef CONFIG_HIGH_RES_TIMERS
++ ++ /*
++ ++ * Happens with high res enabled when the timer was
++ ++ * already expired and the callback mode is
++ ++ * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
++ ++ * enqueue code does not move them to the soft irq
++ ++ * pending list for performance/latency reasons, but
++ ++ * in the migration state, we need to do that
++ ++ * otherwise we end up with a stale timer.
++ ++ */
++ ++ if (timer->state == HRTIMER_STATE_MIGRATE) {
++ ++ timer->state = HRTIMER_STATE_PENDING;
++ ++ list_add_tail(&timer->cb_entry,
++ ++ &new_base->cpu_base->cb_pending);
++ ++ raise = 1;
++ ++ }
++ ++ #endif
++ ++ /* Clear the migration state bit */
++ ++ timer->state &= ~HRTIMER_STATE_MIGRATE;
+ }
++ ++ return raise;
+ }
+
++ ++ #ifdef CONFIG_HIGH_RES_TIMERS
++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
++ ++ struct hrtimer_cpu_base *new_base)
++ ++ {
++ ++ struct hrtimer *timer;
++ ++ int raise = 0;
++ ++
++ ++ while (!list_empty(&old_base->cb_pending)) {
++ ++ timer = list_entry(old_base->cb_pending.next,
++ ++ struct hrtimer, cb_entry);
++ ++
++ ++ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
++ ++ timer->base = &new_base->clock_base[timer->base->index];
++ ++ list_add_tail(&timer->cb_entry, &new_base->cb_pending);
++ ++ raise = 1;
+ ++ }
++ ++ return raise;
++ ++ }
++ ++ #else
++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
++ ++ struct hrtimer_cpu_base *new_base)
++ ++ {
++ ++ return 0;
+ ++ }
++ ++ #endif
+ ++
static void migrate_hrtimers(int cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
-- -- int i;
++ ++ int i, raise = 0;
BUG_ON(cpu_online(cpu));
old_base = &per_cpu(hrtimer_bases, cpu);
new_base = &get_cpu_var(hrtimer_bases);
tick_cancel_sched_timer(cpu);
- ----
- ---- local_irq_disable();
- ---- spin_lock(&new_base->lock);
+ ++++ /*
+ ++++ * The caller is globally serialized and nobody else
+ ++++ * takes two locks at once, deadlock is not possible.
+ ++++ */
+ ++++ spin_lock_irq(&new_base->lock);
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-- -- migrate_hrtimer_list(&old_base->clock_base[i],
-- -- &new_base->clock_base[i]);
++ ++ if (migrate_hrtimer_list(&old_base->clock_base[i],
++ ++ &new_base->clock_base[i], cpu))
++ ++ raise = 1;
}
++ ++ if (migrate_hrtimer_pending(old_base, new_base))
++ ++ raise = 1;
++ ++
spin_unlock(&old_base->lock);
- ---- spin_unlock(&new_base->lock);
- ---- local_irq_enable();
+ ++++ spin_unlock_irq(&new_base->lock);
put_cpu_var(hrtimer_bases);
++ ++
++ ++ if (raise)
++ ++ hrtimer_raise_softirq();
}
#endif /* CONFIG_HOTPLUG_CPU */
return 0;
}
+++++/*
+++++ * Get monotonic time for posix timers
+++++ */
+++++static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+++++{
+++++ getrawmonotonic(tp);
+++++ return 0;
+++++}
+++++
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
.clock_get = posix_ktime_get_ts,
.clock_set = do_posix_clock_nosettime,
};
+++++ struct k_clock clock_monotonic_raw = {
+++++ .clock_getres = hrtimer_get_res,
+++++ .clock_get = posix_get_monotonic_raw,
+++++ .clock_set = do_posix_clock_nosettime,
+++++ };
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+++++ register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
int posix_timer_event(struct k_itimer *timr, int si_private)
{
++++ + int shared, ret;
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->do_schedule_next_timer().
*/
timr->sigq->info.si_sys_private = si_private;
---- - timr->sigq->info.si_signo = timr->it_sigev_signo;
---- - timr->sigq->info.si_code = SI_TIMER;
---- - timr->sigq->info.si_tid = timr->it_id;
---- - timr->sigq->info.si_value = timr->it_sigev_value;
---- -
---- - if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
---- - struct task_struct *leader;
---- - int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
---- -
---- - if (likely(ret >= 0))
---- - return ret;
---- -
---- - timr->it_sigev_notify = SIGEV_SIGNAL;
---- - leader = timr->it_process->group_leader;
---- - put_task_struct(timr->it_process);
---- - timr->it_process = leader;
---- - }
---- -
---- - return send_sigqueue(timr->sigq, timr->it_process, 1);
++++ + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
++++ + ret = send_sigqueue(timr->sigq, timr->it_process, shared);
++++ + /* If we failed to send the signal the timer stops. */
++++ + return ret > 0;
}
EXPORT_SYMBOL_GPL(posix_timer_event);
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
kmem_cache_free(posix_timers_cache, tmr);
-- -- tmr = NULL;
++ ++ return NULL;
}
memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
return tmr;
struct sigevent __user *timer_event_spec,
timer_t __user * created_timer_id)
{
---- - int error = 0;
---- - struct k_itimer *new_timer = NULL;
---- - int new_timer_id;
---- - struct task_struct *process = NULL;
---- - unsigned long flags;
++++ + struct k_itimer *new_timer;
++++ + int error, new_timer_id;
++++ + struct task_struct *process;
sigevent_t event;
int it_id_set = IT_ID_NOT_SET;
goto out;
}
spin_lock_irq(&idr_lock);
---- - error = idr_get_new(&posix_timers_id, (void *) new_timer,
---- - &new_timer_id);
++++ + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
spin_unlock_irq(&idr_lock);
---- - if (error == -EAGAIN)
---- - goto retry;
---- - else if (error) {
++++ + if (error) {
++++ + if (error == -EAGAIN)
++++ + goto retry;
/*
* Weird looking, but we return EAGAIN if the IDR is
* full (proper POSIX return value for this)
error = -EFAULT;
goto out;
}
---- - new_timer->it_sigev_notify = event.sigev_notify;
---- - new_timer->it_sigev_signo = event.sigev_signo;
---- - new_timer->it_sigev_value = event.sigev_value;
---- -
---- - read_lock(&tasklist_lock);
---- - if ((process = good_sigevent(&event))) {
---- - /*
---- - * We may be setting up this process for another
---- - * thread. It may be exiting. To catch this
---- - * case the we check the PF_EXITING flag. If
---- - * the flag is not set, the siglock will catch
---- - * him before it is too late (in exit_itimers).
---- - *
---- - * The exec case is a bit more invloved but easy
---- - * to code. If the process is in our thread
---- - * group (and it must be or we would not allow
---- - * it here) and is doing an exec, it will cause
---- - * us to be killed. In this case it will wait
---- - * for us to die which means we can finish this
---- - * linkage with our last gasp. I.e. no code :)
---- - */
---- - spin_lock_irqsave(&process->sighand->siglock, flags);
---- - if (!(process->flags & PF_EXITING)) {
---- - new_timer->it_process = process;
---- - list_add(&new_timer->list,
---- - &process->signal->posix_timers);
---- - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- - get_task_struct(process);
---- - spin_unlock_irqrestore(&process->sighand->siglock, flags);
---- - } else {
---- - spin_unlock_irqrestore(&process->sighand->siglock, flags);
---- - process = NULL;
---- - }
---- - }
---- - read_unlock(&tasklist_lock);
++++ + rcu_read_lock();
++++ + process = good_sigevent(&event);
++++ + if (process)
++++ + get_task_struct(process);
++++ + rcu_read_unlock();
if (!process) {
error = -EINVAL;
goto out;
}
} else {
---- - new_timer->it_sigev_notify = SIGEV_SIGNAL;
---- - new_timer->it_sigev_signo = SIGALRM;
---- - new_timer->it_sigev_value.sival_int = new_timer->it_id;
++++ + event.sigev_notify = SIGEV_SIGNAL;
++++ + event.sigev_signo = SIGALRM;
++++ + event.sigev_value.sival_int = new_timer->it_id;
process = current->group_leader;
---- - spin_lock_irqsave(&process->sighand->siglock, flags);
---- - new_timer->it_process = process;
---- - list_add(&new_timer->list, &process->signal->posix_timers);
---- - spin_unlock_irqrestore(&process->sighand->siglock, flags);
++++ + get_task_struct(process);
}
++++ + new_timer->it_sigev_notify = event.sigev_notify;
++++ + new_timer->sigq->info.si_signo = event.sigev_signo;
++++ + new_timer->sigq->info.si_value = event.sigev_value;
++++ + new_timer->sigq->info.si_tid = new_timer->it_id;
++++ + new_timer->sigq->info.si_code = SI_TIMER;
++++ +
++++ + spin_lock_irq(¤t->sighand->siglock);
++++ + new_timer->it_process = process;
++++ + list_add(&new_timer->list, ¤t->signal->posix_timers);
++++ + spin_unlock_irq(¤t->sighand->siglock);
++++ +
++++ + return 0;
/*
* In the case of the timer belonging to another task, after
* the task is unlocked, the timer is owned by the other task
* and may cease to exist at any time. Don't use or modify
* new_timer after the unlock call.
*/
---- -
out:
---- - if (error)
---- - release_posix_timer(new_timer, it_id_set);
---- -
++++ + release_posix_timer(new_timer, it_id_set);
return error;
}
* the find to the timer lock. To avoid a dead lock, the timer id MUST
* be release with out holding the timer lock.
*/
---- -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
++++ +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
/*
* flags part over to the timer lock. Must not let interrupts in
* while we are moving the lock.
*/
---- -
spin_lock_irqsave(&idr_lock, *flags);
---- - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
++++ + timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
spin_lock(&timr->it_lock);
---- -
---- - if ((timr->it_id != timer_id) || !(timr->it_process) ||
---- - !same_thread_group(timr->it_process, current)) {
---- - spin_unlock(&timr->it_lock);
---- - spin_unlock_irqrestore(&idr_lock, *flags);
---- - timr = NULL;
---- - } else
++++ + if (timr->it_process &&
++++ + same_thread_group(timr->it_process, current)) {
spin_unlock(&idr_lock);
---- - } else
---- - spin_unlock_irqrestore(&idr_lock, *flags);
++++ + return timr;
++++ + }
++++ + spin_unlock(&timr->it_lock);
++++ + }
++++ + spin_unlock_irqrestore(&idr_lock, *flags);
---- - return timr;
++++ + return NULL;
}
/*
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
---- - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- - put_task_struct(timer->it_process);
++++ + put_task_struct(timer->it_process);
timer->it_process = NULL;
unlock_timer(timer, flags);
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
---- - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- - put_task_struct(timer->it_process);
++++ + put_task_struct(timer->it_process);
timer->it_process = NULL;
unlock_timer(timer, flags);
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
-- -- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
++ ++ }
++ ++
++ ++ static inline int rt_bandwidth_enabled(void)
++ ++ {
++ ++ return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
-- -- if (rt_b->rt_runtime == RUNTIME_INF)
++ ++ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
-- -- #else /* !CONFIG_FAIR_GROUP_SCHED */
++ ++ #else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
-- -- #endif /* CONFIG_FAIR_GROUP_SCHED */
++ ++ #endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-- -- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
++ ++ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
-- -- rq->curr->sched_class->check_preempt_curr(rq, p);
++ ++ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
return NOTIFY_DONE;
}
-- -- static void init_hrtick(void)
++ ++ static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
-- -- static void init_hrtick(void)
++ ++ static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
-- -- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
-- -- #else
++ ++ #else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_hrtick(void)
{
}
-- -- #endif
++ ++ #endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
update_load_sub(&rq->load, load);
}
-- -- #ifdef CONFIG_SMP
-- -- static unsigned long source_load(int cpu, int type);
-- -- static unsigned long target_load(int cpu, int type);
-- -- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-- --
-- -- static unsigned long cpu_avg_load_per_task(int cpu)
-- -- {
-- -- struct rq *rq = cpu_rq(cpu);
-- --
-- -- if (rq->nr_running)
-- -- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-- --
-- -- return rq->avg_load_per_task;
-- -- }
-- --
-- -- #ifdef CONFIG_FAIR_GROUP_SCHED
-- --
-- -- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
++ ++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
++ ++ typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
-- -- static void
-- -- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
++ ++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
++ ++ int ret;
rcu_read_lock();
parent = &root_task_group;
down:
-- -- (*down)(parent, cpu, sd);
++ ++ ret = (*down)(parent, data);
++ ++ if (ret)
++ ++ goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
-- -- (*up)(parent, cpu, sd);
++ ++ ret = (*up)(parent, data);
++ ++ if (ret)
++ ++ goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
++ ++ out_unlock:
rcu_read_unlock();
++ ++
++ ++ return ret;
++ + }
++ +
++ ++ static int tg_nop(struct task_group *tg, void *data)
++ ++ {
++ ++ return 0;
+ }
++ ++ #endif
++ ++
++ ++ #ifdef CONFIG_SMP
++ ++ static unsigned long source_load(int cpu, int type);
++ ++ static unsigned long target_load(int cpu, int type);
++ ++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
++ ++
++ ++ static unsigned long cpu_avg_load_per_task(int cpu)
++ ++ {
++ ++ struct rq *rq = cpu_rq(cpu);
++ ++
++ ++ if (rq->nr_running)
++ ++ rq->avg_load_per_task = rq->load.weight / rq->nr_running;
++ ++
++ ++ return rq->avg_load_per_task;
++ ++ }
++ ++
++ ++ #ifdef CONFIG_FAIR_GROUP_SCHED
+
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
-- -- static void
-- -- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
++ ++ static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
++ ++ struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
__update_group_shares_cpu(tg, i, shares, rq_weight);
spin_unlock_irqrestore(&rq->lock, flags);
}
++ ++
++ ++ return 0;
}
/*
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-- -- static void
-- -- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
++ ++ static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
++ ++ long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
}
tg->cfs_rq[cpu]->h_load = load;
-- -- }
-- -- static void
-- -- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-- -- {
++ ++ return 0;
}
static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
-- -- walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
++ ++ walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
spin_lock(&rq->lock);
}
-- -- static void update_h_load(int cpu)
++ ++ static void update_h_load(long cpu)
{
-- -- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
++ ++ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
-- -- if (!match_state || p->state == match_state) {
-- -- ncsw = p->nivcsw + p->nvcsw;
-- -- if (unlikely(!ncsw))
-- -- ncsw = 1;
-- -- }
++ ++ if (!match_state || p->state == match_state)
++ ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
trace_mark(kernel_sched_wakeup,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
-- -- check_preempt_curr(rq, p);
++ ++ check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
trace_mark(kernel_sched_wakeup_new,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
-- -- check_preempt_curr(rq, p);
++ ++ check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
-- -- check_preempt_curr(this_rq, p);
++ ++ check_preempt_curr(this_rq, p, 0);
}
/*
EXPORT_PER_CPU_SYMBOL(kstat);
/*
---- - * Return p->sum_exec_runtime plus any more ns on the sched_clock
---- - * that have not yet been banked in case the task is currently running.
++++ + * Return any ns on the sched_clock that have not yet been banked in
++++ + * @p in case that task is currently running.
*/
---- -unsigned long long task_sched_runtime(struct task_struct *p)
++++ +unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
---- - u64 ns, delta_exec;
struct rq *rq;
++++ + u64 ns = 0;
rq = task_rq_lock(p, &flags);
---- - ns = p->se.sum_exec_runtime;
++++ +
if (task_current(rq, p)) {
++++ + u64 delta_exec;
++++ +
update_rq_clock(rq);
delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
---- - ns += delta_exec;
++++ + ns = delta_exec;
}
++++ +
task_rq_unlock(rq, &flags);
return ns;
cputime64_t tmp;
p->utime = cputime_add(p->utime, cputime);
++++ + account_group_user_time(p, cputime);
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
tmp = cputime_to_cputime64(cputime);
p->utime = cputime_add(p->utime, cputime);
++++ + account_group_user_time(p, cputime);
p->gtime = cputime_add(p->gtime, cputime);
cpustat->user = cputime64_add(cpustat->user, tmp);
}
p->stime = cputime_add(p->stime, cputime);
++++ + account_group_system_time(p, cputime);
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
++++ + account_group_system_time(p, steal);
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
cpustat->steal = cputime64_add(cpustat->steal, tmp);
}
++ /*
++ * Use precise platform statistics if available:
++ */
++ #ifdef CONFIG_VIRT_CPU_ACCOUNTING
++ cputime_t task_utime(struct task_struct *p)
++ {
++ return p->utime;
++ }
++
++ cputime_t task_stime(struct task_struct *p)
++ {
++ return p->stime;
++ }
++ #else
++ cputime_t task_utime(struct task_struct *p)
++ {
++ clock_t utime = cputime_to_clock_t(p->utime),
++ total = utime + cputime_to_clock_t(p->stime);
++ u64 temp;
++
++ /*
++ * Use CFS's precise accounting:
++ */
++ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
++
++ if (total) {
++ temp *= utime;
++ do_div(temp, total);
++ }
++ utime = (clock_t)temp;
++
++ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
++ return p->prev_utime;
++ }
++
++ cputime_t task_stime(struct task_struct *p)
++ {
++ clock_t stime;
++
++ /*
++ * Use CFS's precise accounting. (we subtract utime from
++ * the total, to make sure the total observed by userspace
++ * grows monotonically - apps rely on that):
++ */
++ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
++ cputime_to_clock_t(task_utime(p));
++
++ if (stime >= 0)
++ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++
++ return p->prev_stime;
++ }
++ #endif
++
++ inline cputime_t task_gtime(struct task_struct *p)
++ {
++ return p->gtime;
++ }
++
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
++ ++ /**
++ ++ * complete: - signals a single thread waiting on this completion
++ ++ * @x: holds the state of this particular completion
++ ++ *
++ ++ * This will wake up a single thread waiting on this completion. Threads will be
++ ++ * awakened in the same order in which they were queued.
++ ++ *
++ ++ * See also complete_all(), wait_for_completion() and related routines.
++ ++ */
void complete(struct completion *x)
{
unsigned long flags;
}
EXPORT_SYMBOL(complete);
++ ++ /**
++ ++ * complete_all: - signals all threads waiting on this completion
++ ++ * @x: holds the state of this particular completion
++ ++ *
++ ++ * This will wake up all threads waiting on this particular completion event.
++ ++ */
void complete_all(struct completion *x)
{
unsigned long flags;
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
-- -- if ((state == TASK_INTERRUPTIBLE &&
-- -- signal_pending(current)) ||
-- -- (state == TASK_KILLABLE &&
-- -- fatal_signal_pending(current))) {
++ ++ if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
return timeout;
}
++ ++ /**
++ ++ * wait_for_completion: - waits for completion of a task
++ ++ * @x: holds the state of this particular completion
++ ++ *
++ ++ * This waits to be signaled for completion of a specific task. It is NOT
++ ++ * interruptible and there is no timeout.
++ ++ *
++ ++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
++ ++ * and interrupt capability. Also see complete().
++ ++ */
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
++ ++ /**
++ ++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
++ ++ * @x: holds the state of this particular completion
++ ++ * @timeout: timeout value in jiffies
++ ++ *
++ ++ * This waits for either a completion of a specific task to be signaled or for a
++ ++ * specified timeout to expire. The timeout is in jiffies. It is not
++ ++ * interruptible.
++ ++ */
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
}
EXPORT_SYMBOL(wait_for_completion_timeout);
++ ++ /**
++ ++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
++ ++ * @x: holds the state of this particular completion
++ ++ *
++ ++ * This waits for completion of a specific task to be signaled. It is
++ ++ * interruptible.
++ ++ */
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
++ ++ /**
++ ++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
++ ++ * @x: holds the state of this particular completion
++ ++ * @timeout: timeout value in jiffies
++ ++ *
++ ++ * This waits for either a completion of a specific task to be signaled or for a
++ ++ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
++ ++ */
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
++ ++ /**
++ ++ * wait_for_completion_killable: - waits for completion of a task (killable)
++ ++ * @x: holds the state of this particular completion
++ ++ *
++ ++ * This waits to be signaled for completion of a specific task. It can be
++ ++ * interrupted by a kill signal.
++ ++ */
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
-- -- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
++ ++ if (rt_bandwidth_enabled() && rt_policy(policy) &&
++ ++ task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
-- -- check_preempt_curr(rq_dest, p);
++ ++ check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
-- -- struct ctl_table *table = sd_alloc_ctl_entry(12);
++ ++ struct ctl_table *table = sd_alloc_ctl_entry(13);
if (table == NULL)
return NULL;
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
-- -- /* &table[11] is terminator */
++ ++ set_table_entry(&table[11], "name", sd->name,
++ ++ CORENAME_MAX_SIZE, 0444, proc_dostring);
++ ++ /* &table[12] is terminator */
return table;
}
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
++ ++ #ifdef CONFIG_SCHED_DEBUG
++ ++ # define SD_INIT_NAME(sd, type) sd->name = #type
++ ++ #else
++ ++ # define SD_INIT_NAME(sd, type) do { } while (0)
++ ++ #endif
++ ++
#define SD_INIT(sd, type) sd_init_##type(sd)
++ ++
#define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \
{ \
memset(sd, 0, sizeof(*sd)); \
*sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
++ ++ SD_INIT_NAME(sd, type); \
}
SD_INIT_FUNC(CPU)
* and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt.
*
++ + * If doms_new==NULL it will be replaced with cpu_online_map.
++ + * ndoms_new==0 is a special case for destroying existing domains.
++ + * It will not create the default domain.
++ + *
* Call with hotplug lock held
*/
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new)
{
-- - int i, j;
++ + int i, j, n;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
-- - if (doms_new == NULL)
-- - ndoms_new = 0;
++ + n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
-- - for (j = 0; j < ndoms_new; j++) {
++ + for (j = 0; j < n; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
if (doms_new == NULL) {
ndoms_cur = 0;
-- - ndoms_new = 1;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
int arch_reinit_sched_domains(void)
{
get_online_cpus();
++ +
++ + /* Destroy domains first to force the rebuild */
++ + partition_sched_domains(0, NULL, NULL);
++ +
rebuild_sched_domains();
put_online_cpus();
++ +
return 0;
}
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
-- - partition_sched_domains(0, NULL, NULL);
++ + partition_sched_domains(1, NULL, NULL);
return NOTIFY_OK;
default:
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
-- -- if ((in_atomic() || irqs_disabled()) &&
-- -- system_state == SYSTEM_RUNNING && !oops_in_progress) {
-- -- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- -- return;
-- -- prev_jiffy = jiffies;
-- -- printk(KERN_ERR "BUG: sleeping function called from invalid"
-- -- " context at %s:%d\n", file, line);
-- -- printk("in_atomic():%d, irqs_disabled():%d\n",
-- -- in_atomic(), irqs_disabled());
-- -- debug_show_held_locks(current);
-- -- if (irqs_disabled())
-- -- print_irqtrace_events(current);
-- -- dump_stack();
-- -- }
++ ++ if ((!in_atomic() && !irqs_disabled()) ||
++ ++ system_state != SYSTEM_RUNNING || oops_in_progress)
++ ++ return;
++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
++ ++ return;
++ ++ prev_jiffy = jiffies;
++ ++
++ ++ printk(KERN_ERR
++ ++ "BUG: sleeping function called from invalid context at %s:%d\n",
++ ++ file, line);
++ ++ printk(KERN_ERR
++ ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
++ ++ in_atomic(), irqs_disabled(),
++ ++ current->pid, current->comm);
++ ++
++ ++ debug_show_held_locks(current);
++ ++ if (irqs_disabled())
++ ++ print_irqtrace_events(current);
++ ++ dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
-- -- return 1ULL << 16;
++ ++ return 1ULL << 20;
-- -- return div64_u64(runtime << 16, period);
++ ++ return div64_u64(runtime << 20, period);
}
-- -- #ifdef CONFIG_CGROUP_SCHED
-- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
++ ++ /* Must be called with tasklist_lock held */
++ ++ static inline int tg_has_rt_tasks(struct task_group *tg)
{
-- -- struct task_group *tgi, *parent = tg->parent;
-- -- unsigned long total = 0;
++ ++ struct task_struct *g, *p;
-- -- if (!parent) {
-- -- if (global_rt_period() < period)
-- -- return 0;
++ ++ do_each_thread(g, p) {
++ ++ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
++ ++ return 1;
++ ++ } while_each_thread(g, p);
-- -- return to_ratio(period, runtime) <
-- -- to_ratio(global_rt_period(), global_rt_runtime());
-- -- }
++ ++ return 0;
++ ++ }
-- -- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-- -- return 0;
++ ++ struct rt_schedulable_data {
++ ++ struct task_group *tg;
++ ++ u64 rt_period;
++ ++ u64 rt_runtime;
++ ++ };
-- -- rcu_read_lock();
-- -- list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-- -- if (tgi == tg)
-- -- continue;
++ ++ static int tg_schedulable(struct task_group *tg, void *data)
++ ++ {
++ ++ struct rt_schedulable_data *d = data;
++ ++ struct task_group *child;
++ ++ unsigned long total, sum = 0;
++ ++ u64 period, runtime;
++
-- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-- tgi->rt_bandwidth.rt_runtime);
++ ++ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
++ ++ runtime = tg->rt_bandwidth.rt_runtime;
++
-- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-- tgi->rt_bandwidth.rt_runtime);
++ ++ if (tg == d->tg) {
++ ++ period = d->rt_period;
++ ++ runtime = d->rt_runtime;
}
-- -- rcu_read_unlock();
-- -- return total + to_ratio(period, runtime) <=
-- -- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-- -- parent->rt_bandwidth.rt_runtime);
-- -- }
-- -- #elif defined CONFIG_USER_SCHED
-- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-- -- {
-- -- struct task_group *tgi;
-- -- unsigned long total = 0;
-- -- unsigned long global_ratio =
-- -- to_ratio(global_rt_period(), global_rt_runtime());
++ ++ /*
++ ++ * Cannot have more runtime than the period.
++ ++ */
++ ++ if (runtime > period && runtime != RUNTIME_INF)
++ ++ return -EINVAL;
-- -- rcu_read_lock();
-- -- list_for_each_entry_rcu(tgi, &task_groups, list) {
-- -- if (tgi == tg)
-- -- continue;
++ ++ /*
++ ++ * Ensure we don't starve existing RT tasks.
++ ++ */
++ ++ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
++ ++ return -EBUSY;
++ +
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
++ ++ total = to_ratio(period, runtime);
++ ++
++ ++ /*
++ ++ * Nobody can have more than the global setting allows.
++ ++ */
++ ++ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
++ ++ return -EINVAL;
++
-- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-- tgi->rt_bandwidth.rt_runtime);
++ ++ /*
++ ++ * The sum of our children's runtime should not exceed our own.
++ ++ */
++ ++ list_for_each_entry_rcu(child, &tg->children, siblings) {
++ ++ period = ktime_to_ns(child->rt_bandwidth.rt_period);
++ ++ runtime = child->rt_bandwidth.rt_runtime;
++ ++
++ ++ if (child == d->tg) {
++ ++ period = d->rt_period;
++ ++ runtime = d->rt_runtime;
++ ++ }
++ +
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
++ ++ sum += to_ratio(period, runtime);
}
-- -- rcu_read_unlock();
-- -- return total + to_ratio(period, runtime) < global_ratio;
++ ++ if (sum > total)
++ ++ return -EINVAL;
++ ++
++ ++ return 0;
}
-- -- #endif
-- -- /* Must be called with tasklist_lock held */
-- -- static inline int tg_has_rt_tasks(struct task_group *tg)
++ ++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
-- -- struct task_struct *g, *p;
-- -- do_each_thread(g, p) {
-- -- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-- -- return 1;
-- -- } while_each_thread(g, p);
-- -- return 0;
++ ++ struct rt_schedulable_data data = {
++ ++ .tg = tg,
++ ++ .rt_period = period,
++ ++ .rt_runtime = runtime,
++ ++ };
++ ++
++ ++ return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
-- -- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-- -- err = -EBUSY;
-- -- goto unlock;
-- -- }
-- -- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-- -- err = -EINVAL;
++ ++ err = __rt_schedulable(tg, rt_period, rt_runtime);
++ ++ if (err)
goto unlock;
-- -- }
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
static int sched_rt_global_constraints(void)
{
-- -- struct task_group *tg = &root_task_group;
-- -- u64 rt_runtime, rt_period;
++ ++ u64 runtime, period;
int ret = 0;
-- -- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-- -- rt_runtime = tg->rt_bandwidth.rt_runtime;
++ ++ if (sysctl_sched_rt_period <= 0)
++ ++ return -EINVAL;
++ ++
++ ++ runtime = global_rt_runtime();
++ ++ period = global_rt_period();
++ ++
++ ++ /*
++ ++ * Sanity check on the sysctl variables.
++ ++ */
++ ++ if (runtime > period && runtime != RUNTIME_INF)
++ ++ return -EINVAL;
mutex_lock(&rt_constraints_mutex);
-- -- if (!__rt_schedulable(tg, rt_period, rt_runtime))
-- -- ret = -EINVAL;
++ ++ read_lock(&tasklist_lock);
++ ++ ret = __rt_schedulable(NULL, 0, 0);
++ ++ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
unsigned long flags;
int i;
++ ++ if (sysctl_sched_rt_period <= 0)
++ ++ return -EINVAL;
++ ++
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
-- -- init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
-- -- /* Bind the cgroup to task_group object we just created */
-- -- tg->css.cgroup = cgrp;
-- --
return &tg->css;
}
return __sched_period(nr_running);
}
-- -- /*
-- -- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
-- -- * that it favours >=0 over <0.
-- -- *
-- -- * -20 |
-- -- * |
-- -- * 0 --------+-------
-- -- * .'
-- -- * 19 .'
-- -- *
-- -- */
-- -- static unsigned long
-- -- calc_delta_asym(unsigned long delta, struct sched_entity *se)
-- -- {
-- -- struct load_weight lw = {
-- -- .weight = NICE_0_LOAD,
-- -- .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-- -- };
-- --
-- -- for_each_sched_entity(se) {
-- -- struct load_weight *se_lw = &se->load;
-- -- unsigned long rw = cfs_rq_of(se)->load.weight;
-- --
-- -- #ifdef CONFIG_FAIR_SCHED_GROUP
-- -- struct cfs_rq *cfs_rq = se->my_q;
-- -- struct task_group *tg = NULL
-- --
-- -- if (cfs_rq)
-- -- tg = cfs_rq->tg;
-- --
-- -- if (tg && tg->shares < NICE_0_LOAD) {
-- -- /*
-- -- * scale shares to what it would have been had
-- -- * tg->weight been NICE_0_LOAD:
-- -- *
-- -- * weight = 1024 * shares / tg->weight
-- -- */
-- -- lw.weight *= se->load.weight;
-- -- lw.weight /= tg->shares;
-- --
-- -- lw.inv_weight = 0;
-- --
-- -- se_lw = &lw;
-- -- rw += lw.weight - se->load.weight;
-- -- } else
-- -- #endif
-- --
-- -- if (se->load.weight < NICE_0_LOAD) {
-- -- se_lw = &lw;
-- -- rw += NICE_0_LOAD - se->load.weight;
-- -- }
-- --
-- -- delta = calc_delta_mine(delta, rw, se_lw);
-- -- }
-- --
-- -- return delta;
-- -- }
-- --
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
struct task_struct *curtask = task_of(curr);
cpuacct_charge(curtask, delta_exec);
++++ + account_group_exec_runtime(curtask, delta_exec);
}
}
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-- -- if (entity_is_task(se))
++ ++ if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, se->load.weight);
++ ++ list_add(&se->group_node, &cfs_rq->tasks);
++ ++ }
cfs_rq->nr_running++;
se->on_rq = 1;
-- -- list_add(&se->group_node, &cfs_rq->tasks);
}
static void
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-- -- if (entity_is_task(se))
++ ++ if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
++ ++ list_del_init(&se->group_node);
++ ++ }
cfs_rq->nr_running--;
se->on_rq = 0;
-- -- list_del_init(&se->group_node);
}
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
-- -- long more_w;
if (!tg->parent)
return wl;
if (!wl && sched_feat(ASYM_EFF_LOAD))
return wl;
-- -- /*
-- -- * Instead of using this increment, also add the difference
-- -- * between when the shares were last updated and now.
-- -- */
-- -- more_w = se->my_q->load.weight - se->my_q->rq_weight;
-- -- wl += more_w;
-- -- wg += more_w;
-- --
for_each_sched_entity(se) {
-- -- #define D(n) (likely(n) ? (n) : 1)
-- --
long S, rw, s, a, b;
++ ++ long more_w;
++ ++
++ ++ /*
++ ++ * Instead of using this increment, also add the difference
++ ++ * between when the shares were last updated and now.
++ ++ */
++ ++ more_w = se->my_q->load.weight - se->my_q->rq_weight;
++ ++ wl += more_w;
++ ++ wg += more_w;
S = se->my_q->tg->shares;
s = se->my_q->shares;
a = S*(rw + wl);
b = S*rw + s*wg;
-- -- wl = s*(a-b)/D(b);
++ ++ wl = s*(a-b);
++ ++
++ ++ if (likely(b))
++ ++ wl /= b;
++ ++
/*
* Assume the group is already running and will
* thus already be accounted for in the weight.
* alter the group weight.
*/
wg = 0;
-- -- #undef D
}
return wl;
#endif
static int
-- -- wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
++ ++ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
++ ++ if (!sync && sched_feat(SYNC_WAKEUPS) &&
++ ++ curr->se.avg_overlap < sysctl_sched_migration_cost &&
++ ++ p->se.avg_overlap < sysctl_sched_migration_cost)
++ ++ sync = 1;
++ ++
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* a reasonable amount of time then attract this newly
* woken task:
*/
-- -- if (sync && balanced) {
-- -- if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-- -- p->se.avg_overlap < sysctl_sched_migration_cost)
-- -- return 1;
-- -- }
++ ++ if (sync && balanced)
++ ++ return 1;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
-- -- if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-- -- balanced) {
++ ++ if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
++ ++ tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
struct sched_domain *sd, *this_sd = NULL;
int prev_cpu, this_cpu, new_cpu;
unsigned long load, this_load;
-- -- struct rq *rq, *this_rq;
++ ++ struct rq *this_rq;
unsigned int imbalance;
int idx;
prev_cpu = task_cpu(p);
-- -- rq = task_rq(p);
this_cpu = smp_processor_id();
this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu;
++ ++ if (prev_cpu == this_cpu)
++ ++ goto out;
/*
* 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in:
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
-- -- if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
++ ++ if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
load, this_load, imbalance))
return this_cpu;
-- -- if (prev_cpu == this_cpu)
-- -- goto out;
-- --
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
* + nice tasks.
*/
if (sched_feat(ASYM_GRAN))
-- -- gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
-- -- else
-- -- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
++ ++ gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
return gran;
}
-- -- /*
-- -- * Should 'se' preempt 'curr'.
-- -- *
-- -- * |s1
-- -- * |s2
-- -- * |s3
-- -- * g
-- -- * |<--->|c
-- -- *
-- -- * w(c, s1) = -1
-- -- * w(c, s2) = 0
-- -- * w(c, s3) = 1
-- -- *
-- -- */
-- -- static int
-- -- wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-- -- {
-- -- s64 gran, vdiff = curr->vruntime - se->vruntime;
-- --
-- -- if (vdiff < 0)
-- -- return -1;
-- --
-- -- gran = wakeup_gran(curr);
-- -- if (vdiff > gran)
-- -- return 1;
-- --
-- -- return 0;
-- -- }
-- --
-- -- /* return depth at which a sched entity is present in the hierarchy */
-- -- static inline int depth_se(struct sched_entity *se)
-- -- {
-- -- int depth = 0;
-- --
-- -- for_each_sched_entity(se)
-- -- depth++;
-- --
-- -- return depth;
-- -- }
-- --
/*
* Preempt the current task with a newly woken task if needed:
*/
-- -- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
++ ++ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
-- -- int se_depth, pse_depth;
++ ++ s64 delta_exec;
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
cfs_rq_of(pse)->next = pse;
++ ++ /*
++ ++ * We can come here with TIF_NEED_RESCHED already set from new task
++ ++ * wake up path.
++ ++ */
++ ++ if (test_tsk_need_resched(curr))
++ ++ return;
++ ++
/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
if (!sched_feat(WAKEUP_PREEMPT))
return;
-- -- /*
-- -- * preemption test can be made between sibling entities who are in the
-- -- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-- -- * both tasks until we find their ancestors who are siblings of common
-- -- * parent.
-- -- */
-- --
-- -- /* First walk up until both entities are at same depth */
-- -- se_depth = depth_se(se);
-- -- pse_depth = depth_se(pse);
-- --
-- -- while (se_depth > pse_depth) {
-- -- se_depth--;
-- -- se = parent_entity(se);
-- -- }
-- --
-- -- while (pse_depth > se_depth) {
-- -- pse_depth--;
-- -- pse = parent_entity(pse);
-- -- }
-- --
-- -- while (!is_same_group(se, pse)) {
-- -- se = parent_entity(se);
-- -- pse = parent_entity(pse);
++ ++ if (sched_feat(WAKEUP_OVERLAP) && (sync ||
++ ++ (se->avg_overlap < sysctl_sched_migration_cost &&
++ ++ pse->avg_overlap < sysctl_sched_migration_cost))) {
++ ++ resched_task(curr);
++ ++ return;
}
-- -- if (wakeup_preempt_entity(se, pse) == 1)
++ ++ delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++ ++ if (delta_exec > wakeup_gran(pse))
resched_task(curr);
}
if (next == &cfs_rq->tasks)
return NULL;
-- -- /* Skip over entities that are not tasks */
-- -- do {
-- -- se = list_entry(next, struct sched_entity, group_node);
-- -- next = next->next;
-- -- } while (next != &cfs_rq->tasks && !entity_is_task(se));
-- --
-- -- if (next == &cfs_rq->tasks)
-- -- return NULL;
-- --
-- -- cfs_rq->balance_iterator = next;
-- --
-- -- if (entity_is_task(se))
-- -- p = task_of(se);
++ ++ se = list_entry(next, struct sched_entity, group_node);
++ ++ p = task_of(se);
++ ++ cfs_rq->balance_iterator = next->next;
return p;
}
rcu_read_lock();
update_h_load(busiest_cpu);
-- -- list_for_each_entry(tg, &task_groups, list) {
++ ++ list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
++ ++ resched_task(rq->curr);
}
enqueue_task_fair(rq, p, 0);
-- -- resched_task(rq->curr);
}
/*
if (p->prio > oldprio)
resched_task(rq->curr);
} else
-- -- check_preempt_curr(rq, p);
++ ++ check_preempt_curr(rq, p, 0);
}
/*
if (running)
resched_task(rq->curr);
else
-- -- check_preempt_curr(rq, p);
++ ++ check_preempt_curr(rq, p, 0);
}
/* Account for a task changing its policy or group.
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
++ ++ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
struct sched_rt_entity *rt_se = rt_rq->rt_se;
-- -- if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
-- -- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-- --
-- -- enqueue_rt_entity(rt_se);
++ ++ if (rt_rq->rt_nr_running) {
++ ++ if (rt_se && !on_rt_rq(rt_se))
++ ++ enqueue_rt_entity(rt_se);
if (rt_rq->highest_prio < curr->prio)
resched_task(curr);
}
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
++ if (rt_rq->rt_nr_running)
++ resched_task(rq_of_rt_rq(rt_rq)->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
++ ++ /*
++ ++ * We ran out of runtime, see if we can borrow some from our neighbours.
++ ++ */
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
continue;
spin_lock(&iter->rt_runtime_lock);
++ ++ /*
++ ++ * Either all rqs have inf runtime and there's nothing to steal
++ ++ * or __disable_runtime() below sets a specific rq to inf to
++ ++ * indicate its been disabled and disalow stealing.
++ ++ */
if (iter->rt_runtime == RUNTIME_INF)
goto next;
++ ++ /*
++ ++ * From runqueues with spare time, take 1/n part of their
++ ++ * spare time, but no more than our period.
++ ++ */
diff = iter->rt_runtime - iter->rt_time;
if (diff > 0) {
diff = div_u64((u64)diff, weight);
return more;
}
++ ++ /*
++ ++ * Ensure this RQ takes back all the runtime it lend to its neighbours.
++ ++ */
static void __disable_runtime(struct rq *rq)
{
struct root_domain *rd = rq->rd;
spin_lock(&rt_b->rt_runtime_lock);
spin_lock(&rt_rq->rt_runtime_lock);
++ ++ /*
++ ++ * Either we're all inf and nobody needs to borrow, or we're
++ ++ * already disabled and thus have nothing to do, or we have
++ ++ * exactly the right amount of runtime to take out.
++ ++ */
if (rt_rq->rt_runtime == RUNTIME_INF ||
rt_rq->rt_runtime == rt_b->rt_runtime)
goto balanced;
spin_unlock(&rt_rq->rt_runtime_lock);
++ ++ /*
++ ++ * Calculate the difference between what we started out with
++ ++ * and what we current have, that's the amount of runtime
++ ++ * we lend and now have to reclaim.
++ ++ */
want = rt_b->rt_runtime - rt_rq->rt_runtime;
++ ++ /*
++ ++ * Greedy reclaim, take back as much as we can.
++ ++ */
for_each_cpu_mask(i, rd->span) {
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
s64 diff;
++ ++ /*
++ ++ * Can't reclaim from ourselves or disabled runqueues.
++ ++ */
if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
continue;
}
spin_lock(&rt_rq->rt_runtime_lock);
++ ++ /*
++ ++ * We cannot be left wanting - that would mean some runtime
++ ++ * leaked out of the system.
++ ++ */
BUG_ON(want);
balanced:
++ ++ /*
++ ++ * Disable all the borrow logic by pretending we have inf
++ ++ * runtime - in which case borrowing doesn't make sense.
++ ++ */
rt_rq->rt_runtime = RUNTIME_INF;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
if (unlikely(!scheduler_running))
return;
++ ++ /*
++ ++ * Reset each runqueue's bandwidth settings
++ ++ */
for_each_leaf_rt_rq(rt_rq, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_runtime = rt_b->rt_runtime;
rt_rq->rt_time = 0;
++ ++ rt_rq->rt_throttled = 0;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
}
int i, idle = 1;
cpumask_t span;
-- -- if (rt_b->rt_runtime == RUNTIME_INF)
++ ++ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return 1;
span = sched_rt_period_mask();
{
u64 runtime = sched_rt_runtime(rt_rq);
-- if (runtime == RUNTIME_INF)
-- return 0;
--
if (rt_rq->rt_throttled)
return rt_rq_throttled(rt_rq);
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
++++ + account_group_exec_runtime(curr, delta_exec);
++++ +
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
++ ++ if (!rt_bandwidth_enabled())
++ ++ return;
++ ++
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
spin_lock(&rt_rq->rt_runtime_lock);
-- rt_rq->rt_time += delta_exec;
-- if (sched_rt_runtime_exceeded(rt_rq))
-- resched_task(curr);
++ if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
++ rt_rq->rt_time += delta_exec;
++ if (sched_rt_runtime_exceeded(rt_rq))
++ resched_task(curr);
++ }
spin_unlock(&rt_rq->rt_runtime_lock);
}
}
/*
* Preempt the current task with a newly woken task if needed:
*/
-- -- static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
++ ++ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
{
if (p->prio < rq->curr->prio) {
resched_task(rq->curr);
p->rt.timeout++;
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
if (p->rt.timeout > next)
---- - p->it_sched_expires = p->se.sum_exec_runtime;
++++ + p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
}
}
* Distribute under GPLv2.
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+++++ *
+++++ * Remote softirq infrastructure is by Jens Axboe.
*/
#include <linux/module.h>
EXPORT_SYMBOL(irq_stat);
#endif
-- -- static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
++ ++ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
do {
if (pending & 1) {
++ ++ int prev_count = preempt_count();
++ ++
h->action(h);
++ ++
++ ++ if (unlikely(prev_count != preempt_count())) {
++ ++ printk(KERN_ERR "huh, entered softirq %td %p"
++ ++ "with preempt_count %08x,"
++ ++ " exited with %08x?\n", h - softirq_vec,
++ ++ h->action, prev_count, preempt_count());
++ ++ preempt_count() = prev_count;
++ ++ }
++ ++
rcu_bh_qsctr_inc(cpu);
}
h++;
*/
void irq_enter(void)
{
-- ---#ifdef CONFIG_NO_HZ
int cpu = smp_processor_id();
++ +++
if (idle_cpu(cpu) && !in_interrupt())
-- --- tick_nohz_stop_idle(cpu);
-- ---#endif
++ +++ tick_check_idle(cpu);
++ +++
__irq_enter();
-- ---#ifdef CONFIG_NO_HZ
-- --- if (idle_cpu(cpu))
-- --- tick_nohz_update_jiffies();
-- ---#endif
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
EXPORT_SYMBOL(tasklet_kill);
+++++ DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+++++ EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+++++
+++++ static void __local_trigger(struct call_single_data *cp, int softirq)
+++++ {
+++++ struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+++++
+++++ list_add_tail(&cp->list, head);
+++++
+++++ /* Trigger the softirq only if the list was previously empty. */
+++++ if (head->next == &cp->list)
+++++ raise_softirq_irqoff(softirq);
+++++ }
+++++
+++++ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+++++ static void remote_softirq_receive(void *data)
+++++ {
+++++ struct call_single_data *cp = data;
+++++ unsigned long flags;
+++++ int softirq;
+++++
+++++ softirq = cp->priv;
+++++
+++++ local_irq_save(flags);
+++++ __local_trigger(cp, softirq);
+++++ local_irq_restore(flags);
+++++ }
+++++
+++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++ if (cpu_online(cpu)) {
+++++ cp->func = remote_softirq_receive;
+++++ cp->info = cp;
+++++ cp->flags = 0;
+++++ cp->priv = softirq;
+++++
+++++ __smp_call_function_single(cpu, cp);
+++++ return 0;
+++++ }
+++++ return 1;
+++++ }
+++++ #else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++ return 1;
+++++ }
+++++ #endif
+++++
+++++ /**
+++++ * __send_remote_softirq - try to schedule softirq work on a remote cpu
+++++ * @cp: private SMP call function data area
+++++ * @cpu: the remote cpu
+++++ * @this_cpu: the currently executing cpu
+++++ * @softirq: the softirq for the work
+++++ *
+++++ * Attempt to schedule softirq work on a remote cpu. If this cannot be
+++++ * done, the work is instead queued up on the local cpu.
+++++ *
+++++ * Interrupts must be disabled.
+++++ */
+++++ void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+++++ {
+++++ if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+++++ __local_trigger(cp, softirq);
+++++ }
+++++ EXPORT_SYMBOL(__send_remote_softirq);
+++++
+++++ /**
+++++ * send_remote_softirq - try to schedule softirq work on a remote cpu
+++++ * @cp: private SMP call function data area
+++++ * @cpu: the remote cpu
+++++ * @softirq: the softirq for the work
+++++ *
+++++ * Like __send_remote_softirq except that disabling interrupts and
+++++ * computing the current cpu is done for the caller.
+++++ */
+++++ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++ unsigned long flags;
+++++ int this_cpu;
+++++
+++++ local_irq_save(flags);
+++++ this_cpu = smp_processor_id();
+++++ __send_remote_softirq(cp, cpu, this_cpu, softirq);
+++++ local_irq_restore(flags);
+++++ }
+++++ EXPORT_SYMBOL(send_remote_softirq);
+++++
+++++ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+++++ unsigned long action, void *hcpu)
+++++ {
+++++ /*
+++++ * If a CPU goes away, splice its entries to the current CPU
+++++ * and trigger a run of the softirq
+++++ */
+++++ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+++++ int cpu = (unsigned long) hcpu;
+++++ int i;
+++++
+++++ local_irq_disable();
+++++ for (i = 0; i < NR_SOFTIRQS; i++) {
+++++ struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+++++ struct list_head *local_head;
+++++
+++++ if (list_empty(head))
+++++ continue;
+++++
+++++ local_head = &__get_cpu_var(softirq_work_list[i]);
+++++ list_splice_init(head, local_head);
+++++ raise_softirq_irqoff(i);
+++++ }
+++++ local_irq_enable();
+++++ }
+++++
+++++ return NOTIFY_OK;
+++++ }
+++++
+++++ static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+++++ .notifier_call = remote_softirq_cpu_notify,
+++++ };
+++++
void __init softirq_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
+++++ int i;
+++++
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
+++++ for (i = 0; i < NR_SOFTIRQS; i++)
+++++ INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
}
+++++ register_hotcpu_notifier(&remote_softirq_cpu_notifier);
+++++
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
return old_fsgid;
}
++++ +void do_sys_times(struct tms *tms)
++++ +{
++++ + struct task_cputime cputime;
++++ + cputime_t cutime, cstime;
++++ +
++++ + spin_lock_irq(¤t->sighand->siglock);
++++ + thread_group_cputime(current, &cputime);
++++ + cutime = current->signal->cutime;
++++ + cstime = current->signal->cstime;
++++ + spin_unlock_irq(¤t->sighand->siglock);
++++ + tms->tms_utime = cputime_to_clock_t(cputime.utime);
++++ + tms->tms_stime = cputime_to_clock_t(cputime.stime);
++++ + tms->tms_cutime = cputime_to_clock_t(cutime);
++++ + tms->tms_cstime = cputime_to_clock_t(cstime);
++++ +}
++++ +
asmlinkage long sys_times(struct tms __user * tbuf)
{
---- - /*
---- - * In the SMP world we might just be unlucky and have one of
---- - * the times increment as we use it. Since the value is an
---- - * atomically safe type this is just fine. Conceptually its
---- - * as if the syscall took an instant longer to occur.
---- - */
if (tbuf) {
struct tms tmp;
---- - struct task_struct *tsk = current;
---- - struct task_struct *t;
---- - cputime_t utime, stime, cutime, cstime;
---- -
---- - spin_lock_irq(&tsk->sighand->siglock);
---- - utime = tsk->signal->utime;
---- - stime = tsk->signal->stime;
---- - t = tsk;
---- - do {
---- - utime = cputime_add(utime, t->utime);
---- - stime = cputime_add(stime, t->stime);
---- - t = next_thread(t);
---- - } while (t != tsk);
---- -
---- - cutime = tsk->signal->cutime;
---- - cstime = tsk->signal->cstime;
---- - spin_unlock_irq(&tsk->sighand->siglock);
---- -
---- - tmp.tms_utime = cputime_to_clock_t(utime);
---- - tmp.tms_stime = cputime_to_clock_t(stime);
---- - tmp.tms_cutime = cputime_to_clock_t(cutime);
---- - tmp.tms_cstime = cputime_to_clock_t(cstime);
++++ +
++++ + do_sys_times(&tmp);
if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
return -EFAULT;
}
group_leader->signal->leader = 1;
__set_special_pids(sid);
-- -- spin_lock(&group_leader->sighand->siglock);
-- -- group_leader->signal->tty = NULL;
-- -- spin_unlock(&group_leader->sighand->siglock);
++ ++ proc_clear_tty(group_leader);
err = session;
out:
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
-- -- memcpy(utsname()->nodename, tmp, len);
-- -- utsname()->nodename[len] = 0;
++ ++ struct new_utsname *u = utsname();
++ ++
++ ++ memcpy(u->nodename, tmp, len);
++ ++ memset(u->nodename + len, 0, sizeof(u->nodename) - len);
errno = 0;
}
up_write(&uts_sem);
asmlinkage long sys_gethostname(char __user *name, int len)
{
int i, errno;
++ ++ struct new_utsname *u;
if (len < 0)
return -EINVAL;
down_read(&uts_sem);
-- -- i = 1 + strlen(utsname()->nodename);
++ ++ u = utsname();
++ ++ i = 1 + strlen(u->nodename);
if (i > len)
i = len;
errno = 0;
-- -- if (copy_to_user(name, utsname()->nodename, i))
++ ++ if (copy_to_user(name, u->nodename, i))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
-- -- memcpy(utsname()->domainname, tmp, len);
-- -- utsname()->domainname[len] = 0;
++ ++ struct new_utsname *u = utsname();
++ ++
++ ++ memcpy(u->domainname, tmp, len);
++ ++ memset(u->domainname + len, 0, sizeof(u->domainname) - len);
errno = 0;
}
up_write(&uts_sem);
asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
{
struct rlimit new_rlim, *old_rlim;
---- - unsigned long it_prof_secs;
int retval;
if (resource >= RLIM_NLIMITS)
return -EINVAL;
if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
return -EFAULT;
-- -- if (new_rlim.rlim_cur > new_rlim.rlim_max)
-- -- return -EINVAL;
old_rlim = current->signal->rlim + resource;
if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
-- -- if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-- -- return -EPERM;
++ ++
++ ++ if (resource == RLIMIT_NOFILE) {
++ ++ if (new_rlim.rlim_max == RLIM_INFINITY)
++ ++ new_rlim.rlim_max = sysctl_nr_open;
++ ++ if (new_rlim.rlim_cur == RLIM_INFINITY)
++ ++ new_rlim.rlim_cur = sysctl_nr_open;
++ ++ if (new_rlim.rlim_max > sysctl_nr_open)
++ ++ return -EPERM;
++ ++ }
++ ++
++ ++ if (new_rlim.rlim_cur > new_rlim.rlim_max)
++ ++ return -EINVAL;
retval = security_task_setrlimit(resource, &new_rlim);
if (retval)
if (new_rlim.rlim_cur == RLIM_INFINITY)
goto out;
---- - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
---- - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
---- - unsigned long rlim_cur = new_rlim.rlim_cur;
---- - cputime_t cputime;
---- -
---- - cputime = secs_to_cputime(rlim_cur);
---- - read_lock(&tasklist_lock);
---- - spin_lock_irq(¤t->sighand->siglock);
---- - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
---- - spin_unlock_irq(¤t->sighand->siglock);
---- - read_unlock(&tasklist_lock);
---- - }
++++ + update_rlimit_cpu(new_rlim.rlim_cur);
out:
return 0;
}
*
*/
---- -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
---- - cputime_t *utimep, cputime_t *stimep)
++++ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
{
---- - *utimep = cputime_add(*utimep, t->utime);
---- - *stimep = cputime_add(*stimep, t->stime);
r->ru_nvcsw += t->nvcsw;
r->ru_nivcsw += t->nivcsw;
r->ru_minflt += t->min_flt;
struct task_struct *t;
unsigned long flags;
cputime_t utime, stime;
++++ + struct task_cputime cputime;
memset((char *) r, 0, sizeof *r);
utime = stime = cputime_zero;
if (who == RUSAGE_THREAD) {
---- - accumulate_thread_rusage(p, r, &utime, &stime);
++++ + accumulate_thread_rusage(p, r);
goto out;
}
break;
case RUSAGE_SELF:
---- - utime = cputime_add(utime, p->signal->utime);
---- - stime = cputime_add(stime, p->signal->stime);
++++ + thread_group_cputime(p, &cputime);
++++ + utime = cputime_add(utime, cputime.utime);
++++ + stime = cputime_add(stime, cputime.stime);
r->ru_nvcsw += p->signal->nvcsw;
r->ru_nivcsw += p->signal->nivcsw;
r->ru_minflt += p->signal->min_flt;
r->ru_oublock += p->signal->oublock;
t = p;
do {
---- - accumulate_thread_rusage(t, r, &utime, &stime);
++++ + accumulate_thread_rusage(t, r);
t = next_thread(t);
} while (t != p);
break;
#include <linux/mm.h>
#include <linux/time.h>
--- --#include <linux/timer.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/hrtimer.h>
#include <linux/capability.h>
#include <linux/math64.h>
#include <linux/clocksource.h>
+++ ++#include <linux/workqueue.h>
#include <asm/timex.h>
/*
/* Disable the cmos update - used by virtualization and embedded */
int no_sync_cmos_clock __read_mostly;
--- --static void sync_cmos_clock(unsigned long dummy);
+++ ++static void sync_cmos_clock(struct work_struct *work);
--- --static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+++ ++static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
--- --static void sync_cmos_clock(unsigned long dummy)
+++ ++static void sync_cmos_clock(struct work_struct *work)
{
struct timespec now, next;
int fail = 1;
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
fail = update_persistent_clock(now);
-- - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
++ + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
if (next.tv_nsec <= 0)
next.tv_nsec += NSEC_PER_SEC;
next.tv_sec++;
next.tv_nsec -= NSEC_PER_SEC;
}
--- -- mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+++ ++ schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
}
static void notify_cmos_timer(void)
{
if (!no_sync_cmos_clock)
--- -- mod_timer(&sync_cmos_timer, jiffies + 1);
+++ ++ schedule_delayed_work(&sync_cmos_work, 0);
}
#else
int do_adjtimex(struct timex *txc)
{
struct timespec ts;
--- -- long save_adjust, sec;
int result;
--- -- /* In order to modify anything, you gotta be super-user! */
--- -- if (txc->modes && !capable(CAP_SYS_TIME))
--- -- return -EPERM;
--- --
--- -- /* Now we validate the data before disabling interrupts */
--- --
--- -- if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+++ ++ /* Validate the data before disabling interrupts */
+++ ++ if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
--- -- if (txc->modes & ~ADJ_OFFSET_SS_READ)
+++ ++ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
return -EINVAL;
+++ ++ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+++ ++ !capable(CAP_SYS_TIME))
+++ ++ return -EPERM;
+++ ++ } else {
+++ ++ /* In order to modify anything, you gotta be super-user! */
+++ ++ if (txc->modes && !capable(CAP_SYS_TIME))
+++ ++ return -EPERM;
+++ ++
+++ ++ /* if the quartz is off by more than 10% something is VERY wrong! */
+++ ++ if (txc->modes & ADJ_TICK &&
+++ ++ (txc->tick < 900000/USER_HZ ||
+++ ++ txc->tick > 1100000/USER_HZ))
+++ ++ return -EINVAL;
+++ ++
+++ ++ if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+++ ++ hrtimer_cancel(&leap_timer);
}
--- -- /* if the quartz is off by more than 10% something is VERY wrong ! */
--- -- if (txc->modes & ADJ_TICK)
--- -- if (txc->tick < 900000/USER_HZ ||
--- -- txc->tick > 1100000/USER_HZ)
--- -- return -EINVAL;
--- --
--- -- if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
--- -- hrtimer_cancel(&leap_timer);
getnstimeofday(&ts);
write_seqlock_irq(&xtime_lock);
--- -- /* Save for later - semantics of adjtime is to return old value */
--- -- save_adjust = time_adjust;
--- --
/* If there are input parameters, then process them */
+++ ++ if (txc->modes & ADJ_ADJTIME) {
+++ ++ long save_adjust = time_adjust;
+++ ++
+++ ++ if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+++ ++ /* adjtime() is independent from ntp_adjtime() */
+++ ++ time_adjust = txc->offset;
+++ ++ ntp_update_frequency();
+++ ++ }
+++ ++ txc->offset = save_adjust;
+++ ++ goto adj_done;
+++ ++ }
if (txc->modes) {
+++ ++ long sec;
+++ ++
if (txc->modes & ADJ_STATUS) {
if ((time_status & STA_PLL) &&
!(txc->status & STA_PLL)) {
if (txc->modes & ADJ_TAI && txc->constant > 0)
time_tai = txc->constant;
--- -- if (txc->modes & ADJ_OFFSET) {
--- -- if (txc->modes == ADJ_OFFSET_SINGLESHOT)
--- -- /* adjtime() is independent from ntp_adjtime() */
--- -- time_adjust = txc->offset;
--- -- else
--- -- ntp_update_offset(txc->offset);
--- -- }
+++ ++ if (txc->modes & ADJ_OFFSET)
+++ ++ ntp_update_offset(txc->offset);
if (txc->modes & ADJ_TICK)
tick_usec = txc->tick;
ntp_update_frequency();
}
+++ ++ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+++ ++ NTP_SCALE_SHIFT);
+++ ++ if (!(time_status & STA_NANO))
+++ ++ txc->offset /= NSEC_PER_USEC;
+++ ++
+++ ++adj_done:
result = time_state; /* mostly `TIME_OK' */
if (time_status & (STA_UNSYNC|STA_CLOCKERR))
result = TIME_ERROR;
--- -- if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
--- -- (txc->modes == ADJ_OFFSET_SS_READ))
--- -- txc->offset = save_adjust;
--- -- else {
--- -- txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
--- -- NTP_SCALE_SHIFT);
--- -- if (!(time_status & STA_NANO))
--- -- txc->offset /= NSEC_PER_USEC;
--- -- }
--- -- txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
--- -- (s64)PPM_SCALE_INV,
--- -- NTP_SCALE_SHIFT);
+++ ++ txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+++ ++ (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
#ifdef CONFIG_GENERIC_TIME
/**
----- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+++++ * clocksource_forward_now - update clock to the current time
*
----- * private function, must hold xtime_lock lock when being
----- * called. Returns the number of nanoseconds since the
----- * last call to update_wall_time() (adjusted by NTP scaling)
+++++ * Forward the current clock to update its state since the last call to
+++++ * update_wall_time(). This is useful before significant clock changes,
+++++ * as it avoids having to deal with this time offset explicitly.
*/
-----static inline s64 __get_nsec_offset(void)
+++++static void clocksource_forward_now(void)
{
cycle_t cycle_now, cycle_delta;
----- s64 ns_offset;
+++++ s64 nsec;
----- /* read clocksource: */
cycle_now = clocksource_read(clock);
-----
----- /* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+++++ clock->cycle_last = cycle_now;
----- /* convert to nanoseconds: */
----- ns_offset = cyc2ns(clock, cycle_delta);
+++++ nsec = cyc2ns(clock, cycle_delta);
+++++ timespec_add_ns(&xtime, nsec);
----- return ns_offset;
+++++ nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+++++ clock->raw_time.tv_nsec += nsec;
}
/**
*/
void getnstimeofday(struct timespec *ts)
{
+++++ cycle_t cycle_now, cycle_delta;
unsigned long seq;
s64 nsecs;
seq = read_seqbegin(&xtime_lock);
*ts = xtime;
----- nsecs = __get_nsec_offset();
+++++
+++++ /* read clocksource: */
+++++ cycle_now = clocksource_read(clock);
+++++
+++++ /* calculate the delta since the last update_wall_time: */
+++++ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+++++
+++++ /* convert to nanoseconds: */
+++++ nsecs = cyc2ns(clock, cycle_delta);
} while (read_seqretry(&xtime_lock, seq));
*/
int do_settimeofday(struct timespec *tv)
{
+++++ struct timespec ts_delta;
unsigned long flags;
----- time_t wtm_sec, sec = tv->tv_sec;
----- long wtm_nsec, nsec = tv->tv_nsec;
if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
write_seqlock_irqsave(&xtime_lock, flags);
----- nsec -= __get_nsec_offset();
+++++ clocksource_forward_now();
+++++
+++++ ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
+++++ ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
+++++ wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
----- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
----- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+++++ xtime = *tv;
----- set_normalized_timespec(&xtime, sec, nsec);
----- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
update_xtime_cache(0);
clock->error = 0;
static void change_clocksource(void)
{
struct clocksource *new;
----- cycle_t now;
----- u64 nsec;
new = clocksource_get_next();
if (clock == new)
return;
----- new->cycle_last = 0;
----- now = clocksource_read(new);
----- nsec = __get_nsec_offset();
----- timespec_add_ns(&xtime, nsec);
+++++ clocksource_forward_now();
----- clock = new;
----- clock->cycle_last = now;
+++++ new->raw_time = clock->raw_time;
+++++ clock = new;
+++++ clock->cycle_last = 0;
+++++ clock->cycle_last = clocksource_read(new);
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
*/
}
#else
+++++static inline void clocksource_forward_now(void) { }
static inline void change_clocksource(void) { }
-----static inline s64 __get_nsec_offset(void) { return 0; }
#endif
+++++/**
+++++ * getrawmonotonic - Returns the raw monotonic time in a timespec
+++++ * @ts: pointer to the timespec to be set
+++++ *
+++++ * Returns the raw monotonic time (completely un-modified by ntp)
+++++ */
+++++void getrawmonotonic(struct timespec *ts)
+++++{
+++++ unsigned long seq;
+++++ s64 nsecs;
+++++ cycle_t cycle_now, cycle_delta;
+++++
+++++ do {
+++++ seq = read_seqbegin(&xtime_lock);
+++++
+++++ /* read clocksource: */
+++++ cycle_now = clocksource_read(clock);
+++++
+++++ /* calculate the delta since the last update_wall_time: */
+++++ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+++++
+++++ /* convert to nanoseconds: */
+++++ nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+++++
+++++ *ts = clock->raw_time;
+++++
+++++ } while (read_seqretry(&xtime_lock, seq));
+++++
+++++ timespec_add_ns(ts, nsecs);
+++++}
+++++EXPORT_SYMBOL(getrawmonotonic);
+++++
+++++
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
static int timekeeping_suspended;
/* time in seconds when suspend began */
static unsigned long timekeeping_suspend_time;
-----/* xtime offset when we went into suspend */
-----static s64 timekeeping_suspend_nsecs;
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
wall_to_monotonic.tv_sec -= sleep_length;
total_sleep_time += sleep_length;
}
----- /* Make sure that we have the correct xtime reference */
----- timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
update_xtime_cache(0);
/* re-base the last cycle value */
clock->cycle_last = 0;
timekeeping_suspend_time = read_persistent_clock();
write_seqlock_irqsave(&xtime_lock, flags);
----- /* Get the current xtime offset */
----- timekeeping_suspend_nsecs = __get_nsec_offset();
+++++ clocksource_forward_now();
timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);
#else
offset = clock->cycle_interval;
#endif
--- -- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+++ ++ clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
while (offset >= clock->cycle_interval) {
/* accumulate one interval */
----- clock->xtime_nsec += clock->xtime_interval;
----- clock->cycle_last += clock->cycle_interval;
offset -= clock->cycle_interval;
+++++ clock->cycle_last += clock->cycle_interval;
+++++ clock->xtime_nsec += clock->xtime_interval;
if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
xtime.tv_sec++;
second_overflow();
}
+++++ clock->raw_time.tv_nsec += clock->raw_interval;
+++++ if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+++++ clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+++++ clock->raw_time.tv_sec++;
+++++ }
+++++
/* accumulate error between NTP and clock interval */
clock->error += tick_length;
clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
/* correct the clock when NTP error is too big */
clocksource_adjust(offset);
--- -- /* store full nanoseconds into xtime */
--- -- xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+++ ++ /* store full nanoseconds into xtime after rounding it up and
+++ ++ * add the remainder to the error difference.
+++ ++ */
+++ ++ xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+++ ++ clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
update_xtime_cache(cyc2ns(clock, offset));
run_local_timers();
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_tick);
++ ++ printk_tick();
scheduler_tick();
run_posix_cpu_timers(p);
}
BUG_ON(cpu_online(cpu));
old_base = per_cpu(tvec_bases, cpu);
new_base = get_cpu_var(tvec_bases);
- ----
- ---- local_irq_disable();
- ---- spin_lock(&new_base->lock);
+ ++++ /*
+ ++++ * The caller is globally serialized and nobody else
+ ++++ * takes two locks at once, deadlock is not possible.
+ ++++ */
+ ++++ spin_lock_irq(&new_base->lock);
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
BUG_ON(old_base->running_timer);
}
spin_unlock(&old_base->lock);
- ---- spin_unlock(&new_base->lock);
- ---- local_irq_enable();
+ ++++ spin_unlock_irq(&new_base->lock);
put_cpu_var(tvec_bases);
}
#endif /* CONFIG_HOTPLUG_CPU */
#include <linux/string.h>
#include <linux/selinux.h>
#include <linux/mutex.h>
++++ +#include <linux/posix-timers.h>
#include "avc.h"
#include "objsec.h"
struct sk_security_struct *ssec = sk->sk_security;
sk->sk_security = NULL;
++ ++ selinux_netlbl_sk_security_free(ssec);
kfree(ssec);
}
Opt_rootcontext = 4,
};
-- -- static match_table_t tokens = {
++ ++ static const match_table_t tokens = {
{Opt_context, CONTEXT_STR "%s"},
{Opt_fscontext, FSCONTEXT_STR "%s"},
{Opt_defcontext, DEFCONTEXT_STR "%s"},
return rc;
}
-- -- void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts)
++ ++ static void selinux_write_opts(struct seq_file *m,
++ ++ struct security_mnt_opts *opts)
{
int i;
char *prefix;
/* Default to the fs superblock SID. */
isec->sid = sbsec->sid;
-- -- if (sbsec->proc) {
++ ++ if (sbsec->proc && !S_ISLNK(inode->i_mode)) {
struct proc_inode *proci = PROC_I(inode);
if (proci->pde) {
isec->sclass = inode_mode_to_security_class(inode->i_mode);
long j = -1;
int drop_tty = 0;
-- -- mutex_lock(&tty_mutex);
tty = get_current_tty();
if (tty) {
file_list_lock();
}
}
file_list_unlock();
++ ++ tty_kref_put(tty);
}
-- -- mutex_unlock(&tty_mutex);
/* Reset controlling tty. */
if (drop_tty)
no_tty();
initrlim = init_task.signal->rlim+i;
rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
}
---- - if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
---- - /*
---- - * This will cause RLIMIT_CPU calculations
---- - * to be refigured.
---- - */
---- - current->it_prof_expires = jiffies_to_cputime(1);
---- - }
++++ + update_rlimit_cpu(rlim->rlim_cur);
}
/* Wake up the parent if it is waiting so that it can
#endif /* IPV6 */
static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
-- -- char **addrp, int src, u8 *proto)
++ ++ char **_addrp, int src, u8 *proto)
{
-- -- int ret = 0;
++ ++ char *addrp;
++ ++ int ret;
switch (ad->u.net.family) {
case PF_INET:
ret = selinux_parse_skb_ipv4(skb, ad, proto);
-- -- if (ret || !addrp)
-- -- break;
-- -- *addrp = (char *)(src ? &ad->u.net.v4info.saddr :
-- -- &ad->u.net.v4info.daddr);
-- -- break;
++ ++ if (ret)
++ ++ goto parse_error;
++ ++ addrp = (char *)(src ? &ad->u.net.v4info.saddr :
++ ++ &ad->u.net.v4info.daddr);
++ ++ goto okay;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
case PF_INET6:
ret = selinux_parse_skb_ipv6(skb, ad, proto);
-- -- if (ret || !addrp)
-- -- break;
-- -- *addrp = (char *)(src ? &ad->u.net.v6info.saddr :
-- -- &ad->u.net.v6info.daddr);
-- -- break;
++ ++ if (ret)
++ ++ goto parse_error;
++ ++ addrp = (char *)(src ? &ad->u.net.v6info.saddr :
++ ++ &ad->u.net.v6info.daddr);
++ ++ goto okay;
#endif /* IPV6 */
default:
-- -- break;
++ ++ addrp = NULL;
++ ++ goto okay;
}
-- -- if (unlikely(ret))
-- -- printk(KERN_WARNING
-- -- "SELinux: failure in selinux_parse_skb(),"
-- -- " unable to parse packet\n");
-- --
++ ++ parse_error:
++ ++ printk(KERN_WARNING
++ ++ "SELinux: failure in selinux_parse_skb(),"
++ ++ " unable to parse packet\n");
return ret;
++ ++
++ ++ okay:
++ ++ if (_addrp)
++ ++ *_addrp = addrp;
++ ++ return 0;
}
/**
static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
{
++ ++ struct sock *sk = sock->sk;
struct inode_security_struct *isec;
int err;
isec = SOCK_INODE(sock)->i_security;
if (isec->sclass == SECCLASS_TCP_SOCKET ||
isec->sclass == SECCLASS_DCCP_SOCKET) {
-- -- struct sock *sk = sock->sk;
struct avc_audit_data ad;
struct sockaddr_in *addr4 = NULL;
struct sockaddr_in6 *addr6 = NULL;
goto out;
}
++ ++ err = selinux_netlbl_socket_connect(sk, address);
++ ++
out:
return err;
}
}
static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
-- -- struct avc_audit_data *ad,
-- -- u16 family, char *addrp)
++ ++ u16 family)
{
int err;
struct sk_security_struct *sksec = sk->sk_security;
u32 peer_sid;
u32 sk_sid = sksec->sid;
++ ++ struct avc_audit_data ad;
++ ++ char *addrp;
++ ++
++ ++ AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++ ad.u.net.netif = skb->iif;
++ ++ ad.u.net.family = family;
++ ++ err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
++ ++ if (err)
++ ++ return err;
if (selinux_compat_net)
-- -- err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
++ ++ err = selinux_sock_rcv_skb_iptables_compat(sk, skb, &ad,
family, addrp);
else
err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
-- -- PACKET__RECV, ad);
++ ++ PACKET__RECV, &ad);
if (err)
return err;
if (err)
return err;
err = avc_has_perm(sk_sid, peer_sid,
-- -- SECCLASS_PEER, PEER__RECV, ad);
++ ++ SECCLASS_PEER, PEER__RECV, &ad);
++ ++ if (err)
++ ++ selinux_netlbl_err(skb, err, 0);
} else {
-- -- err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
++ ++ err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
if (err)
return err;
-- -- err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
++ ++ err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
}
return err;
u32 sk_sid = sksec->sid;
struct avc_audit_data ad;
char *addrp;
++ ++ u8 secmark_active;
++ ++ u8 peerlbl_active;
if (family != PF_INET && family != PF_INET6)
return 0;
if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
family = PF_INET;
++ ++ /* If any sort of compatibility mode is enabled then handoff processing
++ ++ * to the selinux_sock_rcv_skb_compat() function to deal with the
++ ++ * special handling. We do this in an attempt to keep this function
++ ++ * as fast and as clean as possible. */
++ ++ if (selinux_compat_net || !selinux_policycap_netpeer)
++ ++ return selinux_sock_rcv_skb_compat(sk, skb, family);
++ ++
++ ++ secmark_active = selinux_secmark_enabled();
++ ++ peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
++ ++ if (!secmark_active && !peerlbl_active)
++ ++ return 0;
++ ++
AVC_AUDIT_DATA_INIT(&ad, NET);
ad.u.net.netif = skb->iif;
ad.u.net.family = family;
if (err)
return err;
-- -- /* If any sort of compatibility mode is enabled then handoff processing
-- -- * to the selinux_sock_rcv_skb_compat() function to deal with the
-- -- * special handling. We do this in an attempt to keep this function
-- -- * as fast and as clean as possible. */
-- -- if (selinux_compat_net || !selinux_policycap_netpeer)
-- -- return selinux_sock_rcv_skb_compat(sk, skb, &ad,
-- -- family, addrp);
-- --
-- -- if (netlbl_enabled() || selinux_xfrm_enabled()) {
++ ++ if (peerlbl_active) {
u32 peer_sid;
err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
return err;
err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
peer_sid, &ad);
-- -- if (err)
++ ++ if (err) {
++ ++ selinux_netlbl_err(skb, err, 0);
return err;
++ ++ }
err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
PEER__RECV, &ad);
++ ++ if (err)
++ ++ selinux_netlbl_err(skb, err, 0);
}
-- -- if (selinux_secmark_enabled()) {
++ ++ if (secmark_active) {
err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
PACKET__RECV, &ad);
if (err)
u32 peer_secid = SECSID_NULL;
u16 family;
-- -- if (sock)
++ ++ if (skb && skb->protocol == htons(ETH_P_IP))
++ ++ family = PF_INET;
++ ++ else if (skb && skb->protocol == htons(ETH_P_IPV6))
++ ++ family = PF_INET6;
++ ++ else if (sock)
family = sock->sk->sk_family;
-- -- else if (skb && skb->sk)
-- -- family = skb->sk->sk_family;
else
goto out;
sk->sk_family == PF_UNIX)
isec->sid = sksec->sid;
sksec->sclass = isec->sclass;
-- --
-- -- selinux_netlbl_sock_graft(sk, parent);
}
static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
{
struct sk_security_struct *sksec = sk->sk_security;
int err;
++ ++ u16 family = sk->sk_family;
u32 newsid;
u32 peersid;
-- -- err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
++ ++ /* handle mapped IPv4 packets arriving via IPv6 sockets */
++ ++ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
++ ++ family = PF_INET;
++ ++
++ ++ err = selinux_skb_peerlbl_sid(skb, family, &peersid);
if (err)
return err;
if (peersid == SECSID_NULL) {
selinux_netlbl_sk_security_reset(newsksec, req->rsk_ops->family);
}
-- -- static void selinux_inet_conn_established(struct sock *sk,
-- -- struct sk_buff *skb)
++ ++ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
{
++ ++ u16 family = sk->sk_family;
struct sk_security_struct *sksec = sk->sk_security;
-- -- selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
++ ++ /* handle mapped IPv4 packets arriving via IPv6 sockets */
++ ++ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
++ ++ family = PF_INET;
++ ++
++ ++ selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
++ ++
++ ++ selinux_netlbl_inet_conn_established(sk, family);
}
static void selinux_req_classify_flow(const struct request_sock *req,
static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
u16 family)
{
++ ++ int err;
char *addrp;
u32 peer_sid;
struct avc_audit_data ad;
u8 secmark_active;
++ ++ u8 netlbl_active;
u8 peerlbl_active;
if (!selinux_policycap_netpeer)
return NF_ACCEPT;
secmark_active = selinux_secmark_enabled();
-- -- peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
++ ++ netlbl_active = netlbl_enabled();
++ ++ peerlbl_active = netlbl_active || selinux_xfrm_enabled();
if (!secmark_active && !peerlbl_active)
return NF_ACCEPT;
++ ++ if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
++ ++ return NF_DROP;
++ ++
AVC_AUDIT_DATA_INIT(&ad, NET);
ad.u.net.netif = ifindex;
ad.u.net.family = family;
if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
return NF_DROP;
-- -- if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
-- -- return NF_DROP;
-- --
-- -- if (peerlbl_active)
-- -- if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
-- -- peer_sid, &ad) != 0)
++ ++ if (peerlbl_active) {
++ ++ err = selinux_inet_sys_rcv_skb(ifindex, addrp, family,
++ ++ peer_sid, &ad);
++ ++ if (err) {
++ ++ selinux_netlbl_err(skb, err, 1);
return NF_DROP;
++ ++ }
++ ++ }
if (secmark_active)
if (avc_has_perm(peer_sid, skb->secmark,
SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
return NF_DROP;
++ ++ if (netlbl_active)
++ ++ /* we do this in the FORWARD path and not the POST_ROUTING
++ ++ * path because we want to make sure we apply the necessary
++ ++ * labeling before IPsec is applied so we can leverage AH
++ ++ * protection */
++ ++ if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
++ ++ return NF_DROP;
++ ++
return NF_ACCEPT;
}
}
#endif /* IPV6 */
++ ++ static unsigned int selinux_ip_output(struct sk_buff *skb,
++ ++ u16 family)
++ ++ {
++ ++ u32 sid;
++ ++
++ ++ if (!netlbl_enabled())
++ ++ return NF_ACCEPT;
++ ++
++ ++ /* we do this in the LOCAL_OUT path and not the POST_ROUTING path
++ ++ * because we want to make sure we apply the necessary labeling
++ ++ * before IPsec is applied so we can leverage AH protection */
++ ++ if (skb->sk) {
++ ++ struct sk_security_struct *sksec = skb->sk->sk_security;
++ ++ sid = sksec->sid;
++ ++ } else
++ ++ sid = SECINITSID_KERNEL;
++ ++ if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0)
++ ++ return NF_DROP;
++ ++
++ ++ return NF_ACCEPT;
++ ++ }
++ ++
++ ++ static unsigned int selinux_ipv4_output(unsigned int hooknum,
++ ++ struct sk_buff *skb,
++ ++ const struct net_device *in,
++ ++ const struct net_device *out,
++ ++ int (*okfn)(struct sk_buff *))
++ ++ {
++ ++ return selinux_ip_output(skb, PF_INET);
++ ++ }
++ ++
static int selinux_ip_postroute_iptables_compat(struct sock *sk,
int ifindex,
struct avc_audit_data *ad,
static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
int ifindex,
-- -- struct avc_audit_data *ad,
-- -- u16 family,
-- -- char *addrp,
-- -- u8 proto)
++ ++ u16 family)
{
struct sock *sk = skb->sk;
struct sk_security_struct *sksec;
++ ++ struct avc_audit_data ad;
++ ++ char *addrp;
++ ++ u8 proto;
if (sk == NULL)
return NF_ACCEPT;
sksec = sk->sk_security;
++ ++ AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++ ad.u.net.netif = ifindex;
++ ++ ad.u.net.family = family;
++ ++ if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
++ ++ return NF_DROP;
++ ++
if (selinux_compat_net) {
if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
-- -- ad, family, addrp))
++ ++ &ad, family, addrp))
return NF_DROP;
} else {
if (avc_has_perm(sksec->sid, skb->secmark,
-- -- SECCLASS_PACKET, PACKET__SEND, ad))
++ ++ SECCLASS_PACKET, PACKET__SEND, &ad))
return NF_DROP;
}
if (selinux_policycap_netpeer)
-- -- if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
++ ++ if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
return NF_DROP;
return NF_ACCEPT;
struct sock *sk;
struct avc_audit_data ad;
char *addrp;
-- -- u8 proto;
u8 secmark_active;
u8 peerlbl_active;
-- -- AVC_AUDIT_DATA_INIT(&ad, NET);
-- -- ad.u.net.netif = ifindex;
-- -- ad.u.net.family = family;
-- -- if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
-- -- return NF_DROP;
-- --
/* If any sort of compatibility mode is enabled then handoff processing
* to the selinux_ip_postroute_compat() function to deal with the
* special handling. We do this in an attempt to keep this function
* as fast and as clean as possible. */
if (selinux_compat_net || !selinux_policycap_netpeer)
-- -- return selinux_ip_postroute_compat(skb, ifindex, &ad,
-- -- family, addrp, proto);
++ ++ return selinux_ip_postroute_compat(skb, ifindex, family);
/* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
* packet transformation so allow the packet to pass without any checks
if (!secmark_active && !peerlbl_active)
return NF_ACCEPT;
-- -- /* if the packet is locally generated (skb->sk != NULL) then use the
-- -- * socket's label as the peer label, otherwise the packet is being
-- -- * forwarded through this system and we need to fetch the peer label
-- -- * directly from the packet */
++ ++ /* if the packet is being forwarded then get the peer label from the
++ ++ * packet itself; otherwise check to see if it is from a local
++ ++ * application or the kernel, if from an application get the peer label
++ ++ * from the sending socket, otherwise use the kernel's sid */
sk = skb->sk;
-- -- if (sk) {
++ ++ if (sk == NULL) {
++ ++ switch (family) {
++ ++ case PF_INET:
++ ++ if (IPCB(skb)->flags & IPSKB_FORWARDED)
++ ++ secmark_perm = PACKET__FORWARD_OUT;
++ ++ else
++ ++ secmark_perm = PACKET__SEND;
++ ++ break;
++ ++ case PF_INET6:
++ ++ if (IP6CB(skb)->flags & IP6SKB_FORWARDED)
++ ++ secmark_perm = PACKET__FORWARD_OUT;
++ ++ else
++ ++ secmark_perm = PACKET__SEND;
++ ++ break;
++ ++ default:
++ ++ return NF_DROP;
++ ++ }
++ ++ if (secmark_perm == PACKET__FORWARD_OUT) {
++ ++ if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
++ ++ return NF_DROP;
++ ++ } else
++ ++ peer_sid = SECINITSID_KERNEL;
++ ++ } else {
struct sk_security_struct *sksec = sk->sk_security;
peer_sid = sksec->sid;
secmark_perm = PACKET__SEND;
-- -- } else {
-- -- if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
-- -- return NF_DROP;
-- -- secmark_perm = PACKET__FORWARD_OUT;
}
++ ++ AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++ ad.u.net.netif = ifindex;
++ ++ ad.u.net.family = family;
++ ++ if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
++ ++ return NF_DROP;
++ ++
if (secmark_active)
if (avc_has_perm(peer_sid, skb->secmark,
SECCLASS_PACKET, secmark_perm, &ad))
if (sid == 0)
return -EINVAL;
-- --
-- -- /* Only allow single threaded processes to change context */
++ ++ /*
++ ++ * SELinux allows to change context in the following case only.
++ ++ * - Single threaded processes.
++ ++ * - Multi threaded processes intend to change its context into
++ ++ * more restricted domain (defined by TYPEBOUNDS statement).
++ ++ */
if (atomic_read(&p->mm->mm_users) != 1) {
struct task_struct *g, *t;
struct mm_struct *mm = p->mm;
do_each_thread(g, t) {
if (t->mm == mm && t != p) {
read_unlock(&tasklist_lock);
-- -- return -EPERM;
++ ++ error = security_bounded_transition(tsec->sid, sid);
++ ++ if (!error)
++ ++ goto boundary_ok;
++ ++
++ ++ return error;
}
} while_each_thread(g, t);
read_unlock(&tasklist_lock);
}
++ ++ boundary_ok:
/* Check permissions for the transition. */
error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
.pf = PF_INET,
.hooknum = NF_INET_FORWARD,
.priority = NF_IP_PRI_SELINUX_FIRST,
++ ++ },
++ ++ {
++ ++ .hook = selinux_ipv4_output,
++ ++ .owner = THIS_MODULE,
++ ++ .pf = PF_INET,
++ ++ .hooknum = NF_INET_LOCAL_OUT,
++ ++ .priority = NF_IP_PRI_SELINUX_FIRST,
}
};