From: Ingo Molnar Date: Mon, 24 Nov 2008 16:44:55 +0000 (+0100) Subject: Merge branches 'core/debug', 'core/futexes', 'core/locking', 'core/rcu', 'core/signal... X-Git-Tag: v2.6.29-rc1~572^2~4 X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=b19b3c74c7bbec45a848631b8f970ac110665a01;hp=-c;p=linux-2.6-omap-h63xx.git Merge branches 'core/debug', 'core/futexes', 'core/locking', 'core/rcu', 'core/signal', 'core/urgent' and 'core/xen' into core/core --- b19b3c74c7bbec45a848631b8f970ac110665a01 diff --combined arch/x86/include/asm/uaccess_64.h index f8cfd00db45,515d4dce96b,515d4dce96b,543ba883cc6,664f15280f1,664f15280f1,f8cfd00db45,c96c1f5d07a..84210c479fc --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@@@@@@@@ -1,5 -1,5 -1,5 -1,5 -1,5 -1,5 -1,5 -1,5 +1,5 @@@@@@@@@ -- #ifndef __X86_64_UACCESS_H -- #define __X86_64_UACCESS_H -#ifndef ASM_X86__UACCESS_64_H -#define ASM_X86__UACCESS_64_H ++ +#ifndef _ASM_X86_UACCESS_64_H ++ +#define _ASM_X86_UACCESS_64_H /* * User space memory access functions @@@@@@@@@ -7,7 -7,6 -7,6 -7,7 -7,7 -7,7 -7,7 -7,7 +7,7 @@@@@@@@@ #include #include #include ++ #include #include /* @@@@@@@@@ -29,6 -28,6 -28,6 -29,8 -29,6 -29,6 -29,6 -29,6 +29,8 @@@@@@@@@ static __always_inline __must_chec int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; +++ ++++ +++ ++++ might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@@@@@@@@ -46,7 -45,7 -45,7 -48,7 -46,7 -46,7 -46,7 -46,7 +48,7 @@@@@@@@@ return ret; case 10: __get_user_asm(*(u64 *)dst, (u64 __user *)src, ----- - ret, "q", "", "=r", 16); +++++ + ret, "q", "", "=r", 10); if (unlikely(ret)) return ret; __get_user_asm(*(u16 *)(8 + (char *)dst), @@@@@@@@@ -71,6 -70,6 -70,6 -73,8 -71,6 -71,6 -71,6 -71,6 +73,8 @@@@@@@@@ static __always_inline __must_chec int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; +++ ++++ +++ ++++ might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@@@@@@@@ -113,6 -112,6 -112,6 -117,8 -113,6 -113,6 -113,6 -113,6 +117,8 @@@@@@@@@ static __always_inline __must_chec int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; +++ ++++ +++ ++++ might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); @@@@@@@@@ -199,4 -198,4 -198,4 -205,4 -199,4 -199,4 -199,4 -199,4 +205,4 @@@@@@@@@ static inline int __copy_from_user_inat unsigned long copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); -- #endif /* __X86_64_UACCESS_H */ -#endif /* ASM_X86__UACCESS_64_H */ ++ +#endif /* _ASM_X86_UACCESS_64_H */ diff --combined include/linux/kernel.h index dc7e0d0a647,3f30557be2a,2651f805ba6,69a9bfdf9c8,fba141d3ca0,fba141d3ca0,dc7e0d0a647,94d17ff64c5..269df5a17b3 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@@@@@@@@ -16,7 -16,6 -16,6 -16,7 -16,7 -16,7 -16,7 -16,7 +16,7 @@@@@@@@@ #include #include #include ++ #include #include #include @@@@@@@@@ -116,8 -115,6 -115,6 -116,8 -116,8 -116,8 -116,8 -116,6 +116,8 @@@@@@@@@ extern int _cond_resched(void) # define might_resched() do { } while (0) #endif ++ +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP ++ + void __might_sleep(char *file, int line); /** * might_sleep - annotation for functions that can sleep * @@@@@@@@@ -128,6 -125,8 -125,8 -128,6 -128,6 -128,6 -128,6 -126,8 +128,6 @@@@@@@@@ * be bitten later when the calling function happens to sleep when it is not * supposed to. */ -- -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -- - void __might_sleep(char *file, int line); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) #else @@@@@@@@@ -141,6 -140,6 -140,6 -141,15 -141,6 -141,6 -141,6 -141,6 +141,15 @@@@@@@@@ (__x < 0) ? -__x : __x; \ }) +++ ++++#ifdef CONFIG_PROVE_LOCKING +++ ++++void might_fault(void); +++ ++++#else +++ ++++static inline void might_fault(void) +++ ++++{ +++ ++++ might_sleep(); +++ ++++} +++ ++++#endif +++ ++++ extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) @@@@@@@@@ -183,38 -182,16 -182,14 -192,38 -183,38 -183,38 -183,38 -183,14 +192,40 @@@@@@@@@ extern int vsscanf(const char *, const extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); -- extern unsigned long long memparse(char *ptr, char **retptr); ++ extern unsigned long long memparse(const char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); + ++++++extern int func_ptr_is_kernel_text(void *ptr); + ++++++ struct pid; extern struct pid *session_of_pgrp(struct pid *pgrp); ++ +/* ++ + * FW_BUG ++ + * Add this to a message where you are sure the firmware is buggy or behaves ++ + * really stupid or out of spec. Be aware that the responsible BIOS developer ++ + * should be able to fix this issue or at least get a concrete idea of the ++ + * problem by reading your message without the need of looking at the kernel ++ + * code. ++ + * ++ + * Use it for definite and high priority BIOS bugs. ++ + * ++ + * FW_WARN ++ + * Use it for not that clear (e.g. could the kernel messed up things already?) ++ + * and medium priority BIOS bugs. ++ + * ++ + * FW_INFO ++ + * Use this one if you want to tell the user or vendor about something ++ + * suspicious, but generally harmless related to the firmware. ++ + * ++ + * Use it for information or very low priority BIOS bugs. ++ + */ ++ +#define FW_BUG "[Firmware Bug]: " ++ +#define FW_WARN "[Firmware Warn]: " ++ +#define FW_INFO "[Firmware Info]: " ++ + #ifdef CONFIG_PRINTK asmlinkage int vprintk(const char *fmt, va_list args) __attribute__ ((format (printf, 1, 0))); @@@@@@@@@ -238,9 -215,6 -213,6 -247,9 -238,9 -238,9 -238,9 -214,9 +249,9 @@@@@@@@@ static inline bool printk_timed_ratelim { return false; } #endif ++ extern int printk_needs_cpu(int cpu); ++ extern void printk_tick(void); ++ extern void asmlinkage __attribute__((format(printf, 1, 2))) early_printk(const char *fmt, ...); @@@@@@@@@ -263,10 -237,9 -235,9 -272,10 -263,10 -263,10 -263,10 -239,10 +274,10 @@@@@@@@@ extern int oops_in_progress; /* If set extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; -- extern int tainted; extern const char *print_tainted(void); -- extern void add_taint(unsigned); ++ extern void add_taint(unsigned flag); ++ extern int test_taint(unsigned flag); ++ extern unsigned long get_taint(void); extern int root_mountflags; /* Values used for system_state */ @@@@@@@@@ -279,17 -252,16 -250,16 -288,17 -279,17 -279,17 -279,17 -255,17 +290,17 @@@@@@@@@ extern enum system_states SYSTEM_SUSPEND_DISK, } system_state; -- #define TAINT_PROPRIETARY_MODULE (1<<0) -- #define TAINT_FORCED_MODULE (1<<1) -- #define TAINT_UNSAFE_SMP (1<<2) -- #define TAINT_FORCED_RMMOD (1<<3) -- #define TAINT_MACHINE_CHECK (1<<4) -- #define TAINT_BAD_PAGE (1<<5) -- #define TAINT_USER (1<<6) -- #define TAINT_DIE (1<<7) -- #define TAINT_OVERRIDDEN_ACPI_TABLE (1<<8) -- #define TAINT_WARN (1<<9) ++ #define TAINT_PROPRIETARY_MODULE 0 ++ #define TAINT_FORCED_MODULE 1 ++ #define TAINT_UNSAFE_SMP 2 ++ #define TAINT_FORCED_RMMOD 3 ++ #define TAINT_MACHINE_CHECK 4 ++ #define TAINT_BAD_PAGE 5 ++ #define TAINT_USER 6 ++ #define TAINT_DIE 7 ++ #define TAINT_OVERRIDDEN_ACPI_TABLE 8 ++ #define TAINT_WARN 9 ++ #define TAINT_CRAP 10 extern void dump_stack(void) __cold; @@@@@@@@@ -318,36 -290,28 -288,28 -327,32 -318,32 -318,32 -318,36 -294,32 +329,36 @@@@@@@@@ static inline char *pack_hex_byte(char return buf; } ----- -#define pr_emerg(fmt, arg...) \ ----- - printk(KERN_EMERG fmt, ##arg) ----- -#define pr_alert(fmt, arg...) \ ----- - printk(KERN_ALERT fmt, ##arg) ----- -#define pr_crit(fmt, arg...) \ ----- - printk(KERN_CRIT fmt, ##arg) ----- -#define pr_err(fmt, arg...) \ ----- - printk(KERN_ERR fmt, ##arg) ----- -#define pr_warning(fmt, arg...) \ ----- - printk(KERN_WARNING fmt, ##arg) ----- -#define pr_notice(fmt, arg...) \ ----- - printk(KERN_NOTICE fmt, ##arg) ----- -#define pr_info(fmt, arg...) \ ----- - printk(KERN_INFO fmt, ##arg) -- -- #ifdef DEBUG +++++ +#ifndef pr_fmt +++++ +#define pr_fmt(fmt) fmt +++++ +#endif +++++ + +++++ +#define pr_emerg(fmt, ...) \ +++++ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +++++ +#define pr_alert(fmt, ...) \ +++++ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +++++ +#define pr_crit(fmt, ...) \ +++++ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +++++ +#define pr_err(fmt, ...) \ +++++ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +++++ +#define pr_warning(fmt, ...) \ +++++ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +++++ +#define pr_notice(fmt, ...) \ +++++ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +++++ +#define pr_info(fmt, ...) \ +++++ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) ++ /* If you are writing a driver, please use dev_dbg instead */ -- #define pr_debug(fmt, arg...) \ -- printk(KERN_DEBUG fmt, ##arg) ++ #if defined(CONFIG_DYNAMIC_PRINTK_DEBUG) ++ #define pr_debug(fmt, ...) do { \ --- - dynamic_pr_debug(fmt, ##__VA_ARGS__); \ +++++ + dynamic_pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ ++ } while (0) ++ #elif defined(DEBUG) --- -#define pr_debug(fmt, arg...) \ --- - printk(KERN_DEBUG fmt, ##arg) +++++ +#define pr_debug(fmt, ...) \ +++++ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else ----- -#define pr_debug(fmt, arg...) \ ----- - ({ if (0) printk(KERN_DEBUG fmt, ##arg); 0; }) +++++ +#define pr_debug(fmt, ...) \ +++++ + ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) #endif /* @@@@@@@@@ -524,9 -488,4 -486,4 -529,9 -520,9 -520,9 -524,9 -496,9 +535,9 @@@@@@@@@ struct sysinfo #define NUMA_BUILD 0 #endif ++ /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ ++ #ifdef CONFIG_FTRACE_MCOUNT_RECORD ++ # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD ++ #endif ++ #endif diff --combined kernel/exit.c index 2d8be7ebb0f,16395644a98,85a83c83185,ae2b92be5fa,80137a5d946,b9c4d8bb72e,2d8be7ebb0f,80137a5d946..30fcdf16737 --- a/kernel/exit.c +++ b/kernel/exit.c @@@@@@@@@ -40,13 -40,13 -40,13 -40,14 -40,14 -40,14 -40,13 -40,14 +40,13 @@@@@@@@@ #include #include #include ----- -#include #include #include /* for audit_free() */ #include #include #include #include ++ #include #include #include @@@@@@@@@ -112,6 -112,8 -112,8 -113,6 -113,6 -113,6 -112,6 -113,6 +112,6 @@@@@@@@@ static void __exit_signal(struct task_s * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ -- sig->utime = cputime_add(sig->utime, task_utime(tsk)); -- sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@@@@@@@@ -120,6 -122,7 -122,7 -121,6 -121,6 -121,6 -120,6 -121,6 +120,6 @@@@@@@@@ sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@@@@@@@@ -140,21 -143,13 -143,13 -141,21 -141,16 -141,21 -140,21 -141,16 +140,21 @@@@@@@@@ if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); ++ + + /* ++ + + * Make sure ->signal can't go away under rq->lock, ++ + + * see account_group_exec_runtime(). ++ + + */ ++ + + task_rq_unlock_wait(tsk); __cleanup_signal(sig); } } static void delayed_put_task_struct(struct rcu_head *rhp) { -- put_task_struct(container_of(rhp, struct task_struct, rcu)); ++ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); ++ ++ trace_sched_process_free(tsk); ++ put_task_struct(tsk); } @@@@@@@@@ -588,6 -583,8 -583,6 -589,6 -584,6 -589,6 -588,6 -584,6 +588,6 @@@@@@@@@ mm_need_new_owner(struct mm_struct *mm * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@@@@@@@@ -630,38 -627,29 -625,39 -631,38 -626,38 -631,38 -630,38 -626,38 +630,38 @@@@@@@@@ retry } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); ++ read_unlock(&tasklist_lock); ++ down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); -- /* -- * Delay read_unlock() till we have the task_lock() -- * to ensure that c does not slip away underneath us -- */ -- read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); ++ up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); ++ up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ @@@@@@@@@ -1058,6 -1046,14 -1054,14 -1059,14 -1054,14 -1059,14 -1058,6 -1054,14 +1058,6 @@@@@@@@@ NORET_TYPE void do_exit(long code exit_itimers(tsk->signal); } acct_collect(code, group_dead); ----- -#ifdef CONFIG_FUTEX ----- - if (unlikely(tsk->robust_list)) ----- - exit_robust_list(tsk); ----- -#ifdef CONFIG_COMPAT ----- - if (unlikely(tsk->compat_robust_list)) ----- - compat_exit_robust_list(tsk); ----- -#endif ----- -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) @@@@@@@@@ -1070,8 -1066,6 -1074,6 -1079,8 -1074,8 -1079,8 -1070,8 -1074,8 +1070,8 @@@@@@@@@ if (group_dead) acct_process(); ++ trace_sched_process_exit(tsk); ++ exit_sem(tsk); exit_files(tsk); exit_fs(tsk); @@@@@@@@@ -1300,7 -1294,6 -1302,6 -1309,7 -1304,7 -1309,7 -1300,7 -1304,7 +1300,7 @@@@@@@@@ static int wait_task_zombie(struct task if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; ++ struct task_cputime cputime; /* * The resource counters for the group leader are in its @@@@@@@@@ -1316,23 -1309,20 -1317,20 -1325,23 -1320,23 -1325,23 -1316,23 -1320,23 +1316,23 @@@@@@@@@ * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. ++ * ++ * We use thread_group_cputime() to get times for the thread ++ * group, which consolidates times for all threads in the ++ * group including the group leader. */ +++++ ++ thread_group_cputime(p, &cputime); spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; - -- -- thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, -- cputime_add(p->utime, -- cputime_add(sig->utime, -- sig->cutime))); ++ cputime_add(cputime.utime, ++ sig->cutime)); psig->cstime = cputime_add(psig->cstime, -- cputime_add(p->stime, -- cputime_add(sig->stime, -- sig->cstime))); ++ cputime_add(cputime.stime, ++ sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@@@@@@@@ -1677,8 -1667,6 -1675,6 -1686,8 -1681,8 -1686,8 -1677,8 -1681,8 +1677,8 @@@@@@@@@ static long do_wait(enum pid_type type struct task_struct *tsk; int retval; ++ trace_sched_process_wait(pid); ++ add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* diff --combined kernel/futex.c index 8af10027514,7d1136e97c1,62cbd648e28,8af10027514,8af10027514,8af10027514,8af10027514,7d1136e97c1..e10c5c8786a --- a/kernel/futex.c +++ b/kernel/futex.c @@@@@@@@@ -122,24 -122,24 -122,6 -122,24 -122,24 -122,24 -122,24 -122,24 +122,6 @@@@@@@@@ struct futex_hash_bucket static struct futex_hash_bucket futex_queues[1<mmap_sem, when futex is shared -- ----- */ -- -----static inline void futex_lock_mm(struct rw_semaphore *fshared) -- -----{ -- ----- if (fshared) -- ----- down_read(fshared); -- -----} -- ----- -- -----/* -- ----- * Release mm->mmap_sem, when the futex is shared -- ----- */ -- -----static inline void futex_unlock_mm(struct rw_semaphore *fshared) -- -----{ -- ----- if (fshared) -- ----- up_read(fshared); -- -----} -- ----- /* * We hash on the keys returned from get_futex_key (see below). */ @@@@@@@@@ -161,6 -161,6 -143,45 -161,6 -161,6 -161,6 -161,6 -161,6 +143,45 @@@@@@@@@ static inline int match_futex(union fut && key1->both.offset == key2->both.offset); } ++ +++++/* ++ +++++ * Take a reference to the resource addressed by a key. ++ +++++ * Can be called while holding spinlocks. ++ +++++ * ++ +++++ */ ++ +++++static void get_futex_key_refs(union futex_key *key) ++ +++++{ ++ +++++ if (!key->both.ptr) ++ +++++ return; ++ +++++ ++ +++++ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { ++ +++++ case FUT_OFF_INODE: ++ +++++ atomic_inc(&key->shared.inode->i_count); ++ +++++ break; ++ +++++ case FUT_OFF_MMSHARED: ++ +++++ atomic_inc(&key->private.mm->mm_count); ++ +++++ break; ++ +++++ } ++ +++++} ++ +++++ ++ +++++/* ++ +++++ * Drop a reference to the resource addressed by a key. ++ +++++ * The hash bucket spinlock must not be held. ++ +++++ */ ++ +++++static void drop_futex_key_refs(union futex_key *key) ++ +++++{ ++ +++++ if (!key->both.ptr) ++ +++++ return; ++ +++++ ++ +++++ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { ++ +++++ case FUT_OFF_INODE: ++ +++++ iput(key->shared.inode); ++ +++++ break; ++ +++++ case FUT_OFF_MMSHARED: ++ +++++ mmdrop(key->private.mm); ++ +++++ break; ++ +++++ } ++ +++++} ++ +++++ /** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex @@@@@@@@@ -179,12 -179,12 -200,10 -179,12 -179,12 -179,12 -179,12 -179,12 +200,10 @@@@@@@@@ * For other futexes, it points to ¤t->mm->mmap_sem and * caller must have taken the reader lock. but NOT any spinlocks. */ -- -----static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, -- ----- union futex_key *key) ++ +++++static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; -- ----- struct vm_area_struct *vma; struct page *page; int err; @@@@@@@@@ -208,100 -208,100 -227,50 -208,100 -208,100 -208,100 -208,100 -208,100 +227,50 @@@@@@@@@ return -EFAULT; key->private.mm = mm; key->private.address = address; ++ +++++ get_futex_key_refs(key); return 0; } -- ----- /* -- ----- * The futex is hashed differently depending on whether -- ----- * it's in a shared or private mapping. So check vma first. -- ----- */ -- ----- vma = find_extend_vma(mm, address); -- ----- if (unlikely(!vma)) -- ----- return -EFAULT; -- ----- /* -- ----- * Permissions. -- ----- */ -- ----- if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) -- ----- return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; ++ +++++again: ++ +++++ err = get_user_pages_fast(address, 1, 0, &page); ++ +++++ if (err < 0) ++ +++++ return err; ++ +++++ ++ +++++ lock_page(page); ++ +++++ if (!page->mapping) { ++ +++++ unlock_page(page); ++ +++++ put_page(page); ++ +++++ goto again; ++ +++++ } /* * Private mappings are handled in a simple way. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to -- ----- * the object not the particular process. Therefore we use -- ----- * VM_MAYSHARE here, not VM_SHARED which is restricted to shared -- ----- * mappings of _writable_ handles. ++ +++++ * the object not the particular process. */ -- ----- if (likely(!(vma->vm_flags & VM_MAYSHARE))) { -- ----- key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ ++ +++++ if (PageAnon(page)) { ++ +++++ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; - - return 0; - - } - - - - /* - - * Linear file mappings are also simple. - - */ - - key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - - key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ - - if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) - - + vma->vm_pgoff); -- ----- return 0; ++ +++++ } else { ++ +++++ key->both.offset |= FUT_OFF_INODE; /* inode-based key */ ++ +++++ key->shared.inode = page->mapping->host; ++ +++++ key->shared.pgoff = page->index; } -- ----- /* - ---- * Linear file mappings are also simple. - - * We could walk the page table to read the non-linear - - * pte, and get the page index without fetching the page - - * from swap. But that's a lot of code to duplicate here - - * for a rare case, so we simply fetch the page. -- ----- */ - ---- key->shared.inode = vma->vm_file->f_path.dentry->d_inode; - ---- key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ - ---- if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - ---- key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) - ---- + vma->vm_pgoff); - - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - - if (err >= 0) { - - key->shared.pgoff = - - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - put_page(page); -- ----- return 0; -- ----- } - - return err; - -} ++ +++++ get_futex_key_refs(key); - ---- /* - ---- * We could walk the page table to read the non-linear - ---- * pte, and get the page index without fetching the page - ---- * from swap. But that's a lot of code to duplicate here - ---- * for a rare case, so we simply fetch the page. - ---- */ - ---- err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); - ---- if (err >= 0) { - ---- key->shared.pgoff = - ---- page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - ---- put_page(page); - ---- return 0; - ---- } - ---- return err; - ---- } - ---- -- -----/* -- ----- * Take a reference to the resource addressed by a key. -- ----- * Can be called while holding spinlocks. -- ----- * -- ----- */ -- -----static void get_futex_key_refs(union futex_key *key) -- -----{ -- ----- if (key->both.ptr == NULL) -- ----- return; -- ----- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { -- ----- case FUT_OFF_INODE: -- ----- atomic_inc(&key->shared.inode->i_count); -- ----- break; -- ----- case FUT_OFF_MMSHARED: -- ----- atomic_inc(&key->private.mm->mm_count); -- ----- break; -- ----- } ++ +++++ unlock_page(page); ++ +++++ put_page(page); ++ +++++ return 0; } -- -----/* -- ----- * Drop a reference to the resource addressed by a key. -- ----- * The hash bucket spinlock must not be held. -- ----- */ -- -----static void drop_futex_key_refs(union futex_key *key) ++ +++++static inline ++ +++++void put_futex_key(int fshared, union futex_key *key) { -- ----- if (!key->both.ptr) -- ----- return; -- ----- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { -- ----- case FUT_OFF_INODE: -- ----- iput(key->shared.inode); -- ----- break; -- ----- case FUT_OFF_MMSHARED: -- ----- mmdrop(key->private.mm); -- ----- break; -- ----- } ++ +++++ drop_futex_key_refs(key); } static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) @@@@@@@@@ -328,10 -328,10 -297,8 -328,10 -328,10 -328,10 -328,10 -328,10 +297,8 @@@@@@@@@ static int get_futex_value_locked(u32 * /* * Fault handling. -- ----- * if fshared is non NULL, current->mm->mmap_sem is already held */ -- -----static int futex_handle_fault(unsigned long address, -- ----- struct rw_semaphore *fshared, int attempt) ++ +++++static int futex_handle_fault(unsigned long address, int attempt) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; @@@@@@@@@ -340,8 -340,8 -307,7 -340,8 -340,8 -340,8 -340,8 -340,8 +307,7 @@@@@@@@@ if (attempt > 2) return ret; -- ----- if (!fshared) -- ----- down_read(&mm->mmap_sem); ++ +++++ down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { @@@@@@@@@ -361,8 -361,8 -327,7 -361,8 -361,8 -361,8 -361,8 -361,8 +327,7 @@@@@@@@@ current->min_flt++; } } -- ----- if (!fshared) -- ----- up_read(&mm->mmap_sem); ++ +++++ up_read(&mm->mmap_sem); return ret; } @@@@@@@@@ -385,6 -385,6 -350,7 -385,6 -385,6 -385,6 -385,6 -385,6 +350,7 @@@@@@@@@ static int refill_pi_state_cache(void /* pi_mutex gets initialized later */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); ++ +++++ pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@@@@@@@@ -462,7 -462,7 -428,7 -462,7 -462,7 -462,7 -462,7 -462,7 +428,7 @@@@@@@@@ void exit_pi_state_list(struct task_str struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; -- ----- union futex_key key; ++ +++++ union futex_key key = FUTEX_KEY_INIT; if (!futex_cmpxchg_enabled) return; @@@@@@@@@ -719,20 -719,20 -685,17 -719,20 -719,20 -719,20 -719,20 -719,20 +685,17 @@@@@@@@@ double_lock_hb(struct futex_hash_bucke * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -- -----static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, -- ----- int nr_wake, u32 bitset) ++ +++++static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; struct plist_head *head; -- ----- union futex_key key; ++ +++++ union futex_key key = FUTEX_KEY_INIT; int ret; if (!bitset) return -EINVAL; -- ----- futex_lock_mm(fshared); -- ----- ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@@@@@@@@ -760,7 -760,7 -723,7 -760,7 -760,7 -760,7 -760,7 -760,7 +723,7 @@@@@@@@@ spin_unlock(&hb->lock); out: -- ----- futex_unlock_mm(fshared); ++ +++++ put_futex_key(fshared, &key); return ret; } @@@@@@@@@ -769,19 -769,19 -732,16 -769,19 -769,19 -769,19 -769,19 -769,19 +732,16 @@@@@@@@@ * to this virtual address: */ static int -- -----futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, -- ----- u32 __user *uaddr2, ++ +++++futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { -- ----- union futex_key key1, key2; ++ +++++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; retryfull: -- ----- futex_lock_mm(fshared); -- ----- ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@@@@@@@@ -826,18 -826,18 -786,12 -826,18 -826,18 -826,18 -826,18 -826,18 +786,12 @@@@@@@@@ retry */ if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr2, -- ----- fshared, attempt); ++ +++++ attempt); if (ret) goto out; goto retry; } -- ----- /* -- ----- * If we would have faulted, release mmap_sem, -- ----- * fault it in and start all over again. -- ----- */ -- ----- futex_unlock_mm(fshared); -- ----- ret = get_user(dummy, uaddr2); if (ret) return ret; @@@@@@@@@ -873,7 -873,7 -827,8 -873,7 -873,7 -873,7 -873,7 -873,7 +827,8 @@@@@@@@@ if (hb1 != hb2) spin_unlock(&hb2->lock); out: -- ----- futex_unlock_mm(fshared); ++ +++++ put_futex_key(fshared, &key2); ++ +++++ put_futex_key(fshared, &key1); return ret; } @@@@@@@@@ -882,19 -882,19 -837,16 -882,19 -882,19 -882,19 -882,19 -882,19 +837,16 @@@@@@@@@ * Requeue all waiters hashed on one physical page to another * physical page. */ -- -----static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, -- ----- u32 __user *uaddr2, ++ +++++static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval) { -- ----- union futex_key key1, key2; ++ +++++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; retry: -- ----- futex_lock_mm(fshared); -- ----- ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@@@@@@@@ -917,12 -917,12 -869,6 -917,12 -917,12 -917,12 -917,12 -917,12 +869,6 @@@@@@@@@ if (hb1 != hb2) spin_unlock(&hb2->lock); -- ----- /* -- ----- * If we would have faulted, release mmap_sem, fault -- ----- * it in and start all over again. -- ----- */ -- ----- futex_unlock_mm(fshared); -- ----- ret = get_user(curval, uaddr1); if (!ret) @@@@@@@@@ -974,7 -974,7 -920,8 -974,7 -974,7 -974,7 -974,7 -974,7 +920,8 @@@@@@@@@ out_unlock drop_futex_key_refs(&key1); out: -- ----- futex_unlock_mm(fshared); ++ +++++ put_futex_key(fshared, &key2); ++ +++++ put_futex_key(fshared, &key1); return ret; } @@@@@@@@@ -1096,8 -1096,8 -1043,7 -1096,8 -1096,8 -1096,8 -1096,8 -1096,8 +1043,7 @@@@@@@@@ static void unqueue_me_pi(struct futex_ * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -- ----- struct task_struct *newowner, -- ----- struct rw_semaphore *fshared) ++ +++++ struct task_struct *newowner, int fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@@@@@@@@ -1176,7 -1176,7 -1122,7 -1176,7 -1176,7 -1176,7 -1176,7 -1176,7 +1122,7 @@@@@@@@@ retry handle_fault: spin_unlock(q->lock_ptr); -- ----- ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); ++ +++++ ret = futex_handle_fault((unsigned long)uaddr, attempt++); spin_lock(q->lock_ptr); @@@@@@@@@ -1200,7 -1200,7 -1146,7 -1200,7 -1200,7 -1200,7 -1200,7 -1200,7 +1146,7 @@@@@@@@@ static long futex_wait_restart(struct restart_block *restart); -- -----static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, ++ +++++static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; @@@@@@@@@ -1218,8 -1218,8 -1164,7 -1218,8 -1218,8 -1218,8 -1218,8 -1218,8 +1164,7 @@@@@@@@@ q.pi_state = NULL; q.bitset = bitset; retry: -- ----- futex_lock_mm(fshared); -- ----- ++ +++++ q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@@@@@@@@ -1251,12 -1251,12 -1196,6 -1251,12 -1251,12 -1251,12 -1251,12 -1251,12 +1196,6 @@@@@@@@@ if (unlikely(ret)) { queue_unlock(&q, hb); -- ----- /* -- ----- * If we would have faulted, release mmap_sem, fault it in and -- ----- * start all over again. -- ----- */ -- ----- futex_unlock_mm(fshared); -- ----- ret = get_user(uval, uaddr); if (!ret) @@@@@@@@@ -1270,12 -1270,12 -1209,6 -1270,12 -1270,12 -1270,12 -1270,12 -1270,12 +1209,6 @@@@@@@@@ /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); -- ----- /* -- ----- * Now the futex is queued and we have checked the data, we -- ----- * don't want to hold mmap_sem while we sleep. -- ----- */ -- ----- futex_unlock_mm(fshared); -- ----- /* * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it @@@@@@@@@ -1296,16 -1296,13 -1229,13 -1296,16 -1296,16 -1296,16 -1296,16 -1296,13 +1229,16 @@@@@@@@@ if (!abs_time) schedule(); else { ++ + unsigned long slack; ++ + slack = current->timer_slack_ns; ++ + if (rt_task(current)) ++ + slack = 0; hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); -- - t.timer.expires = *abs_time; ++ + hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); -- - hrtimer_start(&t.timer, t.timer.expires, -- - HRTIMER_MODE_ABS); ++ + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) t.task = NULL; @@@@@@@@@ -1363,7 -1360,7 -1293,7 -1363,7 -1363,7 -1363,7 -1363,7 -1360,7 +1296,7 @@@@@@@@@ queue_unlock(&q, hb); out_release_sem: -- ----- futex_unlock_mm(fshared); ++ +++++ put_futex_key(fshared, &q.key); return ret; } @@@@@@@@@ -1371,13 -1368,13 -1301,13 -1371,13 -1371,13 -1371,13 -1371,13 -1368,13 +1304,13 @@@@@@@@@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; -- ----- struct rw_semaphore *fshared = NULL; ++ +++++ int fshared = 0; ktime_t t; t.tv64 = restart->futex.time; restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) -- ----- fshared = ¤t->mm->mmap_sem; ++ +++++ fshared = 1; return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, restart->futex.bitset); } @@@@@@@@@ -1389,7 -1386,7 -1319,7 -1389,7 -1389,7 -1389,7 -1389,7 -1386,7 +1322,7 @@@@@@@@@ * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -- -----static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ++ +++++static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@@@@@@@@ -1407,13 -1404,13 -1337,12 -1407,13 -1407,13 -1407,13 -1407,13 -1404,13 +1340,12 @@@@@@@@@ hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); -- - to->timer.expires = *time; ++ + hrtimer_set_expires(&to->timer, *time); } q.pi_state = NULL; retry: -- ----- futex_lock_mm(fshared); -- ----- ++ +++++ q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out_release_sem; @@@@@@@@@ -1502,7 -1499,7 -1431,6 -1502,7 -1502,7 -1502,7 -1502,7 -1499,7 +1434,6 @@@@@@@@@ * exit to complete. */ queue_unlock(&q, hb); -- ----- futex_unlock_mm(fshared); cond_resched(); goto retry; @@@@@@@@@ -1534,12 -1531,12 -1462,6 -1534,12 -1534,12 -1534,12 -1534,12 -1531,12 +1465,6 @@@@@@@@@ */ queue_me(&q, hb); -- ----- /* -- ----- * Now the futex is queued and we have checked the data, we -- ----- * don't want to hold mmap_sem while we sleep. -- ----- */ -- ----- futex_unlock_mm(fshared); -- ----- WARN_ON(!q.pi_state); /* * Block on the PI mutex: @@@@@@@@@ -1552,7 -1549,7 -1474,6 -1552,7 -1552,7 -1552,7 -1552,7 -1549,7 +1477,6 @@@@@@@@@ ret = ret ? 0 : -EWOULDBLOCK; } -- ----- futex_lock_mm(fshared); spin_lock(q.lock_ptr); if (!ret) { @@@@@@@@@ -1618,7 -1615,7 -1539,6 -1618,7 -1618,7 -1618,7 -1618,7 -1615,7 +1542,6 @@@@@@@@@ /* Unqueue and drop the lock */ unqueue_me_pi(&q); -- ----- futex_unlock_mm(fshared); if (to) destroy_hrtimer_on_stack(&to->timer); @@@@@@@@@ -1628,7 -1625,7 -1548,7 -1628,7 -1628,7 -1628,7 -1628,7 -1625,7 +1551,7 @@@@@@@@@ queue_unlock(&q, hb); out_release_sem: -- ----- futex_unlock_mm(fshared); ++ +++++ put_futex_key(fshared, &q.key); if (to) destroy_hrtimer_on_stack(&to->timer); return ret; @@@@@@@@@ -1645,15 -1642,15 -1565,12 -1645,15 -1645,15 -1645,15 -1645,15 -1642,15 +1568,12 @@@@@@@@@ queue_unlock(&q, hb); if (attempt++) { -- ----- ret = futex_handle_fault((unsigned long)uaddr, fshared, -- ----- attempt); ++ +++++ ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out_release_sem; goto retry_unlocked; } -- ----- futex_unlock_mm(fshared); -- ----- ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; @@@@@@@@@ -1668,13 -1665,13 -1585,13 -1668,13 -1668,13 -1668,13 -1668,13 -1665,13 +1588,13 @@@@@@@@@ * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -- -----static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) ++ +++++static int futex_unlock_pi(u32 __user *uaddr, int fshared) { struct futex_hash_bucket *hb; struct futex_q *this, *next; u32 uval; struct plist_head *head; -- ----- union futex_key key; ++ +++++ union futex_key key = FUTEX_KEY_INIT; int ret, attempt = 0; retry: @@@@@@@@@ -1685,10 -1682,10 -1602,6 -1685,10 -1685,10 -1685,10 -1685,10 -1682,10 +1605,6 @@@@@@@@@ */ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; -- ----- /* -- ----- * First take all the futex related locks: -- ----- */ -- ----- futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) @@@@@@@@@ -1747,7 -1744,7 -1660,7 -1747,7 -1747,7 -1747,7 -1747,7 -1744,7 +1663,7 @@@@@@@@@ retry_unlocked out_unlock: spin_unlock(&hb->lock); out: -- ----- futex_unlock_mm(fshared); ++ +++++ put_futex_key(fshared, &key); return ret; @@@@@@@@@ -1763,16 -1760,16 -1676,13 -1763,16 -1763,16 -1763,16 -1763,16 -1760,16 +1679,13 @@@@@@@@@ pi_faulted spin_unlock(&hb->lock); if (attempt++) { -- ----- ret = futex_handle_fault((unsigned long)uaddr, fshared, -- ----- attempt); ++ +++++ ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) goto out; uval = 0; goto retry_unlocked; } -- ----- futex_unlock_mm(fshared); -- ----- ret = get_user(uval, uaddr); if (!ret && (uval != -EFAULT)) goto retry; @@@@@@@@@ -1898,8 -1895,8 -1808,7 -1898,8 -1898,8 -1898,8 -1898,8 -1895,8 +1811,7 @@@@@@@@@ retry * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) -- ----- futex_wake(uaddr, &curr->mm->mmap_sem, 1, -- ----- FUTEX_BITSET_MATCH_ANY); ++ +++++ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } @@@@@@@@@ -1995,10 -1992,10 -1904,10 -1995,10 -1995,10 -1995,10 -1995,10 -1992,10 +1907,10 @@@@@@@@@ long do_futex(u32 __user *uaddr, int op { int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; -- ----- struct rw_semaphore *fshared = NULL; ++ +++++ int fshared = 0; if (!(op & FUTEX_PRIVATE_FLAG)) -- ----- fshared = ¤t->mm->mmap_sem; ++ +++++ fshared = 1; switch (cmd) { case FUTEX_WAIT: diff --combined kernel/lockdep.c index 06e157119d2,dbda475b13b,dbda475b13b,a4285830323,06e157119d2,06e157119d2,46a404173db,dbda475b13b..e4bdda8dcf0 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@@@@@@@@ -136,16 -136,16 -136,16 -136,16 -136,16 -136,16 -136,16 -136,16 +136,16 @@@@@@@@@ static inline struct lock_class *hlock_ #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); --- ----static int lock_contention_point(struct lock_class *class, unsigned long ip) +++ ++++static int lock_point(unsigned long points[], unsigned long ip) { int i; --- ---- for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { --- ---- if (class->contention_point[i] == 0) { --- ---- class->contention_point[i] = ip; +++ ++++ for (i = 0; i < LOCKSTAT_POINTS; i++) { +++ ++++ if (points[i] == 0) { +++ ++++ points[i] = ip; break; } --- ---- if (class->contention_point[i] == ip) +++ ++++ if (points[i] == ip) break; } @@@@@@@@@ -185,6 -185,6 -185,6 -185,9 -185,6 -185,6 -185,6 -185,6 +185,9 @@@@@@@@@ struct lock_class_stats lock_stats(stru for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) stats.contention_point[i] += pcs->contention_point[i]; +++ ++++ for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) +++ ++++ stats.contending_point[i] += pcs->contending_point[i]; +++ ++++ lock_time_add(&pcs->read_waittime, &stats.read_waittime); lock_time_add(&pcs->write_waittime, &stats.write_waittime); @@@@@@@@@ -209,6 -209,6 -209,6 -212,7 -209,6 -209,6 -209,6 -209,6 +212,7 @@@@@@@@@ void clear_lock_stats(struct lock_clas memset(cpu_stats, 0, sizeof(struct lock_class_stats)); } memset(class->contention_point, 0, sizeof(class->contention_point)); +++ ++++ memset(class->contending_point, 0, sizeof(class->contending_point)); } static struct lock_class_stats *get_lock_stats(struct lock_class *class) @@@@@@@@@ -2169,11 -2169,12 -2169,12 -2173,11 -2169,11 -2169,11 -2169,11 -2169,12 +2173,11 @@@@@@@@@ void early_boot_irqs_on(void /* * Hardirqs will be enabled: */ -- -void trace_hardirqs_on_caller(unsigned long a0) ++ +void trace_hardirqs_on_caller(unsigned long ip) { struct task_struct *curr = current; -- - unsigned long ip; -- - time_hardirqs_on(CALLER_ADDR0, a0); ++ + time_hardirqs_on(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@@@@@@@@ -2187,6 -2188,7 -2188,7 -2191,6 -2187,6 -2187,6 -2187,6 -2188,7 +2191,6 @@@@@@@@@ } /* we'll do an OFF -> ON transition: */ curr->hardirqs_enabled = 1; -- - ip = (unsigned long) __builtin_return_address(0); if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@@@@@@@@ -2222,11 -2224,11 -2224,11 -2226,11 -2222,11 -2222,11 -2222,11 -2224,11 +2226,11 @@@@@@@@@ EXPORT_SYMBOL(trace_hardirqs_on) /* * Hardirqs were disabled: */ -- -void trace_hardirqs_off_caller(unsigned long a0) ++ +void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; -- - time_hardirqs_off(CALLER_ADDR0, a0); ++ + time_hardirqs_off(CALLER_ADDR0, ip); if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@@@@@@@@ -2239,7 -2241,7 -2241,7 -2243,7 -2239,7 -2239,7 -2239,7 -2241,7 +2243,7 @@@@@@@@@ * We have done an ON -> OFF transition: */ curr->hardirqs_enabled = 0; -- - curr->hardirq_disable_ip = _RET_IP_; ++ + curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_off_events); } else @@@@@@@@@ -2999,7 -3001,7 -3001,7 -3003,7 -2999,7 -2999,7 -2999,7 -3001,7 +3003,7 @@@@@@@@@ __lock_contended(struct lockdep_map *lo struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; --- ---- int i, point; +++ ++++ int i, contention_point, contending_point; depth = curr->lockdep_depth; if (DEBUG_LOCKS_WARN_ON(!depth)) @@@@@@@@@ -3023,18 -3025,18 -3025,18 -3027,22 -3023,18 -3023,18 -3023,18 -3025,18 +3027,22 @@@@@@@@@ found_it: hlock->waittime_stamp = sched_clock(); --- ---- point = lock_contention_point(hlock_class(hlock), ip); +++ ++++ contention_point = lock_point(hlock_class(hlock)->contention_point, ip); +++ ++++ contending_point = lock_point(hlock_class(hlock)->contending_point, +++ ++++ lock->ip); stats = get_lock_stats(hlock_class(hlock)); --- ---- if (point < ARRAY_SIZE(stats->contention_point)) --- ---- stats->contention_point[point]++; +++ ++++ if (contention_point < LOCKSTAT_POINTS) +++ ++++ stats->contention_point[contention_point]++; +++ ++++ if (contending_point < LOCKSTAT_POINTS) +++ ++++ stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); } static void --- ----__lock_acquired(struct lockdep_map *lock) +++ ++++__lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock, *prev_hlock; @@@@@@@@@ -3083,6 -3085,6 -3085,6 -3091,7 -3083,6 -3083,6 -3083,6 -3085,6 +3091,7 @@@@@@@@@ found_it put_lock_stats(stats); lock->cpu = cpu; +++ ++++ lock->ip = ip; } void lock_contended(struct lockdep_map *lock, unsigned long ip) @@@@@@@@@ -3104,7 -3106,7 -3106,7 -3113,7 -3104,7 -3104,7 -3104,7 -3106,7 +3113,7 @@@@@@@@@ } EXPORT_SYMBOL_GPL(lock_contended); --- ----void lock_acquired(struct lockdep_map *lock) +++ ++++void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; @@@@@@@@@ -3117,7 -3119,7 -3119,7 -3126,7 -3117,7 -3117,7 -3117,7 -3119,7 +3126,7 @@@@@@@@@ raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; --- ---- __lock_acquired(lock); +++ ++++ __lock_acquired(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@@@@@@@@ -3276,10 -3278,10 -3278,10 -3285,10 -3276,10 -3276,10 -3276,10 -3278,10 +3285,10 @@@@@@@@@ void __init lockdep_info(void { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); ------ - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); ++++++ + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); ------ - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); ++++++ + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); @@@@@@@@@ -3415,10 -3417,9 -3417,9 -3424,10 -3415,10 -3415,10 -3415,10 -3417,9 +3424,10 @@@@@@@@@ retry } printk(" ignoring it.\n"); unlock = 0; ++ + } else { ++ + if (count != 10) ++ + printk(KERN_CONT " locked it.\n"); } -- - if (count != 10) -- - printk(" locked it.\n"); do_each_thread(g, p) { /* diff --combined kernel/notifier.c index 4282c0a40a5,0f39e398ef6,823be11584e,4282c0a40a5,4282c0a40a5,4282c0a40a5,4282c0a40a5,4282c0a40a5..61d5aa5eced --- a/kernel/notifier.c +++ b/kernel/notifier.c @@@@@@@@@ -82,6 -82,14 -82,6 -82,6 -82,6 -82,6 -82,6 -82,6 +82,14 @@@@@@@@@ static int __kprobes notifier_call_chai while (nb && nr_to_call) { next_nb = rcu_dereference(nb->next); + ++++++ + ++++++#ifdef CONFIG_DEBUG_NOTIFIERS + ++++++ if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { + ++++++ WARN(1, "Invalid notifier called!"); + ++++++ nb = next_nb; + ++++++ continue; + ++++++ } + ++++++#endif ret = nb->notifier_call(nb, val, v); if (nr_calls) @@@@@@@@@ -550,7 -558,7 -550,7 -550,7 -550,7 -550,7 -550,7 -550,7 +558,7 @@@@@@@@@ EXPORT_SYMBOL(unregister_reboot_notifie static ATOMIC_NOTIFIER_HEAD(die_chain); -- int notify_die(enum die_val val, const char *str, ++ int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --combined kernel/sched.c index 9b1e79371c2,cc1f81b50b8,13dd2db9fb2,2a106b6b78b,e8819bc6f46,b388c9b243e,9b1e79371c2,d906f72b42d..558e5f28426 --- a/kernel/sched.c +++ b/kernel/sched.c @@@@@@@@@ -55,7 -55,6 -55,6 -55,7 -55,7 -55,7 -55,7 -55,6 +55,7 @@@@@@@@@ #include #include #include ++ +#include #include #include #include @@@@@@@@@ -72,7 -71,6 -71,6 -72,7 -72,7 -72,7 -72,7 -71,7 +72,7 @@@@@@@@@ #include #include #include ++ #include #include #include @@@@@@@@@ -203,19 -201,14 -201,14 -203,19 -203,19 -203,19 -203,19 -202,19 +203,19 @@@@@@@@@ void init_rt_bandwidth(struct rt_bandwi hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; -- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; ++ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; ++ } ++ ++ static inline int rt_bandwidth_enabled(void) ++ { ++ return sysctl_sched_rt_runtime >= 0; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; -- if (rt_b->rt_runtime == RUNTIME_INF) ++ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@@@@@@@@ -228,8 -221,9 -221,9 -228,8 -228,8 -228,8 -228,8 -227,9 +228,8 @@@@@@@@@ now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); -- - hrtimer_start(&rt_b->rt_period_timer, -- - rt_b->rt_period_timer.expires, -- - HRTIMER_MODE_ABS); ++ + hrtimer_start_expires(&rt_b->rt_period_timer, ++ + HRTIMER_MODE_ABS); } spin_unlock(&rt_b->rt_runtime_lock); } @@@@@@@@@ -304,9 -298,9 -298,9 -304,9 -304,9 -304,9 -304,9 -304,9 +304,9 @@@@@@@@@ static DEFINE_PER_CPU(struct cfs_rq, in static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -- #else /* !CONFIG_FAIR_GROUP_SCHED */ ++ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -- #endif /* CONFIG_FAIR_GROUP_SCHED */ ++ #endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@@@@@@@@ -386,6 -380,7 -380,7 -386,6 -386,6 -386,6 -386,6 -386,7 +386,6 @@@@@@@@@ struct cfs_rq u64 exec_clock; u64 min_vruntime; -- - u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; @@@@@@@@@ -397,9 -392,9 -392,9 -397,9 -397,9 -397,9 -397,9 -398,9 +397,9 @@@@@@@@@ * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ -- - - struct sched_entity *curr, *next; ++ + + struct sched_entity *curr, *next, *last; -- - - unsigned long nr_spread_over; ++ + + unsigned int nr_spread_over; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@@@@@@@@ -609,9 -604,9 -604,9 -609,9 -609,9 -609,9 -609,9 -610,9 +609,9 @@@@@@@@@ struct rq static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { -- rq->curr->sched_class->check_preempt_curr(rq, p); ++ rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@@@@@@@@ -817,13 -812,6 -812,6 -817,13 -817,13 -817,13 -817,13 -818,6 +817,13 @@@@@@@@@ const_debug unsigned int sysctl_sched_n */ unsigned int sysctl_sched_shares_ratelimit = 250000; ++ +/* ++ + * Inject some fuzzyness into changing the per-cpu group shares ++ + * this avoids remote rq-locks at the expense of fairness. ++ + * default: 4 ++ + */ ++ +unsigned int sysctl_sched_shares_thresh = 4; ++ + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@@@@@@@@ -969,14 -957,6 -957,6 -969,14 -969,6 -969,14 -969,14 -963,6 +969,14 @@@@@@@@@ static struct rq *task_rq_lock(struct t } } ++ + +void task_rq_unlock_wait(struct task_struct *p) ++ + +{ ++ + + struct rq *rq = task_rq(p); ++ + + ++ + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */ ++ + + spin_unlock_wait(&rq->lock); ++ + +} ++ + + static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { @@@@@@@@@ -1078,7 -1058,7 -1058,7 -1078,7 -1070,7 -1078,7 -1078,7 -1064,7 +1078,7 @@@@@@@@@ static void hrtick_start(struct rq *rq struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); -- - timer->expires = time; ++ + hrtimer_set_expires(timer, time); if (rq == this_rq()) { hrtimer_restart(timer); @@@@@@@@@ -1107,7 -1087,7 -1087,7 -1107,7 -1099,7 -1107,7 -1107,7 -1093,7 +1107,7 @@@@@@@@@ hotplug_hrtick(struct notifier_block *n return NOTIFY_DONE; } - static void init_hrtick(void) + static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } @@@@@@@@@ -1122,7 -1102,7 -1102,7 -1122,7 -1114,7 -1122,7 -1122,7 -1108,7 +1122,7 @@@@@@@@@ static void hrtick_start(struct rq *rq hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -- static void init_hrtick(void) ++ static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@@@@@@@@ -1139,9 -1119,9 -1119,9 -1139,9 -1131,9 -1139,9 -1139,9 -1125,9 +1139,9 @@@@@@@@@ static void init_rq_hrtick(struct rq *r hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; -- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; ++ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -- #else ++ #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@@@@@@@@ -1153,7 -1133,7 -1133,7 -1153,7 -1145,7 -1153,7 -1153,7 -1139,7 +1153,7 @@@@@@@@@ static inline void init_rq_hrtick(struc static inline void init_hrtick(void) { } -- #endif ++ #endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@@@@@@@@ -1400,24 -1380,38 -1380,38 -1400,24 -1392,24 -1400,24 -1400,24 -1386,24 +1400,24 @@@@@@@@@ static inline void dec_cpu_load(struct update_load_sub(&rq->load, load); } -- #ifdef CONFIG_SMP -- static unsigned long source_load(int cpu, int type); -- static unsigned long target_load(int cpu, int type); -- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -- -- static unsigned long cpu_avg_load_per_task(int cpu) -- { -- struct rq *rq = cpu_rq(cpu); -- -- if (rq->nr_running) -- rq->avg_load_per_task = rq->load.weight / rq->nr_running; -- -- return rq->avg_load_per_task; -- } -- -- #ifdef CONFIG_FAIR_GROUP_SCHED -- -- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); ++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) ++ typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -- static void -- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) ++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; ++ int ret; rcu_read_lock(); parent = &root_task_group; down: -- (*down)(parent, cpu, sd); ++ ret = (*down)(parent, data); ++ if (ret) ++ goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@@@@@@@@ -1425,53 -1419,23 -1419,23 -1425,51 -1417,51 -1425,53 -1425,53 -1411,51 +1425,53 @@@@@@@@@ up: continue; } -- (*up)(parent, cpu, sd); ++ ret = (*up)(parent, data); ++ if (ret) ++ goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; ++ out_unlock: rcu_read_unlock(); ++ ++ return ret; ++ } ++ ++ static int tg_nop(struct task_group *tg, void *data) ++ { ++ return 0; ++ } ++ #endif ++ ++ #ifdef CONFIG_SMP ++ static unsigned long source_load(int cpu, int type); ++ static unsigned long target_load(int cpu, int type); ++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); ++ ++ static unsigned long cpu_avg_load_per_task(int cpu) ++ { ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->nr_running) ++ rq->avg_load_per_task = rq->load.weight / rq->nr_running; ++++ + else ++++ + rq->avg_load_per_task = 0; ++ ++ return rq->avg_load_per_task; } ++ #ifdef CONFIG_FAIR_GROUP_SCHED ++ static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ static void -- -__update_group_shares_cpu(struct task_group *tg, int cpu, -- - unsigned long sd_shares, unsigned long sd_rq_weight) ++ +update_group_shares_cpu(struct task_group *tg, int cpu, ++ + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@@@@@@@@ -1502,23 -1466,19 -1466,19 -1500,23 -1492,23 -1502,23 -1502,23 -1486,19 +1502,23 @@@@@@@@@ * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); ++ + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); -- - /* -- - * record the actual number of shares, not the boosted amount. -- - */ -- - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; -- - tg->cfs_rq[cpu]->rq_weight = rq_weight; ++ + if (abs(shares - tg->se[cpu]->load.weight) > ++ + sysctl_sched_shares_thresh) { ++ + struct rq *rq = cpu_rq(cpu); ++ + unsigned long flags; -- - if (shares < MIN_SHARES) -- - shares = MIN_SHARES; -- - else if (shares > MAX_SHARES) -- - shares = MAX_SHARES; ++ + spin_lock_irqsave(&rq->lock, flags); ++ + /* ++ + * record the actual number of shares, not the boosted amount. ++ + */ ++ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; ++ + tg->cfs_rq[cpu]->rq_weight = rq_weight; -- - __set_se_shares(tg->se[cpu], shares); ++ + __set_se_shares(tg->se[cpu], shares); ++ + spin_unlock_irqrestore(&rq->lock, flags); ++ + } } /* @@@@@@@@@ -1526,11 -1486,11 -1486,11 -1524,11 -1516,11 -1526,11 -1526,11 -1506,11 +1526,11 @@@@@@@@@ * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -- static void -- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) ++ static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; ++ struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@@@@@@@@ -1547,10 -1507,14 -1507,14 -1545,10 -1537,10 -1547,10 -1547,10 -1527,16 +1547,10 @@@@@@@@@ if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; -- - for_each_cpu_mask(i, sd->span) { -- - struct rq *rq = cpu_rq(i); -- - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } ++ + for_each_cpu_mask(i, sd->span) ++ + update_group_shares_cpu(tg, i, shares, rq_weight); -- spin_lock_irqsave(&rq->lock, flags); -- __update_group_shares_cpu(tg, i, shares, rq_weight); -- spin_unlock_irqrestore(&rq->lock, flags); -- } ++ return 0; } /* @@@@@@@@@ -1558,10 -1522,10 -1522,10 -1556,10 -1548,10 -1558,10 -1558,10 -1544,10 +1558,10 @@@@@@@@@ * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -- static void -- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) ++ static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; ++ long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@@@@@@@@ -1572,8 -1536,11 -1536,11 -1570,8 -1562,8 -1572,8 -1572,8 -1558,8 +1572,8 @@@@@@@@@ } tg->cfs_rq[cpu]->h_load = load; -- } -- static void -- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -- { ++ return 0; } static void update_shares(struct sched_domain *sd) @@@@@@@@@ -1583,7 -1550,7 -1550,7 -1581,7 -1573,7 -1583,7 -1583,7 -1569,7 +1583,7 @@@@@@@@@ if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; -- walk_tg_tree(tg_nop, tg_shares_up, 0, sd); ++ walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@@@@@@@@ -1594,9 -1561,9 -1561,9 -1592,9 -1584,9 -1594,9 -1594,9 -1580,9 +1594,9 @@@@@@@@@ static void update_shares_locked(struc spin_lock(&rq->lock); } -- static void update_h_load(int cpu) ++ static void update_h_load(long cpu) { -- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); ++ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@@@@@@@@ -1815,9 -1782,7 -1782,7 -1813,9 -1805,7 -1815,9 -1815,9 -1801,7 +1815,9 @@@@@@@@@ task_hot(struct task_struct *p, u64 now /* * Buddy candidates are cache hot: */ -- - - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) ++ + + if (sched_feat(CACHE_HOT_BUDDY) && ++ + + (&p->se == cfs_rq_of(&p->se)->next || ++ + + &p->se == cfs_rq_of(&p->se)->last)) return 1; if (p->sched_class != &fair_sched_class) @@@@@@@@@ -1953,12 -1918,14 -1918,14 -1951,12 -1941,12 -1953,12 -1953,12 -1937,12 +1953,12 @@@@@@@@@ unsigned long wait_task_inactive(struc * just go back and repeat. */ rq = task_rq_lock(p, &flags); ++ trace_sched_wait_task(rq, p); running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; -- if (!match_state || p->state == match_state) { -- ncsw = p->nivcsw + p->nvcsw; -- if (unlikely(!ncsw)) -- ncsw = 1; -- } ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* @@@@@@@@@ -2315,8 -2282,10 -2282,10 -2313,8 -2303,8 -2315,8 -2315,8 -2299,8 +2315,8 @@@@@@@@@ out_activate success = 1; out_running: -- trace_mark(kernel_sched_wakeup, -- "pid %d state %ld ## rq %p task %p rq->curr %p", -- p->pid, p->state, rq, p, rq->curr); -- check_preempt_curr(rq, p); ++ trace_sched_wakeup(rq, p); ++ check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@@@@@@@@ -2448,8 -2417,10 -2417,10 -2446,8 -2436,8 -2448,8 -2448,8 -2432,8 +2448,8 @@@@@@@@@ void wake_up_new_task(struct task_struc p->sched_class->task_new(rq, p); inc_nr_running(rq); } -- trace_mark(kernel_sched_wakeup_new, -- "pid %d state %ld ## rq %p task %p rq->curr %p", -- p->pid, p->state, rq, p, rq->curr); -- check_preempt_curr(rq, p); ++ trace_sched_wakeup_new(rq, p); ++ check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@@@@@@@@ -2621,7 -2592,11 -2592,11 -2619,7 -2609,7 -2621,7 -2621,7 -2605,7 +2621,7 @@@@@@@@@ context_switch(struct rq *rq, struct ta struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); -- trace_mark(kernel_sched_schedule, -- "prev_pid %d next_pid %d prev_state %ld " -- "## rq %p prev %p next %p", -- prev->pid, next->pid, prev->state, -- rq, prev, next); ++ trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@@@@@@@@ -2861,7 -2836,6 -2836,6 -2859,7 -2849,7 -2861,7 -2861,7 -2845,7 +2861,7 @@@@@@@@@ static void sched_migrate_task(struct t || unlikely(!cpu_active(dest_cpu))) goto out; ++ trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@@@@@@@@ -2906,7 -2880,7 -2880,7 -2904,7 -2894,7 -2906,7 -2906,7 -2890,7 +2906,7 @@@@@@@@@ static void pull_task(struct rq *src_rq * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ -- check_preempt_curr(this_rq, p); ++ check_preempt_curr(this_rq, p, 0); } /* @@@@@@@@@ -3355,7 -3329,7 -3329,7 -3353,7 -3343,7 -3355,7 -3355,7 -3339,7 +3355,7 @@@@@@@@@ small_imbalance } else this_load_per_task = cpu_avg_load_per_task(this_cpu); -- - if (max_load - this_load + 2*busiest_load_per_task >= ++ + if (max_load - this_load + busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; @@@@@@@@@ -4063,26 -4037,23 -4037,23 -4061,26 -4051,26 -4063,26 -4063,26 -4047,26 +4063,26 @@@@@@@@@ DEFINE_PER_CPU(struct kernel_stat, ksta EXPORT_PER_CPU_SYMBOL(kstat); /* -- * Return p->sum_exec_runtime plus any more ns on the sched_clock -- * that have not yet been banked in case the task is currently running. ++ * Return any ns on the sched_clock that have not yet been banked in ++ * @p in case that task is currently running. */ -- unsigned long long task_sched_runtime(struct task_struct *p) ++ unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; -- u64 ns, delta_exec; struct rq *rq; ++ u64 ns = 0; rq = task_rq_lock(p, &flags); -- ns = p->se.sum_exec_runtime; ++ if (task_current(rq, p)) { ++ u64 delta_exec; ++ update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) -- ns += delta_exec; ++ ns = delta_exec; } ++ task_rq_unlock(rq, &flags); return ns; @@@@@@@@@ -4099,7 -4070,6 -4070,6 -4097,7 -4087,7 -4099,7 -4099,7 -4083,7 +4099,7 @@@@@@@@@ void account_user_time(struct task_stru cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); ++ account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@@@@@@@@ -4124,7 -4094,6 -4094,6 -4122,7 -4112,7 -4124,7 -4124,7 -4108,7 +4124,7 @@@@@@@@@ static void account_guest_time(struct t tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); ++ account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@@@@@@@@ -4160,7 -4129,6 -4129,6 -4158,7 -4148,7 -4160,7 -4160,7 -4144,7 +4160,7 @@@@@@@@@ void account_system_time(struct task_st } p->stime = cputime_add(p->stime, cputime); ++ account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@@@@@@@@ -4202,7 -4170,6 -4170,6 -4200,7 -4190,7 -4202,6 -4202,7 -4186,7 +4202,6 @@@@@@@@@ void account_steal_time(struct task_str if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); - -- -- account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@@@@@@@@ -4338,7 -4305,7 -4305,7 -4336,7 -4326,7 -4337,7 -4338,7 -4322,7 +4337,7 @@@@@@@@@ void __kprobes sub_preempt_count(int va /* * Underflow? */ --- ---- if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) +++ ++++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) return; /* * Is the spinlock portion underflowing? @@@@@@@@@ -4459,8 -4426,12 -4426,12 -4457,8 -4447,8 -4458,8 -4459,8 -4443,12 +4458,8 @@@@@@@@@ need_resched_nonpreemptible if (sched_feat(HRTICK)) hrtick_clear(rq); -- - /* -- - * Do the rq-clock update outside the rq lock: -- - */ -- - local_irq_disable(); ++ + spin_lock_irq(&rq->lock); update_rq_clock(rq); -- - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@@@@@@@@ -4656,15 -4627,6 -4627,6 -4654,15 -4644,15 -4655,15 -4656,15 -4644,15 +4655,15 @@@@@@@@@ __wake_up_sync(wait_queue_head_t *q, un } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ ++ /** ++ * complete: - signals a single thread waiting on this completion ++ * @x: holds the state of this particular completion ++ * ++ * This will wake up a single thread waiting on this completion. Threads will be ++ * awakened in the same order in which they were queued. ++ * ++ * See also complete_all(), wait_for_completion() and related routines. ++ */ void complete(struct completion *x) { unsigned long flags; @@@@@@@@@ -4676,12 -4638,6 -4638,6 -4674,12 -4664,12 -4675,12 -4676,12 -4664,12 +4675,12 @@@@@@@@@ } EXPORT_SYMBOL(complete); ++ /** ++ * complete_all: - signals all threads waiting on this completion ++ * @x: holds the state of this particular completion ++ * ++ * This will wake up all threads waiting on this particular completion event. ++ */ void complete_all(struct completion *x) { unsigned long flags; @@@@@@@@@ -4702,7 -4658,10 -4658,10 -4700,7 -4690,7 -4701,7 -4702,7 -4690,7 +4701,7 @@@@@@@@@ do_wait_for_common(struct completion *x wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { -- if ((state == TASK_INTERRUPTIBLE && -- signal_pending(current)) || -- (state == TASK_KILLABLE && -- fatal_signal_pending(current))) { ++ if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@@@@@@@@ -4730,31 -4689,12 -4689,12 -4728,31 -4718,31 -4729,31 -4730,31 -4718,31 +4729,31 @@@@@@@@@ wait_for_common(struct completion *x, l return timeout; } ++ /** ++ * wait_for_completion: - waits for completion of a task ++ * @x: holds the state of this particular completion ++ * ++ * This waits to be signaled for completion of a specific task. It is NOT ++ * interruptible and there is no timeout. ++ * ++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout ++ * and interrupt capability. Also see complete(). ++ */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); ++ /** ++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be signaled or for a ++ * specified timeout to expire. The timeout is in jiffies. It is not ++ * interruptible. ++ */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@@@@@@@@ -4762,13 -4702,6 -4702,6 -4760,13 -4750,13 -4761,13 -4762,13 -4750,13 +4761,13 @@@@@@@@@ } EXPORT_SYMBOL(wait_for_completion_timeout); ++ /** ++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr) ++ * @x: holds the state of this particular completion ++ * ++ * This waits for completion of a specific task to be signaled. It is ++ * interruptible. ++ */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@@@@@@@@ -4778,14 -4711,6 -4711,6 -4776,14 -4766,14 -4777,14 -4778,14 -4766,14 +4777,14 @@@@@@@@@ } EXPORT_SYMBOL(wait_for_completion_interruptible); ++ /** ++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) ++ * @x: holds the state of this particular completion ++ * @timeout: timeout value in jiffies ++ * ++ * This waits for either a completion of a specific task to be signaled or for a ++ * specified timeout to expire. It is interruptible. The timeout is in jiffies. ++ */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@@@@@@@@ -4794,13 -4719,6 -4719,6 -4792,13 -4782,13 -4793,13 -4794,13 -4782,13 +4793,13 @@@@@@@@@ } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); ++ /** ++ * wait_for_completion_killable: - waits for completion of a task (killable) ++ * @x: holds the state of this particular completion ++ * ++ * This waits to be signaled for completion of a specific task. It can be ++ * interrupted by a kill signal. ++ */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@@@@@@@@ -5203,8 -5121,7 -5121,7 -5201,8 -5191,8 -5202,8 -5203,8 -5191,8 +5202,8 @@@@@@@@@ recheck * Do not allow realtime tasks into groups that have no runtime * assigned. */ -- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) ++ if (rt_bandwidth_enabled() && rt_policy(policy) && ++ task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@@@@@@@@ -5870,8 -5787,6 -5787,6 -5868,6 -5858,6 -5869,8 -5870,8 -5858,6 +5869,8 @@@@@@@@@ void __cpuinit init_idle(struct task_st struct rq *rq = cpu_rq(cpu); unsigned long flags; ++++ + spin_lock_irqsave(&rq->lock, flags); ++++ + __sched_fork(idle); idle->se.exec_start = sched_clock(); @@@@@@@@@ -5879,6 -5794,7 -5794,7 -5875,7 -5865,7 -5878,6 -5879,6 -5865,7 +5878,6 @@@@@@@@@ idle->cpus_allowed = cpumask_of_cpu(cpu); __set_task_cpu(idle, cpu); ---- - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; @@@@@@@@@ -6041,7 -5957,7 -5957,7 -6038,7 -6028,7 -6040,7 -6041,7 -6028,7 +6040,7 @@@@@@@@@ static int __migrate_task(struct task_s set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); -- check_preempt_curr(rq_dest, p); ++ check_preempt_curr(rq_dest, p, 0); } done: ret = 1; @@@@@@@@@ -6366,7 -6282,7 -6282,7 -6363,7 -6353,7 -6365,7 -6366,7 -6353,7 +6365,7 @@@@@@@@@ set_table_entry(struct ctl_table *entry static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { -- struct ctl_table *table = sd_alloc_ctl_entry(12); ++ struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@@@@@@@@ -6394,9 -6310,7 -6310,7 -6391,9 -6381,9 -6393,9 -6394,9 -6381,9 +6393,9 @@@@@@@@@ sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); -- /* &table[11] is terminator */ ++ set_table_entry(&table[11], "name", sd->name, ++ CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[12] is terminator */ return table; } @@@@@@@@@ -6888,17 -6802,15 -6802,15 -6885,17 -6875,15 -6887,17 -6888,17 -6875,15 +6887,17 @@@@@@@@@ cpu_attach_domain(struct sched_domain * struct sched_domain *tmp; /* Remove the sched domains which do not contribute to scheduling. */ -- - - for (tmp = sd; tmp; tmp = tmp->parent) { ++ + + for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; if (!parent) break; ++ + + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; -- - - } ++ + + } else ++ + + tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { @@@@@@@@@ -7282,21 -7194,13 -7194,13 -7279,21 -7267,21 -7281,21 -7282,21 -7267,21 +7281,21 @@@@@@@@@ static void init_sched_groups_power(in * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ ++ #ifdef CONFIG_SCHED_DEBUG ++ # define SD_INIT_NAME(sd, type) sd->name = #type ++ #else ++ # define SD_INIT_NAME(sd, type) do { } while (0) ++ #endif ++ #define SD_INIT(sd, type) sd_init_##type(sd) ++ #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ ++ SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@@@@@@@@ -7687,7 -7591,6 -7591,6 -7684,7 -7672,6 -7686,7 -7687,7 -7672,6 +7686,7 @@@@@@@@@ static int __build_sched_domains(const error: free_sched_groups(cpu_map, tmpmask); SCHED_CPUMASK_FREE((void *)allmasks); ++ + + kfree(rd); return -ENOMEM; #endif } @@@@@@@@@ -7789,14 -7692,13 -7692,13 -7786,13 -7773,13 -7788,13 -7789,14 -7773,13 +7788,14 @@@@@@@@@ static int dattrs_equal(struct sched_do * * The passed in 'doms_new' should be kmalloc'd. This routine takes * ownership of it and will kfree it when done with it. If the caller ----- - * failed the kmalloc call, then it can pass in doms_new == NULL, ----- - * and partition_sched_domains() will fallback to the single partition ----- - * 'fallback_doms', it also forces the domains to be rebuilt. +++++ + * failed the kmalloc call, then it can pass in doms_new == NULL && +++++ + * ndoms_new == 1, and partition_sched_domains() will fallback to +++++ + * the single partition 'fallback_doms', it also forces the domains +++++ + * to be rebuilt. * ----- - * If doms_new==NULL it will be replaced with cpu_online_map. ----- - * ndoms_new==0 is a special case for destroying existing domains. ----- - * It will not create the default domain. +++++ + * If doms_new == NULL it will be replaced with cpu_online_map. +++++ + * ndoms_new == 0 is a special case for destroying existing domains, +++++ + * and it will not create the default domain. * * Call with hotplug lock held */ @@@@@@@@@ -8340,25 -8242,20 -8242,20 -8336,25 -8323,25 -8338,25 -8340,25 -8323,25 +8339,25 @@@@@@@@@ void __might_sleep(char *file, int line #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ -- if ((in_atomic() || irqs_disabled()) && -- system_state == SYSTEM_RUNNING && !oops_in_progress) { -- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -- return; -- prev_jiffy = jiffies; -- printk(KERN_ERR "BUG: sleeping function called from invalid" -- " context at %s:%d\n", file, line); -- printk("in_atomic():%d, irqs_disabled():%d\n", -- in_atomic(), irqs_disabled()); -- debug_show_held_locks(current); -- if (irqs_disabled()) -- print_irqtrace_events(current); -- dump_stack(); -- } ++ if ((!in_atomic() && !irqs_disabled()) || ++ system_state != SYSTEM_RUNNING || oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@@@@@@@@ -8856,95 -8753,73 -8753,73 -8852,95 -8839,95 -8854,95 -8856,95 -8839,95 +8855,95 @@@@@@@@@ static DEFINE_MUTEX(rt_constraints_mute static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) -- return 1ULL << 16; ++ return 1ULL << 20; -- return div64_u64(runtime << 16, period); ++ return div64_u64(runtime << 20, period); } -- #ifdef CONFIG_CGROUP_SCHED -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) ++ /* Must be called with tasklist_lock held */ ++ static inline int tg_has_rt_tasks(struct task_group *tg) { -- struct task_group *tgi, *parent = tg->parent; -- unsigned long total = 0; ++ struct task_struct *g, *p; -- if (!parent) { -- if (global_rt_period() < period) -- return 0; ++ do_each_thread(g, p) { ++ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) ++ return 1; ++ } while_each_thread(g, p); -- return to_ratio(period, runtime) < -- to_ratio(global_rt_period(), global_rt_runtime()); -- } ++ return 0; ++ } -- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) -- return 0; ++ struct rt_schedulable_data { ++ struct task_group *tg; ++ u64 rt_period; ++ u64 rt_runtime; ++ }; -- rcu_read_lock(); -- list_for_each_entry_rcu(tgi, &parent->children, siblings) { -- if (tgi == tg) -- continue; ++ static int tg_schedulable(struct task_group *tg, void *data) ++ { ++ struct rt_schedulable_data *d = data; ++ struct task_group *child; ++ unsigned long total, sum = 0; ++ u64 period, runtime; -- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), -- tgi->rt_bandwidth.rt_runtime); ++ period = ktime_to_ns(tg->rt_bandwidth.rt_period); ++ runtime = tg->rt_bandwidth.rt_runtime; ++ ++ if (tg == d->tg) { ++ period = d->rt_period; ++ runtime = d->rt_runtime; } -- rcu_read_unlock(); -- return total + to_ratio(period, runtime) <= -- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), -- parent->rt_bandwidth.rt_runtime); -- } -- #elif defined CONFIG_USER_SCHED -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -- { -- struct task_group *tgi; -- unsigned long total = 0; -- unsigned long global_ratio = -- to_ratio(global_rt_period(), global_rt_runtime()); ++ /* ++ * Cannot have more runtime than the period. ++ */ ++ if (runtime > period && runtime != RUNTIME_INF) ++ return -EINVAL; -- rcu_read_lock(); -- list_for_each_entry_rcu(tgi, &task_groups, list) { -- if (tgi == tg) -- continue; ++ /* ++ * Ensure we don't starve existing RT tasks. ++ */ ++ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) ++ return -EBUSY; ++ ++ total = to_ratio(period, runtime); -- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), -- tgi->rt_bandwidth.rt_runtime); ++ /* ++ * Nobody can have more than the global setting allows. ++ */ ++ if (total > to_ratio(global_rt_period(), global_rt_runtime())) ++ return -EINVAL; ++ ++ /* ++ * The sum of our children's runtime should not exceed our own. ++ */ ++ list_for_each_entry_rcu(child, &tg->children, siblings) { ++ period = ktime_to_ns(child->rt_bandwidth.rt_period); ++ runtime = child->rt_bandwidth.rt_runtime; ++ ++ if (child == d->tg) { ++ period = d->rt_period; ++ runtime = d->rt_runtime; ++ } ++ ++ sum += to_ratio(period, runtime); } -- rcu_read_unlock(); -- return total + to_ratio(period, runtime) < global_ratio; ++ if (sum > total) ++ return -EINVAL; ++ ++ return 0; } -- #endif -- /* Must be called with tasklist_lock held */ -- static inline int tg_has_rt_tasks(struct task_group *tg) ++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { -- struct task_struct *g, *p; -- do_each_thread(g, p) { -- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) -- return 1; -- } while_each_thread(g, p); -- return 0; ++ struct rt_schedulable_data data = { ++ .tg = tg, ++ .rt_period = period, ++ .rt_runtime = runtime, ++ }; ++ ++ return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@@@@@@@@ -8954,9 -8829,14 -8829,14 -8950,9 -8937,9 -8952,9 -8954,9 -8937,9 +8953,9 @@@@@@@@@ mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); -- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { -- err = -EBUSY; -- goto unlock; -- } -- if (!__rt_schedulable(tg, rt_period, rt_runtime)) { -- err = -EINVAL; ++ err = __rt_schedulable(tg, rt_period, rt_runtime); ++ if (err) goto unlock; -- } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@@@@@@@@ -9025,25 -8905,16 -8905,19 -9021,25 -9008,25 -9023,25 -9025,25 -9008,25 +9024,25 @@@@@@@@@ long sched_group_rt_period(struct task_ static int sched_rt_global_constraints(void) { -- struct task_group *tg = &root_task_group; -- u64 rt_runtime, rt_period; ++ u64 runtime, period; int ret = 0; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; ++ runtime = global_rt_runtime(); ++ period = global_rt_period(); ++ ++ /* ++ * Sanity check on the sysctl variables. ++ */ ++ if (runtime > period && runtime != RUNTIME_INF) ++ return -EINVAL; mutex_lock(&rt_constraints_mutex); -- if (!__rt_schedulable(tg, rt_period, rt_runtime)) -- ret = -EINVAL; ++ read_lock(&tasklist_lock); ++ ret = __rt_schedulable(NULL, 0, 0); ++ read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@@@@@@@@ -9054,9 -8925,6 -8928,9 -9050,9 -9037,9 -9052,9 -9054,9 -9037,9 +9053,9 @@@@@@@@@ static int sched_rt_global_constraints( unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@@@@@@@@ -9117,6 -8985,7 -8991,7 -9113,6 -9100,6 -9115,6 -9117,6 -9100,6 +9116,6 @@@@@@@@@ cpu_cgroup_create(struct cgroup_subsys if (!cgrp->parent) { /* This is early initialization for the top cgroup */ -- init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@@@@@@@@ -9125,6 -8994,9 -9000,9 -9121,6 -9108,6 -9123,6 -9125,6 -9108,6 +9124,6 @@@@@@@@@ if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); -- /* Bind the cgroup to task_group object we just created */ -- tg->css.cgroup = cgrp; -- return &tg->css; } diff --combined kernel/softlockup.c index 3953e4aed73,b9a528f2273,cb838ee93a8,3953e4aed73,3953e4aed73,3953e4aed73,3953e4aed73,3953e4aed73..884e6cd2769 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@@@@@@@@ -164,7 -164,7 -164,7 -164,7 -164,7 -164,7 -164,7 -164,7 +164,7 @@@@@@@@@ unsigned long __read_mostly sysctl_hung /* * Zero means infinite timeout - no checking done: */ - ------unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; + ++++++unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; unsigned long __read_mostly sysctl_hung_task_warnings = 10; @@@@@@@@@ -226,7 -226,7 -226,7 -226,7 -226,7 -226,7 -226,7 -226,7 +226,7 @@@@@@@@@ static void check_hung_uninterruptible_ * If the system crashed already then all bets are off, * do not report extra hung tasks: */ -- if ((tainted & TAINT_DIE) || did_panic) ++ if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --combined lib/Kconfig.debug index b0f239e443b,4116e10ea14,0b504814e37,b0f239e443b,b0f239e443b,b0f239e443b,b0f239e443b,b0f239e443b..1e3fd3e3436 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@@@@@@@@ -495,15 -495,6 -495,6 -495,15 -495,15 -495,15 -495,15 -495,15 +495,15 @@@@@@@@@ config DEBUG_V If unsure, say N. ++ config DEBUG_VIRTUAL ++ bool "Debug VM translations" ++ depends on DEBUG_KERNEL && X86 ++ help ++ Enable some costly sanity checks in virtual to page code. This can ++ catch mistakes with virt_to_page() and friends. ++ ++ If unsure, say N. ++ config DEBUG_WRITECOUNT bool "Debug filesystem writers count" depends on DEBUG_KERNEL @@@@@@@@@ -545,6 -536,16 -536,6 -545,6 -545,6 -545,6 -545,6 -545,6 +545,16 @@@@@@@@@ config DEBUG_S If unsure, say N. + ++++++config DEBUG_NOTIFIERS + ++++++ bool "Debug notifier call chains" + ++++++ depends on DEBUG_KERNEL + ++++++ help + ++++++ Enable this to turn on sanity checking for notifier call chains. + ++++++ This is most useful for kernel developers to make sure that + ++++++ modules properly unregister themselves from notifier chains. + ++++++ This is a relatively cheap check but if you care about maximum + ++++++ performance, say N. + ++++++ config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && \ @@@@@@@@@ -606,19 -607,6 -597,6 -606,19 -606,19 -606,19 -606,19 -606,19 +616,19 @@@@@@@@@ config RCU_TORTURE_TEST_RUNNABL Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. ++ config RCU_CPU_STALL_DETECTOR ++ bool "Check for stalled CPUs delaying RCU grace periods" ++ depends on CLASSIC_RCU ++ default n ++ help ++ This option causes RCU to printk information on which ++ CPUs are delaying the current grace period, but only when ++ the grace period extends for excessive time periods. ++ ++ Say Y if you want RCU to perform such checks. ++ ++ Say N if you are unsure. ++ config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL @@@@@@@@@ -646,33 -634,6 -624,6 -646,33 -646,33 -646,33 -646,33 -646,33 +656,33 @@@@@@@@@ config BACKTRACE_SELF_TES Say N if you are unsure. ++ config DEBUG_BLOCK_EXT_DEVT ++ bool "Force extended block device numbers and spread them" ++ depends on DEBUG_KERNEL ++ depends on BLOCK ++ default n ++ help ++ BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON ++ SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT ++ YOU ARE DOING. Distros, please enable this and fix whatever ++ is broken. ++ ++ Conventionally, block device numbers are allocated from ++ predetermined contiguous area. However, extended block area ++ may introduce non-contiguous block device numbers. This ++ option forces most block device numbers to be allocated from ++ the extended space and spreads them to discover kernel or ++ userland code paths which assume predetermined contiguous ++ device number allocation. ++ ++ Note that turning on this debug option shuffles all the ++ device numbers for all IDE and SCSI devices including libata ++ ones, so root partition specified using device number ++ directly (via rdev or root=MAJ:MIN) won't work anymore. ++ Textual device names (root=/dev/sdXn) will continue to work. ++ ++ Say N if you are unsure. ++ config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL @@@@@@@@@ -710,21 -671,10 -661,10 -710,21 -710,21 -710,21 -710,21 -710,21 +720,21 @@@@@@@@@ config FAIL_PAGE_ALLO config FAIL_MAKE_REQUEST bool "Fault-injection capability for disk IO" -- depends on FAULT_INJECTION ++ depends on FAULT_INJECTION && BLOCK help Provide fault-injection capability for disk IO. ++ config FAIL_IO_TIMEOUT ++ bool "Faul-injection capability for faking disk interrupts" ++ depends on FAULT_INJECTION && BLOCK ++ help ++ Provide fault-injection capability on end IO handling. This ++ will make the block layer "forget" an interrupt as configured, ++ thus exercising the error handling. ++ ++ Only works with drivers that use the generic timeout handling, ++ for others it wont do anything. ++ config FAULT_INJECTION_DEBUG_FS bool "Debugfs entries for fault-injection capabilities" depends on FAULT_INJECTION && SYSFS && DEBUG_FS @@@@@@@@@ -812,61 -762,6 -752,6 -812,61 -812,61 -812,61 -812,61 -812,61 +822,61 @@@@@@@@@ menuconfig BUILD_DOCSR Say N if you are unsure. ++ config DYNAMIC_PRINTK_DEBUG ++ bool "Enable dynamic printk() call support" ++ default n ++ depends on PRINTK ++ select PRINTK_DEBUG ++ help ++ ++ Compiles debug level messages into the kernel, which would not ++ otherwise be available at runtime. These messages can then be ++ enabled/disabled on a per module basis. This mechanism implicitly ++ enables all pr_debug() and dev_dbg() calls. The impact of this ++ compile option is a larger kernel text size of about 2%. ++ ++ Usage: ++ ++ Dynamic debugging is controlled by the debugfs file, ++ dynamic_printk/modules. This file contains a list of the modules that ++ can be enabled. The format of the file is the module name, followed ++ by a set of flags that can be enabled. The first flag is always the ++ 'enabled' flag. For example: ++ ++ ++ . ++ . ++ . ++ ++ : Name of the module in which the debug call resides ++ : whether the messages are enabled or not ++ ++ From a live system: ++ ++ snd_hda_intel enabled=0 ++ fixup enabled=0 ++ driver enabled=0 ++ ++ Enable a module: ++ ++ $echo "set enabled=1 " > dynamic_printk/modules ++ ++ Disable a module: ++ ++ $echo "set enabled=0 " > dynamic_printk/modules ++ ++ Enable all modules: ++ ++ $echo "set enabled=1 all" > dynamic_printk/modules ++ ++ Disable all modules: ++ ++ $echo "set enabled=0 all" > dynamic_printk/modules ++ ++ Finally, passing "dynamic_printk" at the command line enables ++ debugging for all modules. This mode can be turned off via the above ++ disable command. ++ source "samples/Kconfig" source "lib/Kconfig.kgdb"