2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
54 #include <asm/syscalls.h>
56 asmlinkage extern void ret_from_fork(void);
58 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
60 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62 void idle_notifier_register(struct notifier_block *n)
64 atomic_notifier_chain_register(&idle_notifier, n);
70 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
73 static void __exit_idle(void)
75 if (test_and_clear_bit_pda(0, isidle) == 0)
77 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
80 /* Called from interrupts to signify idle end */
83 /* idle loop has pid 0 */
89 #ifdef CONFIG_HOTPLUG_CPU
90 DECLARE_PER_CPU(int, cpu_state);
93 /* We halt the CPU with physical CPU hotplug */
94 static inline void play_dead(void)
97 c1e_remove_cpu(raw_smp_processor_id());
101 __get_cpu_var(cpu_state) = CPU_DEAD;
104 /* mask all interrupts, flush any and all caches, and halt */
108 static inline void play_dead(void)
112 #endif /* CONFIG_HOTPLUG_CPU */
115 * The idle thread. There's no useful work to be
116 * done, so just try to conserve power and have a
117 * low exit latency (ie sit in a loop waiting for
118 * somebody to say that they'd like to reschedule)
122 current_thread_info()->status |= TS_POLLING;
123 /* endless idle loop with no priority at all */
125 tick_nohz_stop_sched_tick(1);
126 while (!need_resched()) {
130 if (cpu_is_offline(smp_processor_id()))
133 * Idle routines should keep interrupts disabled
134 * from here on, until they go to idle.
135 * Otherwise, idle callbacks can misfire.
139 /* Don't trace irqs off for idle */
140 stop_critical_timings();
142 start_critical_timings();
143 /* In many cases the interrupt that ended idle
144 has already called exit_idle. But some idle
145 loops can be woken up without interrupt. */
149 tick_nohz_restart_sched_tick();
150 preempt_enable_no_resched();
156 /* Prints also some state that isn't saved in the pt_regs */
157 void __show_regs(struct pt_regs * regs)
159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
160 unsigned long d0, d1, d2, d3, d6, d7;
161 unsigned int fsindex, gsindex;
162 unsigned int ds, cs, es;
166 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
167 current->pid, current->comm, print_tainted(),
168 init_utsname()->release,
169 (int)strcspn(init_utsname()->version, " "),
170 init_utsname()->version);
171 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
172 printk_address(regs->ip, 1);
173 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
175 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
176 regs->ax, regs->bx, regs->cx);
177 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
178 regs->dx, regs->si, regs->di);
179 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
180 regs->bp, regs->r8, regs->r9);
181 printk("R10: %016lx R11: %016lx R12: %016lx\n",
182 regs->r10, regs->r11, regs->r12);
183 printk("R13: %016lx R14: %016lx R15: %016lx\n",
184 regs->r13, regs->r14, regs->r15);
186 asm("movl %%ds,%0" : "=r" (ds));
187 asm("movl %%cs,%0" : "=r" (cs));
188 asm("movl %%es,%0" : "=r" (es));
189 asm("movl %%fs,%0" : "=r" (fsindex));
190 asm("movl %%gs,%0" : "=r" (gsindex));
192 rdmsrl(MSR_FS_BASE, fs);
193 rdmsrl(MSR_GS_BASE, gs);
194 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
201 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
202 fs,fsindex,gs,gsindex,shadowgs);
203 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
204 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
209 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
213 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
216 void show_regs(struct pt_regs *regs)
218 printk("CPU %d:", smp_processor_id());
220 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
224 * Free current thread data structures etc..
226 void exit_thread(void)
228 struct task_struct *me = current;
229 struct thread_struct *t = &me->thread;
231 if (me->thread.io_bitmap_ptr) {
232 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
234 kfree(t->io_bitmap_ptr);
235 t->io_bitmap_ptr = NULL;
236 clear_thread_flag(TIF_IO_BITMAP);
238 * Careful, clear this in the TSS too:
240 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
241 t->io_bitmap_max = 0;
245 /* Free any DS contexts that have not been properly released. */
246 if (unlikely(t->ds_ctx)) {
247 /* we clear debugctl to make sure DS is not used. */
248 update_debugctlmsr(0);
251 #endif /* CONFIG_X86_DS */
254 void flush_thread(void)
256 struct task_struct *tsk = current;
258 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
259 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
260 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
261 clear_tsk_thread_flag(tsk, TIF_IA32);
263 set_tsk_thread_flag(tsk, TIF_IA32);
264 current_thread_info()->status |= TS_COMPAT;
267 clear_tsk_thread_flag(tsk, TIF_DEBUG);
269 tsk->thread.debugreg0 = 0;
270 tsk->thread.debugreg1 = 0;
271 tsk->thread.debugreg2 = 0;
272 tsk->thread.debugreg3 = 0;
273 tsk->thread.debugreg6 = 0;
274 tsk->thread.debugreg7 = 0;
275 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
277 * Forget coprocessor state..
279 tsk->fpu_counter = 0;
284 void release_thread(struct task_struct *dead_task)
287 if (dead_task->mm->context.size) {
288 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
290 dead_task->mm->context.ldt,
291 dead_task->mm->context.size);
297 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
299 struct user_desc ud = {
306 struct desc_struct *desc = t->thread.tls_array;
311 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
313 return get_desc_base(&t->thread.tls_array[tls]);
317 * This gets called before we allocate a new thread and copy
318 * the current task into it.
320 void prepare_to_copy(struct task_struct *tsk)
325 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
326 unsigned long unused,
327 struct task_struct * p, struct pt_regs * regs)
330 struct pt_regs * childregs;
331 struct task_struct *me = current;
333 childregs = ((struct pt_regs *)
334 (THREAD_SIZE + task_stack_page(p))) - 1;
340 childregs->sp = (unsigned long)childregs;
342 p->thread.sp = (unsigned long) childregs;
343 p->thread.sp0 = (unsigned long) (childregs+1);
344 p->thread.usersp = me->thread.usersp;
346 set_tsk_thread_flag(p, TIF_FORK);
348 p->thread.fs = me->thread.fs;
349 p->thread.gs = me->thread.gs;
351 savesegment(gs, p->thread.gsindex);
352 savesegment(fs, p->thread.fsindex);
353 savesegment(es, p->thread.es);
354 savesegment(ds, p->thread.ds);
356 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
357 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
358 if (!p->thread.io_bitmap_ptr) {
359 p->thread.io_bitmap_max = 0;
362 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
364 set_tsk_thread_flag(p, TIF_IO_BITMAP);
368 * Set a new TLS for the child thread?
370 if (clone_flags & CLONE_SETTLS) {
371 #ifdef CONFIG_IA32_EMULATION
372 if (test_thread_flag(TIF_IA32))
373 err = do_set_thread_area(p, -1,
374 (struct user_desc __user *)childregs->si, 0);
377 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
383 if (err && p->thread.io_bitmap_ptr) {
384 kfree(p->thread.io_bitmap_ptr);
385 p->thread.io_bitmap_max = 0;
391 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
399 write_pda(oldrsp, new_sp);
400 regs->cs = __USER_CS;
401 regs->ss = __USER_DS;
405 * Free the old FP and other extended state
407 free_thread_xstate(current);
409 EXPORT_SYMBOL_GPL(start_thread);
411 static void hard_disable_TSC(void)
413 write_cr4(read_cr4() | X86_CR4_TSD);
416 void disable_TSC(void)
419 if (!test_and_set_thread_flag(TIF_NOTSC))
421 * Must flip the CPU state synchronously with
422 * TIF_NOTSC in the current running context.
428 static void hard_enable_TSC(void)
430 write_cr4(read_cr4() & ~X86_CR4_TSD);
433 static void enable_TSC(void)
436 if (test_and_clear_thread_flag(TIF_NOTSC))
438 * Must flip the CPU state synchronously with
439 * TIF_NOTSC in the current running context.
445 int get_tsc_mode(unsigned long adr)
449 if (test_thread_flag(TIF_NOTSC))
450 val = PR_TSC_SIGSEGV;
454 return put_user(val, (unsigned int __user *)adr);
457 int set_tsc_mode(unsigned int val)
459 if (val == PR_TSC_SIGSEGV)
461 else if (val == PR_TSC_ENABLE)
470 * This special macro can be used to load a debugging register
472 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
474 static inline void __switch_to_xtra(struct task_struct *prev_p,
475 struct task_struct *next_p,
476 struct tss_struct *tss)
478 struct thread_struct *prev, *next;
479 unsigned long debugctl;
481 prev = &prev_p->thread,
482 next = &next_p->thread;
484 debugctl = prev->debugctlmsr;
488 unsigned long ds_prev = 0, ds_next = 0;
491 ds_prev = (unsigned long)prev->ds_ctx->ds;
493 ds_next = (unsigned long)next->ds_ctx->ds;
495 if (ds_next != ds_prev) {
497 * We clear debugctl to make sure DS
498 * is not in use when we change it:
501 update_debugctlmsr(0);
502 wrmsrl(MSR_IA32_DS_AREA, ds_next);
505 #endif /* CONFIG_X86_DS */
507 if (next->debugctlmsr != debugctl)
508 update_debugctlmsr(next->debugctlmsr);
510 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
520 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
521 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
522 /* prev and next are different */
523 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
529 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
531 * Copy the relevant range of the IO bitmap.
532 * Normally this is 128 bytes or less:
534 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
535 max(prev->io_bitmap_max, next->io_bitmap_max));
536 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
538 * Clear any possible leftover bits:
540 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
543 #ifdef CONFIG_X86_PTRACE_BTS
544 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
545 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
547 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
548 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
549 #endif /* CONFIG_X86_PTRACE_BTS */
553 * switch_to(x,y) should switch tasks from x to y.
555 * This could still be optimized:
556 * - fold all the options into a flag word and test it with a single test.
557 * - could test fs/gs bitsliced
559 * Kprobes not supported here. Set the probe on schedule instead.
562 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
564 struct thread_struct *prev = &prev_p->thread;
565 struct thread_struct *next = &next_p->thread;
566 int cpu = smp_processor_id();
567 struct tss_struct *tss = &per_cpu(init_tss, cpu);
568 unsigned fsindex, gsindex;
570 /* we're going to use this soon, after a few expensive things */
571 if (next_p->fpu_counter>5)
572 prefetch(next->xstate);
575 * Reload esp0, LDT and the page table pointer:
581 * This won't pick up thread selector changes, but I guess that is ok.
583 savesegment(es, prev->es);
584 if (unlikely(next->es | prev->es))
585 loadsegment(es, next->es);
587 savesegment(ds, prev->ds);
588 if (unlikely(next->ds | prev->ds))
589 loadsegment(ds, next->ds);
592 /* We must save %fs and %gs before load_TLS() because
593 * %fs and %gs may be cleared by load_TLS().
595 * (e.g. xen_load_tls())
597 savesegment(fs, fsindex);
598 savesegment(gs, gsindex);
603 * Leave lazy mode, flushing any hypercalls made here.
604 * This must be done before restoring TLS segments so
605 * the GDT and LDT are properly updated, and must be
606 * done before math_state_restore, so the TS bit is up
609 arch_leave_lazy_cpu_mode();
614 * Segment register != 0 always requires a reload. Also
615 * reload when it has changed. When prev process used 64bit
616 * base always reload to avoid an information leak.
618 if (unlikely(fsindex | next->fsindex | prev->fs)) {
619 loadsegment(fs, next->fsindex);
621 * Check if the user used a selector != 0; if yes
622 * clear 64bit base, since overloaded base is always
623 * mapped to the Null selector
628 /* when next process has a 64bit base use it */
630 wrmsrl(MSR_FS_BASE, next->fs);
631 prev->fsindex = fsindex;
633 if (unlikely(gsindex | next->gsindex | prev->gs)) {
634 load_gs_index(next->gsindex);
639 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
640 prev->gsindex = gsindex;
642 /* Must be after DS reload */
646 * Switch the PDA and FPU contexts.
648 prev->usersp = read_pda(oldrsp);
649 write_pda(oldrsp, next->usersp);
650 write_pda(pcurrent, next_p);
652 write_pda(kernelstack,
653 (unsigned long)task_stack_page(next_p) +
654 THREAD_SIZE - PDA_STACKOFFSET);
655 #ifdef CONFIG_CC_STACKPROTECTOR
656 write_pda(stack_canary, next_p->stack_canary);
658 * Build time only check to make sure the stack_canary is at
659 * offset 40 in the pda; this is a gcc ABI requirement
661 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
665 * Now maybe reload the debug registers and handle I/O bitmaps
667 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
668 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
669 __switch_to_xtra(prev_p, next_p, tss);
671 /* If the task has used fpu the last 5 timeslices, just do a full
672 * restore of the math state immediately to avoid the trap; the
673 * chances of needing FPU soon are obviously high now
675 * tsk_used_math() checks prevent calling math_state_restore(),
676 * which can sleep in the case of !tsk_used_math()
678 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
679 math_state_restore();
684 * sys_execve() executes a new program.
687 long sys_execve(char __user *name, char __user * __user *argv,
688 char __user * __user *envp, struct pt_regs *regs)
693 filename = getname(name);
694 error = PTR_ERR(filename);
695 if (IS_ERR(filename))
697 error = do_execve(filename, argv, envp, regs);
702 void set_personality_64bit(void)
704 /* inherit personality from parent */
706 /* Make sure to be in 64bit mode */
707 clear_thread_flag(TIF_IA32);
709 /* TBD: overwrites user setup. Should have two bits.
710 But 64bit processes have always behaved this way,
711 so it's not too bad. The main problem is just that
712 32bit childs are affected again. */
713 current->personality &= ~READ_IMPLIES_EXEC;
716 asmlinkage long sys_fork(struct pt_regs *regs)
718 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
722 sys_clone(unsigned long clone_flags, unsigned long newsp,
723 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
727 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
731 * This is trivial, and on the face of it looks like it
732 * could equally well be done in user mode.
734 * Not so, for quite unobvious reasons - register pressure.
735 * In user mode vfork() cannot have a stack frame, and if
736 * done by calling the "clone()" system call directly, you
737 * do not have enough call-clobbered registers to hold all
738 * the information you need.
740 asmlinkage long sys_vfork(struct pt_regs *regs)
742 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
746 unsigned long get_wchan(struct task_struct *p)
752 if (!p || p == current || p->state==TASK_RUNNING)
754 stack = (unsigned long)task_stack_page(p);
755 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
757 fp = *(u64 *)(p->thread.sp);
759 if (fp < (unsigned long)stack ||
760 fp > (unsigned long)stack+THREAD_SIZE)
763 if (!in_sched_functions(ip))
766 } while (count++ < 16);
770 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
773 int doit = task == current;
778 if (addr >= TASK_SIZE_OF(task))
781 /* handle small bases via the GDT because that's faster to
783 if (addr <= 0xffffffff) {
784 set_32bit_tls(task, GS_TLS, addr);
786 load_TLS(&task->thread, cpu);
787 load_gs_index(GS_TLS_SEL);
789 task->thread.gsindex = GS_TLS_SEL;
792 task->thread.gsindex = 0;
793 task->thread.gs = addr;
796 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
802 /* Not strictly needed for fs, but do it for symmetry
804 if (addr >= TASK_SIZE_OF(task))
807 /* handle small bases via the GDT because that's faster to
809 if (addr <= 0xffffffff) {
810 set_32bit_tls(task, FS_TLS, addr);
812 load_TLS(&task->thread, cpu);
813 loadsegment(fs, FS_TLS_SEL);
815 task->thread.fsindex = FS_TLS_SEL;
818 task->thread.fsindex = 0;
819 task->thread.fs = addr;
821 /* set the selector to 0 to not confuse
824 ret = checking_wrmsrl(MSR_FS_BASE, addr);
831 if (task->thread.fsindex == FS_TLS_SEL)
832 base = read_32bit_tls(task, FS_TLS);
834 rdmsrl(MSR_FS_BASE, base);
836 base = task->thread.fs;
837 ret = put_user(base, (unsigned long __user *)addr);
843 if (task->thread.gsindex == GS_TLS_SEL)
844 base = read_32bit_tls(task, GS_TLS);
846 savesegment(gs, gsindex);
848 rdmsrl(MSR_KERNEL_GS_BASE, base);
850 base = task->thread.gs;
853 base = task->thread.gs;
854 ret = put_user(base, (unsigned long __user *)addr);
866 long sys_arch_prctl(int code, unsigned long addr)
868 return do_arch_prctl(current, code, addr);
871 unsigned long arch_align_stack(unsigned long sp)
873 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
874 sp -= get_random_int() % 8192;
878 unsigned long arch_randomize_brk(struct mm_struct *mm)
880 unsigned long range_end = mm->brk + 0x02000000;
881 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;