2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 void idle_notifier_register(struct notifier_block *n)
63 atomic_notifier_chain_register(&idle_notifier, n);
69 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
72 static void __exit_idle(void)
74 if (test_and_clear_bit_pda(0, isidle) == 0)
76 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
79 /* Called from interrupts to signify idle end */
82 /* idle loop has pid 0 */
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state);
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
96 c1e_remove_cpu(raw_smp_processor_id());
100 __get_cpu_var(cpu_state) = CPU_DEAD;
103 /* mask all interrupts, flush any and all caches, and halt */
107 static inline void play_dead(void)
111 #endif /* CONFIG_HOTPLUG_CPU */
114 * The idle thread. There's no useful work to be
115 * done, so just try to conserve power and have a
116 * low exit latency (ie sit in a loop waiting for
117 * somebody to say that they'd like to reschedule)
121 current_thread_info()->status |= TS_POLLING;
122 /* endless idle loop with no priority at all */
124 tick_nohz_stop_sched_tick(1);
125 while (!need_resched()) {
129 if (cpu_is_offline(smp_processor_id()))
132 * Idle routines should keep interrupts disabled
133 * from here on, until they go to idle.
134 * Otherwise, idle callbacks can misfire.
138 /* Don't trace irqs off for idle */
139 stop_critical_timings();
141 start_critical_timings();
142 /* In many cases the interrupt that ended idle
143 has already called exit_idle. But some idle
144 loops can be woken up without interrupt. */
148 tick_nohz_restart_sched_tick();
149 preempt_enable_no_resched();
155 /* Prints also some state that isn't saved in the pt_regs */
156 void __show_regs(struct pt_regs * regs)
158 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 unsigned long d0, d1, d2, d3, d6, d7;
160 unsigned int fsindex, gsindex;
161 unsigned int ds, cs, es;
165 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
166 current->pid, current->comm, print_tainted(),
167 init_utsname()->release,
168 (int)strcspn(init_utsname()->version, " "),
169 init_utsname()->version);
170 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
171 printk_address(regs->ip, 1);
172 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
174 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 regs->ax, regs->bx, regs->cx);
176 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 regs->dx, regs->si, regs->di);
178 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
179 regs->bp, regs->r8, regs->r9);
180 printk("R10: %016lx R11: %016lx R12: %016lx\n",
181 regs->r10, regs->r11, regs->r12);
182 printk("R13: %016lx R14: %016lx R15: %016lx\n",
183 regs->r13, regs->r14, regs->r15);
185 asm("movl %%ds,%0" : "=r" (ds));
186 asm("movl %%cs,%0" : "=r" (cs));
187 asm("movl %%es,%0" : "=r" (es));
188 asm("movl %%fs,%0" : "=r" (fsindex));
189 asm("movl %%gs,%0" : "=r" (gsindex));
191 rdmsrl(MSR_FS_BASE, fs);
192 rdmsrl(MSR_GS_BASE, gs);
193 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
200 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
201 fs,fsindex,gs,gsindex,shadowgs);
202 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
203 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
208 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
212 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
215 void show_regs(struct pt_regs *regs)
217 printk("CPU %d:", smp_processor_id());
219 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
223 * Free current thread data structures etc..
225 void exit_thread(void)
227 struct task_struct *me = current;
228 struct thread_struct *t = &me->thread;
230 if (me->thread.io_bitmap_ptr) {
231 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
233 kfree(t->io_bitmap_ptr);
234 t->io_bitmap_ptr = NULL;
235 clear_thread_flag(TIF_IO_BITMAP);
237 * Careful, clear this in the TSS too:
239 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
240 t->io_bitmap_max = 0;
244 /* Free any DS contexts that have not been properly released. */
245 if (unlikely(t->ds_ctx)) {
246 /* we clear debugctl to make sure DS is not used. */
247 update_debugctlmsr(0);
250 #endif /* CONFIG_X86_DS */
253 void flush_thread(void)
255 struct task_struct *tsk = current;
257 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
258 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
259 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
260 clear_tsk_thread_flag(tsk, TIF_IA32);
262 set_tsk_thread_flag(tsk, TIF_IA32);
263 current_thread_info()->status |= TS_COMPAT;
266 clear_tsk_thread_flag(tsk, TIF_DEBUG);
268 tsk->thread.debugreg0 = 0;
269 tsk->thread.debugreg1 = 0;
270 tsk->thread.debugreg2 = 0;
271 tsk->thread.debugreg3 = 0;
272 tsk->thread.debugreg6 = 0;
273 tsk->thread.debugreg7 = 0;
274 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
276 * Forget coprocessor state..
278 tsk->fpu_counter = 0;
283 void release_thread(struct task_struct *dead_task)
286 if (dead_task->mm->context.size) {
287 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
289 dead_task->mm->context.ldt,
290 dead_task->mm->context.size);
296 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
298 struct user_desc ud = {
305 struct desc_struct *desc = t->thread.tls_array;
310 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
312 return get_desc_base(&t->thread.tls_array[tls]);
316 * This gets called before we allocate a new thread and copy
317 * the current task into it.
319 void prepare_to_copy(struct task_struct *tsk)
324 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
325 unsigned long unused,
326 struct task_struct * p, struct pt_regs * regs)
329 struct pt_regs * childregs;
330 struct task_struct *me = current;
332 childregs = ((struct pt_regs *)
333 (THREAD_SIZE + task_stack_page(p))) - 1;
339 childregs->sp = (unsigned long)childregs;
341 p->thread.sp = (unsigned long) childregs;
342 p->thread.sp0 = (unsigned long) (childregs+1);
343 p->thread.usersp = me->thread.usersp;
345 set_tsk_thread_flag(p, TIF_FORK);
347 p->thread.fs = me->thread.fs;
348 p->thread.gs = me->thread.gs;
350 savesegment(gs, p->thread.gsindex);
351 savesegment(fs, p->thread.fsindex);
352 savesegment(es, p->thread.es);
353 savesegment(ds, p->thread.ds);
355 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
356 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
357 if (!p->thread.io_bitmap_ptr) {
358 p->thread.io_bitmap_max = 0;
361 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
363 set_tsk_thread_flag(p, TIF_IO_BITMAP);
367 * Set a new TLS for the child thread?
369 if (clone_flags & CLONE_SETTLS) {
370 #ifdef CONFIG_IA32_EMULATION
371 if (test_thread_flag(TIF_IA32))
372 err = do_set_thread_area(p, -1,
373 (struct user_desc __user *)childregs->si, 0);
376 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
382 if (err && p->thread.io_bitmap_ptr) {
383 kfree(p->thread.io_bitmap_ptr);
384 p->thread.io_bitmap_max = 0;
390 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
398 write_pda(oldrsp, new_sp);
399 regs->cs = __USER_CS;
400 regs->ss = __USER_DS;
404 * Free the old FP and other extended state
406 free_thread_xstate(current);
408 EXPORT_SYMBOL_GPL(start_thread);
410 static void hard_disable_TSC(void)
412 write_cr4(read_cr4() | X86_CR4_TSD);
415 void disable_TSC(void)
418 if (!test_and_set_thread_flag(TIF_NOTSC))
420 * Must flip the CPU state synchronously with
421 * TIF_NOTSC in the current running context.
427 static void hard_enable_TSC(void)
429 write_cr4(read_cr4() & ~X86_CR4_TSD);
432 static void enable_TSC(void)
435 if (test_and_clear_thread_flag(TIF_NOTSC))
437 * Must flip the CPU state synchronously with
438 * TIF_NOTSC in the current running context.
444 int get_tsc_mode(unsigned long adr)
448 if (test_thread_flag(TIF_NOTSC))
449 val = PR_TSC_SIGSEGV;
453 return put_user(val, (unsigned int __user *)adr);
456 int set_tsc_mode(unsigned int val)
458 if (val == PR_TSC_SIGSEGV)
460 else if (val == PR_TSC_ENABLE)
469 * This special macro can be used to load a debugging register
471 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
473 static inline void __switch_to_xtra(struct task_struct *prev_p,
474 struct task_struct *next_p,
475 struct tss_struct *tss)
477 struct thread_struct *prev, *next;
478 unsigned long debugctl;
480 prev = &prev_p->thread,
481 next = &next_p->thread;
483 debugctl = prev->debugctlmsr;
487 unsigned long ds_prev = 0, ds_next = 0;
490 ds_prev = (unsigned long)prev->ds_ctx->ds;
492 ds_next = (unsigned long)next->ds_ctx->ds;
494 if (ds_next != ds_prev) {
496 * We clear debugctl to make sure DS
497 * is not in use when we change it:
500 update_debugctlmsr(0);
501 wrmsrl(MSR_IA32_DS_AREA, ds_next);
504 #endif /* CONFIG_X86_DS */
506 if (next->debugctlmsr != debugctl)
507 update_debugctlmsr(next->debugctlmsr);
509 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
519 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
520 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
521 /* prev and next are different */
522 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
528 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
530 * Copy the relevant range of the IO bitmap.
531 * Normally this is 128 bytes or less:
533 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
534 max(prev->io_bitmap_max, next->io_bitmap_max));
535 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
537 * Clear any possible leftover bits:
539 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
542 #ifdef CONFIG_X86_PTRACE_BTS
543 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
544 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
546 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
547 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
548 #endif /* CONFIG_X86_PTRACE_BTS */
552 * switch_to(x,y) should switch tasks from x to y.
554 * This could still be optimized:
555 * - fold all the options into a flag word and test it with a single test.
556 * - could test fs/gs bitsliced
558 * Kprobes not supported here. Set the probe on schedule instead.
561 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
563 struct thread_struct *prev = &prev_p->thread;
564 struct thread_struct *next = &next_p->thread;
565 int cpu = smp_processor_id();
566 struct tss_struct *tss = &per_cpu(init_tss, cpu);
567 unsigned fsindex, gsindex;
569 /* we're going to use this soon, after a few expensive things */
570 if (next_p->fpu_counter>5)
571 prefetch(next->xstate);
574 * Reload esp0, LDT and the page table pointer:
580 * This won't pick up thread selector changes, but I guess that is ok.
582 savesegment(es, prev->es);
583 if (unlikely(next->es | prev->es))
584 loadsegment(es, next->es);
586 savesegment(ds, prev->ds);
587 if (unlikely(next->ds | prev->ds))
588 loadsegment(ds, next->ds);
591 /* We must save %fs and %gs before load_TLS() because
592 * %fs and %gs may be cleared by load_TLS().
594 * (e.g. xen_load_tls())
596 savesegment(fs, fsindex);
597 savesegment(gs, gsindex);
602 * Leave lazy mode, flushing any hypercalls made here.
603 * This must be done before restoring TLS segments so
604 * the GDT and LDT are properly updated, and must be
605 * done before math_state_restore, so the TS bit is up
608 arch_leave_lazy_cpu_mode();
613 * Segment register != 0 always requires a reload. Also
614 * reload when it has changed. When prev process used 64bit
615 * base always reload to avoid an information leak.
617 if (unlikely(fsindex | next->fsindex | prev->fs)) {
618 loadsegment(fs, next->fsindex);
620 * Check if the user used a selector != 0; if yes
621 * clear 64bit base, since overloaded base is always
622 * mapped to the Null selector
627 /* when next process has a 64bit base use it */
629 wrmsrl(MSR_FS_BASE, next->fs);
630 prev->fsindex = fsindex;
632 if (unlikely(gsindex | next->gsindex | prev->gs)) {
633 load_gs_index(next->gsindex);
638 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
639 prev->gsindex = gsindex;
641 /* Must be after DS reload */
645 * Switch the PDA and FPU contexts.
647 prev->usersp = read_pda(oldrsp);
648 write_pda(oldrsp, next->usersp);
649 write_pda(pcurrent, next_p);
651 write_pda(kernelstack,
652 (unsigned long)task_stack_page(next_p) +
653 THREAD_SIZE - PDA_STACKOFFSET);
654 #ifdef CONFIG_CC_STACKPROTECTOR
655 write_pda(stack_canary, next_p->stack_canary);
657 * Build time only check to make sure the stack_canary is at
658 * offset 40 in the pda; this is a gcc ABI requirement
660 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
664 * Now maybe reload the debug registers and handle I/O bitmaps
666 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
667 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
668 __switch_to_xtra(prev_p, next_p, tss);
670 /* If the task has used fpu the last 5 timeslices, just do a full
671 * restore of the math state immediately to avoid the trap; the
672 * chances of needing FPU soon are obviously high now
674 * tsk_used_math() checks prevent calling math_state_restore(),
675 * which can sleep in the case of !tsk_used_math()
677 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
678 math_state_restore();
683 * sys_execve() executes a new program.
686 long sys_execve(char __user *name, char __user * __user *argv,
687 char __user * __user *envp, struct pt_regs *regs)
692 filename = getname(name);
693 error = PTR_ERR(filename);
694 if (IS_ERR(filename))
696 error = do_execve(filename, argv, envp, regs);
701 void set_personality_64bit(void)
703 /* inherit personality from parent */
705 /* Make sure to be in 64bit mode */
706 clear_thread_flag(TIF_IA32);
708 /* TBD: overwrites user setup. Should have two bits.
709 But 64bit processes have always behaved this way,
710 so it's not too bad. The main problem is just that
711 32bit childs are affected again. */
712 current->personality &= ~READ_IMPLIES_EXEC;
715 asmlinkage long sys_fork(struct pt_regs *regs)
717 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
721 sys_clone(unsigned long clone_flags, unsigned long newsp,
722 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
726 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
730 * This is trivial, and on the face of it looks like it
731 * could equally well be done in user mode.
733 * Not so, for quite unobvious reasons - register pressure.
734 * In user mode vfork() cannot have a stack frame, and if
735 * done by calling the "clone()" system call directly, you
736 * do not have enough call-clobbered registers to hold all
737 * the information you need.
739 asmlinkage long sys_vfork(struct pt_regs *regs)
741 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
745 unsigned long get_wchan(struct task_struct *p)
751 if (!p || p == current || p->state==TASK_RUNNING)
753 stack = (unsigned long)task_stack_page(p);
754 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
756 fp = *(u64 *)(p->thread.sp);
758 if (fp < (unsigned long)stack ||
759 fp > (unsigned long)stack+THREAD_SIZE)
762 if (!in_sched_functions(ip))
765 } while (count++ < 16);
769 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
772 int doit = task == current;
777 if (addr >= TASK_SIZE_OF(task))
780 /* handle small bases via the GDT because that's faster to
782 if (addr <= 0xffffffff) {
783 set_32bit_tls(task, GS_TLS, addr);
785 load_TLS(&task->thread, cpu);
786 load_gs_index(GS_TLS_SEL);
788 task->thread.gsindex = GS_TLS_SEL;
791 task->thread.gsindex = 0;
792 task->thread.gs = addr;
795 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
801 /* Not strictly needed for fs, but do it for symmetry
803 if (addr >= TASK_SIZE_OF(task))
806 /* handle small bases via the GDT because that's faster to
808 if (addr <= 0xffffffff) {
809 set_32bit_tls(task, FS_TLS, addr);
811 load_TLS(&task->thread, cpu);
812 loadsegment(fs, FS_TLS_SEL);
814 task->thread.fsindex = FS_TLS_SEL;
817 task->thread.fsindex = 0;
818 task->thread.fs = addr;
820 /* set the selector to 0 to not confuse
823 ret = checking_wrmsrl(MSR_FS_BASE, addr);
830 if (task->thread.fsindex == FS_TLS_SEL)
831 base = read_32bit_tls(task, FS_TLS);
833 rdmsrl(MSR_FS_BASE, base);
835 base = task->thread.fs;
836 ret = put_user(base, (unsigned long __user *)addr);
842 if (task->thread.gsindex == GS_TLS_SEL)
843 base = read_32bit_tls(task, GS_TLS);
845 savesegment(gs, gsindex);
847 rdmsrl(MSR_KERNEL_GS_BASE, base);
849 base = task->thread.gs;
852 base = task->thread.gs;
853 ret = put_user(base, (unsigned long __user *)addr);
865 long sys_arch_prctl(int code, unsigned long addr)
867 return do_arch_prctl(current, code, addr);
870 unsigned long arch_align_stack(unsigned long sp)
872 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
873 sp -= get_random_int() % 8192;
877 unsigned long arch_randomize_brk(struct mm_struct *mm)
879 unsigned long range_end = mm->brk + 0x02000000;
880 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;