2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle) == 0)
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status &= ~TS_POLLING;
105 * TS_POLLING-cleared state must be visible before we
110 safe_halt(); /* enables interrupts racelessly */
113 current_thread_info()->status |= TS_POLLING;
116 #ifdef CONFIG_HOTPLUG_CPU
117 DECLARE_PER_CPU(int, cpu_state);
120 /* We halt the CPU with physical CPU hotplug */
121 static inline void play_dead(void)
127 __get_cpu_var(cpu_state) = CPU_DEAD;
134 static inline void play_dead(void)
138 #endif /* CONFIG_HOTPLUG_CPU */
141 * The idle thread. There's no useful work to be
142 * done, so just try to conserve power and have a
143 * low exit latency (ie sit in a loop waiting for
144 * somebody to say that they'd like to reschedule)
148 current_thread_info()->status |= TS_POLLING;
149 /* endless idle loop with no priority at all */
151 tick_nohz_stop_sched_tick();
152 while (!need_resched()) {
159 if (cpu_is_offline(smp_processor_id()))
162 * Idle routines should keep interrupts disabled
163 * from here on, until they go to idle.
164 * Otherwise, idle callbacks can misfire.
169 /* In many cases the interrupt that ended idle
170 has already called exit_idle. But some idle
171 loops can be woken up without interrupt. */
175 tick_nohz_restart_sched_tick();
176 preempt_enable_no_resched();
182 /* Prints also some state that isn't saved in the pt_regs */
183 void __show_regs(struct pt_regs * regs)
185 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
186 unsigned long d0, d1, d2, d3, d6, d7;
187 unsigned int fsindex, gsindex;
188 unsigned int ds, cs, es;
192 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
193 current->pid, current->comm, print_tainted(),
194 init_utsname()->release,
195 (int)strcspn(init_utsname()->version, " "),
196 init_utsname()->version);
197 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
198 printk_address(regs->ip, 1);
199 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
201 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
202 regs->ax, regs->bx, regs->cx);
203 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
204 regs->dx, regs->si, regs->di);
205 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
206 regs->bp, regs->r8, regs->r9);
207 printk("R10: %016lx R11: %016lx R12: %016lx\n",
208 regs->r10, regs->r11, regs->r12);
209 printk("R13: %016lx R14: %016lx R15: %016lx\n",
210 regs->r13, regs->r14, regs->r15);
212 asm("movl %%ds,%0" : "=r" (ds));
213 asm("movl %%cs,%0" : "=r" (cs));
214 asm("movl %%es,%0" : "=r" (es));
215 asm("movl %%fs,%0" : "=r" (fsindex));
216 asm("movl %%gs,%0" : "=r" (gsindex));
218 rdmsrl(MSR_FS_BASE, fs);
219 rdmsrl(MSR_GS_BASE, gs);
220 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
227 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
228 fs,fsindex,gs,gsindex,shadowgs);
229 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
230 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
235 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
239 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
242 void show_regs(struct pt_regs *regs)
244 printk("CPU %d:", smp_processor_id());
246 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
250 * Free current thread data structures etc..
252 void exit_thread(void)
254 struct task_struct *me = current;
255 struct thread_struct *t = &me->thread;
257 if (me->thread.io_bitmap_ptr) {
258 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
260 kfree(t->io_bitmap_ptr);
261 t->io_bitmap_ptr = NULL;
262 clear_thread_flag(TIF_IO_BITMAP);
264 * Careful, clear this in the TSS too:
266 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
267 t->io_bitmap_max = 0;
272 void flush_thread(void)
274 struct task_struct *tsk = current;
276 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
277 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
278 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
279 clear_tsk_thread_flag(tsk, TIF_IA32);
281 set_tsk_thread_flag(tsk, TIF_IA32);
282 current_thread_info()->status |= TS_COMPAT;
285 clear_tsk_thread_flag(tsk, TIF_DEBUG);
287 tsk->thread.debugreg0 = 0;
288 tsk->thread.debugreg1 = 0;
289 tsk->thread.debugreg2 = 0;
290 tsk->thread.debugreg3 = 0;
291 tsk->thread.debugreg6 = 0;
292 tsk->thread.debugreg7 = 0;
293 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
295 * Forget coprocessor state..
301 void release_thread(struct task_struct *dead_task)
304 if (dead_task->mm->context.size) {
305 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
307 dead_task->mm->context.ldt,
308 dead_task->mm->context.size);
314 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
316 struct user_desc ud = {
323 struct desc_struct *desc = t->thread.tls_array;
328 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
330 return get_desc_base(&t->thread.tls_array[tls]);
334 * This gets called before we allocate a new thread and copy
335 * the current task into it.
337 void prepare_to_copy(struct task_struct *tsk)
342 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
343 unsigned long unused,
344 struct task_struct * p, struct pt_regs * regs)
347 struct pt_regs * childregs;
348 struct task_struct *me = current;
350 childregs = ((struct pt_regs *)
351 (THREAD_SIZE + task_stack_page(p))) - 1;
357 childregs->sp = (unsigned long)childregs;
359 p->thread.sp = (unsigned long) childregs;
360 p->thread.sp0 = (unsigned long) (childregs+1);
361 p->thread.usersp = me->thread.usersp;
363 set_tsk_thread_flag(p, TIF_FORK);
365 p->thread.fs = me->thread.fs;
366 p->thread.gs = me->thread.gs;
368 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
369 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
370 asm("mov %%es,%0" : "=m" (p->thread.es));
371 asm("mov %%ds,%0" : "=m" (p->thread.ds));
373 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
374 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
375 if (!p->thread.io_bitmap_ptr) {
376 p->thread.io_bitmap_max = 0;
379 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
381 set_tsk_thread_flag(p, TIF_IO_BITMAP);
385 * Set a new TLS for the child thread?
387 if (clone_flags & CLONE_SETTLS) {
388 #ifdef CONFIG_IA32_EMULATION
389 if (test_thread_flag(TIF_IA32))
390 err = do_set_thread_area(p, -1,
391 (struct user_desc __user *)childregs->si, 0);
394 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
400 if (err && p->thread.io_bitmap_ptr) {
401 kfree(p->thread.io_bitmap_ptr);
402 p->thread.io_bitmap_max = 0;
408 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
410 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
414 write_pda(oldrsp, new_sp);
415 regs->cs = __USER_CS;
416 regs->ss = __USER_DS;
420 * Free the old FP and other extended state
422 free_thread_xstate(current);
424 EXPORT_SYMBOL_GPL(start_thread);
426 static void hard_disable_TSC(void)
428 write_cr4(read_cr4() | X86_CR4_TSD);
431 void disable_TSC(void)
434 if (!test_and_set_thread_flag(TIF_NOTSC))
436 * Must flip the CPU state synchronously with
437 * TIF_NOTSC in the current running context.
443 static void hard_enable_TSC(void)
445 write_cr4(read_cr4() & ~X86_CR4_TSD);
448 static void enable_TSC(void)
451 if (test_and_clear_thread_flag(TIF_NOTSC))
453 * Must flip the CPU state synchronously with
454 * TIF_NOTSC in the current running context.
460 int get_tsc_mode(unsigned long adr)
464 if (test_thread_flag(TIF_NOTSC))
465 val = PR_TSC_SIGSEGV;
469 return put_user(val, (unsigned int __user *)adr);
472 int set_tsc_mode(unsigned int val)
474 if (val == PR_TSC_SIGSEGV)
476 else if (val == PR_TSC_ENABLE)
485 * This special macro can be used to load a debugging register
487 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
489 static inline void __switch_to_xtra(struct task_struct *prev_p,
490 struct task_struct *next_p,
491 struct tss_struct *tss)
493 struct thread_struct *prev, *next;
494 unsigned long debugctl;
496 prev = &prev_p->thread,
497 next = &next_p->thread;
499 debugctl = prev->debugctlmsr;
500 if (next->ds_area_msr != prev->ds_area_msr) {
501 /* we clear debugctl to make sure DS
502 * is not in use when we change it */
504 update_debugctlmsr(0);
505 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
508 if (next->debugctlmsr != debugctl)
509 update_debugctlmsr(next->debugctlmsr);
511 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
521 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
522 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
523 /* prev and next are different */
524 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
530 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
532 * Copy the relevant range of the IO bitmap.
533 * Normally this is 128 bytes or less:
535 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
536 max(prev->io_bitmap_max, next->io_bitmap_max));
537 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
539 * Clear any possible leftover bits:
541 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
545 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
546 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
548 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
549 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
554 * switch_to(x,y) should switch tasks from x to y.
556 * This could still be optimized:
557 * - fold all the options into a flag word and test it with a single test.
558 * - could test fs/gs bitsliced
560 * Kprobes not supported here. Set the probe on schedule instead.
563 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
565 struct thread_struct *prev = &prev_p->thread,
566 *next = &next_p->thread;
567 int cpu = smp_processor_id();
568 struct tss_struct *tss = &per_cpu(init_tss, cpu);
570 /* we're going to use this soon, after a few expensive things */
571 if (next_p->fpu_counter>5)
572 prefetch(next->xstate);
575 * Reload esp0, LDT and the page table pointer:
581 * This won't pick up thread selector changes, but I guess that is ok.
583 asm volatile("mov %%es,%0" : "=m" (prev->es));
584 if (unlikely(next->es | prev->es))
585 loadsegment(es, next->es);
587 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
588 if (unlikely(next->ds | prev->ds))
589 loadsegment(ds, next->ds);
598 asm volatile("movl %%fs,%0" : "=r" (fsindex));
599 /* segment register != 0 always requires a reload.
600 also reload when it has changed.
601 when prev process used 64bit base always reload
602 to avoid an information leak. */
603 if (unlikely(fsindex | next->fsindex | prev->fs)) {
604 loadsegment(fs, next->fsindex);
605 /* check if the user used a selector != 0
606 * if yes clear 64bit base, since overloaded base
607 * is always mapped to the Null selector
612 /* when next process has a 64bit base use it */
614 wrmsrl(MSR_FS_BASE, next->fs);
615 prev->fsindex = fsindex;
619 asm volatile("movl %%gs,%0" : "=r" (gsindex));
620 if (unlikely(gsindex | next->gsindex | prev->gs)) {
621 load_gs_index(next->gsindex);
626 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
627 prev->gsindex = gsindex;
630 /* Must be after DS reload */
634 * Switch the PDA and FPU contexts.
636 prev->usersp = read_pda(oldrsp);
637 write_pda(oldrsp, next->usersp);
638 write_pda(pcurrent, next_p);
640 write_pda(kernelstack,
641 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
642 #ifdef CONFIG_CC_STACKPROTECTOR
643 write_pda(stack_canary, next_p->stack_canary);
645 * Build time only check to make sure the stack_canary is at
646 * offset 40 in the pda; this is a gcc ABI requirement
648 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
652 * Now maybe reload the debug registers and handle I/O bitmaps
654 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
655 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
656 __switch_to_xtra(prev_p, next_p, tss);
658 /* If the task has used fpu the last 5 timeslices, just do a full
659 * restore of the math state immediately to avoid the trap; the
660 * chances of needing FPU soon are obviously high now
662 * tsk_used_math() checks prevent calling math_state_restore(),
663 * which can sleep in the case of !tsk_used_math()
665 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
666 math_state_restore();
671 * sys_execve() executes a new program.
674 long sys_execve(char __user *name, char __user * __user *argv,
675 char __user * __user *envp, struct pt_regs *regs)
680 filename = getname(name);
681 error = PTR_ERR(filename);
682 if (IS_ERR(filename))
684 error = do_execve(filename, argv, envp, regs);
689 void set_personality_64bit(void)
691 /* inherit personality from parent */
693 /* Make sure to be in 64bit mode */
694 clear_thread_flag(TIF_IA32);
696 /* TBD: overwrites user setup. Should have two bits.
697 But 64bit processes have always behaved this way,
698 so it's not too bad. The main problem is just that
699 32bit childs are affected again. */
700 current->personality &= ~READ_IMPLIES_EXEC;
703 asmlinkage long sys_fork(struct pt_regs *regs)
705 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
709 sys_clone(unsigned long clone_flags, unsigned long newsp,
710 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
714 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
718 * This is trivial, and on the face of it looks like it
719 * could equally well be done in user mode.
721 * Not so, for quite unobvious reasons - register pressure.
722 * In user mode vfork() cannot have a stack frame, and if
723 * done by calling the "clone()" system call directly, you
724 * do not have enough call-clobbered registers to hold all
725 * the information you need.
727 asmlinkage long sys_vfork(struct pt_regs *regs)
729 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
733 unsigned long get_wchan(struct task_struct *p)
739 if (!p || p == current || p->state==TASK_RUNNING)
741 stack = (unsigned long)task_stack_page(p);
742 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
744 fp = *(u64 *)(p->thread.sp);
746 if (fp < (unsigned long)stack ||
747 fp > (unsigned long)stack+THREAD_SIZE)
750 if (!in_sched_functions(ip))
753 } while (count++ < 16);
757 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
760 int doit = task == current;
765 if (addr >= TASK_SIZE_OF(task))
768 /* handle small bases via the GDT because that's faster to
770 if (addr <= 0xffffffff) {
771 set_32bit_tls(task, GS_TLS, addr);
773 load_TLS(&task->thread, cpu);
774 load_gs_index(GS_TLS_SEL);
776 task->thread.gsindex = GS_TLS_SEL;
779 task->thread.gsindex = 0;
780 task->thread.gs = addr;
783 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
789 /* Not strictly needed for fs, but do it for symmetry
791 if (addr >= TASK_SIZE_OF(task))
794 /* handle small bases via the GDT because that's faster to
796 if (addr <= 0xffffffff) {
797 set_32bit_tls(task, FS_TLS, addr);
799 load_TLS(&task->thread, cpu);
800 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
802 task->thread.fsindex = FS_TLS_SEL;
805 task->thread.fsindex = 0;
806 task->thread.fs = addr;
808 /* set the selector to 0 to not confuse
810 asm volatile("movl %0,%%fs" :: "r" (0));
811 ret = checking_wrmsrl(MSR_FS_BASE, addr);
818 if (task->thread.fsindex == FS_TLS_SEL)
819 base = read_32bit_tls(task, FS_TLS);
821 rdmsrl(MSR_FS_BASE, base);
823 base = task->thread.fs;
824 ret = put_user(base, (unsigned long __user *)addr);
830 if (task->thread.gsindex == GS_TLS_SEL)
831 base = read_32bit_tls(task, GS_TLS);
833 asm("movl %%gs,%0" : "=r" (gsindex));
835 rdmsrl(MSR_KERNEL_GS_BASE, base);
837 base = task->thread.gs;
840 base = task->thread.gs;
841 ret = put_user(base, (unsigned long __user *)addr);
853 long sys_arch_prctl(int code, unsigned long addr)
855 return do_arch_prctl(current, code, addr);
858 unsigned long arch_align_stack(unsigned long sp)
860 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
861 sp -= get_random_int() % 8192;
865 unsigned long arch_randomize_brk(struct mm_struct *mm)
867 unsigned long range_end = mm->brk + 0x02000000;
868 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;