2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle) == 0)
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status &= ~TS_POLLING;
105 * TS_POLLING-cleared state must be visible before we
110 if (!need_resched()) {
111 safe_halt(); /* enables interrupts racelessly */
115 current_thread_info()->status |= TS_POLLING;
119 * On SMP it's slightly faster (but much more power-consuming!)
120 * to poll the ->need_resched flag instead of waiting for the
121 * cross-CPU IPI to arrive. Use this option with caution.
123 static void poll_idle(void)
129 #ifdef CONFIG_HOTPLUG_CPU
130 DECLARE_PER_CPU(int, cpu_state);
133 /* We halt the CPU with physical CPU hotplug */
134 static inline void play_dead(void)
140 __get_cpu_var(cpu_state) = CPU_DEAD;
147 static inline void play_dead(void)
151 #endif /* CONFIG_HOTPLUG_CPU */
154 * The idle thread. There's no useful work to be
155 * done, so just try to conserve power and have a
156 * low exit latency (ie sit in a loop waiting for
157 * somebody to say that they'd like to reschedule)
161 current_thread_info()->status |= TS_POLLING;
162 /* endless idle loop with no priority at all */
164 tick_nohz_stop_sched_tick();
165 while (!need_resched()) {
172 if (cpu_is_offline(smp_processor_id()))
175 * Idle routines should keep interrupts disabled
176 * from here on, until they go to idle.
177 * Otherwise, idle callbacks can misfire.
182 /* In many cases the interrupt that ended idle
183 has already called exit_idle. But some idle
184 loops can be woken up without interrupt. */
188 tick_nohz_restart_sched_tick();
189 preempt_enable_no_resched();
195 static void do_nothing(void *unused)
200 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
201 * pm_idle and update to new pm_idle value. Required while changing pm_idle
202 * handler on SMP systems.
204 * Caller must have changed pm_idle to the new value before the call. Old
205 * pm_idle value will not be used by any CPU after the return of this function.
207 void cpu_idle_wait(void)
210 /* kick all the CPUs so that they exit out of pm_idle */
211 smp_call_function(do_nothing, NULL, 0, 1);
213 EXPORT_SYMBOL_GPL(cpu_idle_wait);
216 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
217 * which can obviate IPI to trigger checking of need_resched.
218 * We execute MONITOR against need_resched and enter optimized wait state
219 * through MWAIT. Whenever someone changes need_resched, we would be woken
220 * up from MWAIT (without an IPI).
222 * New with Core Duo processors, MWAIT can take some hints based on CPU
225 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
227 if (!need_resched()) {
228 __monitor((void *)¤t_thread_info()->flags, 0, 0);
235 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
236 static void mwait_idle(void)
238 if (!need_resched()) {
239 __monitor((void *)¤t_thread_info()->flags, 0, 0);
251 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
255 /* Any C1 states supported? */
256 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
259 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
265 #ifdef CONFIG_X86_SMP
266 if (pm_idle == poll_idle && smp_num_siblings > 1) {
267 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
268 " performance may degrade.\n");
271 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
273 * Skip, if setup has overridden idle.
274 * One CPU supports mwait => All CPUs supports mwait
277 printk(KERN_INFO "using mwait in idle threads.\n");
278 pm_idle = mwait_idle;
284 static int __init idle_setup(char *str)
286 if (!strcmp(str, "poll")) {
287 printk("using polling idle threads.\n");
289 } else if (!strcmp(str, "mwait"))
294 boot_option_idle_override = 1;
297 early_param("idle", idle_setup);
299 /* Prints also some state that isn't saved in the pt_regs */
300 void __show_regs(struct pt_regs * regs)
302 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
303 unsigned long d0, d1, d2, d3, d6, d7;
304 unsigned int fsindex, gsindex;
305 unsigned int ds, cs, es;
309 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
310 current->pid, current->comm, print_tainted(),
311 init_utsname()->release,
312 (int)strcspn(init_utsname()->version, " "),
313 init_utsname()->version);
314 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
315 printk_address(regs->ip, 1);
316 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
318 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
319 regs->ax, regs->bx, regs->cx);
320 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
321 regs->dx, regs->si, regs->di);
322 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
323 regs->bp, regs->r8, regs->r9);
324 printk("R10: %016lx R11: %016lx R12: %016lx\n",
325 regs->r10, regs->r11, regs->r12);
326 printk("R13: %016lx R14: %016lx R15: %016lx\n",
327 regs->r13, regs->r14, regs->r15);
329 asm("movl %%ds,%0" : "=r" (ds));
330 asm("movl %%cs,%0" : "=r" (cs));
331 asm("movl %%es,%0" : "=r" (es));
332 asm("movl %%fs,%0" : "=r" (fsindex));
333 asm("movl %%gs,%0" : "=r" (gsindex));
335 rdmsrl(MSR_FS_BASE, fs);
336 rdmsrl(MSR_GS_BASE, gs);
337 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
344 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
345 fs,fsindex,gs,gsindex,shadowgs);
346 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
347 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
352 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
356 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
359 void show_regs(struct pt_regs *regs)
361 printk("CPU %d:", smp_processor_id());
363 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
367 * Free current thread data structures etc..
369 void exit_thread(void)
371 struct task_struct *me = current;
372 struct thread_struct *t = &me->thread;
374 if (me->thread.io_bitmap_ptr) {
375 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
377 kfree(t->io_bitmap_ptr);
378 t->io_bitmap_ptr = NULL;
379 clear_thread_flag(TIF_IO_BITMAP);
381 * Careful, clear this in the TSS too:
383 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
384 t->io_bitmap_max = 0;
389 void flush_thread(void)
391 struct task_struct *tsk = current;
393 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
394 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
395 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
396 clear_tsk_thread_flag(tsk, TIF_IA32);
398 set_tsk_thread_flag(tsk, TIF_IA32);
399 current_thread_info()->status |= TS_COMPAT;
402 clear_tsk_thread_flag(tsk, TIF_DEBUG);
404 tsk->thread.debugreg0 = 0;
405 tsk->thread.debugreg1 = 0;
406 tsk->thread.debugreg2 = 0;
407 tsk->thread.debugreg3 = 0;
408 tsk->thread.debugreg6 = 0;
409 tsk->thread.debugreg7 = 0;
410 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
412 * Forget coprocessor state..
418 void release_thread(struct task_struct *dead_task)
421 if (dead_task->mm->context.size) {
422 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
424 dead_task->mm->context.ldt,
425 dead_task->mm->context.size);
431 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
433 struct user_desc ud = {
440 struct desc_struct *desc = t->thread.tls_array;
445 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
447 return get_desc_base(&t->thread.tls_array[tls]);
451 * This gets called before we allocate a new thread and copy
452 * the current task into it.
454 void prepare_to_copy(struct task_struct *tsk)
459 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
460 unsigned long unused,
461 struct task_struct * p, struct pt_regs * regs)
464 struct pt_regs * childregs;
465 struct task_struct *me = current;
467 childregs = ((struct pt_regs *)
468 (THREAD_SIZE + task_stack_page(p))) - 1;
474 childregs->sp = (unsigned long)childregs;
476 p->thread.sp = (unsigned long) childregs;
477 p->thread.sp0 = (unsigned long) (childregs+1);
478 p->thread.usersp = me->thread.usersp;
480 set_tsk_thread_flag(p, TIF_FORK);
482 p->thread.fs = me->thread.fs;
483 p->thread.gs = me->thread.gs;
485 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
486 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
487 asm("mov %%es,%0" : "=m" (p->thread.es));
488 asm("mov %%ds,%0" : "=m" (p->thread.ds));
490 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
491 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
492 if (!p->thread.io_bitmap_ptr) {
493 p->thread.io_bitmap_max = 0;
496 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
498 set_tsk_thread_flag(p, TIF_IO_BITMAP);
502 * Set a new TLS for the child thread?
504 if (clone_flags & CLONE_SETTLS) {
505 #ifdef CONFIG_IA32_EMULATION
506 if (test_thread_flag(TIF_IA32))
507 err = do_set_thread_area(p, -1,
508 (struct user_desc __user *)childregs->si, 0);
511 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
517 if (err && p->thread.io_bitmap_ptr) {
518 kfree(p->thread.io_bitmap_ptr);
519 p->thread.io_bitmap_max = 0;
525 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
527 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
531 write_pda(oldrsp, new_sp);
532 regs->cs = __USER_CS;
533 regs->ss = __USER_DS;
537 * Free the old FP and other extended state
539 free_thread_xstate(current);
541 EXPORT_SYMBOL_GPL(start_thread);
543 static void hard_disable_TSC(void)
545 write_cr4(read_cr4() | X86_CR4_TSD);
548 void disable_TSC(void)
551 if (!test_and_set_thread_flag(TIF_NOTSC))
553 * Must flip the CPU state synchronously with
554 * TIF_NOTSC in the current running context.
560 static void hard_enable_TSC(void)
562 write_cr4(read_cr4() & ~X86_CR4_TSD);
565 void enable_TSC(void)
568 if (test_and_clear_thread_flag(TIF_NOTSC))
570 * Must flip the CPU state synchronously with
571 * TIF_NOTSC in the current running context.
577 int get_tsc_mode(unsigned long adr)
581 if (test_thread_flag(TIF_NOTSC))
582 val = PR_TSC_SIGSEGV;
586 return put_user(val, (unsigned int __user *)adr);
589 int set_tsc_mode(unsigned int val)
591 if (val == PR_TSC_SIGSEGV)
593 else if (val == PR_TSC_ENABLE)
602 * This special macro can be used to load a debugging register
604 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
606 static inline void __switch_to_xtra(struct task_struct *prev_p,
607 struct task_struct *next_p,
608 struct tss_struct *tss)
610 struct thread_struct *prev, *next;
611 unsigned long debugctl;
613 prev = &prev_p->thread,
614 next = &next_p->thread;
616 debugctl = prev->debugctlmsr;
617 if (next->ds_area_msr != prev->ds_area_msr) {
618 /* we clear debugctl to make sure DS
619 * is not in use when we change it */
621 update_debugctlmsr(0);
622 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
625 if (next->debugctlmsr != debugctl)
626 update_debugctlmsr(next->debugctlmsr);
628 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
638 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
639 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
640 /* prev and next are different */
641 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
647 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
649 * Copy the relevant range of the IO bitmap.
650 * Normally this is 128 bytes or less:
652 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
653 max(prev->io_bitmap_max, next->io_bitmap_max));
654 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
656 * Clear any possible leftover bits:
658 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
662 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
663 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
665 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
666 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
671 * switch_to(x,y) should switch tasks from x to y.
673 * This could still be optimized:
674 * - fold all the options into a flag word and test it with a single test.
675 * - could test fs/gs bitsliced
677 * Kprobes not supported here. Set the probe on schedule instead.
680 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
682 struct thread_struct *prev = &prev_p->thread,
683 *next = &next_p->thread;
684 int cpu = smp_processor_id();
685 struct tss_struct *tss = &per_cpu(init_tss, cpu);
687 /* we're going to use this soon, after a few expensive things */
688 if (next_p->fpu_counter>5)
689 prefetch(next->xstate);
692 * Reload esp0, LDT and the page table pointer:
698 * This won't pick up thread selector changes, but I guess that is ok.
700 asm volatile("mov %%es,%0" : "=m" (prev->es));
701 if (unlikely(next->es | prev->es))
702 loadsegment(es, next->es);
704 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
705 if (unlikely(next->ds | prev->ds))
706 loadsegment(ds, next->ds);
715 asm volatile("movl %%fs,%0" : "=r" (fsindex));
716 /* segment register != 0 always requires a reload.
717 also reload when it has changed.
718 when prev process used 64bit base always reload
719 to avoid an information leak. */
720 if (unlikely(fsindex | next->fsindex | prev->fs)) {
721 loadsegment(fs, next->fsindex);
722 /* check if the user used a selector != 0
723 * if yes clear 64bit base, since overloaded base
724 * is always mapped to the Null selector
729 /* when next process has a 64bit base use it */
731 wrmsrl(MSR_FS_BASE, next->fs);
732 prev->fsindex = fsindex;
736 asm volatile("movl %%gs,%0" : "=r" (gsindex));
737 if (unlikely(gsindex | next->gsindex | prev->gs)) {
738 load_gs_index(next->gsindex);
743 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
744 prev->gsindex = gsindex;
747 /* Must be after DS reload */
751 * Switch the PDA and FPU contexts.
753 prev->usersp = read_pda(oldrsp);
754 write_pda(oldrsp, next->usersp);
755 write_pda(pcurrent, next_p);
757 write_pda(kernelstack,
758 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
759 #ifdef CONFIG_CC_STACKPROTECTOR
760 write_pda(stack_canary, next_p->stack_canary);
762 * Build time only check to make sure the stack_canary is at
763 * offset 40 in the pda; this is a gcc ABI requirement
765 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
769 * Now maybe reload the debug registers and handle I/O bitmaps
771 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
772 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
773 __switch_to_xtra(prev_p, next_p, tss);
775 /* If the task has used fpu the last 5 timeslices, just do a full
776 * restore of the math state immediately to avoid the trap; the
777 * chances of needing FPU soon are obviously high now
779 if (next_p->fpu_counter>5)
780 math_state_restore();
785 * sys_execve() executes a new program.
788 long sys_execve(char __user *name, char __user * __user *argv,
789 char __user * __user *envp, struct pt_regs *regs)
794 filename = getname(name);
795 error = PTR_ERR(filename);
796 if (IS_ERR(filename))
798 error = do_execve(filename, argv, envp, regs);
803 void set_personality_64bit(void)
805 /* inherit personality from parent */
807 /* Make sure to be in 64bit mode */
808 clear_thread_flag(TIF_IA32);
810 /* TBD: overwrites user setup. Should have two bits.
811 But 64bit processes have always behaved this way,
812 so it's not too bad. The main problem is just that
813 32bit childs are affected again. */
814 current->personality &= ~READ_IMPLIES_EXEC;
817 asmlinkage long sys_fork(struct pt_regs *regs)
819 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
823 sys_clone(unsigned long clone_flags, unsigned long newsp,
824 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
828 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
832 * This is trivial, and on the face of it looks like it
833 * could equally well be done in user mode.
835 * Not so, for quite unobvious reasons - register pressure.
836 * In user mode vfork() cannot have a stack frame, and if
837 * done by calling the "clone()" system call directly, you
838 * do not have enough call-clobbered registers to hold all
839 * the information you need.
841 asmlinkage long sys_vfork(struct pt_regs *regs)
843 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
847 unsigned long get_wchan(struct task_struct *p)
853 if (!p || p == current || p->state==TASK_RUNNING)
855 stack = (unsigned long)task_stack_page(p);
856 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
858 fp = *(u64 *)(p->thread.sp);
860 if (fp < (unsigned long)stack ||
861 fp > (unsigned long)stack+THREAD_SIZE)
864 if (!in_sched_functions(ip))
867 } while (count++ < 16);
871 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
874 int doit = task == current;
879 if (addr >= TASK_SIZE_OF(task))
882 /* handle small bases via the GDT because that's faster to
884 if (addr <= 0xffffffff) {
885 set_32bit_tls(task, GS_TLS, addr);
887 load_TLS(&task->thread, cpu);
888 load_gs_index(GS_TLS_SEL);
890 task->thread.gsindex = GS_TLS_SEL;
893 task->thread.gsindex = 0;
894 task->thread.gs = addr;
897 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
903 /* Not strictly needed for fs, but do it for symmetry
905 if (addr >= TASK_SIZE_OF(task))
908 /* handle small bases via the GDT because that's faster to
910 if (addr <= 0xffffffff) {
911 set_32bit_tls(task, FS_TLS, addr);
913 load_TLS(&task->thread, cpu);
914 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
916 task->thread.fsindex = FS_TLS_SEL;
919 task->thread.fsindex = 0;
920 task->thread.fs = addr;
922 /* set the selector to 0 to not confuse
924 asm volatile("movl %0,%%fs" :: "r" (0));
925 ret = checking_wrmsrl(MSR_FS_BASE, addr);
932 if (task->thread.fsindex == FS_TLS_SEL)
933 base = read_32bit_tls(task, FS_TLS);
935 rdmsrl(MSR_FS_BASE, base);
937 base = task->thread.fs;
938 ret = put_user(base, (unsigned long __user *)addr);
944 if (task->thread.gsindex == GS_TLS_SEL)
945 base = read_32bit_tls(task, GS_TLS);
947 asm("movl %%gs,%0" : "=r" (gsindex));
949 rdmsrl(MSR_KERNEL_GS_BASE, base);
951 base = task->thread.gs;
954 base = task->thread.gs;
955 ret = put_user(base, (unsigned long __user *)addr);
967 long sys_arch_prctl(int code, unsigned long addr)
969 return do_arch_prctl(current, code, addr);
972 unsigned long arch_align_stack(unsigned long sp)
974 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
975 sp -= get_random_int() % 8192;
979 unsigned long arch_randomize_brk(struct mm_struct *mm)
981 unsigned long range_end = mm->brk + 0x02000000;
982 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;