2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
50 #include <asm/proto.h>
54 asmlinkage extern void ret_from_fork(void);
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
62 * Powermanagement idle function, if any..
64 void (*pm_idle)(void);
65 EXPORT_SYMBOL(pm_idle);
67 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
69 void idle_notifier_register(struct notifier_block *n)
71 atomic_notifier_chain_register(&idle_notifier, n);
77 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
80 static void __exit_idle(void)
82 if (test_and_clear_bit_pda(0, isidle) == 0)
84 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
87 /* Called from interrupts to signify idle end */
90 /* idle loop has pid 0 */
97 * We use this if we don't have any better
100 void default_idle(void)
102 current_thread_info()->status &= ~TS_POLLING;
104 * TS_POLLING-cleared state must be visible before we
109 if (!need_resched()) {
114 t0n = ktime_to_ns(t0);
115 safe_halt(); /* enables interrupts racelessly */
118 t1n = ktime_to_ns(t1);
119 sched_clock_idle_wakeup_event(t1n - t0n);
122 current_thread_info()->status |= TS_POLLING;
126 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution.
130 static void poll_idle(void)
136 #ifdef CONFIG_HOTPLUG_CPU
137 DECLARE_PER_CPU(int, cpu_state);
140 /* We halt the CPU with physical CPU hotplug */
141 static inline void play_dead(void)
147 __get_cpu_var(cpu_state) = CPU_DEAD;
154 static inline void play_dead(void)
158 #endif /* CONFIG_HOTPLUG_CPU */
161 * The idle thread. There's no useful work to be
162 * done, so just try to conserve power and have a
163 * low exit latency (ie sit in a loop waiting for
164 * somebody to say that they'd like to reschedule)
168 current_thread_info()->status |= TS_POLLING;
169 /* endless idle loop with no priority at all */
171 tick_nohz_stop_sched_tick();
172 while (!need_resched()) {
179 if (cpu_is_offline(smp_processor_id()))
182 * Idle routines should keep interrupts disabled
183 * from here on, until they go to idle.
184 * Otherwise, idle callbacks can misfire.
189 /* In many cases the interrupt that ended idle
190 has already called exit_idle. But some idle
191 loops can be woken up without interrupt. */
195 tick_nohz_restart_sched_tick();
196 preempt_enable_no_resched();
202 static void do_nothing(void *unused)
207 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
208 * pm_idle and update to new pm_idle value. Required while changing pm_idle
209 * handler on SMP systems.
211 * Caller must have changed pm_idle to the new value before the call. Old
212 * pm_idle value will not be used by any CPU after the return of this function.
214 void cpu_idle_wait(void)
217 /* kick all the CPUs so that they exit out of pm_idle */
218 smp_call_function(do_nothing, NULL, 0, 1);
220 EXPORT_SYMBOL_GPL(cpu_idle_wait);
223 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
224 * which can obviate IPI to trigger checking of need_resched.
225 * We execute MONITOR against need_resched and enter optimized wait state
226 * through MWAIT. Whenever someone changes need_resched, we would be woken
227 * up from MWAIT (without an IPI).
229 * New with Core Duo processors, MWAIT can take some hints based on CPU
232 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
234 if (!need_resched()) {
235 __monitor((void *)¤t_thread_info()->flags, 0, 0);
242 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
243 static void mwait_idle(void)
245 if (!need_resched()) {
246 __monitor((void *)¤t_thread_info()->flags, 0, 0);
258 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
262 /* Any C1 states supported? */
263 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
266 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
272 #ifdef CONFIG_X86_SMP
273 if (pm_idle == poll_idle && smp_num_siblings > 1) {
274 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
275 " performance may degrade.\n");
278 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
280 * Skip, if setup has overridden idle.
281 * One CPU supports mwait => All CPUs supports mwait
284 printk(KERN_INFO "using mwait in idle threads.\n");
285 pm_idle = mwait_idle;
291 static int __init idle_setup(char *str)
293 if (!strcmp(str, "poll")) {
294 printk("using polling idle threads.\n");
296 } else if (!strcmp(str, "mwait"))
301 boot_option_idle_override = 1;
304 early_param("idle", idle_setup);
306 /* Prints also some state that isn't saved in the pt_regs */
307 void __show_regs(struct pt_regs * regs)
309 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
310 unsigned long d0, d1, d2, d3, d6, d7;
311 unsigned int fsindex, gsindex;
312 unsigned int ds, cs, es;
316 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
317 current->pid, current->comm, print_tainted(),
318 init_utsname()->release,
319 (int)strcspn(init_utsname()->version, " "),
320 init_utsname()->version);
321 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
322 printk_address(regs->ip, 1);
323 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
325 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
326 regs->ax, regs->bx, regs->cx);
327 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
328 regs->dx, regs->si, regs->di);
329 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
330 regs->bp, regs->r8, regs->r9);
331 printk("R10: %016lx R11: %016lx R12: %016lx\n",
332 regs->r10, regs->r11, regs->r12);
333 printk("R13: %016lx R14: %016lx R15: %016lx\n",
334 regs->r13, regs->r14, regs->r15);
336 asm("movl %%ds,%0" : "=r" (ds));
337 asm("movl %%cs,%0" : "=r" (cs));
338 asm("movl %%es,%0" : "=r" (es));
339 asm("movl %%fs,%0" : "=r" (fsindex));
340 asm("movl %%gs,%0" : "=r" (gsindex));
342 rdmsrl(MSR_FS_BASE, fs);
343 rdmsrl(MSR_GS_BASE, gs);
344 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
351 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
352 fs,fsindex,gs,gsindex,shadowgs);
353 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
354 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
359 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
363 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
366 void show_regs(struct pt_regs *regs)
368 printk("CPU %d:", smp_processor_id());
370 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
374 * Free current thread data structures etc..
376 void exit_thread(void)
378 struct task_struct *me = current;
379 struct thread_struct *t = &me->thread;
381 if (me->thread.io_bitmap_ptr) {
382 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
384 kfree(t->io_bitmap_ptr);
385 t->io_bitmap_ptr = NULL;
386 clear_thread_flag(TIF_IO_BITMAP);
388 * Careful, clear this in the TSS too:
390 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
391 t->io_bitmap_max = 0;
396 void flush_thread(void)
398 struct task_struct *tsk = current;
400 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
401 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
402 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
403 clear_tsk_thread_flag(tsk, TIF_IA32);
405 set_tsk_thread_flag(tsk, TIF_IA32);
406 current_thread_info()->status |= TS_COMPAT;
409 clear_tsk_thread_flag(tsk, TIF_DEBUG);
411 tsk->thread.debugreg0 = 0;
412 tsk->thread.debugreg1 = 0;
413 tsk->thread.debugreg2 = 0;
414 tsk->thread.debugreg3 = 0;
415 tsk->thread.debugreg6 = 0;
416 tsk->thread.debugreg7 = 0;
417 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
419 * Forget coprocessor state..
425 void release_thread(struct task_struct *dead_task)
428 if (dead_task->mm->context.size) {
429 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
431 dead_task->mm->context.ldt,
432 dead_task->mm->context.size);
438 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
440 struct user_desc ud = {
447 struct desc_struct *desc = t->thread.tls_array;
452 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
454 return get_desc_base(&t->thread.tls_array[tls]);
458 * This gets called before we allocate a new thread and copy
459 * the current task into it.
461 void prepare_to_copy(struct task_struct *tsk)
466 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
467 unsigned long unused,
468 struct task_struct * p, struct pt_regs * regs)
471 struct pt_regs * childregs;
472 struct task_struct *me = current;
474 childregs = ((struct pt_regs *)
475 (THREAD_SIZE + task_stack_page(p))) - 1;
481 childregs->sp = (unsigned long)childregs;
483 p->thread.sp = (unsigned long) childregs;
484 p->thread.sp0 = (unsigned long) (childregs+1);
485 p->thread.usersp = me->thread.usersp;
487 set_tsk_thread_flag(p, TIF_FORK);
489 p->thread.fs = me->thread.fs;
490 p->thread.gs = me->thread.gs;
492 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
493 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
494 asm("mov %%es,%0" : "=m" (p->thread.es));
495 asm("mov %%ds,%0" : "=m" (p->thread.ds));
497 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
498 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
499 if (!p->thread.io_bitmap_ptr) {
500 p->thread.io_bitmap_max = 0;
503 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
505 set_tsk_thread_flag(p, TIF_IO_BITMAP);
509 * Set a new TLS for the child thread?
511 if (clone_flags & CLONE_SETTLS) {
512 #ifdef CONFIG_IA32_EMULATION
513 if (test_thread_flag(TIF_IA32))
514 err = do_set_thread_area(p, -1,
515 (struct user_desc __user *)childregs->si, 0);
518 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
524 if (err && p->thread.io_bitmap_ptr) {
525 kfree(p->thread.io_bitmap_ptr);
526 p->thread.io_bitmap_max = 0;
532 * This special macro can be used to load a debugging register
534 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
536 static inline void __switch_to_xtra(struct task_struct *prev_p,
537 struct task_struct *next_p,
538 struct tss_struct *tss)
540 struct thread_struct *prev, *next;
541 unsigned long debugctl;
543 prev = &prev_p->thread,
544 next = &next_p->thread;
546 debugctl = prev->debugctlmsr;
547 if (next->ds_area_msr != prev->ds_area_msr) {
548 /* we clear debugctl to make sure DS
549 * is not in use when we change it */
551 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
552 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
555 if (next->debugctlmsr != debugctl)
556 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
558 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
568 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
570 * Copy the relevant range of the IO bitmap.
571 * Normally this is 128 bytes or less:
573 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
574 max(prev->io_bitmap_max, next->io_bitmap_max));
575 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
577 * Clear any possible leftover bits:
579 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
583 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
584 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
586 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
587 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
592 * switch_to(x,y) should switch tasks from x to y.
594 * This could still be optimized:
595 * - fold all the options into a flag word and test it with a single test.
596 * - could test fs/gs bitsliced
598 * Kprobes not supported here. Set the probe on schedule instead.
601 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
603 struct thread_struct *prev = &prev_p->thread,
604 *next = &next_p->thread;
605 int cpu = smp_processor_id();
606 struct tss_struct *tss = &per_cpu(init_tss, cpu);
608 /* we're going to use this soon, after a few expensive things */
609 if (next_p->fpu_counter>5)
610 prefetch(&next->i387.fxsave);
613 * Reload esp0, LDT and the page table pointer:
619 * This won't pick up thread selector changes, but I guess that is ok.
621 asm volatile("mov %%es,%0" : "=m" (prev->es));
622 if (unlikely(next->es | prev->es))
623 loadsegment(es, next->es);
625 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
626 if (unlikely(next->ds | prev->ds))
627 loadsegment(ds, next->ds);
636 asm volatile("movl %%fs,%0" : "=r" (fsindex));
637 /* segment register != 0 always requires a reload.
638 also reload when it has changed.
639 when prev process used 64bit base always reload
640 to avoid an information leak. */
641 if (unlikely(fsindex | next->fsindex | prev->fs)) {
642 loadsegment(fs, next->fsindex);
643 /* check if the user used a selector != 0
644 * if yes clear 64bit base, since overloaded base
645 * is always mapped to the Null selector
650 /* when next process has a 64bit base use it */
652 wrmsrl(MSR_FS_BASE, next->fs);
653 prev->fsindex = fsindex;
657 asm volatile("movl %%gs,%0" : "=r" (gsindex));
658 if (unlikely(gsindex | next->gsindex | prev->gs)) {
659 load_gs_index(next->gsindex);
664 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
665 prev->gsindex = gsindex;
668 /* Must be after DS reload */
672 * Switch the PDA and FPU contexts.
674 prev->usersp = read_pda(oldrsp);
675 write_pda(oldrsp, next->usersp);
676 write_pda(pcurrent, next_p);
678 write_pda(kernelstack,
679 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
680 #ifdef CONFIG_CC_STACKPROTECTOR
681 write_pda(stack_canary, next_p->stack_canary);
683 * Build time only check to make sure the stack_canary is at
684 * offset 40 in the pda; this is a gcc ABI requirement
686 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
690 * Now maybe reload the debug registers and handle I/O bitmaps
692 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
693 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
694 __switch_to_xtra(prev_p, next_p, tss);
696 /* If the task has used fpu the last 5 timeslices, just do a full
697 * restore of the math state immediately to avoid the trap; the
698 * chances of needing FPU soon are obviously high now
700 if (next_p->fpu_counter>5)
701 math_state_restore();
706 * sys_execve() executes a new program.
709 long sys_execve(char __user *name, char __user * __user *argv,
710 char __user * __user *envp, struct pt_regs *regs)
715 filename = getname(name);
716 error = PTR_ERR(filename);
717 if (IS_ERR(filename))
719 error = do_execve(filename, argv, envp, regs);
724 void set_personality_64bit(void)
726 /* inherit personality from parent */
728 /* Make sure to be in 64bit mode */
729 clear_thread_flag(TIF_IA32);
731 /* TBD: overwrites user setup. Should have two bits.
732 But 64bit processes have always behaved this way,
733 so it's not too bad. The main problem is just that
734 32bit childs are affected again. */
735 current->personality &= ~READ_IMPLIES_EXEC;
738 asmlinkage long sys_fork(struct pt_regs *regs)
740 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
744 sys_clone(unsigned long clone_flags, unsigned long newsp,
745 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
749 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
753 * This is trivial, and on the face of it looks like it
754 * could equally well be done in user mode.
756 * Not so, for quite unobvious reasons - register pressure.
757 * In user mode vfork() cannot have a stack frame, and if
758 * done by calling the "clone()" system call directly, you
759 * do not have enough call-clobbered registers to hold all
760 * the information you need.
762 asmlinkage long sys_vfork(struct pt_regs *regs)
764 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
768 unsigned long get_wchan(struct task_struct *p)
774 if (!p || p == current || p->state==TASK_RUNNING)
776 stack = (unsigned long)task_stack_page(p);
777 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
779 fp = *(u64 *)(p->thread.sp);
781 if (fp < (unsigned long)stack ||
782 fp > (unsigned long)stack+THREAD_SIZE)
785 if (!in_sched_functions(ip))
788 } while (count++ < 16);
792 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
795 int doit = task == current;
800 if (addr >= TASK_SIZE_OF(task))
803 /* handle small bases via the GDT because that's faster to
805 if (addr <= 0xffffffff) {
806 set_32bit_tls(task, GS_TLS, addr);
808 load_TLS(&task->thread, cpu);
809 load_gs_index(GS_TLS_SEL);
811 task->thread.gsindex = GS_TLS_SEL;
814 task->thread.gsindex = 0;
815 task->thread.gs = addr;
818 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
824 /* Not strictly needed for fs, but do it for symmetry
826 if (addr >= TASK_SIZE_OF(task))
829 /* handle small bases via the GDT because that's faster to
831 if (addr <= 0xffffffff) {
832 set_32bit_tls(task, FS_TLS, addr);
834 load_TLS(&task->thread, cpu);
835 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
837 task->thread.fsindex = FS_TLS_SEL;
840 task->thread.fsindex = 0;
841 task->thread.fs = addr;
843 /* set the selector to 0 to not confuse
845 asm volatile("movl %0,%%fs" :: "r" (0));
846 ret = checking_wrmsrl(MSR_FS_BASE, addr);
853 if (task->thread.fsindex == FS_TLS_SEL)
854 base = read_32bit_tls(task, FS_TLS);
856 rdmsrl(MSR_FS_BASE, base);
858 base = task->thread.fs;
859 ret = put_user(base, (unsigned long __user *)addr);
865 if (task->thread.gsindex == GS_TLS_SEL)
866 base = read_32bit_tls(task, GS_TLS);
868 asm("movl %%gs,%0" : "=r" (gsindex));
870 rdmsrl(MSR_KERNEL_GS_BASE, base);
872 base = task->thread.gs;
875 base = task->thread.gs;
876 ret = put_user(base, (unsigned long __user *)addr);
888 long sys_arch_prctl(int code, unsigned long addr)
890 return do_arch_prctl(current, code, addr);
893 unsigned long arch_align_stack(unsigned long sp)
895 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
896 sp -= get_random_int() % 8192;
900 unsigned long arch_randomize_brk(struct mm_struct *mm)
902 unsigned long range_end = mm->brk + 0x02000000;
903 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;