2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/module.h>
32 #include <linux/a.out.h>
33 #include <linux/interrupt.h>
34 #include <linux/delay.h>
35 #include <linux/ptrace.h>
36 #include <linux/utsname.h>
37 #include <linux/random.h>
38 #include <linux/notifier.h>
39 #include <linux/kprobes.h>
40 #include <linux/kdebug.h>
42 #include <asm/uaccess.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
46 #include <asm/processor.h>
48 #include <asm/mmu_context.h>
50 #include <asm/prctl.h>
52 #include <asm/proto.h>
56 asmlinkage extern void ret_from_fork(void);
58 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override);
64 * Powermanagement idle function, if any..
66 void (*pm_idle)(void);
67 EXPORT_SYMBOL(pm_idle);
68 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
70 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
72 void idle_notifier_register(struct notifier_block *n)
74 atomic_notifier_chain_register(&idle_notifier, n);
76 EXPORT_SYMBOL_GPL(idle_notifier_register);
78 void idle_notifier_unregister(struct notifier_block *n)
80 atomic_notifier_chain_unregister(&idle_notifier, n);
82 EXPORT_SYMBOL(idle_notifier_unregister);
87 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
90 static void __exit_idle(void)
92 if (test_and_clear_bit_pda(0, isidle) == 0)
94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
97 /* Called from interrupts to signify idle end */
100 /* idle loop has pid 0 */
107 * We use this if we don't have any better
110 static void default_idle(void)
112 current_thread_info()->status &= ~TS_POLLING;
114 * TS_POLLING-cleared state must be visible before we
119 if (!need_resched()) {
120 /* Enables interrupts one instruction before HLT.
121 x86 special cases this so there is no race. */
125 current_thread_info()->status |= TS_POLLING;
129 * On SMP it's slightly faster (but much more power-consuming!)
130 * to poll the ->need_resched flag instead of waiting for the
131 * cross-CPU IPI to arrive. Use this option with caution.
133 static void poll_idle (void)
139 void cpu_idle_wait(void)
141 unsigned int cpu, this_cpu = get_cpu();
142 cpumask_t map, tmp = current->cpus_allowed;
144 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
148 for_each_online_cpu(cpu) {
149 per_cpu(cpu_idle_state, cpu) = 1;
153 __get_cpu_var(cpu_idle_state) = 0;
158 for_each_online_cpu(cpu) {
159 if (cpu_isset(cpu, map) &&
160 !per_cpu(cpu_idle_state, cpu))
163 cpus_and(map, map, cpu_online_map);
164 } while (!cpus_empty(map));
166 set_cpus_allowed(current, tmp);
168 EXPORT_SYMBOL_GPL(cpu_idle_wait);
170 #ifdef CONFIG_HOTPLUG_CPU
171 DECLARE_PER_CPU(int, cpu_state);
174 /* We halt the CPU with physical CPU hotplug */
175 static inline void play_dead(void)
181 __get_cpu_var(cpu_state) = CPU_DEAD;
188 static inline void play_dead(void)
192 #endif /* CONFIG_HOTPLUG_CPU */
195 * The idle thread. There's no useful work to be
196 * done, so just try to conserve power and have a
197 * low exit latency (ie sit in a loop waiting for
198 * somebody to say that they'd like to reschedule)
202 current_thread_info()->status |= TS_POLLING;
203 /* endless idle loop with no priority at all */
205 while (!need_resched()) {
208 if (__get_cpu_var(cpu_idle_state))
209 __get_cpu_var(cpu_idle_state) = 0;
216 if (cpu_is_offline(smp_processor_id()))
219 * Idle routines should keep interrupts disabled
220 * from here on, until they go to idle.
221 * Otherwise, idle callbacks can misfire.
226 /* In many cases the interrupt that ended idle
227 has already called exit_idle. But some idle
228 loops can be woken up without interrupt. */
232 preempt_enable_no_resched();
239 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
240 * which can obviate IPI to trigger checking of need_resched.
241 * We execute MONITOR against need_resched and enter optimized wait state
242 * through MWAIT. Whenever someone changes need_resched, we would be woken
243 * up from MWAIT (without an IPI).
245 * New with Core Duo processors, MWAIT can take some hints based on CPU
248 void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
250 if (!need_resched()) {
251 __monitor((void *)¤t_thread_info()->flags, 0, 0);
258 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
259 static void mwait_idle(void)
261 if (!need_resched()) {
262 __monitor((void *)¤t_thread_info()->flags, 0, 0);
273 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
276 if (cpu_has(c, X86_FEATURE_MWAIT)) {
278 * Skip, if setup has overridden idle.
279 * One CPU supports mwait => All CPUs supports mwait
283 printk(KERN_INFO "using mwait in idle threads.\n");
286 pm_idle = mwait_idle;
291 static int __init idle_setup (char *str)
293 if (!strcmp(str, "poll")) {
294 printk("using polling idle threads.\n");
296 } else if (!strcmp(str, "mwait"))
301 boot_option_idle_override = 1;
304 early_param("idle", idle_setup);
306 /* Prints also some state that isn't saved in the pt_regs */
307 void __show_regs(struct pt_regs * regs)
309 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
310 unsigned long d0, d1, d2, d3, d6, d7;
311 unsigned int fsindex,gsindex;
312 unsigned int ds,cs,es;
316 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
317 current->pid, current->comm, print_tainted(),
318 init_utsname()->release,
319 (int)strcspn(init_utsname()->version, " "),
320 init_utsname()->version);
321 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
322 printk_address(regs->rip);
323 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
325 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
326 regs->rax, regs->rbx, regs->rcx);
327 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
328 regs->rdx, regs->rsi, regs->rdi);
329 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
330 regs->rbp, regs->r8, regs->r9);
331 printk("R10: %016lx R11: %016lx R12: %016lx\n",
332 regs->r10, regs->r11, regs->r12);
333 printk("R13: %016lx R14: %016lx R15: %016lx\n",
334 regs->r13, regs->r14, regs->r15);
336 asm("movl %%ds,%0" : "=r" (ds));
337 asm("movl %%cs,%0" : "=r" (cs));
338 asm("movl %%es,%0" : "=r" (es));
339 asm("movl %%fs,%0" : "=r" (fsindex));
340 asm("movl %%gs,%0" : "=r" (gsindex));
342 rdmsrl(MSR_FS_BASE, fs);
343 rdmsrl(MSR_GS_BASE, gs);
344 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
351 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
352 fs,fsindex,gs,gsindex,shadowgs);
353 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
354 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
359 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
363 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
366 void show_regs(struct pt_regs *regs)
368 printk("CPU %d:", smp_processor_id());
370 show_trace(NULL, regs, (void *)(regs + 1));
374 * Free current thread data structures etc..
376 void exit_thread(void)
378 struct task_struct *me = current;
379 struct thread_struct *t = &me->thread;
381 if (me->thread.io_bitmap_ptr) {
382 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
384 kfree(t->io_bitmap_ptr);
385 t->io_bitmap_ptr = NULL;
386 clear_thread_flag(TIF_IO_BITMAP);
388 * Careful, clear this in the TSS too:
390 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
391 t->io_bitmap_max = 0;
396 void flush_thread(void)
398 struct task_struct *tsk = current;
400 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
401 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
402 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
403 clear_tsk_thread_flag(tsk, TIF_IA32);
405 set_tsk_thread_flag(tsk, TIF_IA32);
406 current_thread_info()->status |= TS_COMPAT;
409 clear_tsk_thread_flag(tsk, TIF_DEBUG);
411 tsk->thread.debugreg0 = 0;
412 tsk->thread.debugreg1 = 0;
413 tsk->thread.debugreg2 = 0;
414 tsk->thread.debugreg3 = 0;
415 tsk->thread.debugreg6 = 0;
416 tsk->thread.debugreg7 = 0;
417 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
419 * Forget coprocessor state..
425 void release_thread(struct task_struct *dead_task)
428 if (dead_task->mm->context.size) {
429 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
431 dead_task->mm->context.ldt,
432 dead_task->mm->context.size);
438 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
440 struct user_desc ud = {
447 struct n_desc_struct *desc = (void *)t->thread.tls_array;
449 desc->a = LDT_entry_a(&ud);
450 desc->b = LDT_entry_b(&ud);
453 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
455 struct desc_struct *desc = (void *)t->thread.tls_array;
458 (((u32)desc->base1) << 16) |
459 (((u32)desc->base2) << 24);
463 * This gets called before we allocate a new thread and copy
464 * the current task into it.
466 void prepare_to_copy(struct task_struct *tsk)
471 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
472 unsigned long unused,
473 struct task_struct * p, struct pt_regs * regs)
476 struct pt_regs * childregs;
477 struct task_struct *me = current;
479 childregs = ((struct pt_regs *)
480 (THREAD_SIZE + task_stack_page(p))) - 1;
484 childregs->rsp = rsp;
486 childregs->rsp = (unsigned long)childregs;
488 p->thread.rsp = (unsigned long) childregs;
489 p->thread.rsp0 = (unsigned long) (childregs+1);
490 p->thread.userrsp = me->thread.userrsp;
492 set_tsk_thread_flag(p, TIF_FORK);
494 p->thread.fs = me->thread.fs;
495 p->thread.gs = me->thread.gs;
497 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
498 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
499 asm("mov %%es,%0" : "=m" (p->thread.es));
500 asm("mov %%ds,%0" : "=m" (p->thread.ds));
502 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
503 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
504 if (!p->thread.io_bitmap_ptr) {
505 p->thread.io_bitmap_max = 0;
508 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
510 set_tsk_thread_flag(p, TIF_IO_BITMAP);
514 * Set a new TLS for the child thread?
516 if (clone_flags & CLONE_SETTLS) {
517 #ifdef CONFIG_IA32_EMULATION
518 if (test_thread_flag(TIF_IA32))
519 err = ia32_child_tls(p, childregs);
522 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
528 if (err && p->thread.io_bitmap_ptr) {
529 kfree(p->thread.io_bitmap_ptr);
530 p->thread.io_bitmap_max = 0;
536 * This special macro can be used to load a debugging register
538 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
540 static inline void __switch_to_xtra(struct task_struct *prev_p,
541 struct task_struct *next_p,
542 struct tss_struct *tss)
544 struct thread_struct *prev, *next;
546 prev = &prev_p->thread,
547 next = &next_p->thread;
549 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
559 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
561 * Copy the relevant range of the IO bitmap.
562 * Normally this is 128 bytes or less:
564 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
565 max(prev->io_bitmap_max, next->io_bitmap_max));
566 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
568 * Clear any possible leftover bits:
570 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
575 * switch_to(x,y) should switch tasks from x to y.
577 * This could still be optimized:
578 * - fold all the options into a flag word and test it with a single test.
579 * - could test fs/gs bitsliced
581 * Kprobes not supported here. Set the probe on schedule instead.
583 __kprobes struct task_struct *
584 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 struct thread_struct *prev = &prev_p->thread,
587 *next = &next_p->thread;
588 int cpu = smp_processor_id();
589 struct tss_struct *tss = &per_cpu(init_tss, cpu);
591 /* we're going to use this soon, after a few expensive things */
592 if (next_p->fpu_counter>5)
593 prefetch(&next->i387.fxsave);
596 * Reload esp0, LDT and the page table pointer:
598 tss->rsp0 = next->rsp0;
602 * This won't pick up thread selector changes, but I guess that is ok.
604 asm volatile("mov %%es,%0" : "=m" (prev->es));
605 if (unlikely(next->es | prev->es))
606 loadsegment(es, next->es);
608 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
609 if (unlikely(next->ds | prev->ds))
610 loadsegment(ds, next->ds);
619 asm volatile("movl %%fs,%0" : "=r" (fsindex));
620 /* segment register != 0 always requires a reload.
621 also reload when it has changed.
622 when prev process used 64bit base always reload
623 to avoid an information leak. */
624 if (unlikely(fsindex | next->fsindex | prev->fs)) {
625 loadsegment(fs, next->fsindex);
626 /* check if the user used a selector != 0
627 * if yes clear 64bit base, since overloaded base
628 * is always mapped to the Null selector
633 /* when next process has a 64bit base use it */
635 wrmsrl(MSR_FS_BASE, next->fs);
636 prev->fsindex = fsindex;
640 asm volatile("movl %%gs,%0" : "=r" (gsindex));
641 if (unlikely(gsindex | next->gsindex | prev->gs)) {
642 load_gs_index(next->gsindex);
647 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
648 prev->gsindex = gsindex;
651 /* Must be after DS reload */
655 * Switch the PDA and FPU contexts.
657 prev->userrsp = read_pda(oldrsp);
658 write_pda(oldrsp, next->userrsp);
659 write_pda(pcurrent, next_p);
661 write_pda(kernelstack,
662 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
663 #ifdef CONFIG_CC_STACKPROTECTOR
664 write_pda(stack_canary, next_p->stack_canary);
666 * Build time only check to make sure the stack_canary is at
667 * offset 40 in the pda; this is a gcc ABI requirement
669 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
673 * Now maybe reload the debug registers and handle I/O bitmaps
675 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
676 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
677 __switch_to_xtra(prev_p, next_p, tss);
679 /* If the task has used fpu the last 5 timeslices, just do a full
680 * restore of the math state immediately to avoid the trap; the
681 * chances of needing FPU soon are obviously high now
683 if (next_p->fpu_counter>5)
684 math_state_restore();
689 * sys_execve() executes a new program.
692 long sys_execve(char __user *name, char __user * __user *argv,
693 char __user * __user *envp, struct pt_regs regs)
698 filename = getname(name);
699 error = PTR_ERR(filename);
700 if (IS_ERR(filename))
702 error = do_execve(filename, argv, envp, ®s);
705 current->ptrace &= ~PT_DTRACE;
706 task_unlock(current);
712 void set_personality_64bit(void)
714 /* inherit personality from parent */
716 /* Make sure to be in 64bit mode */
717 clear_thread_flag(TIF_IA32);
719 /* TBD: overwrites user setup. Should have two bits.
720 But 64bit processes have always behaved this way,
721 so it's not too bad. The main problem is just that
722 32bit childs are affected again. */
723 current->personality &= ~READ_IMPLIES_EXEC;
726 asmlinkage long sys_fork(struct pt_regs *regs)
728 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
732 sys_clone(unsigned long clone_flags, unsigned long newsp,
733 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
737 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
741 * This is trivial, and on the face of it looks like it
742 * could equally well be done in user mode.
744 * Not so, for quite unobvious reasons - register pressure.
745 * In user mode vfork() cannot have a stack frame, and if
746 * done by calling the "clone()" system call directly, you
747 * do not have enough call-clobbered registers to hold all
748 * the information you need.
750 asmlinkage long sys_vfork(struct pt_regs *regs)
752 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
756 unsigned long get_wchan(struct task_struct *p)
762 if (!p || p == current || p->state==TASK_RUNNING)
764 stack = (unsigned long)task_stack_page(p);
765 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
767 fp = *(u64 *)(p->thread.rsp);
769 if (fp < (unsigned long)stack ||
770 fp > (unsigned long)stack+THREAD_SIZE)
772 rip = *(u64 *)(fp+8);
773 if (!in_sched_functions(rip))
776 } while (count++ < 16);
780 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
783 int doit = task == current;
788 if (addr >= TASK_SIZE_OF(task))
791 /* handle small bases via the GDT because that's faster to
793 if (addr <= 0xffffffff) {
794 set_32bit_tls(task, GS_TLS, addr);
796 load_TLS(&task->thread, cpu);
797 load_gs_index(GS_TLS_SEL);
799 task->thread.gsindex = GS_TLS_SEL;
802 task->thread.gsindex = 0;
803 task->thread.gs = addr;
806 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
812 /* Not strictly needed for fs, but do it for symmetry
814 if (addr >= TASK_SIZE_OF(task))
817 /* handle small bases via the GDT because that's faster to
819 if (addr <= 0xffffffff) {
820 set_32bit_tls(task, FS_TLS, addr);
822 load_TLS(&task->thread, cpu);
823 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
825 task->thread.fsindex = FS_TLS_SEL;
828 task->thread.fsindex = 0;
829 task->thread.fs = addr;
831 /* set the selector to 0 to not confuse
833 asm volatile("movl %0,%%fs" :: "r" (0));
834 ret = checking_wrmsrl(MSR_FS_BASE, addr);
841 if (task->thread.fsindex == FS_TLS_SEL)
842 base = read_32bit_tls(task, FS_TLS);
844 rdmsrl(MSR_FS_BASE, base);
846 base = task->thread.fs;
847 ret = put_user(base, (unsigned long __user *)addr);
853 if (task->thread.gsindex == GS_TLS_SEL)
854 base = read_32bit_tls(task, GS_TLS);
856 asm("movl %%gs,%0" : "=r" (gsindex));
858 rdmsrl(MSR_KERNEL_GS_BASE, base);
860 base = task->thread.gs;
863 base = task->thread.gs;
864 ret = put_user(base, (unsigned long __user *)addr);
876 long sys_arch_prctl(int code, unsigned long addr)
878 return do_arch_prctl(current, code, addr);
882 * Capture the user space registers if the task is not running (in user space)
884 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
886 struct pt_regs *pp, ptregs;
888 pp = task_pt_regs(tsk);
894 elf_core_copy_regs(regs, &ptregs);
899 unsigned long arch_align_stack(unsigned long sp)
901 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
902 sp -= get_random_int() % 8192;