2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/module.h>
31 #include <linux/a.out.h>
32 #include <linux/interrupt.h>
33 #include <linux/delay.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
37 #include <linux/notifier.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 void idle_notifier_unregister(struct notifier_block *n)
79 atomic_notifier_chain_unregister(&idle_notifier, n);
81 EXPORT_SYMBOL(idle_notifier_unregister);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 static void __exit_idle(void)
91 if (read_pda(isidle) == 0)
94 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
97 /* Called from interrupts to signify idle end */
100 /* idle loop has pid 0 */
107 * We use this if we don't have any better
110 static void default_idle(void)
114 current_thread_info()->status &= ~TS_POLLING;
115 smp_mb__after_clear_bit();
116 while (!need_resched()) {
123 current_thread_info()->status |= TS_POLLING;
127 * On SMP it's slightly faster (but much more power-consuming!)
128 * to poll the ->need_resched flag instead of waiting for the
129 * cross-CPU IPI to arrive. Use this option with caution.
131 static void poll_idle (void)
141 "i" (_TIF_NEED_RESCHED),
142 "m" (current_thread_info()->flags));
145 void cpu_idle_wait(void)
147 unsigned int cpu, this_cpu = get_cpu();
150 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
154 for_each_online_cpu(cpu) {
155 per_cpu(cpu_idle_state, cpu) = 1;
159 __get_cpu_var(cpu_idle_state) = 0;
164 for_each_online_cpu(cpu) {
165 if (cpu_isset(cpu, map) &&
166 !per_cpu(cpu_idle_state, cpu))
169 cpus_and(map, map, cpu_online_map);
170 } while (!cpus_empty(map));
172 EXPORT_SYMBOL_GPL(cpu_idle_wait);
174 #ifdef CONFIG_HOTPLUG_CPU
175 DECLARE_PER_CPU(int, cpu_state);
178 /* We halt the CPU with physical CPU hotplug */
179 static inline void play_dead(void)
185 __get_cpu_var(cpu_state) = CPU_DEAD;
192 static inline void play_dead(void)
196 #endif /* CONFIG_HOTPLUG_CPU */
199 * The idle thread. There's no useful work to be
200 * done, so just try to conserve power and have a
201 * low exit latency (ie sit in a loop waiting for
202 * somebody to say that they'd like to reschedule)
206 current_thread_info()->status |= TS_POLLING;
207 /* endless idle loop with no priority at all */
209 while (!need_resched()) {
212 if (__get_cpu_var(cpu_idle_state))
213 __get_cpu_var(cpu_idle_state) = 0;
219 if (cpu_is_offline(smp_processor_id()))
223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
229 preempt_enable_no_resched();
236 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
237 * which can obviate IPI to trigger checking of need_resched.
238 * We execute MONITOR against need_resched and enter optimized wait state
239 * through MWAIT. Whenever someone changes need_resched, we would be woken
240 * up from MWAIT (without an IPI).
242 static void mwait_idle(void)
246 while (!need_resched()) {
247 __monitor((void *)¤t_thread_info()->flags, 0, 0);
255 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
258 if (cpu_has(c, X86_FEATURE_MWAIT)) {
260 * Skip, if setup has overridden idle.
261 * One CPU supports mwait => All CPUs supports mwait
265 printk("using mwait in idle threads.\n");
268 pm_idle = mwait_idle;
273 static int __init idle_setup (char *str)
275 if (!strncmp(str, "poll", 4)) {
276 printk("using polling idle threads.\n");
280 boot_option_idle_override = 1;
284 __setup("idle=", idle_setup);
286 /* Prints also some state that isn't saved in the pt_regs */
287 void __show_regs(struct pt_regs * regs)
289 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
290 unsigned int fsindex,gsindex;
291 unsigned int ds,cs,es;
295 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
296 current->pid, current->comm, print_tainted(),
297 init_utsname()->release,
298 (int)strcspn(init_utsname()->version, " "),
299 init_utsname()->version);
300 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
301 printk_address(regs->rip);
302 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
304 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
305 regs->rax, regs->rbx, regs->rcx);
306 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
307 regs->rdx, regs->rsi, regs->rdi);
308 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
309 regs->rbp, regs->r8, regs->r9);
310 printk("R10: %016lx R11: %016lx R12: %016lx\n",
311 regs->r10, regs->r11, regs->r12);
312 printk("R13: %016lx R14: %016lx R15: %016lx\n",
313 regs->r13, regs->r14, regs->r15);
315 asm("movl %%ds,%0" : "=r" (ds));
316 asm("movl %%cs,%0" : "=r" (cs));
317 asm("movl %%es,%0" : "=r" (es));
318 asm("movl %%fs,%0" : "=r" (fsindex));
319 asm("movl %%gs,%0" : "=r" (gsindex));
321 rdmsrl(MSR_FS_BASE, fs);
322 rdmsrl(MSR_GS_BASE, gs);
323 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
325 asm("movq %%cr0, %0": "=r" (cr0));
326 asm("movq %%cr2, %0": "=r" (cr2));
327 asm("movq %%cr3, %0": "=r" (cr3));
328 asm("movq %%cr4, %0": "=r" (cr4));
330 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
331 fs,fsindex,gs,gsindex,shadowgs);
332 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
333 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
336 void show_regs(struct pt_regs *regs)
338 printk("CPU %d:", smp_processor_id());
340 show_trace(NULL, regs, (void *)(regs + 1));
344 * Free current thread data structures etc..
346 void exit_thread(void)
348 struct task_struct *me = current;
349 struct thread_struct *t = &me->thread;
351 if (me->thread.io_bitmap_ptr) {
352 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
354 kfree(t->io_bitmap_ptr);
355 t->io_bitmap_ptr = NULL;
356 clear_thread_flag(TIF_IO_BITMAP);
358 * Careful, clear this in the TSS too:
360 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
361 t->io_bitmap_max = 0;
366 void flush_thread(void)
368 struct task_struct *tsk = current;
369 struct thread_info *t = current_thread_info();
371 if (t->flags & _TIF_ABI_PENDING) {
372 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
373 if (t->flags & _TIF_IA32)
374 current_thread_info()->status |= TS_COMPAT;
376 t->flags &= ~_TIF_DEBUG;
378 tsk->thread.debugreg0 = 0;
379 tsk->thread.debugreg1 = 0;
380 tsk->thread.debugreg2 = 0;
381 tsk->thread.debugreg3 = 0;
382 tsk->thread.debugreg6 = 0;
383 tsk->thread.debugreg7 = 0;
384 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
386 * Forget coprocessor state..
392 void release_thread(struct task_struct *dead_task)
395 if (dead_task->mm->context.size) {
396 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
398 dead_task->mm->context.ldt,
399 dead_task->mm->context.size);
405 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
407 struct user_desc ud = {
414 struct n_desc_struct *desc = (void *)t->thread.tls_array;
416 desc->a = LDT_entry_a(&ud);
417 desc->b = LDT_entry_b(&ud);
420 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
422 struct desc_struct *desc = (void *)t->thread.tls_array;
425 (((u32)desc->base1) << 16) |
426 (((u32)desc->base2) << 24);
430 * This gets called before we allocate a new thread and copy
431 * the current task into it.
433 void prepare_to_copy(struct task_struct *tsk)
438 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
439 unsigned long unused,
440 struct task_struct * p, struct pt_regs * regs)
443 struct pt_regs * childregs;
444 struct task_struct *me = current;
446 childregs = ((struct pt_regs *)
447 (THREAD_SIZE + task_stack_page(p))) - 1;
451 childregs->rsp = rsp;
453 childregs->rsp = (unsigned long)childregs;
455 p->thread.rsp = (unsigned long) childregs;
456 p->thread.rsp0 = (unsigned long) (childregs+1);
457 p->thread.userrsp = me->thread.userrsp;
459 set_tsk_thread_flag(p, TIF_FORK);
461 p->thread.fs = me->thread.fs;
462 p->thread.gs = me->thread.gs;
464 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
465 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
466 asm("mov %%es,%0" : "=m" (p->thread.es));
467 asm("mov %%ds,%0" : "=m" (p->thread.ds));
469 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
470 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
471 if (!p->thread.io_bitmap_ptr) {
472 p->thread.io_bitmap_max = 0;
475 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
477 set_tsk_thread_flag(p, TIF_IO_BITMAP);
481 * Set a new TLS for the child thread?
483 if (clone_flags & CLONE_SETTLS) {
484 #ifdef CONFIG_IA32_EMULATION
485 if (test_thread_flag(TIF_IA32))
486 err = ia32_child_tls(p, childregs);
489 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
495 if (err && p->thread.io_bitmap_ptr) {
496 kfree(p->thread.io_bitmap_ptr);
497 p->thread.io_bitmap_max = 0;
503 * This special macro can be used to load a debugging register
505 #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
507 static inline void __switch_to_xtra(struct task_struct *prev_p,
508 struct task_struct *next_p,
509 struct tss_struct *tss)
511 struct thread_struct *prev, *next;
513 prev = &prev_p->thread,
514 next = &next_p->thread;
516 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
526 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
531 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
532 max(prev->io_bitmap_max, next->io_bitmap_max));
533 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
535 * Clear any possible leftover bits:
537 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
542 * switch_to(x,y) should switch tasks from x to y.
544 * This could still be optimized:
545 * - fold all the options into a flag word and test it with a single test.
546 * - could test fs/gs bitsliced
548 * Kprobes not supported here. Set the probe on schedule instead.
550 __kprobes struct task_struct *
551 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
553 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread;
555 int cpu = smp_processor_id();
556 struct tss_struct *tss = &per_cpu(init_tss, cpu);
558 /* we're going to use this soon, after a few expensive things */
559 if (next_p->fpu_counter>5)
560 prefetch(&next->i387.fxsave);
563 * Reload esp0, LDT and the page table pointer:
565 tss->rsp0 = next->rsp0;
569 * This won't pick up thread selector changes, but I guess that is ok.
571 asm volatile("mov %%es,%0" : "=m" (prev->es));
572 if (unlikely(next->es | prev->es))
573 loadsegment(es, next->es);
575 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
576 if (unlikely(next->ds | prev->ds))
577 loadsegment(ds, next->ds);
586 asm volatile("movl %%fs,%0" : "=r" (fsindex));
587 /* segment register != 0 always requires a reload.
588 also reload when it has changed.
589 when prev process used 64bit base always reload
590 to avoid an information leak. */
591 if (unlikely(fsindex | next->fsindex | prev->fs)) {
592 loadsegment(fs, next->fsindex);
593 /* check if the user used a selector != 0
594 * if yes clear 64bit base, since overloaded base
595 * is always mapped to the Null selector
600 /* when next process has a 64bit base use it */
602 wrmsrl(MSR_FS_BASE, next->fs);
603 prev->fsindex = fsindex;
607 asm volatile("movl %%gs,%0" : "=r" (gsindex));
608 if (unlikely(gsindex | next->gsindex | prev->gs)) {
609 load_gs_index(next->gsindex);
614 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
615 prev->gsindex = gsindex;
618 /* Must be after DS reload */
622 * Switch the PDA and FPU contexts.
624 prev->userrsp = read_pda(oldrsp);
625 write_pda(oldrsp, next->userrsp);
626 write_pda(pcurrent, next_p);
628 write_pda(kernelstack,
629 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
630 #ifdef CONFIG_CC_STACKPROTECTOR
631 write_pda(stack_canary, next_p->stack_canary);
633 * Build time only check to make sure the stack_canary is at
634 * offset 40 in the pda; this is a gcc ABI requirement
636 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
640 * Now maybe reload the debug registers and handle I/O bitmaps
642 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
643 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
644 __switch_to_xtra(prev_p, next_p, tss);
646 /* If the task has used fpu the last 5 timeslices, just do a full
647 * restore of the math state immediately to avoid the trap; the
648 * chances of needing FPU soon are obviously high now
650 if (next_p->fpu_counter>5)
651 math_state_restore();
656 * sys_execve() executes a new program.
659 long sys_execve(char __user *name, char __user * __user *argv,
660 char __user * __user *envp, struct pt_regs regs)
665 filename = getname(name);
666 error = PTR_ERR(filename);
667 if (IS_ERR(filename))
669 error = do_execve(filename, argv, envp, ®s);
672 current->ptrace &= ~PT_DTRACE;
673 task_unlock(current);
679 void set_personality_64bit(void)
681 /* inherit personality from parent */
683 /* Make sure to be in 64bit mode */
684 clear_thread_flag(TIF_IA32);
686 /* TBD: overwrites user setup. Should have two bits.
687 But 64bit processes have always behaved this way,
688 so it's not too bad. The main problem is just that
689 32bit childs are affected again. */
690 current->personality &= ~READ_IMPLIES_EXEC;
693 asmlinkage long sys_fork(struct pt_regs *regs)
695 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
699 sys_clone(unsigned long clone_flags, unsigned long newsp,
700 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
704 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
708 * This is trivial, and on the face of it looks like it
709 * could equally well be done in user mode.
711 * Not so, for quite unobvious reasons - register pressure.
712 * In user mode vfork() cannot have a stack frame, and if
713 * done by calling the "clone()" system call directly, you
714 * do not have enough call-clobbered registers to hold all
715 * the information you need.
717 asmlinkage long sys_vfork(struct pt_regs *regs)
719 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
723 unsigned long get_wchan(struct task_struct *p)
729 if (!p || p == current || p->state==TASK_RUNNING)
731 stack = (unsigned long)task_stack_page(p);
732 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
734 fp = *(u64 *)(p->thread.rsp);
736 if (fp < (unsigned long)stack ||
737 fp > (unsigned long)stack+THREAD_SIZE)
739 rip = *(u64 *)(fp+8);
740 if (!in_sched_functions(rip))
743 } while (count++ < 16);
747 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
750 int doit = task == current;
755 if (addr >= TASK_SIZE_OF(task))
758 /* handle small bases via the GDT because that's faster to
760 if (addr <= 0xffffffff) {
761 set_32bit_tls(task, GS_TLS, addr);
763 load_TLS(&task->thread, cpu);
764 load_gs_index(GS_TLS_SEL);
766 task->thread.gsindex = GS_TLS_SEL;
769 task->thread.gsindex = 0;
770 task->thread.gs = addr;
773 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
779 /* Not strictly needed for fs, but do it for symmetry
781 if (addr >= TASK_SIZE_OF(task))
784 /* handle small bases via the GDT because that's faster to
786 if (addr <= 0xffffffff) {
787 set_32bit_tls(task, FS_TLS, addr);
789 load_TLS(&task->thread, cpu);
790 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
792 task->thread.fsindex = FS_TLS_SEL;
795 task->thread.fsindex = 0;
796 task->thread.fs = addr;
798 /* set the selector to 0 to not confuse
800 asm volatile("movl %0,%%fs" :: "r" (0));
801 ret = checking_wrmsrl(MSR_FS_BASE, addr);
808 if (task->thread.fsindex == FS_TLS_SEL)
809 base = read_32bit_tls(task, FS_TLS);
811 rdmsrl(MSR_FS_BASE, base);
813 base = task->thread.fs;
814 ret = put_user(base, (unsigned long __user *)addr);
820 if (task->thread.gsindex == GS_TLS_SEL)
821 base = read_32bit_tls(task, GS_TLS);
823 asm("movl %%gs,%0" : "=r" (gsindex));
825 rdmsrl(MSR_KERNEL_GS_BASE, base);
827 base = task->thread.gs;
830 base = task->thread.gs;
831 ret = put_user(base, (unsigned long __user *)addr);
843 long sys_arch_prctl(int code, unsigned long addr)
845 return do_arch_prctl(current, code, addr);
849 * Capture the user space registers if the task is not running (in user space)
851 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
853 struct pt_regs *pp, ptregs;
855 pp = task_pt_regs(tsk);
861 elf_core_copy_regs(regs, &ptregs);
866 unsigned long arch_align_stack(unsigned long sp)
868 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
869 sp -= get_random_int() % 8192;