2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
17 * This file handles the architecture-dependent parts of process handling..
22 #include <linux/cpu.h>
23 #include <linux/errno.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/module.h>
32 #include <linux/a.out.h>
33 #include <linux/interrupt.h>
34 #include <linux/delay.h>
35 #include <linux/ptrace.h>
36 #include <linux/utsname.h>
37 #include <linux/random.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
54 asmlinkage extern void ret_from_fork(void);
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 static atomic_t hlt_counter = ATOMIC_INIT(0);
60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override);
64 * Powermanagement idle function, if any..
66 void (*pm_idle)(void);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 void disable_hlt(void)
71 atomic_inc(&hlt_counter);
74 EXPORT_SYMBOL(disable_hlt);
78 atomic_dec(&hlt_counter);
81 EXPORT_SYMBOL(enable_hlt);
84 * We use this if we don't have any better
87 void default_idle(void)
91 if (!atomic_read(&hlt_counter)) {
92 clear_thread_flag(TIF_POLLING_NRFLAG);
93 smp_mb__after_clear_bit();
94 while (!need_resched()) {
101 set_thread_flag(TIF_POLLING_NRFLAG);
103 while (!need_resched())
109 * On SMP it's slightly faster (but much more power-consuming!)
110 * to poll the ->need_resched flag instead of waiting for the
111 * cross-CPU IPI to arrive. Use this option with caution.
113 static void poll_idle (void)
123 "i" (_TIF_NEED_RESCHED),
124 "m" (current_thread_info()->flags));
127 void cpu_idle_wait(void)
129 unsigned int cpu, this_cpu = get_cpu();
132 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
136 for_each_online_cpu(cpu) {
137 per_cpu(cpu_idle_state, cpu) = 1;
141 __get_cpu_var(cpu_idle_state) = 0;
146 for_each_online_cpu(cpu) {
147 if (cpu_isset(cpu, map) &&
148 !per_cpu(cpu_idle_state, cpu))
151 cpus_and(map, map, cpu_online_map);
152 } while (!cpus_empty(map));
154 EXPORT_SYMBOL_GPL(cpu_idle_wait);
156 #ifdef CONFIG_HOTPLUG_CPU
157 DECLARE_PER_CPU(int, cpu_state);
160 /* We don't actually take CPU down, just spin without interrupts. */
161 static inline void play_dead(void)
167 __get_cpu_var(cpu_state) = CPU_DEAD;
173 static inline void play_dead(void)
177 #endif /* CONFIG_HOTPLUG_CPU */
180 * The idle thread. There's no useful work to be
181 * done, so just try to conserve power and have a
182 * low exit latency (ie sit in a loop waiting for
183 * somebody to say that they'd like to reschedule)
187 set_thread_flag(TIF_POLLING_NRFLAG);
189 /* endless idle loop with no priority at all */
191 while (!need_resched()) {
194 if (__get_cpu_var(cpu_idle_state))
195 __get_cpu_var(cpu_idle_state) = 0;
201 if (cpu_is_offline(smp_processor_id()))
206 preempt_enable_no_resched();
213 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
214 * which can obviate IPI to trigger checking of need_resched.
215 * We execute MONITOR against need_resched and enter optimized wait state
216 * through MWAIT. Whenever someone changes need_resched, we would be woken
217 * up from MWAIT (without an IPI).
219 static void mwait_idle(void)
223 while (!need_resched()) {
224 __monitor((void *)¤t_thread_info()->flags, 0, 0);
232 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
235 if (cpu_has(c, X86_FEATURE_MWAIT)) {
237 * Skip, if setup has overridden idle.
238 * One CPU supports mwait => All CPUs supports mwait
242 printk("using mwait in idle threads.\n");
245 pm_idle = mwait_idle;
250 static int __init idle_setup (char *str)
252 if (!strncmp(str, "poll", 4)) {
253 printk("using polling idle threads.\n");
257 boot_option_idle_override = 1;
261 __setup("idle=", idle_setup);
263 /* Prints also some state that isn't saved in the pt_regs */
264 void __show_regs(struct pt_regs * regs)
266 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
267 unsigned int fsindex,gsindex;
268 unsigned int ds,cs,es;
272 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
273 current->pid, current->comm, print_tainted(),
274 system_utsname.release,
275 (int)strcspn(system_utsname.version, " "),
276 system_utsname.version);
277 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
278 printk_address(regs->rip);
279 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
281 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
282 regs->rax, regs->rbx, regs->rcx);
283 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
284 regs->rdx, regs->rsi, regs->rdi);
285 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
286 regs->rbp, regs->r8, regs->r9);
287 printk("R10: %016lx R11: %016lx R12: %016lx\n",
288 regs->r10, regs->r11, regs->r12);
289 printk("R13: %016lx R14: %016lx R15: %016lx\n",
290 regs->r13, regs->r14, regs->r15);
292 asm("movl %%ds,%0" : "=r" (ds));
293 asm("movl %%cs,%0" : "=r" (cs));
294 asm("movl %%es,%0" : "=r" (es));
295 asm("movl %%fs,%0" : "=r" (fsindex));
296 asm("movl %%gs,%0" : "=r" (gsindex));
298 rdmsrl(MSR_FS_BASE, fs);
299 rdmsrl(MSR_GS_BASE, gs);
300 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
302 asm("movq %%cr0, %0": "=r" (cr0));
303 asm("movq %%cr2, %0": "=r" (cr2));
304 asm("movq %%cr3, %0": "=r" (cr3));
305 asm("movq %%cr4, %0": "=r" (cr4));
307 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
308 fs,fsindex,gs,gsindex,shadowgs);
309 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
310 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
313 void show_regs(struct pt_regs *regs)
315 printk("CPU %d:", smp_processor_id());
317 show_trace(®s->rsp);
321 * Free current thread data structures etc..
323 void exit_thread(void)
325 struct task_struct *me = current;
326 struct thread_struct *t = &me->thread;
329 * Remove function-return probe instances associated with this task
330 * and put them back on the free list. Do not insert an exit probe for
331 * this function, it will be disabled by kprobe_flush_task if you do.
333 kprobe_flush_task(me);
335 if (me->thread.io_bitmap_ptr) {
336 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
338 kfree(t->io_bitmap_ptr);
339 t->io_bitmap_ptr = NULL;
341 * Careful, clear this in the TSS too:
343 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
344 t->io_bitmap_max = 0;
349 void flush_thread(void)
351 struct task_struct *tsk = current;
352 struct thread_info *t = current_thread_info();
354 if (t->flags & _TIF_ABI_PENDING)
355 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
357 tsk->thread.debugreg0 = 0;
358 tsk->thread.debugreg1 = 0;
359 tsk->thread.debugreg2 = 0;
360 tsk->thread.debugreg3 = 0;
361 tsk->thread.debugreg6 = 0;
362 tsk->thread.debugreg7 = 0;
363 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
365 * Forget coprocessor state..
371 void release_thread(struct task_struct *dead_task)
374 if (dead_task->mm->context.size) {
375 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
377 dead_task->mm->context.ldt,
378 dead_task->mm->context.size);
384 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
386 struct user_desc ud = {
393 struct n_desc_struct *desc = (void *)t->thread.tls_array;
395 desc->a = LDT_entry_a(&ud);
396 desc->b = LDT_entry_b(&ud);
399 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
401 struct desc_struct *desc = (void *)t->thread.tls_array;
404 (((u32)desc->base1) << 16) |
405 (((u32)desc->base2) << 24);
409 * This gets called before we allocate a new thread and copy
410 * the current task into it.
412 void prepare_to_copy(struct task_struct *tsk)
417 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
418 unsigned long unused,
419 struct task_struct * p, struct pt_regs * regs)
422 struct pt_regs * childregs;
423 struct task_struct *me = current;
425 childregs = ((struct pt_regs *)
426 (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
430 childregs->rsp = rsp;
432 childregs->rsp = (unsigned long)childregs;
434 p->thread.rsp = (unsigned long) childregs;
435 p->thread.rsp0 = (unsigned long) (childregs+1);
436 p->thread.userrsp = me->thread.userrsp;
438 set_ti_thread_flag(p->thread_info, TIF_FORK);
440 p->thread.fs = me->thread.fs;
441 p->thread.gs = me->thread.gs;
443 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
444 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
445 asm("mov %%es,%0" : "=m" (p->thread.es));
446 asm("mov %%ds,%0" : "=m" (p->thread.ds));
448 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
449 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
450 if (!p->thread.io_bitmap_ptr) {
451 p->thread.io_bitmap_max = 0;
454 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
459 * Set a new TLS for the child thread?
461 if (clone_flags & CLONE_SETTLS) {
462 #ifdef CONFIG_IA32_EMULATION
463 if (test_thread_flag(TIF_IA32))
464 err = ia32_child_tls(p, childregs);
467 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
473 if (err && p->thread.io_bitmap_ptr) {
474 kfree(p->thread.io_bitmap_ptr);
475 p->thread.io_bitmap_max = 0;
481 * This special macro can be used to load a debugging register
483 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
486 * switch_to(x,y) should switch tasks from x to y.
488 * This could still be optimized:
489 * - fold all the options into a flag word and test it with a single test.
490 * - could test fs/gs bitsliced
493 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 struct thread_struct *prev = &prev_p->thread,
496 *next = &next_p->thread;
497 int cpu = smp_processor_id();
498 struct tss_struct *tss = &per_cpu(init_tss, cpu);
503 * Reload esp0, LDT and the page table pointer:
505 tss->rsp0 = next->rsp0;
509 * This won't pick up thread selector changes, but I guess that is ok.
511 asm volatile("mov %%es,%0" : "=m" (prev->es));
512 if (unlikely(next->es | prev->es))
513 loadsegment(es, next->es);
515 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
516 if (unlikely(next->ds | prev->ds))
517 loadsegment(ds, next->ds);
526 asm volatile("movl %%fs,%0" : "=r" (fsindex));
527 /* segment register != 0 always requires a reload.
528 also reload when it has changed.
529 when prev process used 64bit base always reload
530 to avoid an information leak. */
531 if (unlikely(fsindex | next->fsindex | prev->fs)) {
532 loadsegment(fs, next->fsindex);
533 /* check if the user used a selector != 0
534 * if yes clear 64bit base, since overloaded base
535 * is always mapped to the Null selector
540 /* when next process has a 64bit base use it */
542 wrmsrl(MSR_FS_BASE, next->fs);
543 prev->fsindex = fsindex;
547 asm volatile("movl %%gs,%0" : "=r" (gsindex));
548 if (unlikely(gsindex | next->gsindex | prev->gs)) {
549 load_gs_index(next->gsindex);
554 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
555 prev->gsindex = gsindex;
559 * Switch the PDA context.
561 prev->userrsp = read_pda(oldrsp);
562 write_pda(oldrsp, next->userrsp);
563 write_pda(pcurrent, next_p);
564 write_pda(kernelstack,
565 (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
568 * Now maybe reload the debug registers
570 if (unlikely(next->debugreg7)) {
582 * Handle the IO bitmap
584 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
585 if (next->io_bitmap_ptr)
587 * Copy the relevant range of the IO bitmap.
588 * Normally this is 128 bytes or less:
590 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
591 max(prev->io_bitmap_max, next->io_bitmap_max));
594 * Clear any possible leftover bits:
596 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
604 * sys_execve() executes a new program.
607 long sys_execve(char __user *name, char __user * __user *argv,
608 char __user * __user *envp, struct pt_regs regs)
613 filename = getname(name);
614 error = PTR_ERR(filename);
615 if (IS_ERR(filename))
617 error = do_execve(filename, argv, envp, ®s);
620 current->ptrace &= ~PT_DTRACE;
621 task_unlock(current);
627 void set_personality_64bit(void)
629 /* inherit personality from parent */
631 /* Make sure to be in 64bit mode */
632 clear_thread_flag(TIF_IA32);
634 /* TBD: overwrites user setup. Should have two bits.
635 But 64bit processes have always behaved this way,
636 so it's not too bad. The main problem is just that
637 32bit childs are affected again. */
638 current->personality &= ~READ_IMPLIES_EXEC;
641 asmlinkage long sys_fork(struct pt_regs *regs)
643 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
647 sys_clone(unsigned long clone_flags, unsigned long newsp,
648 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
652 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
656 * This is trivial, and on the face of it looks like it
657 * could equally well be done in user mode.
659 * Not so, for quite unobvious reasons - register pressure.
660 * In user mode vfork() cannot have a stack frame, and if
661 * done by calling the "clone()" system call directly, you
662 * do not have enough call-clobbered registers to hold all
663 * the information you need.
665 asmlinkage long sys_vfork(struct pt_regs *regs)
667 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
671 unsigned long get_wchan(struct task_struct *p)
677 if (!p || p == current || p->state==TASK_RUNNING)
679 stack = (unsigned long)p->thread_info;
680 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
682 fp = *(u64 *)(p->thread.rsp);
684 if (fp < (unsigned long)stack ||
685 fp > (unsigned long)stack+THREAD_SIZE)
687 rip = *(u64 *)(fp+8);
688 if (!in_sched_functions(rip))
691 } while (count++ < 16);
695 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
698 int doit = task == current;
703 if (addr >= TASK_SIZE_OF(task))
706 /* handle small bases via the GDT because that's faster to
708 if (addr <= 0xffffffff) {
709 set_32bit_tls(task, GS_TLS, addr);
711 load_TLS(&task->thread, cpu);
712 load_gs_index(GS_TLS_SEL);
714 task->thread.gsindex = GS_TLS_SEL;
717 task->thread.gsindex = 0;
718 task->thread.gs = addr;
721 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
727 /* Not strictly needed for fs, but do it for symmetry
729 if (addr >= TASK_SIZE_OF(task))
732 /* handle small bases via the GDT because that's faster to
734 if (addr <= 0xffffffff) {
735 set_32bit_tls(task, FS_TLS, addr);
737 load_TLS(&task->thread, cpu);
738 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
740 task->thread.fsindex = FS_TLS_SEL;
743 task->thread.fsindex = 0;
744 task->thread.fs = addr;
746 /* set the selector to 0 to not confuse
748 asm volatile("movl %0,%%fs" :: "r" (0));
749 ret = checking_wrmsrl(MSR_FS_BASE, addr);
756 if (task->thread.fsindex == FS_TLS_SEL)
757 base = read_32bit_tls(task, FS_TLS);
759 rdmsrl(MSR_FS_BASE, base);
761 base = task->thread.fs;
762 ret = put_user(base, (unsigned long __user *)addr);
767 if (task->thread.gsindex == GS_TLS_SEL)
768 base = read_32bit_tls(task, GS_TLS);
770 rdmsrl(MSR_KERNEL_GS_BASE, base);
772 base = task->thread.gs;
773 ret = put_user(base, (unsigned long __user *)addr);
785 long sys_arch_prctl(int code, unsigned long addr)
787 return do_arch_prctl(current, code, addr);
791 * Capture the user space registers if the task is not running (in user space)
793 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
795 struct pt_regs *pp, ptregs;
797 pp = (struct pt_regs *)(tsk->thread.rsp0);
804 elf_core_copy_regs(regs, &ptregs);
809 unsigned long arch_align_stack(unsigned long sp)
811 if (randomize_va_space)
812 sp -= get_random_int() % 8192;