2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
17 * This file handles the architecture-dependent parts of process handling..
22 #include <linux/cpu.h>
23 #include <linux/errno.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/module.h>
32 #include <linux/a.out.h>
33 #include <linux/interrupt.h>
34 #include <linux/delay.h>
35 #include <linux/ptrace.h>
36 #include <linux/utsname.h>
37 #include <linux/random.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
54 asmlinkage extern void ret_from_fork(void);
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
62 * Powermanagement idle function, if any..
64 void (*pm_idle)(void);
65 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 * We use this if we don't have any better
71 void default_idle(void)
75 clear_thread_flag(TIF_POLLING_NRFLAG);
76 smp_mb__after_clear_bit();
77 while (!need_resched()) {
84 set_thread_flag(TIF_POLLING_NRFLAG);
88 * On SMP it's slightly faster (but much more power-consuming!)
89 * to poll the ->need_resched flag instead of waiting for the
90 * cross-CPU IPI to arrive. Use this option with caution.
92 static void poll_idle (void)
102 "i" (_TIF_NEED_RESCHED),
103 "m" (current_thread_info()->flags));
106 void cpu_idle_wait(void)
108 unsigned int cpu, this_cpu = get_cpu();
111 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
115 for_each_online_cpu(cpu) {
116 per_cpu(cpu_idle_state, cpu) = 1;
120 __get_cpu_var(cpu_idle_state) = 0;
125 for_each_online_cpu(cpu) {
126 if (cpu_isset(cpu, map) &&
127 !per_cpu(cpu_idle_state, cpu))
130 cpus_and(map, map, cpu_online_map);
131 } while (!cpus_empty(map));
133 EXPORT_SYMBOL_GPL(cpu_idle_wait);
135 #ifdef CONFIG_HOTPLUG_CPU
136 DECLARE_PER_CPU(int, cpu_state);
139 /* We halt the CPU with physical CPU hotplug */
140 static inline void play_dead(void)
146 __get_cpu_var(cpu_state) = CPU_DEAD;
153 static inline void play_dead(void)
157 #endif /* CONFIG_HOTPLUG_CPU */
160 * The idle thread. There's no useful work to be
161 * done, so just try to conserve power and have a
162 * low exit latency (ie sit in a loop waiting for
163 * somebody to say that they'd like to reschedule)
167 set_thread_flag(TIF_POLLING_NRFLAG);
169 /* endless idle loop with no priority at all */
171 while (!need_resched()) {
174 if (__get_cpu_var(cpu_idle_state))
175 __get_cpu_var(cpu_idle_state) = 0;
181 if (cpu_is_offline(smp_processor_id()))
186 preempt_enable_no_resched();
193 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
194 * which can obviate IPI to trigger checking of need_resched.
195 * We execute MONITOR against need_resched and enter optimized wait state
196 * through MWAIT. Whenever someone changes need_resched, we would be woken
197 * up from MWAIT (without an IPI).
199 static void mwait_idle(void)
203 while (!need_resched()) {
204 __monitor((void *)¤t_thread_info()->flags, 0, 0);
212 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
215 if (cpu_has(c, X86_FEATURE_MWAIT)) {
217 * Skip, if setup has overridden idle.
218 * One CPU supports mwait => All CPUs supports mwait
222 printk("using mwait in idle threads.\n");
225 pm_idle = mwait_idle;
230 static int __init idle_setup (char *str)
232 if (!strncmp(str, "poll", 4)) {
233 printk("using polling idle threads.\n");
237 boot_option_idle_override = 1;
241 __setup("idle=", idle_setup);
243 /* Prints also some state that isn't saved in the pt_regs */
244 void __show_regs(struct pt_regs * regs)
246 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
247 unsigned int fsindex,gsindex;
248 unsigned int ds,cs,es;
252 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
253 current->pid, current->comm, print_tainted(),
254 system_utsname.release,
255 (int)strcspn(system_utsname.version, " "),
256 system_utsname.version);
257 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
258 printk_address(regs->rip);
259 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
261 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
262 regs->rax, regs->rbx, regs->rcx);
263 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
264 regs->rdx, regs->rsi, regs->rdi);
265 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
266 regs->rbp, regs->r8, regs->r9);
267 printk("R10: %016lx R11: %016lx R12: %016lx\n",
268 regs->r10, regs->r11, regs->r12);
269 printk("R13: %016lx R14: %016lx R15: %016lx\n",
270 regs->r13, regs->r14, regs->r15);
272 asm("movl %%ds,%0" : "=r" (ds));
273 asm("movl %%cs,%0" : "=r" (cs));
274 asm("movl %%es,%0" : "=r" (es));
275 asm("movl %%fs,%0" : "=r" (fsindex));
276 asm("movl %%gs,%0" : "=r" (gsindex));
278 rdmsrl(MSR_FS_BASE, fs);
279 rdmsrl(MSR_GS_BASE, gs);
280 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
282 asm("movq %%cr0, %0": "=r" (cr0));
283 asm("movq %%cr2, %0": "=r" (cr2));
284 asm("movq %%cr3, %0": "=r" (cr3));
285 asm("movq %%cr4, %0": "=r" (cr4));
287 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
288 fs,fsindex,gs,gsindex,shadowgs);
289 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
290 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
293 void show_regs(struct pt_regs *regs)
295 printk("CPU %d:", smp_processor_id());
297 show_trace(®s->rsp);
301 * Free current thread data structures etc..
303 void exit_thread(void)
305 struct task_struct *me = current;
306 struct thread_struct *t = &me->thread;
309 * Remove function-return probe instances associated with this task
310 * and put them back on the free list. Do not insert an exit probe for
311 * this function, it will be disabled by kprobe_flush_task if you do.
313 kprobe_flush_task(me);
315 if (me->thread.io_bitmap_ptr) {
316 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
318 kfree(t->io_bitmap_ptr);
319 t->io_bitmap_ptr = NULL;
321 * Careful, clear this in the TSS too:
323 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
324 t->io_bitmap_max = 0;
329 void flush_thread(void)
331 struct task_struct *tsk = current;
332 struct thread_info *t = current_thread_info();
334 if (t->flags & _TIF_ABI_PENDING)
335 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
337 tsk->thread.debugreg0 = 0;
338 tsk->thread.debugreg1 = 0;
339 tsk->thread.debugreg2 = 0;
340 tsk->thread.debugreg3 = 0;
341 tsk->thread.debugreg6 = 0;
342 tsk->thread.debugreg7 = 0;
343 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
345 * Forget coprocessor state..
351 void release_thread(struct task_struct *dead_task)
354 if (dead_task->mm->context.size) {
355 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
357 dead_task->mm->context.ldt,
358 dead_task->mm->context.size);
364 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
366 struct user_desc ud = {
373 struct n_desc_struct *desc = (void *)t->thread.tls_array;
375 desc->a = LDT_entry_a(&ud);
376 desc->b = LDT_entry_b(&ud);
379 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
381 struct desc_struct *desc = (void *)t->thread.tls_array;
384 (((u32)desc->base1) << 16) |
385 (((u32)desc->base2) << 24);
389 * This gets called before we allocate a new thread and copy
390 * the current task into it.
392 void prepare_to_copy(struct task_struct *tsk)
397 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
398 unsigned long unused,
399 struct task_struct * p, struct pt_regs * regs)
402 struct pt_regs * childregs;
403 struct task_struct *me = current;
405 childregs = ((struct pt_regs *)
406 (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
410 childregs->rsp = rsp;
412 childregs->rsp = (unsigned long)childregs;
414 p->thread.rsp = (unsigned long) childregs;
415 p->thread.rsp0 = (unsigned long) (childregs+1);
416 p->thread.userrsp = me->thread.userrsp;
418 set_ti_thread_flag(p->thread_info, TIF_FORK);
420 p->thread.fs = me->thread.fs;
421 p->thread.gs = me->thread.gs;
423 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
424 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
425 asm("mov %%es,%0" : "=m" (p->thread.es));
426 asm("mov %%ds,%0" : "=m" (p->thread.ds));
428 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
429 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
430 if (!p->thread.io_bitmap_ptr) {
431 p->thread.io_bitmap_max = 0;
434 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
439 * Set a new TLS for the child thread?
441 if (clone_flags & CLONE_SETTLS) {
442 #ifdef CONFIG_IA32_EMULATION
443 if (test_thread_flag(TIF_IA32))
444 err = ia32_child_tls(p, childregs);
447 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
453 if (err && p->thread.io_bitmap_ptr) {
454 kfree(p->thread.io_bitmap_ptr);
455 p->thread.io_bitmap_max = 0;
461 * This special macro can be used to load a debugging register
463 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
466 * switch_to(x,y) should switch tasks from x to y.
468 * This could still be optimized:
469 * - fold all the options into a flag word and test it with a single test.
470 * - could test fs/gs bitsliced
473 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
475 struct thread_struct *prev = &prev_p->thread,
476 *next = &next_p->thread;
477 int cpu = smp_processor_id();
478 struct tss_struct *tss = &per_cpu(init_tss, cpu);
483 * Reload esp0, LDT and the page table pointer:
485 tss->rsp0 = next->rsp0;
489 * This won't pick up thread selector changes, but I guess that is ok.
491 asm volatile("mov %%es,%0" : "=m" (prev->es));
492 if (unlikely(next->es | prev->es))
493 loadsegment(es, next->es);
495 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
496 if (unlikely(next->ds | prev->ds))
497 loadsegment(ds, next->ds);
506 asm volatile("movl %%fs,%0" : "=r" (fsindex));
507 /* segment register != 0 always requires a reload.
508 also reload when it has changed.
509 when prev process used 64bit base always reload
510 to avoid an information leak. */
511 if (unlikely(fsindex | next->fsindex | prev->fs)) {
512 loadsegment(fs, next->fsindex);
513 /* check if the user used a selector != 0
514 * if yes clear 64bit base, since overloaded base
515 * is always mapped to the Null selector
520 /* when next process has a 64bit base use it */
522 wrmsrl(MSR_FS_BASE, next->fs);
523 prev->fsindex = fsindex;
527 asm volatile("movl %%gs,%0" : "=r" (gsindex));
528 if (unlikely(gsindex | next->gsindex | prev->gs)) {
529 load_gs_index(next->gsindex);
534 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
535 prev->gsindex = gsindex;
539 * Switch the PDA context.
541 prev->userrsp = read_pda(oldrsp);
542 write_pda(oldrsp, next->userrsp);
543 write_pda(pcurrent, next_p);
544 write_pda(kernelstack,
545 (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
548 * Now maybe reload the debug registers
550 if (unlikely(next->debugreg7)) {
562 * Handle the IO bitmap
564 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
565 if (next->io_bitmap_ptr)
567 * Copy the relevant range of the IO bitmap.
568 * Normally this is 128 bytes or less:
570 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
571 max(prev->io_bitmap_max, next->io_bitmap_max));
574 * Clear any possible leftover bits:
576 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
584 * sys_execve() executes a new program.
587 long sys_execve(char __user *name, char __user * __user *argv,
588 char __user * __user *envp, struct pt_regs regs)
593 filename = getname(name);
594 error = PTR_ERR(filename);
595 if (IS_ERR(filename))
597 error = do_execve(filename, argv, envp, ®s);
600 current->ptrace &= ~PT_DTRACE;
601 task_unlock(current);
607 void set_personality_64bit(void)
609 /* inherit personality from parent */
611 /* Make sure to be in 64bit mode */
612 clear_thread_flag(TIF_IA32);
614 /* TBD: overwrites user setup. Should have two bits.
615 But 64bit processes have always behaved this way,
616 so it's not too bad. The main problem is just that
617 32bit childs are affected again. */
618 current->personality &= ~READ_IMPLIES_EXEC;
621 asmlinkage long sys_fork(struct pt_regs *regs)
623 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
627 sys_clone(unsigned long clone_flags, unsigned long newsp,
628 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
632 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
636 * This is trivial, and on the face of it looks like it
637 * could equally well be done in user mode.
639 * Not so, for quite unobvious reasons - register pressure.
640 * In user mode vfork() cannot have a stack frame, and if
641 * done by calling the "clone()" system call directly, you
642 * do not have enough call-clobbered registers to hold all
643 * the information you need.
645 asmlinkage long sys_vfork(struct pt_regs *regs)
647 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
651 unsigned long get_wchan(struct task_struct *p)
657 if (!p || p == current || p->state==TASK_RUNNING)
659 stack = (unsigned long)p->thread_info;
660 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
662 fp = *(u64 *)(p->thread.rsp);
664 if (fp < (unsigned long)stack ||
665 fp > (unsigned long)stack+THREAD_SIZE)
667 rip = *(u64 *)(fp+8);
668 if (!in_sched_functions(rip))
671 } while (count++ < 16);
675 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
678 int doit = task == current;
683 if (addr >= TASK_SIZE_OF(task))
686 /* handle small bases via the GDT because that's faster to
688 if (addr <= 0xffffffff) {
689 set_32bit_tls(task, GS_TLS, addr);
691 load_TLS(&task->thread, cpu);
692 load_gs_index(GS_TLS_SEL);
694 task->thread.gsindex = GS_TLS_SEL;
697 task->thread.gsindex = 0;
698 task->thread.gs = addr;
701 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
707 /* Not strictly needed for fs, but do it for symmetry
709 if (addr >= TASK_SIZE_OF(task))
712 /* handle small bases via the GDT because that's faster to
714 if (addr <= 0xffffffff) {
715 set_32bit_tls(task, FS_TLS, addr);
717 load_TLS(&task->thread, cpu);
718 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
720 task->thread.fsindex = FS_TLS_SEL;
723 task->thread.fsindex = 0;
724 task->thread.fs = addr;
726 /* set the selector to 0 to not confuse
728 asm volatile("movl %0,%%fs" :: "r" (0));
729 ret = checking_wrmsrl(MSR_FS_BASE, addr);
736 if (task->thread.fsindex == FS_TLS_SEL)
737 base = read_32bit_tls(task, FS_TLS);
739 rdmsrl(MSR_FS_BASE, base);
741 base = task->thread.fs;
742 ret = put_user(base, (unsigned long __user *)addr);
747 if (task->thread.gsindex == GS_TLS_SEL)
748 base = read_32bit_tls(task, GS_TLS);
750 rdmsrl(MSR_KERNEL_GS_BASE, base);
752 base = task->thread.gs;
753 ret = put_user(base, (unsigned long __user *)addr);
765 long sys_arch_prctl(int code, unsigned long addr)
767 return do_arch_prctl(current, code, addr);
771 * Capture the user space registers if the task is not running (in user space)
773 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
775 struct pt_regs *pp, ptregs;
777 pp = (struct pt_regs *)(tsk->thread.rsp0);
784 elf_core_copy_regs(regs, &ptregs);
789 unsigned long arch_align_stack(unsigned long sp)
791 if (randomize_va_space)
792 sp -= get_random_int() % 8192;