2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/irq.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
38 #include <asm/uaccess.h>
39 #include <asm/pgtable.h>
40 #include <asm/system.h>
42 #include <asm/processor.h>
44 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/kdebug.h>
49 #include <asm/proto.h>
52 asmlinkage extern void ret_from_fork(void);
54 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
56 static atomic_t hlt_counter = ATOMIC_INIT(0);
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
62 * Powermanagement idle function, if any..
64 void (*pm_idle)(void);
65 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
67 void disable_hlt(void)
69 atomic_inc(&hlt_counter);
72 EXPORT_SYMBOL(disable_hlt);
76 atomic_dec(&hlt_counter);
79 EXPORT_SYMBOL(enable_hlt);
82 * We use this if we don't have any better
85 void default_idle(void)
87 if (!atomic_read(&hlt_counter)) {
97 * On SMP it's slightly faster (but much more power-consuming!)
98 * to poll the ->need_resched flag instead of waiting for the
99 * cross-CPU IPI to arrive. Use this option with caution.
101 static void poll_idle (void)
108 * Deal with another CPU just having chosen a thread to
111 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
114 set_thread_flag(TIF_POLLING_NRFLAG);
121 "i" (_TIF_NEED_RESCHED),
122 "m" (current_thread_info()->flags));
128 void cpu_idle_wait(void)
130 unsigned int cpu, this_cpu = get_cpu();
133 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
137 for_each_online_cpu(cpu) {
138 per_cpu(cpu_idle_state, cpu) = 1;
142 __get_cpu_var(cpu_idle_state) = 0;
147 for_each_online_cpu(cpu) {
148 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
151 cpus_and(map, map, cpu_online_map);
152 } while (!cpus_empty(map));
154 EXPORT_SYMBOL_GPL(cpu_idle_wait);
157 * The idle thread. There's no useful work to be
158 * done, so just try to conserve power and have a
159 * low exit latency (ie sit in a loop waiting for
160 * somebody to say that they'd like to reschedule)
164 /* endless idle loop with no priority at all */
166 while (!need_resched()) {
169 if (__get_cpu_var(cpu_idle_state))
170 __get_cpu_var(cpu_idle_state) = 0;
184 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
185 * which can obviate IPI to trigger checking of need_resched.
186 * We execute MONITOR against need_resched and enter optimized wait state
187 * through MWAIT. Whenever someone changes need_resched, we would be woken
188 * up from MWAIT (without an IPI).
190 static void mwait_idle(void)
194 if (!need_resched()) {
195 set_thread_flag(TIF_POLLING_NRFLAG);
197 __monitor((void *)¤t_thread_info()->flags, 0, 0);
201 } while (!need_resched());
202 clear_thread_flag(TIF_POLLING_NRFLAG);
206 void __init select_idle_routine(const struct cpuinfo_x86 *c)
209 if (cpu_has(c, X86_FEATURE_MWAIT)) {
211 * Skip, if setup has overridden idle.
212 * One CPU supports mwait => All CPUs supports mwait
216 printk("using mwait in idle threads.\n");
219 pm_idle = mwait_idle;
224 static int __init idle_setup (char *str)
226 if (!strncmp(str, "poll", 4)) {
227 printk("using polling idle threads.\n");
231 boot_option_idle_override = 1;
235 __setup("idle=", idle_setup);
237 /* Prints also some state that isn't saved in the pt_regs */
238 void __show_regs(struct pt_regs * regs)
240 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
241 unsigned int fsindex,gsindex;
242 unsigned int ds,cs,es;
246 printk("Pid: %d, comm: %.20s %s %s\n",
247 current->pid, current->comm, print_tainted(), system_utsname.release);
248 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
249 printk_address(regs->rip);
250 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
251 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
252 regs->rax, regs->rbx, regs->rcx);
253 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
254 regs->rdx, regs->rsi, regs->rdi);
255 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
256 regs->rbp, regs->r8, regs->r9);
257 printk("R10: %016lx R11: %016lx R12: %016lx\n",
258 regs->r10, regs->r11, regs->r12);
259 printk("R13: %016lx R14: %016lx R15: %016lx\n",
260 regs->r13, regs->r14, regs->r15);
262 asm("movl %%ds,%0" : "=r" (ds));
263 asm("movl %%cs,%0" : "=r" (cs));
264 asm("movl %%es,%0" : "=r" (es));
265 asm("movl %%fs,%0" : "=r" (fsindex));
266 asm("movl %%gs,%0" : "=r" (gsindex));
268 rdmsrl(MSR_FS_BASE, fs);
269 rdmsrl(MSR_GS_BASE, gs);
270 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
272 asm("movq %%cr0, %0": "=r" (cr0));
273 asm("movq %%cr2, %0": "=r" (cr2));
274 asm("movq %%cr3, %0": "=r" (cr3));
275 asm("movq %%cr4, %0": "=r" (cr4));
277 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
278 fs,fsindex,gs,gsindex,shadowgs);
279 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
280 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
283 void show_regs(struct pt_regs *regs)
286 show_trace(®s->rsp);
290 * Free current thread data structures etc..
292 void exit_thread(void)
294 struct task_struct *me = current;
295 struct thread_struct *t = &me->thread;
296 if (me->thread.io_bitmap_ptr) {
297 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
299 kfree(t->io_bitmap_ptr);
300 t->io_bitmap_ptr = NULL;
302 * Careful, clear this in the TSS too:
304 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
305 t->io_bitmap_max = 0;
310 void flush_thread(void)
312 struct task_struct *tsk = current;
313 struct thread_info *t = current_thread_info();
315 if (t->flags & _TIF_ABI_PENDING)
316 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
318 tsk->thread.debugreg0 = 0;
319 tsk->thread.debugreg1 = 0;
320 tsk->thread.debugreg2 = 0;
321 tsk->thread.debugreg3 = 0;
322 tsk->thread.debugreg6 = 0;
323 tsk->thread.debugreg7 = 0;
324 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
326 * Forget coprocessor state..
332 void release_thread(struct task_struct *dead_task)
335 if (dead_task->mm->context.size) {
336 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
338 dead_task->mm->context.ldt,
339 dead_task->mm->context.size);
345 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
347 struct user_desc ud = {
354 struct n_desc_struct *desc = (void *)t->thread.tls_array;
356 desc->a = LDT_entry_a(&ud);
357 desc->b = LDT_entry_b(&ud);
360 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
362 struct desc_struct *desc = (void *)t->thread.tls_array;
365 (((u32)desc->base1) << 16) |
366 (((u32)desc->base2) << 24);
370 * This gets called before we allocate a new thread and copy
371 * the current task into it.
373 void prepare_to_copy(struct task_struct *tsk)
378 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
379 unsigned long unused,
380 struct task_struct * p, struct pt_regs * regs)
383 struct pt_regs * childregs;
384 struct task_struct *me = current;
386 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
391 childregs->rsp = rsp;
393 childregs->rsp = (unsigned long)childregs;
396 p->thread.rsp = (unsigned long) childregs;
397 p->thread.rsp0 = (unsigned long) (childregs+1);
398 p->thread.userrsp = me->thread.userrsp;
400 set_ti_thread_flag(p->thread_info, TIF_FORK);
402 p->thread.fs = me->thread.fs;
403 p->thread.gs = me->thread.gs;
405 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
406 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
407 asm("mov %%es,%0" : "=m" (p->thread.es));
408 asm("mov %%ds,%0" : "=m" (p->thread.ds));
410 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
411 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
412 if (!p->thread.io_bitmap_ptr) {
413 p->thread.io_bitmap_max = 0;
416 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
420 * Set a new TLS for the child thread?
422 if (clone_flags & CLONE_SETTLS) {
423 #ifdef CONFIG_IA32_EMULATION
424 if (test_thread_flag(TIF_IA32))
425 err = ia32_child_tls(p, childregs);
428 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
434 if (err && p->thread.io_bitmap_ptr) {
435 kfree(p->thread.io_bitmap_ptr);
436 p->thread.io_bitmap_max = 0;
442 * This special macro can be used to load a debugging register
444 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
447 * switch_to(x,y) should switch tasks from x to y.
449 * This could still be optimized:
450 * - fold all the options into a flag word and test it with a single test.
451 * - could test fs/gs bitsliced
453 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
455 struct thread_struct *prev = &prev_p->thread,
456 *next = &next_p->thread;
457 int cpu = smp_processor_id();
458 struct tss_struct *tss = &per_cpu(init_tss, cpu);
463 * Reload esp0, LDT and the page table pointer:
465 tss->rsp0 = next->rsp0;
469 * This won't pick up thread selector changes, but I guess that is ok.
471 asm volatile("mov %%es,%0" : "=m" (prev->es));
472 if (unlikely(next->es | prev->es))
473 loadsegment(es, next->es);
475 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
476 if (unlikely(next->ds | prev->ds))
477 loadsegment(ds, next->ds);
486 asm volatile("movl %%fs,%0" : "=r" (fsindex));
487 /* segment register != 0 always requires a reload.
488 also reload when it has changed.
489 when prev process used 64bit base always reload
490 to avoid an information leak. */
491 if (unlikely(fsindex | next->fsindex | prev->fs)) {
492 loadsegment(fs, next->fsindex);
493 /* check if the user used a selector != 0
494 * if yes clear 64bit base, since overloaded base
495 * is always mapped to the Null selector
500 /* when next process has a 64bit base use it */
502 wrmsrl(MSR_FS_BASE, next->fs);
503 prev->fsindex = fsindex;
507 asm volatile("movl %%gs,%0" : "=r" (gsindex));
508 if (unlikely(gsindex | next->gsindex | prev->gs)) {
509 load_gs_index(next->gsindex);
514 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
515 prev->gsindex = gsindex;
519 * Switch the PDA context.
521 prev->userrsp = read_pda(oldrsp);
522 write_pda(oldrsp, next->userrsp);
523 write_pda(pcurrent, next_p);
524 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
527 * Now maybe reload the debug registers
529 if (unlikely(next->debugreg7)) {
541 * Handle the IO bitmap
543 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
544 if (next->io_bitmap_ptr)
546 * Copy the relevant range of the IO bitmap.
547 * Normally this is 128 bytes or less:
549 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
550 max(prev->io_bitmap_max, next->io_bitmap_max));
553 * Clear any possible leftover bits:
555 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
563 * sys_execve() executes a new program.
566 long sys_execve(char __user *name, char __user * __user *argv,
567 char __user * __user *envp, struct pt_regs regs)
572 filename = getname(name);
573 error = PTR_ERR(filename);
574 if (IS_ERR(filename))
576 error = do_execve(filename, argv, envp, ®s);
579 current->ptrace &= ~PT_DTRACE;
580 task_unlock(current);
586 void set_personality_64bit(void)
588 /* inherit personality from parent */
590 /* Make sure to be in 64bit mode */
591 clear_thread_flag(TIF_IA32);
593 /* TBD: overwrites user setup. Should have two bits.
594 But 64bit processes have always behaved this way,
595 so it's not too bad. The main problem is just that
596 32bit childs are affected again. */
597 current->personality &= ~READ_IMPLIES_EXEC;
600 asmlinkage long sys_fork(struct pt_regs *regs)
602 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
605 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
609 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
613 * This is trivial, and on the face of it looks like it
614 * could equally well be done in user mode.
616 * Not so, for quite unobvious reasons - register pressure.
617 * In user mode vfork() cannot have a stack frame, and if
618 * done by calling the "clone()" system call directly, you
619 * do not have enough call-clobbered registers to hold all
620 * the information you need.
622 asmlinkage long sys_vfork(struct pt_regs *regs)
624 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
628 unsigned long get_wchan(struct task_struct *p)
634 if (!p || p == current || p->state==TASK_RUNNING)
636 stack = (unsigned long)p->thread_info;
637 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
639 fp = *(u64 *)(p->thread.rsp);
641 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
643 rip = *(u64 *)(fp+8);
644 if (!in_sched_functions(rip))
647 } while (count++ < 16);
651 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
654 int doit = task == current;
659 if (addr >= TASK_SIZE)
662 /* handle small bases via the GDT because that's faster to
664 if (addr <= 0xffffffff) {
665 set_32bit_tls(task, GS_TLS, addr);
667 load_TLS(&task->thread, cpu);
668 load_gs_index(GS_TLS_SEL);
670 task->thread.gsindex = GS_TLS_SEL;
673 task->thread.gsindex = 0;
674 task->thread.gs = addr;
677 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
683 /* Not strictly needed for fs, but do it for symmetry
685 if (addr >= TASK_SIZE)
688 /* handle small bases via the GDT because that's faster to
690 if (addr <= 0xffffffff) {
691 set_32bit_tls(task, FS_TLS, addr);
693 load_TLS(&task->thread, cpu);
694 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
696 task->thread.fsindex = FS_TLS_SEL;
699 task->thread.fsindex = 0;
700 task->thread.fs = addr;
702 /* set the selector to 0 to not confuse
704 asm volatile("movl %0,%%fs" :: "r" (0));
705 ret = checking_wrmsrl(MSR_FS_BASE, addr);
712 if (task->thread.fsindex == FS_TLS_SEL)
713 base = read_32bit_tls(task, FS_TLS);
715 rdmsrl(MSR_FS_BASE, base);
717 base = task->thread.fs;
718 ret = put_user(base, (unsigned long __user *)addr);
723 if (task->thread.gsindex == GS_TLS_SEL)
724 base = read_32bit_tls(task, GS_TLS);
726 rdmsrl(MSR_KERNEL_GS_BASE, base);
728 base = task->thread.gs;
729 ret = put_user(base, (unsigned long __user *)addr);
741 long sys_arch_prctl(int code, unsigned long addr)
743 return do_arch_prctl(current, code, addr);
747 * Capture the user space registers if the task is not running (in user space)
749 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
751 struct pt_regs *pp, ptregs;
753 pp = (struct pt_regs *)(tsk->thread.rsp0);
760 elf_core_copy_regs(regs, &ptregs);
765 unsigned long arch_align_stack(unsigned long sp)
767 if (randomize_va_space)
768 sp -= get_random_int() % 8192;