2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
16 * This file handles the architecture-dependent parts of process handling..
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/module.h>
30 #include <linux/a.out.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/irq.h>
34 #include <linux/ptrace.h>
35 #include <linux/utsname.h>
36 #include <linux/random.h>
37 #include <linux/kprobes.h>
39 #include <asm/uaccess.h>
40 #include <asm/pgtable.h>
41 #include <asm/system.h>
43 #include <asm/processor.h>
45 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <asm/kdebug.h>
50 #include <asm/proto.h>
53 asmlinkage extern void ret_from_fork(void);
55 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
57 static atomic_t hlt_counter = ATOMIC_INIT(0);
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68 void disable_hlt(void)
70 atomic_inc(&hlt_counter);
73 EXPORT_SYMBOL(disable_hlt);
77 atomic_dec(&hlt_counter);
80 EXPORT_SYMBOL(enable_hlt);
83 * We use this if we don't have any better
86 void default_idle(void)
88 if (!atomic_read(&hlt_counter)) {
98 * On SMP it's slightly faster (but much more power-consuming!)
99 * to poll the ->need_resched flag instead of waiting for the
100 * cross-CPU IPI to arrive. Use this option with caution.
102 static void poll_idle (void)
109 * Deal with another CPU just having chosen a thread to
112 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
115 set_thread_flag(TIF_POLLING_NRFLAG);
122 "i" (_TIF_NEED_RESCHED),
123 "m" (current_thread_info()->flags));
129 void cpu_idle_wait(void)
131 unsigned int cpu, this_cpu = get_cpu();
134 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
138 for_each_online_cpu(cpu) {
139 per_cpu(cpu_idle_state, cpu) = 1;
143 __get_cpu_var(cpu_idle_state) = 0;
148 for_each_online_cpu(cpu) {
149 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
152 cpus_and(map, map, cpu_online_map);
153 } while (!cpus_empty(map));
155 EXPORT_SYMBOL_GPL(cpu_idle_wait);
158 * The idle thread. There's no useful work to be
159 * done, so just try to conserve power and have a
160 * low exit latency (ie sit in a loop waiting for
161 * somebody to say that they'd like to reschedule)
165 /* endless idle loop with no priority at all */
167 while (!need_resched()) {
170 if (__get_cpu_var(cpu_idle_state))
171 __get_cpu_var(cpu_idle_state) = 0;
185 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
186 * which can obviate IPI to trigger checking of need_resched.
187 * We execute MONITOR against need_resched and enter optimized wait state
188 * through MWAIT. Whenever someone changes need_resched, we would be woken
189 * up from MWAIT (without an IPI).
191 static void mwait_idle(void)
195 if (!need_resched()) {
196 set_thread_flag(TIF_POLLING_NRFLAG);
198 __monitor((void *)¤t_thread_info()->flags, 0, 0);
202 } while (!need_resched());
203 clear_thread_flag(TIF_POLLING_NRFLAG);
207 void __init select_idle_routine(const struct cpuinfo_x86 *c)
210 if (cpu_has(c, X86_FEATURE_MWAIT)) {
212 * Skip, if setup has overridden idle.
213 * One CPU supports mwait => All CPUs supports mwait
217 printk("using mwait in idle threads.\n");
220 pm_idle = mwait_idle;
225 static int __init idle_setup (char *str)
227 if (!strncmp(str, "poll", 4)) {
228 printk("using polling idle threads.\n");
232 boot_option_idle_override = 1;
236 __setup("idle=", idle_setup);
238 /* Prints also some state that isn't saved in the pt_regs */
239 void __show_regs(struct pt_regs * regs)
241 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
242 unsigned int fsindex,gsindex;
243 unsigned int ds,cs,es;
247 printk("Pid: %d, comm: %.20s %s %s\n",
248 current->pid, current->comm, print_tainted(), system_utsname.release);
249 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
250 printk_address(regs->rip);
251 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
252 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
253 regs->rax, regs->rbx, regs->rcx);
254 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
255 regs->rdx, regs->rsi, regs->rdi);
256 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
257 regs->rbp, regs->r8, regs->r9);
258 printk("R10: %016lx R11: %016lx R12: %016lx\n",
259 regs->r10, regs->r11, regs->r12);
260 printk("R13: %016lx R14: %016lx R15: %016lx\n",
261 regs->r13, regs->r14, regs->r15);
263 asm("movl %%ds,%0" : "=r" (ds));
264 asm("movl %%cs,%0" : "=r" (cs));
265 asm("movl %%es,%0" : "=r" (es));
266 asm("movl %%fs,%0" : "=r" (fsindex));
267 asm("movl %%gs,%0" : "=r" (gsindex));
269 rdmsrl(MSR_FS_BASE, fs);
270 rdmsrl(MSR_GS_BASE, gs);
271 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
273 asm("movq %%cr0, %0": "=r" (cr0));
274 asm("movq %%cr2, %0": "=r" (cr2));
275 asm("movq %%cr3, %0": "=r" (cr3));
276 asm("movq %%cr4, %0": "=r" (cr4));
278 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
279 fs,fsindex,gs,gsindex,shadowgs);
280 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
281 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
284 void show_regs(struct pt_regs *regs)
287 show_trace(®s->rsp);
291 * Free current thread data structures etc..
293 void exit_thread(void)
295 struct task_struct *me = current;
296 struct thread_struct *t = &me->thread;
299 * Remove function-return probe instances associated with this task
300 * and put them back on the free list. Do not insert an exit probe for
301 * this function, it will be disabled by kprobe_flush_task if you do.
303 kprobe_flush_task(me);
305 if (me->thread.io_bitmap_ptr) {
306 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
308 kfree(t->io_bitmap_ptr);
309 t->io_bitmap_ptr = NULL;
311 * Careful, clear this in the TSS too:
313 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
314 t->io_bitmap_max = 0;
319 void flush_thread(void)
321 struct task_struct *tsk = current;
322 struct thread_info *t = current_thread_info();
325 * Remove function-return probe instances associated with this task
326 * and put them back on the free list. Do not insert an exit probe for
327 * this function, it will be disabled by kprobe_flush_task if you do.
329 kprobe_flush_task(tsk);
331 if (t->flags & _TIF_ABI_PENDING)
332 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
334 tsk->thread.debugreg0 = 0;
335 tsk->thread.debugreg1 = 0;
336 tsk->thread.debugreg2 = 0;
337 tsk->thread.debugreg3 = 0;
338 tsk->thread.debugreg6 = 0;
339 tsk->thread.debugreg7 = 0;
340 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
342 * Forget coprocessor state..
348 void release_thread(struct task_struct *dead_task)
351 if (dead_task->mm->context.size) {
352 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
354 dead_task->mm->context.ldt,
355 dead_task->mm->context.size);
361 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
363 struct user_desc ud = {
370 struct n_desc_struct *desc = (void *)t->thread.tls_array;
372 desc->a = LDT_entry_a(&ud);
373 desc->b = LDT_entry_b(&ud);
376 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
378 struct desc_struct *desc = (void *)t->thread.tls_array;
381 (((u32)desc->base1) << 16) |
382 (((u32)desc->base2) << 24);
386 * This gets called before we allocate a new thread and copy
387 * the current task into it.
389 void prepare_to_copy(struct task_struct *tsk)
394 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
395 unsigned long unused,
396 struct task_struct * p, struct pt_regs * regs)
399 struct pt_regs * childregs;
400 struct task_struct *me = current;
402 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
407 childregs->rsp = rsp;
409 childregs->rsp = (unsigned long)childregs;
412 p->thread.rsp = (unsigned long) childregs;
413 p->thread.rsp0 = (unsigned long) (childregs+1);
414 p->thread.userrsp = me->thread.userrsp;
416 set_ti_thread_flag(p->thread_info, TIF_FORK);
418 p->thread.fs = me->thread.fs;
419 p->thread.gs = me->thread.gs;
421 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
422 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
423 asm("mov %%es,%0" : "=m" (p->thread.es));
424 asm("mov %%ds,%0" : "=m" (p->thread.ds));
426 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
427 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
428 if (!p->thread.io_bitmap_ptr) {
429 p->thread.io_bitmap_max = 0;
432 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
436 * Set a new TLS for the child thread?
438 if (clone_flags & CLONE_SETTLS) {
439 #ifdef CONFIG_IA32_EMULATION
440 if (test_thread_flag(TIF_IA32))
441 err = ia32_child_tls(p, childregs);
444 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
450 if (err && p->thread.io_bitmap_ptr) {
451 kfree(p->thread.io_bitmap_ptr);
452 p->thread.io_bitmap_max = 0;
458 * This special macro can be used to load a debugging register
460 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
463 * switch_to(x,y) should switch tasks from x to y.
465 * This could still be optimized:
466 * - fold all the options into a flag word and test it with a single test.
467 * - could test fs/gs bitsliced
469 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
471 struct thread_struct *prev = &prev_p->thread,
472 *next = &next_p->thread;
473 int cpu = smp_processor_id();
474 struct tss_struct *tss = &per_cpu(init_tss, cpu);
479 * Reload esp0, LDT and the page table pointer:
481 tss->rsp0 = next->rsp0;
485 * This won't pick up thread selector changes, but I guess that is ok.
487 asm volatile("mov %%es,%0" : "=m" (prev->es));
488 if (unlikely(next->es | prev->es))
489 loadsegment(es, next->es);
491 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
492 if (unlikely(next->ds | prev->ds))
493 loadsegment(ds, next->ds);
502 asm volatile("movl %%fs,%0" : "=r" (fsindex));
503 /* segment register != 0 always requires a reload.
504 also reload when it has changed.
505 when prev process used 64bit base always reload
506 to avoid an information leak. */
507 if (unlikely(fsindex | next->fsindex | prev->fs)) {
508 loadsegment(fs, next->fsindex);
509 /* check if the user used a selector != 0
510 * if yes clear 64bit base, since overloaded base
511 * is always mapped to the Null selector
516 /* when next process has a 64bit base use it */
518 wrmsrl(MSR_FS_BASE, next->fs);
519 prev->fsindex = fsindex;
523 asm volatile("movl %%gs,%0" : "=r" (gsindex));
524 if (unlikely(gsindex | next->gsindex | prev->gs)) {
525 load_gs_index(next->gsindex);
530 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
531 prev->gsindex = gsindex;
535 * Switch the PDA context.
537 prev->userrsp = read_pda(oldrsp);
538 write_pda(oldrsp, next->userrsp);
539 write_pda(pcurrent, next_p);
540 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
543 * Now maybe reload the debug registers
545 if (unlikely(next->debugreg7)) {
557 * Handle the IO bitmap
559 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
560 if (next->io_bitmap_ptr)
562 * Copy the relevant range of the IO bitmap.
563 * Normally this is 128 bytes or less:
565 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
566 max(prev->io_bitmap_max, next->io_bitmap_max));
569 * Clear any possible leftover bits:
571 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
579 * sys_execve() executes a new program.
582 long sys_execve(char __user *name, char __user * __user *argv,
583 char __user * __user *envp, struct pt_regs regs)
588 filename = getname(name);
589 error = PTR_ERR(filename);
590 if (IS_ERR(filename))
592 error = do_execve(filename, argv, envp, ®s);
595 current->ptrace &= ~PT_DTRACE;
596 task_unlock(current);
602 void set_personality_64bit(void)
604 /* inherit personality from parent */
606 /* Make sure to be in 64bit mode */
607 clear_thread_flag(TIF_IA32);
609 /* TBD: overwrites user setup. Should have two bits.
610 But 64bit processes have always behaved this way,
611 so it's not too bad. The main problem is just that
612 32bit childs are affected again. */
613 current->personality &= ~READ_IMPLIES_EXEC;
616 asmlinkage long sys_fork(struct pt_regs *regs)
618 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
621 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
625 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
629 * This is trivial, and on the face of it looks like it
630 * could equally well be done in user mode.
632 * Not so, for quite unobvious reasons - register pressure.
633 * In user mode vfork() cannot have a stack frame, and if
634 * done by calling the "clone()" system call directly, you
635 * do not have enough call-clobbered registers to hold all
636 * the information you need.
638 asmlinkage long sys_vfork(struct pt_regs *regs)
640 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
644 unsigned long get_wchan(struct task_struct *p)
650 if (!p || p == current || p->state==TASK_RUNNING)
652 stack = (unsigned long)p->thread_info;
653 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
655 fp = *(u64 *)(p->thread.rsp);
657 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
659 rip = *(u64 *)(fp+8);
660 if (!in_sched_functions(rip))
663 } while (count++ < 16);
667 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
670 int doit = task == current;
675 if (addr >= TASK_SIZE_OF(task))
678 /* handle small bases via the GDT because that's faster to
680 if (addr <= 0xffffffff) {
681 set_32bit_tls(task, GS_TLS, addr);
683 load_TLS(&task->thread, cpu);
684 load_gs_index(GS_TLS_SEL);
686 task->thread.gsindex = GS_TLS_SEL;
689 task->thread.gsindex = 0;
690 task->thread.gs = addr;
693 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
699 /* Not strictly needed for fs, but do it for symmetry
701 if (addr >= TASK_SIZE_OF(task))
704 /* handle small bases via the GDT because that's faster to
706 if (addr <= 0xffffffff) {
707 set_32bit_tls(task, FS_TLS, addr);
709 load_TLS(&task->thread, cpu);
710 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
712 task->thread.fsindex = FS_TLS_SEL;
715 task->thread.fsindex = 0;
716 task->thread.fs = addr;
718 /* set the selector to 0 to not confuse
720 asm volatile("movl %0,%%fs" :: "r" (0));
721 ret = checking_wrmsrl(MSR_FS_BASE, addr);
728 if (task->thread.fsindex == FS_TLS_SEL)
729 base = read_32bit_tls(task, FS_TLS);
731 rdmsrl(MSR_FS_BASE, base);
733 base = task->thread.fs;
734 ret = put_user(base, (unsigned long __user *)addr);
739 if (task->thread.gsindex == GS_TLS_SEL)
740 base = read_32bit_tls(task, GS_TLS);
742 rdmsrl(MSR_KERNEL_GS_BASE, base);
744 base = task->thread.gs;
745 ret = put_user(base, (unsigned long __user *)addr);
757 long sys_arch_prctl(int code, unsigned long addr)
759 return do_arch_prctl(current, code, addr);
763 * Capture the user space registers if the task is not running (in user space)
765 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
767 struct pt_regs *pp, ptregs;
769 pp = (struct pt_regs *)(tsk->thread.rsp0);
776 elf_core_copy_regs(regs, &ptregs);
781 unsigned long arch_align_stack(unsigned long sp)
783 if (randomize_va_space)
784 sp -= get_random_int() % 8192;