2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/a.out.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
82 static void __exit_idle(void)
84 if (test_and_clear_bit_pda(0, isidle) == 0)
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
89 /* Called from interrupts to signify idle end */
92 /* idle loop has pid 0 */
99 * We use this if we don't have any better
102 void default_idle(void)
104 current_thread_info()->status &= ~TS_POLLING;
106 * TS_POLLING-cleared state must be visible before we
111 if (!need_resched()) {
116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
124 current_thread_info()->status |= TS_POLLING;
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
132 static void poll_idle(void)
138 #ifdef CONFIG_HOTPLUG_CPU
139 DECLARE_PER_CPU(int, cpu_state);
142 /* We halt the CPU with physical CPU hotplug */
143 static inline void play_dead(void)
149 __get_cpu_var(cpu_state) = CPU_DEAD;
156 static inline void play_dead(void)
160 #endif /* CONFIG_HOTPLUG_CPU */
163 * The idle thread. There's no useful work to be
164 * done, so just try to conserve power and have a
165 * low exit latency (ie sit in a loop waiting for
166 * somebody to say that they'd like to reschedule)
170 current_thread_info()->status |= TS_POLLING;
171 /* endless idle loop with no priority at all */
173 while (!need_resched()) {
176 if (__get_cpu_var(cpu_idle_state))
177 __get_cpu_var(cpu_idle_state) = 0;
179 tick_nohz_stop_sched_tick();
185 if (cpu_is_offline(smp_processor_id()))
188 * Idle routines should keep interrupts disabled
189 * from here on, until they go to idle.
190 * Otherwise, idle callbacks can misfire.
195 /* In many cases the interrupt that ended idle
196 has already called exit_idle. But some idle
197 loops can be woken up without interrupt. */
201 tick_nohz_restart_sched_tick();
202 preempt_enable_no_resched();
208 static void do_nothing(void *unused)
212 void cpu_idle_wait(void)
214 unsigned int cpu, this_cpu = get_cpu();
215 cpumask_t map, tmp = current->cpus_allowed;
217 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
221 for_each_online_cpu(cpu) {
222 per_cpu(cpu_idle_state, cpu) = 1;
226 __get_cpu_var(cpu_idle_state) = 0;
231 for_each_online_cpu(cpu) {
232 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
235 cpus_and(map, map, cpu_online_map);
237 * We waited 1 sec, if a CPU still did not call idle
238 * it may be because it is in idle and not waking up
239 * because it has nothing to do.
240 * Give all the remaining CPUS a kick.
242 smp_call_function_mask(map, do_nothing, 0, 0);
243 } while (!cpus_empty(map));
245 set_cpus_allowed(current, tmp);
247 EXPORT_SYMBOL_GPL(cpu_idle_wait);
250 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
251 * which can obviate IPI to trigger checking of need_resched.
252 * We execute MONITOR against need_resched and enter optimized wait state
253 * through MWAIT. Whenever someone changes need_resched, we would be woken
254 * up from MWAIT (without an IPI).
256 * New with Core Duo processors, MWAIT can take some hints based on CPU
259 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
261 if (!need_resched()) {
262 __monitor((void *)¤t_thread_info()->flags, 0, 0);
269 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
270 static void mwait_idle(void)
272 if (!need_resched()) {
273 __monitor((void *)¤t_thread_info()->flags, 0, 0);
284 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
287 if (cpu_has(c, X86_FEATURE_MWAIT)) {
289 * Skip, if setup has overridden idle.
290 * One CPU supports mwait => All CPUs supports mwait
294 printk(KERN_INFO "using mwait in idle threads.\n");
297 pm_idle = mwait_idle;
302 static int __init idle_setup(char *str)
304 if (!strcmp(str, "poll")) {
305 printk("using polling idle threads.\n");
307 } else if (!strcmp(str, "mwait"))
312 boot_option_idle_override = 1;
315 early_param("idle", idle_setup);
317 /* Prints also some state that isn't saved in the pt_regs */
318 void __show_regs(struct pt_regs * regs)
320 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
321 unsigned long d0, d1, d2, d3, d6, d7;
322 unsigned int fsindex, gsindex;
323 unsigned int ds, cs, es;
327 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
328 current->pid, current->comm, print_tainted(),
329 init_utsname()->release,
330 (int)strcspn(init_utsname()->version, " "),
331 init_utsname()->version);
332 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
333 printk_address(regs->ip);
334 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
336 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
337 regs->ax, regs->bx, regs->cx);
338 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
339 regs->dx, regs->si, regs->di);
340 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
341 regs->bp, regs->r8, regs->r9);
342 printk("R10: %016lx R11: %016lx R12: %016lx\n",
343 regs->r10, regs->r11, regs->r12);
344 printk("R13: %016lx R14: %016lx R15: %016lx\n",
345 regs->r13, regs->r14, regs->r15);
347 asm("movl %%ds,%0" : "=r" (ds));
348 asm("movl %%cs,%0" : "=r" (cs));
349 asm("movl %%es,%0" : "=r" (es));
350 asm("movl %%fs,%0" : "=r" (fsindex));
351 asm("movl %%gs,%0" : "=r" (gsindex));
353 rdmsrl(MSR_FS_BASE, fs);
354 rdmsrl(MSR_GS_BASE, gs);
355 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
362 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
363 fs,fsindex,gs,gsindex,shadowgs);
364 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
365 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
370 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
374 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
377 void show_regs(struct pt_regs *regs)
379 printk("CPU %d:", smp_processor_id());
381 show_trace(NULL, regs, (void *)(regs + 1));
385 * Free current thread data structures etc..
387 void exit_thread(void)
389 struct task_struct *me = current;
390 struct thread_struct *t = &me->thread;
392 if (me->thread.io_bitmap_ptr) {
393 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
395 kfree(t->io_bitmap_ptr);
396 t->io_bitmap_ptr = NULL;
397 clear_thread_flag(TIF_IO_BITMAP);
399 * Careful, clear this in the TSS too:
401 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
402 t->io_bitmap_max = 0;
407 void flush_thread(void)
409 struct task_struct *tsk = current;
411 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
412 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
413 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
414 clear_tsk_thread_flag(tsk, TIF_IA32);
416 set_tsk_thread_flag(tsk, TIF_IA32);
417 current_thread_info()->status |= TS_COMPAT;
420 clear_tsk_thread_flag(tsk, TIF_DEBUG);
422 tsk->thread.debugreg0 = 0;
423 tsk->thread.debugreg1 = 0;
424 tsk->thread.debugreg2 = 0;
425 tsk->thread.debugreg3 = 0;
426 tsk->thread.debugreg6 = 0;
427 tsk->thread.debugreg7 = 0;
428 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
430 * Forget coprocessor state..
436 void release_thread(struct task_struct *dead_task)
439 if (dead_task->mm->context.size) {
440 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
442 dead_task->mm->context.ldt,
443 dead_task->mm->context.size);
449 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
451 struct user_desc ud = {
458 struct desc_struct *desc = (void *)t->thread.tls_array;
463 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
465 return get_desc_base(&t->thread.tls_array[tls]);
469 * This gets called before we allocate a new thread and copy
470 * the current task into it.
472 void prepare_to_copy(struct task_struct *tsk)
477 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
478 unsigned long unused,
479 struct task_struct * p, struct pt_regs * regs)
482 struct pt_regs * childregs;
483 struct task_struct *me = current;
485 childregs = ((struct pt_regs *)
486 (THREAD_SIZE + task_stack_page(p))) - 1;
492 childregs->sp = (unsigned long)childregs;
494 p->thread.sp = (unsigned long) childregs;
495 p->thread.sp0 = (unsigned long) (childregs+1);
496 p->thread.usersp = me->thread.usersp;
498 set_tsk_thread_flag(p, TIF_FORK);
500 p->thread.fs = me->thread.fs;
501 p->thread.gs = me->thread.gs;
503 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
504 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
505 asm("mov %%es,%0" : "=m" (p->thread.es));
506 asm("mov %%ds,%0" : "=m" (p->thread.ds));
508 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
509 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
510 if (!p->thread.io_bitmap_ptr) {
511 p->thread.io_bitmap_max = 0;
514 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
516 set_tsk_thread_flag(p, TIF_IO_BITMAP);
520 * Set a new TLS for the child thread?
522 if (clone_flags & CLONE_SETTLS) {
523 #ifdef CONFIG_IA32_EMULATION
524 if (test_thread_flag(TIF_IA32))
525 err = do_set_thread_area(p, -1,
526 (struct user_desc __user *)childregs->si, 0);
529 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
535 if (err && p->thread.io_bitmap_ptr) {
536 kfree(p->thread.io_bitmap_ptr);
537 p->thread.io_bitmap_max = 0;
543 * This special macro can be used to load a debugging register
545 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
547 static inline void __switch_to_xtra(struct task_struct *prev_p,
548 struct task_struct *next_p,
549 struct tss_struct *tss)
551 struct thread_struct *prev, *next;
552 unsigned long debugctl;
554 prev = &prev_p->thread,
555 next = &next_p->thread;
557 debugctl = prev->debugctlmsr;
558 if (next->ds_area_msr != prev->ds_area_msr) {
559 /* we clear debugctl to make sure DS
560 * is not in use when we change it */
562 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
563 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
566 if (next->debugctlmsr != debugctl)
567 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
569 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
579 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
581 * Copy the relevant range of the IO bitmap.
582 * Normally this is 128 bytes or less:
584 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
585 max(prev->io_bitmap_max, next->io_bitmap_max));
586 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
588 * Clear any possible leftover bits:
590 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
593 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
594 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
596 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
597 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
601 * switch_to(x,y) should switch tasks from x to y.
603 * This could still be optimized:
604 * - fold all the options into a flag word and test it with a single test.
605 * - could test fs/gs bitsliced
607 * Kprobes not supported here. Set the probe on schedule instead.
610 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
612 struct thread_struct *prev = &prev_p->thread,
613 *next = &next_p->thread;
614 int cpu = smp_processor_id();
615 struct tss_struct *tss = &per_cpu(init_tss, cpu);
617 /* we're going to use this soon, after a few expensive things */
618 if (next_p->fpu_counter>5)
619 prefetch(&next->i387.fxsave);
622 * Reload esp0, LDT and the page table pointer:
628 * This won't pick up thread selector changes, but I guess that is ok.
630 asm volatile("mov %%es,%0" : "=m" (prev->es));
631 if (unlikely(next->es | prev->es))
632 loadsegment(es, next->es);
634 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
635 if (unlikely(next->ds | prev->ds))
636 loadsegment(ds, next->ds);
645 asm volatile("movl %%fs,%0" : "=r" (fsindex));
646 /* segment register != 0 always requires a reload.
647 also reload when it has changed.
648 when prev process used 64bit base always reload
649 to avoid an information leak. */
650 if (unlikely(fsindex | next->fsindex | prev->fs)) {
651 loadsegment(fs, next->fsindex);
652 /* check if the user used a selector != 0
653 * if yes clear 64bit base, since overloaded base
654 * is always mapped to the Null selector
659 /* when next process has a 64bit base use it */
661 wrmsrl(MSR_FS_BASE, next->fs);
662 prev->fsindex = fsindex;
666 asm volatile("movl %%gs,%0" : "=r" (gsindex));
667 if (unlikely(gsindex | next->gsindex | prev->gs)) {
668 load_gs_index(next->gsindex);
673 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
674 prev->gsindex = gsindex;
677 /* Must be after DS reload */
681 * Switch the PDA and FPU contexts.
683 prev->usersp = read_pda(oldrsp);
684 write_pda(oldrsp, next->usersp);
685 write_pda(pcurrent, next_p);
687 write_pda(kernelstack,
688 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
689 #ifdef CONFIG_CC_STACKPROTECTOR
690 write_pda(stack_canary, next_p->stack_canary);
692 * Build time only check to make sure the stack_canary is at
693 * offset 40 in the pda; this is a gcc ABI requirement
695 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
699 * Now maybe reload the debug registers and handle I/O bitmaps
701 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
702 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
703 __switch_to_xtra(prev_p, next_p, tss);
705 /* If the task has used fpu the last 5 timeslices, just do a full
706 * restore of the math state immediately to avoid the trap; the
707 * chances of needing FPU soon are obviously high now
709 if (next_p->fpu_counter>5)
710 math_state_restore();
715 * sys_execve() executes a new program.
718 long sys_execve(char __user *name, char __user * __user *argv,
719 char __user * __user *envp, struct pt_regs regs)
724 filename = getname(name);
725 error = PTR_ERR(filename);
726 if (IS_ERR(filename))
728 error = do_execve(filename, argv, envp, ®s);
733 void set_personality_64bit(void)
735 /* inherit personality from parent */
737 /* Make sure to be in 64bit mode */
738 clear_thread_flag(TIF_IA32);
740 /* TBD: overwrites user setup. Should have two bits.
741 But 64bit processes have always behaved this way,
742 so it's not too bad. The main problem is just that
743 32bit childs are affected again. */
744 current->personality &= ~READ_IMPLIES_EXEC;
747 asmlinkage long sys_fork(struct pt_regs *regs)
749 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
753 sys_clone(unsigned long clone_flags, unsigned long newsp,
754 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
758 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
762 * This is trivial, and on the face of it looks like it
763 * could equally well be done in user mode.
765 * Not so, for quite unobvious reasons - register pressure.
766 * In user mode vfork() cannot have a stack frame, and if
767 * done by calling the "clone()" system call directly, you
768 * do not have enough call-clobbered registers to hold all
769 * the information you need.
771 asmlinkage long sys_vfork(struct pt_regs *regs)
773 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
777 unsigned long get_wchan(struct task_struct *p)
783 if (!p || p == current || p->state==TASK_RUNNING)
785 stack = (unsigned long)task_stack_page(p);
786 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
788 fp = *(u64 *)(p->thread.sp);
790 if (fp < (unsigned long)stack ||
791 fp > (unsigned long)stack+THREAD_SIZE)
794 if (!in_sched_functions(ip))
797 } while (count++ < 16);
801 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
804 int doit = task == current;
809 if (addr >= TASK_SIZE_OF(task))
812 /* handle small bases via the GDT because that's faster to
814 if (addr <= 0xffffffff) {
815 set_32bit_tls(task, GS_TLS, addr);
817 load_TLS(&task->thread, cpu);
818 load_gs_index(GS_TLS_SEL);
820 task->thread.gsindex = GS_TLS_SEL;
823 task->thread.gsindex = 0;
824 task->thread.gs = addr;
827 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
833 /* Not strictly needed for fs, but do it for symmetry
835 if (addr >= TASK_SIZE_OF(task))
838 /* handle small bases via the GDT because that's faster to
840 if (addr <= 0xffffffff) {
841 set_32bit_tls(task, FS_TLS, addr);
843 load_TLS(&task->thread, cpu);
844 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
846 task->thread.fsindex = FS_TLS_SEL;
849 task->thread.fsindex = 0;
850 task->thread.fs = addr;
852 /* set the selector to 0 to not confuse
854 asm volatile("movl %0,%%fs" :: "r" (0));
855 ret = checking_wrmsrl(MSR_FS_BASE, addr);
862 if (task->thread.fsindex == FS_TLS_SEL)
863 base = read_32bit_tls(task, FS_TLS);
865 rdmsrl(MSR_FS_BASE, base);
867 base = task->thread.fs;
868 ret = put_user(base, (unsigned long __user *)addr);
874 if (task->thread.gsindex == GS_TLS_SEL)
875 base = read_32bit_tls(task, GS_TLS);
877 asm("movl %%gs,%0" : "=r" (gsindex));
879 rdmsrl(MSR_KERNEL_GS_BASE, base);
881 base = task->thread.gs;
884 base = task->thread.gs;
885 ret = put_user(base, (unsigned long __user *)addr);
897 long sys_arch_prctl(int code, unsigned long addr)
899 return do_arch_prctl(current, code, addr);
902 unsigned long arch_align_stack(unsigned long sp)
904 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
905 sp -= get_random_int() % 8192;
909 unsigned long arch_randomize_brk(struct mm_struct *mm)
911 unsigned long range_end = mm->brk + 0x02000000;
912 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;