2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 unsigned long boot_option_idle_override = 0;
60 EXPORT_SYMBOL(boot_option_idle_override);
63 * Powermanagement idle function, if any..
65 void (*pm_idle)(void);
66 EXPORT_SYMBOL(pm_idle);
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
78 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81 static void __exit_idle(void)
83 if (test_and_clear_bit_pda(0, isidle) == 0)
85 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88 /* Called from interrupts to signify idle end */
91 /* idle loop has pid 0 */
98 * We use this if we don't have any better
101 void default_idle(void)
103 current_thread_info()->status &= ~TS_POLLING;
105 * TS_POLLING-cleared state must be visible before we
110 safe_halt(); /* enables interrupts racelessly */
113 current_thread_info()->status |= TS_POLLING;
116 #ifdef CONFIG_HOTPLUG_CPU
117 DECLARE_PER_CPU(int, cpu_state);
120 /* We halt the CPU with physical CPU hotplug */
121 static inline void play_dead(void)
127 __get_cpu_var(cpu_state) = CPU_DEAD;
134 static inline void play_dead(void)
138 #endif /* CONFIG_HOTPLUG_CPU */
141 * The idle thread. There's no useful work to be
142 * done, so just try to conserve power and have a
143 * low exit latency (ie sit in a loop waiting for
144 * somebody to say that they'd like to reschedule)
148 current_thread_info()->status |= TS_POLLING;
149 /* endless idle loop with no priority at all */
151 tick_nohz_stop_sched_tick();
152 while (!need_resched()) {
159 if (cpu_is_offline(smp_processor_id()))
162 * Idle routines should keep interrupts disabled
163 * from here on, until they go to idle.
164 * Otherwise, idle callbacks can misfire.
168 /* Don't trace irqs off for idle */
169 stop_critical_timings();
171 start_critical_timings();
172 /* In many cases the interrupt that ended idle
173 has already called exit_idle. But some idle
174 loops can be woken up without interrupt. */
178 tick_nohz_restart_sched_tick();
179 preempt_enable_no_resched();
185 /* Prints also some state that isn't saved in the pt_regs */
186 void __show_regs(struct pt_regs * regs)
188 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
189 unsigned long d0, d1, d2, d3, d6, d7;
190 unsigned int fsindex, gsindex;
191 unsigned int ds, cs, es;
195 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
196 current->pid, current->comm, print_tainted(),
197 init_utsname()->release,
198 (int)strcspn(init_utsname()->version, " "),
199 init_utsname()->version);
200 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
201 printk_address(regs->ip, 1);
202 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
204 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
205 regs->ax, regs->bx, regs->cx);
206 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
207 regs->dx, regs->si, regs->di);
208 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
209 regs->bp, regs->r8, regs->r9);
210 printk("R10: %016lx R11: %016lx R12: %016lx\n",
211 regs->r10, regs->r11, regs->r12);
212 printk("R13: %016lx R14: %016lx R15: %016lx\n",
213 regs->r13, regs->r14, regs->r15);
215 asm("movl %%ds,%0" : "=r" (ds));
216 asm("movl %%cs,%0" : "=r" (cs));
217 asm("movl %%es,%0" : "=r" (es));
218 asm("movl %%fs,%0" : "=r" (fsindex));
219 asm("movl %%gs,%0" : "=r" (gsindex));
221 rdmsrl(MSR_FS_BASE, fs);
222 rdmsrl(MSR_GS_BASE, gs);
223 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
230 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
231 fs,fsindex,gs,gsindex,shadowgs);
232 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
233 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
238 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
242 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
245 void show_regs(struct pt_regs *regs)
247 printk("CPU %d:", smp_processor_id());
249 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
253 * Free current thread data structures etc..
255 void exit_thread(void)
257 struct task_struct *me = current;
258 struct thread_struct *t = &me->thread;
260 if (me->thread.io_bitmap_ptr) {
261 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
263 kfree(t->io_bitmap_ptr);
264 t->io_bitmap_ptr = NULL;
265 clear_thread_flag(TIF_IO_BITMAP);
267 * Careful, clear this in the TSS too:
269 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
270 t->io_bitmap_max = 0;
275 void flush_thread(void)
277 struct task_struct *tsk = current;
279 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
280 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
281 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
282 clear_tsk_thread_flag(tsk, TIF_IA32);
284 set_tsk_thread_flag(tsk, TIF_IA32);
285 current_thread_info()->status |= TS_COMPAT;
288 clear_tsk_thread_flag(tsk, TIF_DEBUG);
290 tsk->thread.debugreg0 = 0;
291 tsk->thread.debugreg1 = 0;
292 tsk->thread.debugreg2 = 0;
293 tsk->thread.debugreg3 = 0;
294 tsk->thread.debugreg6 = 0;
295 tsk->thread.debugreg7 = 0;
296 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
298 * Forget coprocessor state..
300 tsk->fpu_counter = 0;
305 void release_thread(struct task_struct *dead_task)
308 if (dead_task->mm->context.size) {
309 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
311 dead_task->mm->context.ldt,
312 dead_task->mm->context.size);
318 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
320 struct user_desc ud = {
327 struct desc_struct *desc = t->thread.tls_array;
332 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
334 return get_desc_base(&t->thread.tls_array[tls]);
338 * This gets called before we allocate a new thread and copy
339 * the current task into it.
341 void prepare_to_copy(struct task_struct *tsk)
346 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
347 unsigned long unused,
348 struct task_struct * p, struct pt_regs * regs)
351 struct pt_regs * childregs;
352 struct task_struct *me = current;
354 childregs = ((struct pt_regs *)
355 (THREAD_SIZE + task_stack_page(p))) - 1;
361 childregs->sp = (unsigned long)childregs;
363 p->thread.sp = (unsigned long) childregs;
364 p->thread.sp0 = (unsigned long) (childregs+1);
365 p->thread.usersp = me->thread.usersp;
367 set_tsk_thread_flag(p, TIF_FORK);
369 p->thread.fs = me->thread.fs;
370 p->thread.gs = me->thread.gs;
372 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
373 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
374 asm("mov %%es,%0" : "=m" (p->thread.es));
375 asm("mov %%ds,%0" : "=m" (p->thread.ds));
377 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
378 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
379 if (!p->thread.io_bitmap_ptr) {
380 p->thread.io_bitmap_max = 0;
383 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
385 set_tsk_thread_flag(p, TIF_IO_BITMAP);
389 * Set a new TLS for the child thread?
391 if (clone_flags & CLONE_SETTLS) {
392 #ifdef CONFIG_IA32_EMULATION
393 if (test_thread_flag(TIF_IA32))
394 err = do_set_thread_area(p, -1,
395 (struct user_desc __user *)childregs->si, 0);
398 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
404 if (err && p->thread.io_bitmap_ptr) {
405 kfree(p->thread.io_bitmap_ptr);
406 p->thread.io_bitmap_max = 0;
412 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
414 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
418 write_pda(oldrsp, new_sp);
419 regs->cs = __USER_CS;
420 regs->ss = __USER_DS;
424 * Free the old FP and other extended state
426 free_thread_xstate(current);
428 EXPORT_SYMBOL_GPL(start_thread);
430 static void hard_disable_TSC(void)
432 write_cr4(read_cr4() | X86_CR4_TSD);
435 void disable_TSC(void)
438 if (!test_and_set_thread_flag(TIF_NOTSC))
440 * Must flip the CPU state synchronously with
441 * TIF_NOTSC in the current running context.
447 static void hard_enable_TSC(void)
449 write_cr4(read_cr4() & ~X86_CR4_TSD);
452 static void enable_TSC(void)
455 if (test_and_clear_thread_flag(TIF_NOTSC))
457 * Must flip the CPU state synchronously with
458 * TIF_NOTSC in the current running context.
464 int get_tsc_mode(unsigned long adr)
468 if (test_thread_flag(TIF_NOTSC))
469 val = PR_TSC_SIGSEGV;
473 return put_user(val, (unsigned int __user *)adr);
476 int set_tsc_mode(unsigned int val)
478 if (val == PR_TSC_SIGSEGV)
480 else if (val == PR_TSC_ENABLE)
489 * This special macro can be used to load a debugging register
491 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
493 static inline void __switch_to_xtra(struct task_struct *prev_p,
494 struct task_struct *next_p,
495 struct tss_struct *tss)
497 struct thread_struct *prev, *next;
498 unsigned long debugctl;
500 prev = &prev_p->thread,
501 next = &next_p->thread;
503 debugctl = prev->debugctlmsr;
504 if (next->ds_area_msr != prev->ds_area_msr) {
505 /* we clear debugctl to make sure DS
506 * is not in use when we change it */
508 update_debugctlmsr(0);
509 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
512 if (next->debugctlmsr != debugctl)
513 update_debugctlmsr(next->debugctlmsr);
515 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
525 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
526 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
527 /* prev and next are different */
528 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
534 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
536 * Copy the relevant range of the IO bitmap.
537 * Normally this is 128 bytes or less:
539 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
540 max(prev->io_bitmap_max, next->io_bitmap_max));
541 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
543 * Clear any possible leftover bits:
545 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
549 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
550 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
552 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
553 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
558 * switch_to(x,y) should switch tasks from x to y.
560 * This could still be optimized:
561 * - fold all the options into a flag word and test it with a single test.
562 * - could test fs/gs bitsliced
564 * Kprobes not supported here. Set the probe on schedule instead.
567 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
569 struct thread_struct *prev = &prev_p->thread,
570 *next = &next_p->thread;
571 int cpu = smp_processor_id();
572 struct tss_struct *tss = &per_cpu(init_tss, cpu);
574 /* we're going to use this soon, after a few expensive things */
575 if (next_p->fpu_counter>5)
576 prefetch(next->xstate);
579 * Reload esp0, LDT and the page table pointer:
585 * This won't pick up thread selector changes, but I guess that is ok.
587 asm volatile("mov %%es,%0" : "=m" (prev->es));
588 if (unlikely(next->es | prev->es))
589 loadsegment(es, next->es);
591 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
592 if (unlikely(next->ds | prev->ds))
593 loadsegment(ds, next->ds);
602 asm volatile("movl %%fs,%0" : "=r" (fsindex));
603 /* segment register != 0 always requires a reload.
604 also reload when it has changed.
605 when prev process used 64bit base always reload
606 to avoid an information leak. */
607 if (unlikely(fsindex | next->fsindex | prev->fs)) {
608 loadsegment(fs, next->fsindex);
609 /* check if the user used a selector != 0
610 * if yes clear 64bit base, since overloaded base
611 * is always mapped to the Null selector
616 /* when next process has a 64bit base use it */
618 wrmsrl(MSR_FS_BASE, next->fs);
619 prev->fsindex = fsindex;
623 asm volatile("movl %%gs,%0" : "=r" (gsindex));
624 if (unlikely(gsindex | next->gsindex | prev->gs)) {
625 load_gs_index(next->gsindex);
630 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
631 prev->gsindex = gsindex;
634 /* Must be after DS reload */
638 * Switch the PDA and FPU contexts.
640 prev->usersp = read_pda(oldrsp);
641 write_pda(oldrsp, next->usersp);
642 write_pda(pcurrent, next_p);
644 write_pda(kernelstack,
645 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
646 #ifdef CONFIG_CC_STACKPROTECTOR
647 write_pda(stack_canary, next_p->stack_canary);
649 * Build time only check to make sure the stack_canary is at
650 * offset 40 in the pda; this is a gcc ABI requirement
652 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
656 * Now maybe reload the debug registers and handle I/O bitmaps
658 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
659 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
660 __switch_to_xtra(prev_p, next_p, tss);
662 /* If the task has used fpu the last 5 timeslices, just do a full
663 * restore of the math state immediately to avoid the trap; the
664 * chances of needing FPU soon are obviously high now
666 * tsk_used_math() checks prevent calling math_state_restore(),
667 * which can sleep in the case of !tsk_used_math()
669 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
670 math_state_restore();
675 * sys_execve() executes a new program.
678 long sys_execve(char __user *name, char __user * __user *argv,
679 char __user * __user *envp, struct pt_regs *regs)
684 filename = getname(name);
685 error = PTR_ERR(filename);
686 if (IS_ERR(filename))
688 error = do_execve(filename, argv, envp, regs);
693 void set_personality_64bit(void)
695 /* inherit personality from parent */
697 /* Make sure to be in 64bit mode */
698 clear_thread_flag(TIF_IA32);
700 /* TBD: overwrites user setup. Should have two bits.
701 But 64bit processes have always behaved this way,
702 so it's not too bad. The main problem is just that
703 32bit childs are affected again. */
704 current->personality &= ~READ_IMPLIES_EXEC;
707 asmlinkage long sys_fork(struct pt_regs *regs)
709 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
713 sys_clone(unsigned long clone_flags, unsigned long newsp,
714 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
718 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
722 * This is trivial, and on the face of it looks like it
723 * could equally well be done in user mode.
725 * Not so, for quite unobvious reasons - register pressure.
726 * In user mode vfork() cannot have a stack frame, and if
727 * done by calling the "clone()" system call directly, you
728 * do not have enough call-clobbered registers to hold all
729 * the information you need.
731 asmlinkage long sys_vfork(struct pt_regs *regs)
733 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
737 unsigned long get_wchan(struct task_struct *p)
743 if (!p || p == current || p->state==TASK_RUNNING)
745 stack = (unsigned long)task_stack_page(p);
746 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
748 fp = *(u64 *)(p->thread.sp);
750 if (fp < (unsigned long)stack ||
751 fp > (unsigned long)stack+THREAD_SIZE)
754 if (!in_sched_functions(ip))
757 } while (count++ < 16);
761 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
764 int doit = task == current;
769 if (addr >= TASK_SIZE_OF(task))
772 /* handle small bases via the GDT because that's faster to
774 if (addr <= 0xffffffff) {
775 set_32bit_tls(task, GS_TLS, addr);
777 load_TLS(&task->thread, cpu);
778 load_gs_index(GS_TLS_SEL);
780 task->thread.gsindex = GS_TLS_SEL;
783 task->thread.gsindex = 0;
784 task->thread.gs = addr;
787 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
793 /* Not strictly needed for fs, but do it for symmetry
795 if (addr >= TASK_SIZE_OF(task))
798 /* handle small bases via the GDT because that's faster to
800 if (addr <= 0xffffffff) {
801 set_32bit_tls(task, FS_TLS, addr);
803 load_TLS(&task->thread, cpu);
804 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
806 task->thread.fsindex = FS_TLS_SEL;
809 task->thread.fsindex = 0;
810 task->thread.fs = addr;
812 /* set the selector to 0 to not confuse
814 asm volatile("movl %0,%%fs" :: "r" (0));
815 ret = checking_wrmsrl(MSR_FS_BASE, addr);
822 if (task->thread.fsindex == FS_TLS_SEL)
823 base = read_32bit_tls(task, FS_TLS);
825 rdmsrl(MSR_FS_BASE, base);
827 base = task->thread.fs;
828 ret = put_user(base, (unsigned long __user *)addr);
834 if (task->thread.gsindex == GS_TLS_SEL)
835 base = read_32bit_tls(task, GS_TLS);
837 asm("movl %%gs,%0" : "=r" (gsindex));
839 rdmsrl(MSR_KERNEL_GS_BASE, base);
841 base = task->thread.gs;
844 base = task->thread.gs;
845 ret = put_user(base, (unsigned long __user *)addr);
857 long sys_arch_prctl(int code, unsigned long addr)
859 return do_arch_prctl(current, code, addr);
862 unsigned long arch_align_stack(unsigned long sp)
864 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
865 sp -= get_random_int() % 8192;
869 unsigned long arch_randomize_brk(struct mm_struct *mm)
871 unsigned long range_end = mm->brk + 0x02000000;
872 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;