2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62 EXPORT_PER_CPU_SYMBOL(current_task);
64 DEFINE_PER_CPU(unsigned long, old_rsp);
65 static DEFINE_PER_CPU(unsigned char, is_idle);
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 void idle_notifier_unregister(struct notifier_block *n)
79 atomic_notifier_chain_unregister(&idle_notifier, n);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
85 percpu_write(is_idle, 1);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 static inline void play_dead(void)
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status |= TS_POLLING;
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
129 boot_init_stack_canary();
131 /* endless idle loop with no priority at all */
133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) {
138 if (cpu_is_offline(smp_processor_id()))
141 * Idle routines should keep interrupts disabled
142 * from here on, until they go to idle.
143 * Otherwise, idle callbacks can misfire.
147 /* Don't trace irqs off for idle */
148 stop_critical_timings();
150 start_critical_timings();
151 /* In many cases the interrupt that ended idle
152 has already called exit_idle. But some idle
153 loops can be woken up without interrupt. */
157 tick_nohz_restart_sched_tick();
158 preempt_enable_no_resched();
164 /* Prints also some state that isn't saved in the pt_regs */
165 void __show_regs(struct pt_regs *regs, int all)
167 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168 unsigned long d0, d1, d2, d3, d6, d7;
169 unsigned int fsindex, gsindex;
170 unsigned int ds, cs, es;
175 board = dmi_get_system_info(DMI_PRODUCT_NAME);
178 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
179 current->pid, current->comm, print_tainted(),
180 init_utsname()->release,
181 (int)strcspn(init_utsname()->version, " "),
182 init_utsname()->version, board);
183 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
184 printk_address(regs->ip, 1);
185 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
186 regs->sp, regs->flags);
187 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
188 regs->ax, regs->bx, regs->cx);
189 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
190 regs->dx, regs->si, regs->di);
191 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
192 regs->bp, regs->r8, regs->r9);
193 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
194 regs->r10, regs->r11, regs->r12);
195 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
196 regs->r13, regs->r14, regs->r15);
198 asm("movl %%ds,%0" : "=r" (ds));
199 asm("movl %%cs,%0" : "=r" (cs));
200 asm("movl %%es,%0" : "=r" (es));
201 asm("movl %%fs,%0" : "=r" (fsindex));
202 asm("movl %%gs,%0" : "=r" (gsindex));
204 rdmsrl(MSR_FS_BASE, fs);
205 rdmsrl(MSR_GS_BASE, gs);
206 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
216 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
217 fs, fsindex, gs, gsindex, shadowgs);
218 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
220 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
226 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
230 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
233 void show_regs(struct pt_regs *regs)
235 printk(KERN_INFO "CPU %d:", smp_processor_id());
236 __show_regs(regs, 1);
237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
241 * Free current thread data structures etc..
243 void exit_thread(void)
245 struct task_struct *me = current;
246 struct thread_struct *t = &me->thread;
248 if (me->thread.io_bitmap_ptr) {
249 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
251 kfree(t->io_bitmap_ptr);
252 t->io_bitmap_ptr = NULL;
253 clear_thread_flag(TIF_IO_BITMAP);
255 * Careful, clear this in the TSS too:
257 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
258 t->io_bitmap_max = 0;
262 ds_exit_thread(current);
265 void flush_thread(void)
267 struct task_struct *tsk = current;
269 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
270 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
271 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
272 clear_tsk_thread_flag(tsk, TIF_IA32);
274 set_tsk_thread_flag(tsk, TIF_IA32);
275 current_thread_info()->status |= TS_COMPAT;
278 clear_tsk_thread_flag(tsk, TIF_DEBUG);
280 tsk->thread.debugreg0 = 0;
281 tsk->thread.debugreg1 = 0;
282 tsk->thread.debugreg2 = 0;
283 tsk->thread.debugreg3 = 0;
284 tsk->thread.debugreg6 = 0;
285 tsk->thread.debugreg7 = 0;
286 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
288 * Forget coprocessor state..
290 tsk->fpu_counter = 0;
295 void release_thread(struct task_struct *dead_task)
298 if (dead_task->mm->context.size) {
299 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
301 dead_task->mm->context.ldt,
302 dead_task->mm->context.size);
308 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
310 struct user_desc ud = {
317 struct desc_struct *desc = t->thread.tls_array;
322 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
324 return get_desc_base(&t->thread.tls_array[tls]);
328 * This gets called before we allocate a new thread and copy
329 * the current task into it.
331 void prepare_to_copy(struct task_struct *tsk)
336 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
337 unsigned long unused,
338 struct task_struct *p, struct pt_regs *regs)
341 struct pt_regs *childregs;
342 struct task_struct *me = current;
344 childregs = ((struct pt_regs *)
345 (THREAD_SIZE + task_stack_page(p))) - 1;
351 childregs->sp = (unsigned long)childregs;
353 p->thread.sp = (unsigned long) childregs;
354 p->thread.sp0 = (unsigned long) (childregs+1);
355 p->thread.usersp = me->thread.usersp;
357 set_tsk_thread_flag(p, TIF_FORK);
359 p->thread.fs = me->thread.fs;
360 p->thread.gs = me->thread.gs;
362 savesegment(gs, p->thread.gsindex);
363 savesegment(fs, p->thread.fsindex);
364 savesegment(es, p->thread.es);
365 savesegment(ds, p->thread.ds);
367 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
368 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
369 if (!p->thread.io_bitmap_ptr) {
370 p->thread.io_bitmap_max = 0;
373 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
375 set_tsk_thread_flag(p, TIF_IO_BITMAP);
379 * Set a new TLS for the child thread?
381 if (clone_flags & CLONE_SETTLS) {
382 #ifdef CONFIG_IA32_EMULATION
383 if (test_thread_flag(TIF_IA32))
384 err = do_set_thread_area(p, -1,
385 (struct user_desc __user *)childregs->si, 0);
388 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
393 ds_copy_thread(p, me);
395 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
396 p->thread.debugctlmsr = 0;
400 if (err && p->thread.io_bitmap_ptr) {
401 kfree(p->thread.io_bitmap_ptr);
402 p->thread.io_bitmap_max = 0;
408 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
416 percpu_write(old_rsp, new_sp);
417 regs->cs = __USER_CS;
418 regs->ss = __USER_DS;
422 * Free the old FP and other extended state
424 free_thread_xstate(current);
426 EXPORT_SYMBOL_GPL(start_thread);
428 static void hard_disable_TSC(void)
430 write_cr4(read_cr4() | X86_CR4_TSD);
433 void disable_TSC(void)
436 if (!test_and_set_thread_flag(TIF_NOTSC))
438 * Must flip the CPU state synchronously with
439 * TIF_NOTSC in the current running context.
445 static void hard_enable_TSC(void)
447 write_cr4(read_cr4() & ~X86_CR4_TSD);
450 static void enable_TSC(void)
453 if (test_and_clear_thread_flag(TIF_NOTSC))
455 * Must flip the CPU state synchronously with
456 * TIF_NOTSC in the current running context.
462 int get_tsc_mode(unsigned long adr)
466 if (test_thread_flag(TIF_NOTSC))
467 val = PR_TSC_SIGSEGV;
471 return put_user(val, (unsigned int __user *)adr);
474 int set_tsc_mode(unsigned int val)
476 if (val == PR_TSC_SIGSEGV)
478 else if (val == PR_TSC_ENABLE)
487 * This special macro can be used to load a debugging register
489 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
491 static inline void __switch_to_xtra(struct task_struct *prev_p,
492 struct task_struct *next_p,
493 struct tss_struct *tss)
495 struct thread_struct *prev, *next;
497 prev = &prev_p->thread,
498 next = &next_p->thread;
500 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
501 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
502 ds_switch_to(prev_p, next_p);
503 else if (next->debugctlmsr != prev->debugctlmsr)
504 update_debugctlmsr(next->debugctlmsr);
506 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
516 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
517 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
518 /* prev and next are different */
519 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
525 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
527 * Copy the relevant range of the IO bitmap.
528 * Normally this is 128 bytes or less:
530 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
531 max(prev->io_bitmap_max, next->io_bitmap_max));
532 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
534 * Clear any possible leftover bits:
536 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
541 * switch_to(x,y) should switch tasks from x to y.
543 * This could still be optimized:
544 * - fold all the options into a flag word and test it with a single test.
545 * - could test fs/gs bitsliced
547 * Kprobes not supported here. Set the probe on schedule instead.
548 * Function graph tracer not supported too.
550 __notrace_funcgraph struct task_struct *
551 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
553 struct thread_struct *prev = &prev_p->thread;
554 struct thread_struct *next = &next_p->thread;
555 int cpu = smp_processor_id();
556 struct tss_struct *tss = &per_cpu(init_tss, cpu);
557 unsigned fsindex, gsindex;
559 /* we're going to use this soon, after a few expensive things */
560 if (next_p->fpu_counter > 5)
561 prefetch(next->xstate);
564 * Reload esp0, LDT and the page table pointer:
570 * This won't pick up thread selector changes, but I guess that is ok.
572 savesegment(es, prev->es);
573 if (unlikely(next->es | prev->es))
574 loadsegment(es, next->es);
576 savesegment(ds, prev->ds);
577 if (unlikely(next->ds | prev->ds))
578 loadsegment(ds, next->ds);
581 /* We must save %fs and %gs before load_TLS() because
582 * %fs and %gs may be cleared by load_TLS().
584 * (e.g. xen_load_tls())
586 savesegment(fs, fsindex);
587 savesegment(gs, gsindex);
592 * Leave lazy mode, flushing any hypercalls made here.
593 * This must be done before restoring TLS segments so
594 * the GDT and LDT are properly updated, and must be
595 * done before math_state_restore, so the TS bit is up
598 arch_leave_lazy_cpu_mode();
603 * Segment register != 0 always requires a reload. Also
604 * reload when it has changed. When prev process used 64bit
605 * base always reload to avoid an information leak.
607 if (unlikely(fsindex | next->fsindex | prev->fs)) {
608 loadsegment(fs, next->fsindex);
610 * Check if the user used a selector != 0; if yes
611 * clear 64bit base, since overloaded base is always
612 * mapped to the Null selector
617 /* when next process has a 64bit base use it */
619 wrmsrl(MSR_FS_BASE, next->fs);
620 prev->fsindex = fsindex;
622 if (unlikely(gsindex | next->gsindex | prev->gs)) {
623 load_gs_index(next->gsindex);
628 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
629 prev->gsindex = gsindex;
631 /* Must be after DS reload */
635 * Switch the PDA and FPU contexts.
637 prev->usersp = percpu_read(old_rsp);
638 percpu_write(old_rsp, next->usersp);
639 percpu_write(current_task, next_p);
641 percpu_write(kernel_stack,
642 (unsigned long)task_stack_page(next_p) +
643 THREAD_SIZE - KERNEL_STACK_OFFSET);
646 * Now maybe reload the debug registers and handle I/O bitmaps
648 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
649 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
650 __switch_to_xtra(prev_p, next_p, tss);
652 /* If the task has used fpu the last 5 timeslices, just do a full
653 * restore of the math state immediately to avoid the trap; the
654 * chances of needing FPU soon are obviously high now
656 * tsk_used_math() checks prevent calling math_state_restore(),
657 * which can sleep in the case of !tsk_used_math()
659 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
660 math_state_restore();
665 * sys_execve() executes a new program.
668 long sys_execve(char __user *name, char __user * __user *argv,
669 char __user * __user *envp, struct pt_regs *regs)
674 filename = getname(name);
675 error = PTR_ERR(filename);
676 if (IS_ERR(filename))
678 error = do_execve(filename, argv, envp, regs);
683 void set_personality_64bit(void)
685 /* inherit personality from parent */
687 /* Make sure to be in 64bit mode */
688 clear_thread_flag(TIF_IA32);
690 /* TBD: overwrites user setup. Should have two bits.
691 But 64bit processes have always behaved this way,
692 so it's not too bad. The main problem is just that
693 32bit childs are affected again. */
694 current->personality &= ~READ_IMPLIES_EXEC;
697 asmlinkage long sys_fork(struct pt_regs *regs)
699 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
703 sys_clone(unsigned long clone_flags, unsigned long newsp,
704 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
708 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
712 * This is trivial, and on the face of it looks like it
713 * could equally well be done in user mode.
715 * Not so, for quite unobvious reasons - register pressure.
716 * In user mode vfork() cannot have a stack frame, and if
717 * done by calling the "clone()" system call directly, you
718 * do not have enough call-clobbered registers to hold all
719 * the information you need.
721 asmlinkage long sys_vfork(struct pt_regs *regs)
723 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
727 unsigned long get_wchan(struct task_struct *p)
733 if (!p || p == current || p->state == TASK_RUNNING)
735 stack = (unsigned long)task_stack_page(p);
736 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
738 fp = *(u64 *)(p->thread.sp);
740 if (fp < (unsigned long)stack ||
741 fp >= (unsigned long)stack+THREAD_SIZE)
744 if (!in_sched_functions(ip))
747 } while (count++ < 16);
751 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
754 int doit = task == current;
759 if (addr >= TASK_SIZE_OF(task))
762 /* handle small bases via the GDT because that's faster to
764 if (addr <= 0xffffffff) {
765 set_32bit_tls(task, GS_TLS, addr);
767 load_TLS(&task->thread, cpu);
768 load_gs_index(GS_TLS_SEL);
770 task->thread.gsindex = GS_TLS_SEL;
773 task->thread.gsindex = 0;
774 task->thread.gs = addr;
777 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
783 /* Not strictly needed for fs, but do it for symmetry
785 if (addr >= TASK_SIZE_OF(task))
788 /* handle small bases via the GDT because that's faster to
790 if (addr <= 0xffffffff) {
791 set_32bit_tls(task, FS_TLS, addr);
793 load_TLS(&task->thread, cpu);
794 loadsegment(fs, FS_TLS_SEL);
796 task->thread.fsindex = FS_TLS_SEL;
799 task->thread.fsindex = 0;
800 task->thread.fs = addr;
802 /* set the selector to 0 to not confuse
805 ret = checking_wrmsrl(MSR_FS_BASE, addr);
812 if (task->thread.fsindex == FS_TLS_SEL)
813 base = read_32bit_tls(task, FS_TLS);
815 rdmsrl(MSR_FS_BASE, base);
817 base = task->thread.fs;
818 ret = put_user(base, (unsigned long __user *)addr);
824 if (task->thread.gsindex == GS_TLS_SEL)
825 base = read_32bit_tls(task, GS_TLS);
827 savesegment(gs, gsindex);
829 rdmsrl(MSR_KERNEL_GS_BASE, base);
831 base = task->thread.gs;
833 base = task->thread.gs;
834 ret = put_user(base, (unsigned long __user *)addr);
846 long sys_arch_prctl(int code, unsigned long addr)
848 return do_arch_prctl(current, code, addr);
851 unsigned long arch_align_stack(unsigned long sp)
853 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
854 sp -= get_random_int() % 8192;
858 unsigned long arch_randomize_brk(struct mm_struct *mm)
860 unsigned long range_end = mm->brk + 0x02000000;
861 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;