2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
40 #include <linux/uaccess.h>
42 #include <linux/ftrace.h>
44 #include <asm/pgtable.h>
45 #include <asm/system.h>
46 #include <asm/processor.h>
48 #include <asm/mmu_context.h>
50 #include <asm/prctl.h>
52 #include <asm/proto.h>
55 #include <asm/syscalls.h>
58 asmlinkage extern void ret_from_fork(void);
60 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
61 EXPORT_PER_CPU_SYMBOL(current_task);
63 DEFINE_PER_CPU(unsigned long, old_rsp);
64 static DEFINE_PER_CPU(unsigned char, is_idle);
66 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
74 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 void idle_notifier_unregister(struct notifier_block *n)
78 atomic_notifier_chain_unregister(&idle_notifier, n);
80 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
84 percpu_write(is_idle, 1);
85 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
88 static void __exit_idle(void)
90 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95 /* Called from interrupts to signify idle end */
98 /* idle loop has pid 0 */
105 static inline void play_dead(void)
112 * The idle thread. There's no useful work to be
113 * done, so just try to conserve power and have a
114 * low exit latency (ie sit in a loop waiting for
115 * somebody to say that they'd like to reschedule)
119 current_thread_info()->status |= TS_POLLING;
120 /* endless idle loop with no priority at all */
122 tick_nohz_stop_sched_tick(1);
123 while (!need_resched()) {
127 if (cpu_is_offline(smp_processor_id()))
130 * Idle routines should keep interrupts disabled
131 * from here on, until they go to idle.
132 * Otherwise, idle callbacks can misfire.
136 /* Don't trace irqs off for idle */
137 stop_critical_timings();
139 start_critical_timings();
140 /* In many cases the interrupt that ended idle
141 has already called exit_idle. But some idle
142 loops can be woken up without interrupt. */
146 tick_nohz_restart_sched_tick();
147 preempt_enable_no_resched();
153 /* Prints also some state that isn't saved in the pt_regs */
154 void __show_regs(struct pt_regs *regs, int all)
156 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
157 unsigned long d0, d1, d2, d3, d6, d7;
158 unsigned int fsindex, gsindex;
159 unsigned int ds, cs, es;
163 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
164 current->pid, current->comm, print_tainted(),
165 init_utsname()->release,
166 (int)strcspn(init_utsname()->version, " "),
167 init_utsname()->version);
168 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169 printk_address(regs->ip, 1);
170 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
171 regs->sp, regs->flags);
172 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 regs->ax, regs->bx, regs->cx);
174 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 regs->dx, regs->si, regs->di);
176 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
177 regs->bp, regs->r8, regs->r9);
178 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
179 regs->r10, regs->r11, regs->r12);
180 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
181 regs->r13, regs->r14, regs->r15);
183 asm("movl %%ds,%0" : "=r" (ds));
184 asm("movl %%cs,%0" : "=r" (cs));
185 asm("movl %%es,%0" : "=r" (es));
186 asm("movl %%fs,%0" : "=r" (fsindex));
187 asm("movl %%gs,%0" : "=r" (gsindex));
189 rdmsrl(MSR_FS_BASE, fs);
190 rdmsrl(MSR_GS_BASE, gs);
191 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
201 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
202 fs, fsindex, gs, gsindex, shadowgs);
203 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
205 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
211 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
215 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
218 void show_regs(struct pt_regs *regs)
220 printk(KERN_INFO "CPU %d:", smp_processor_id());
221 __show_regs(regs, 1);
222 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
226 * Free current thread data structures etc..
228 void exit_thread(void)
230 struct task_struct *me = current;
231 struct thread_struct *t = &me->thread;
233 if (me->thread.io_bitmap_ptr) {
234 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
236 kfree(t->io_bitmap_ptr);
237 t->io_bitmap_ptr = NULL;
238 clear_thread_flag(TIF_IO_BITMAP);
240 * Careful, clear this in the TSS too:
242 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
243 t->io_bitmap_max = 0;
247 ds_exit_thread(current);
250 void flush_thread(void)
252 struct task_struct *tsk = current;
254 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
255 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
256 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
257 clear_tsk_thread_flag(tsk, TIF_IA32);
259 set_tsk_thread_flag(tsk, TIF_IA32);
260 current_thread_info()->status |= TS_COMPAT;
263 clear_tsk_thread_flag(tsk, TIF_DEBUG);
265 tsk->thread.debugreg0 = 0;
266 tsk->thread.debugreg1 = 0;
267 tsk->thread.debugreg2 = 0;
268 tsk->thread.debugreg3 = 0;
269 tsk->thread.debugreg6 = 0;
270 tsk->thread.debugreg7 = 0;
271 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
273 * Forget coprocessor state..
275 tsk->fpu_counter = 0;
280 void release_thread(struct task_struct *dead_task)
283 if (dead_task->mm->context.size) {
284 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
286 dead_task->mm->context.ldt,
287 dead_task->mm->context.size);
293 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
295 struct user_desc ud = {
302 struct desc_struct *desc = t->thread.tls_array;
307 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
309 return get_desc_base(&t->thread.tls_array[tls]);
313 * This gets called before we allocate a new thread and copy
314 * the current task into it.
316 void prepare_to_copy(struct task_struct *tsk)
321 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
322 unsigned long unused,
323 struct task_struct *p, struct pt_regs *regs)
326 struct pt_regs *childregs;
327 struct task_struct *me = current;
329 childregs = ((struct pt_regs *)
330 (THREAD_SIZE + task_stack_page(p))) - 1;
336 childregs->sp = (unsigned long)childregs;
338 p->thread.sp = (unsigned long) childregs;
339 p->thread.sp0 = (unsigned long) (childregs+1);
340 p->thread.usersp = me->thread.usersp;
342 set_tsk_thread_flag(p, TIF_FORK);
344 p->thread.fs = me->thread.fs;
345 p->thread.gs = me->thread.gs;
347 savesegment(gs, p->thread.gsindex);
348 savesegment(fs, p->thread.fsindex);
349 savesegment(es, p->thread.es);
350 savesegment(ds, p->thread.ds);
352 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
353 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
354 if (!p->thread.io_bitmap_ptr) {
355 p->thread.io_bitmap_max = 0;
358 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
360 set_tsk_thread_flag(p, TIF_IO_BITMAP);
364 * Set a new TLS for the child thread?
366 if (clone_flags & CLONE_SETTLS) {
367 #ifdef CONFIG_IA32_EMULATION
368 if (test_thread_flag(TIF_IA32))
369 err = do_set_thread_area(p, -1,
370 (struct user_desc __user *)childregs->si, 0);
373 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
378 ds_copy_thread(p, me);
380 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
381 p->thread.debugctlmsr = 0;
385 if (err && p->thread.io_bitmap_ptr) {
386 kfree(p->thread.io_bitmap_ptr);
387 p->thread.io_bitmap_max = 0;
393 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
401 percpu_write(old_rsp, new_sp);
402 regs->cs = __USER_CS;
403 regs->ss = __USER_DS;
407 * Free the old FP and other extended state
409 free_thread_xstate(current);
411 EXPORT_SYMBOL_GPL(start_thread);
413 static void hard_disable_TSC(void)
415 write_cr4(read_cr4() | X86_CR4_TSD);
418 void disable_TSC(void)
421 if (!test_and_set_thread_flag(TIF_NOTSC))
423 * Must flip the CPU state synchronously with
424 * TIF_NOTSC in the current running context.
430 static void hard_enable_TSC(void)
432 write_cr4(read_cr4() & ~X86_CR4_TSD);
435 static void enable_TSC(void)
438 if (test_and_clear_thread_flag(TIF_NOTSC))
440 * Must flip the CPU state synchronously with
441 * TIF_NOTSC in the current running context.
447 int get_tsc_mode(unsigned long adr)
451 if (test_thread_flag(TIF_NOTSC))
452 val = PR_TSC_SIGSEGV;
456 return put_user(val, (unsigned int __user *)adr);
459 int set_tsc_mode(unsigned int val)
461 if (val == PR_TSC_SIGSEGV)
463 else if (val == PR_TSC_ENABLE)
472 * This special macro can be used to load a debugging register
474 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
476 static inline void __switch_to_xtra(struct task_struct *prev_p,
477 struct task_struct *next_p,
478 struct tss_struct *tss)
480 struct thread_struct *prev, *next;
482 prev = &prev_p->thread,
483 next = &next_p->thread;
485 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
486 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
487 ds_switch_to(prev_p, next_p);
488 else if (next->debugctlmsr != prev->debugctlmsr)
489 update_debugctlmsr(next->debugctlmsr);
491 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
501 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
502 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
503 /* prev and next are different */
504 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
510 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
512 * Copy the relevant range of the IO bitmap.
513 * Normally this is 128 bytes or less:
515 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
516 max(prev->io_bitmap_max, next->io_bitmap_max));
517 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
519 * Clear any possible leftover bits:
521 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
526 * switch_to(x,y) should switch tasks from x to y.
528 * This could still be optimized:
529 * - fold all the options into a flag word and test it with a single test.
530 * - could test fs/gs bitsliced
532 * Kprobes not supported here. Set the probe on schedule instead.
533 * Function graph tracer not supported too.
535 __notrace_funcgraph struct task_struct *
536 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
538 struct thread_struct *prev = &prev_p->thread;
539 struct thread_struct *next = &next_p->thread;
540 int cpu = smp_processor_id();
541 struct tss_struct *tss = &per_cpu(init_tss, cpu);
542 unsigned fsindex, gsindex;
544 /* we're going to use this soon, after a few expensive things */
545 if (next_p->fpu_counter > 5)
546 prefetch(next->xstate);
549 * Reload esp0, LDT and the page table pointer:
555 * This won't pick up thread selector changes, but I guess that is ok.
557 savesegment(es, prev->es);
558 if (unlikely(next->es | prev->es))
559 loadsegment(es, next->es);
561 savesegment(ds, prev->ds);
562 if (unlikely(next->ds | prev->ds))
563 loadsegment(ds, next->ds);
566 /* We must save %fs and %gs before load_TLS() because
567 * %fs and %gs may be cleared by load_TLS().
569 * (e.g. xen_load_tls())
571 savesegment(fs, fsindex);
572 savesegment(gs, gsindex);
577 * Leave lazy mode, flushing any hypercalls made here.
578 * This must be done before restoring TLS segments so
579 * the GDT and LDT are properly updated, and must be
580 * done before math_state_restore, so the TS bit is up
583 arch_leave_lazy_cpu_mode();
588 * Segment register != 0 always requires a reload. Also
589 * reload when it has changed. When prev process used 64bit
590 * base always reload to avoid an information leak.
592 if (unlikely(fsindex | next->fsindex | prev->fs)) {
593 loadsegment(fs, next->fsindex);
595 * Check if the user used a selector != 0; if yes
596 * clear 64bit base, since overloaded base is always
597 * mapped to the Null selector
602 /* when next process has a 64bit base use it */
604 wrmsrl(MSR_FS_BASE, next->fs);
605 prev->fsindex = fsindex;
607 if (unlikely(gsindex | next->gsindex | prev->gs)) {
608 load_gs_index(next->gsindex);
613 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
614 prev->gsindex = gsindex;
616 /* Must be after DS reload */
620 * Switch the PDA and FPU contexts.
622 prev->usersp = percpu_read(old_rsp);
623 percpu_write(old_rsp, next->usersp);
624 percpu_write(current_task, next_p);
626 percpu_write(kernel_stack,
627 (unsigned long)task_stack_page(next_p) +
628 THREAD_SIZE - KERNEL_STACK_OFFSET);
629 #ifdef CONFIG_CC_STACKPROTECTOR
630 write_pda(stack_canary, next_p->stack_canary);
632 * Build time only check to make sure the stack_canary is at
633 * offset 40 in the pda; this is a gcc ABI requirement
635 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
639 * Now maybe reload the debug registers and handle I/O bitmaps
641 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
642 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
643 __switch_to_xtra(prev_p, next_p, tss);
645 /* If the task has used fpu the last 5 timeslices, just do a full
646 * restore of the math state immediately to avoid the trap; the
647 * chances of needing FPU soon are obviously high now
649 * tsk_used_math() checks prevent calling math_state_restore(),
650 * which can sleep in the case of !tsk_used_math()
652 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
653 math_state_restore();
658 * sys_execve() executes a new program.
661 long sys_execve(char __user *name, char __user * __user *argv,
662 char __user * __user *envp, struct pt_regs *regs)
667 filename = getname(name);
668 error = PTR_ERR(filename);
669 if (IS_ERR(filename))
671 error = do_execve(filename, argv, envp, regs);
676 void set_personality_64bit(void)
678 /* inherit personality from parent */
680 /* Make sure to be in 64bit mode */
681 clear_thread_flag(TIF_IA32);
683 /* TBD: overwrites user setup. Should have two bits.
684 But 64bit processes have always behaved this way,
685 so it's not too bad. The main problem is just that
686 32bit childs are affected again. */
687 current->personality &= ~READ_IMPLIES_EXEC;
690 asmlinkage long sys_fork(struct pt_regs *regs)
692 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
696 sys_clone(unsigned long clone_flags, unsigned long newsp,
697 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
701 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
705 * This is trivial, and on the face of it looks like it
706 * could equally well be done in user mode.
708 * Not so, for quite unobvious reasons - register pressure.
709 * In user mode vfork() cannot have a stack frame, and if
710 * done by calling the "clone()" system call directly, you
711 * do not have enough call-clobbered registers to hold all
712 * the information you need.
714 asmlinkage long sys_vfork(struct pt_regs *regs)
716 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
720 unsigned long get_wchan(struct task_struct *p)
726 if (!p || p == current || p->state == TASK_RUNNING)
728 stack = (unsigned long)task_stack_page(p);
729 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
731 fp = *(u64 *)(p->thread.sp);
733 if (fp < (unsigned long)stack ||
734 fp >= (unsigned long)stack+THREAD_SIZE)
737 if (!in_sched_functions(ip))
740 } while (count++ < 16);
744 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
747 int doit = task == current;
752 if (addr >= TASK_SIZE_OF(task))
755 /* handle small bases via the GDT because that's faster to
757 if (addr <= 0xffffffff) {
758 set_32bit_tls(task, GS_TLS, addr);
760 load_TLS(&task->thread, cpu);
761 load_gs_index(GS_TLS_SEL);
763 task->thread.gsindex = GS_TLS_SEL;
766 task->thread.gsindex = 0;
767 task->thread.gs = addr;
770 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
776 /* Not strictly needed for fs, but do it for symmetry
778 if (addr >= TASK_SIZE_OF(task))
781 /* handle small bases via the GDT because that's faster to
783 if (addr <= 0xffffffff) {
784 set_32bit_tls(task, FS_TLS, addr);
786 load_TLS(&task->thread, cpu);
787 loadsegment(fs, FS_TLS_SEL);
789 task->thread.fsindex = FS_TLS_SEL;
792 task->thread.fsindex = 0;
793 task->thread.fs = addr;
795 /* set the selector to 0 to not confuse
798 ret = checking_wrmsrl(MSR_FS_BASE, addr);
805 if (task->thread.fsindex == FS_TLS_SEL)
806 base = read_32bit_tls(task, FS_TLS);
808 rdmsrl(MSR_FS_BASE, base);
810 base = task->thread.fs;
811 ret = put_user(base, (unsigned long __user *)addr);
817 if (task->thread.gsindex == GS_TLS_SEL)
818 base = read_32bit_tls(task, GS_TLS);
820 savesegment(gs, gsindex);
822 rdmsrl(MSR_KERNEL_GS_BASE, base);
824 base = task->thread.gs;
826 base = task->thread.gs;
827 ret = put_user(base, (unsigned long __user *)addr);
839 long sys_arch_prctl(int code, unsigned long addr)
841 return do_arch_prctl(current, code, addr);
844 unsigned long arch_align_stack(unsigned long sp)
846 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
847 sp -= get_random_int() % 8192;
851 unsigned long arch_randomize_brk(struct mm_struct *mm)
853 unsigned long range_end = mm->brk + 0x02000000;
854 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;