2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
40 #include <linux/uaccess.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 void idle_notifier_register(struct notifier_block *n)
63 atomic_notifier_chain_register(&idle_notifier, n);
69 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
72 static void __exit_idle(void)
74 if (test_and_clear_bit_pda(0, isidle) == 0)
76 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
79 /* Called from interrupts to signify idle end */
82 /* idle loop has pid 0 */
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state);
91 #include <linux/nmi.h>
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
99 __get_cpu_var(cpu_state) = CPU_DEAD;
106 static inline void play_dead(void)
110 #endif /* CONFIG_HOTPLUG_CPU */
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status |= TS_POLLING;
121 /* endless idle loop with no priority at all */
123 tick_nohz_stop_sched_tick(1);
124 while (!need_resched()) {
128 if (cpu_is_offline(smp_processor_id()))
131 * Idle routines should keep interrupts disabled
132 * from here on, until they go to idle.
133 * Otherwise, idle callbacks can misfire.
137 /* Don't trace irqs off for idle */
138 stop_critical_timings();
140 start_critical_timings();
141 /* In many cases the interrupt that ended idle
142 has already called exit_idle. But some idle
143 loops can be woken up without interrupt. */
147 tick_nohz_restart_sched_tick();
148 preempt_enable_no_resched();
154 /* Prints also some state that isn't saved in the pt_regs */
155 void __show_regs(struct pt_regs *regs)
157 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 unsigned long d0, d1, d2, d3, d6, d7;
159 unsigned int fsindex, gsindex;
160 unsigned int ds, cs, es;
164 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
165 current->pid, current->comm, print_tainted(),
166 init_utsname()->release,
167 (int)strcspn(init_utsname()->version, " "),
168 init_utsname()->version);
169 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
170 printk_address(regs->ip, 1);
171 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
172 regs->sp, regs->flags);
173 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
174 regs->ax, regs->bx, regs->cx);
175 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
176 regs->dx, regs->si, regs->di);
177 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
178 regs->bp, regs->r8, regs->r9);
179 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
180 regs->r10, regs->r11, regs->r12);
181 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
182 regs->r13, regs->r14, regs->r15);
184 asm("movl %%ds,%0" : "=r" (ds));
185 asm("movl %%cs,%0" : "=r" (cs));
186 asm("movl %%es,%0" : "=r" (es));
187 asm("movl %%fs,%0" : "=r" (fsindex));
188 asm("movl %%gs,%0" : "=r" (gsindex));
190 rdmsrl(MSR_FS_BASE, fs);
191 rdmsrl(MSR_GS_BASE, gs);
192 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
199 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs, fsindex, gs, gsindex, shadowgs);
201 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
203 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
209 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
213 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
216 void show_regs(struct pt_regs *regs)
218 printk(KERN_INFO "CPU %d:", smp_processor_id());
220 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
224 * Free current thread data structures etc..
226 void exit_thread(void)
228 struct task_struct *me = current;
229 struct thread_struct *t = &me->thread;
231 if (me->thread.io_bitmap_ptr) {
232 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
234 kfree(t->io_bitmap_ptr);
235 t->io_bitmap_ptr = NULL;
236 clear_thread_flag(TIF_IO_BITMAP);
238 * Careful, clear this in the TSS too:
240 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
241 t->io_bitmap_max = 0;
246 void flush_thread(void)
248 struct task_struct *tsk = current;
250 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
251 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
252 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
253 clear_tsk_thread_flag(tsk, TIF_IA32);
255 set_tsk_thread_flag(tsk, TIF_IA32);
256 current_thread_info()->status |= TS_COMPAT;
259 clear_tsk_thread_flag(tsk, TIF_DEBUG);
261 tsk->thread.debugreg0 = 0;
262 tsk->thread.debugreg1 = 0;
263 tsk->thread.debugreg2 = 0;
264 tsk->thread.debugreg3 = 0;
265 tsk->thread.debugreg6 = 0;
266 tsk->thread.debugreg7 = 0;
267 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
269 * Forget coprocessor state..
271 tsk->fpu_counter = 0;
276 void release_thread(struct task_struct *dead_task)
279 if (dead_task->mm->context.size) {
280 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
282 dead_task->mm->context.ldt,
283 dead_task->mm->context.size);
289 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
291 struct user_desc ud = {
298 struct desc_struct *desc = t->thread.tls_array;
303 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
305 return get_desc_base(&t->thread.tls_array[tls]);
309 * This gets called before we allocate a new thread and copy
310 * the current task into it.
312 void prepare_to_copy(struct task_struct *tsk)
317 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
318 unsigned long unused,
319 struct task_struct *p, struct pt_regs *regs)
322 struct pt_regs *childregs;
323 struct task_struct *me = current;
325 childregs = ((struct pt_regs *)
326 (THREAD_SIZE + task_stack_page(p))) - 1;
332 childregs->sp = (unsigned long)childregs;
334 p->thread.sp = (unsigned long) childregs;
335 p->thread.sp0 = (unsigned long) (childregs+1);
336 p->thread.usersp = me->thread.usersp;
338 set_tsk_thread_flag(p, TIF_FORK);
340 p->thread.fs = me->thread.fs;
341 p->thread.gs = me->thread.gs;
343 savesegment(gs, p->thread.gsindex);
344 savesegment(fs, p->thread.fsindex);
345 savesegment(es, p->thread.es);
346 savesegment(ds, p->thread.ds);
348 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
349 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
350 if (!p->thread.io_bitmap_ptr) {
351 p->thread.io_bitmap_max = 0;
354 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
356 set_tsk_thread_flag(p, TIF_IO_BITMAP);
360 * Set a new TLS for the child thread?
362 if (clone_flags & CLONE_SETTLS) {
363 #ifdef CONFIG_IA32_EMULATION
364 if (test_thread_flag(TIF_IA32))
365 err = do_set_thread_area(p, -1,
366 (struct user_desc __user *)childregs->si, 0);
369 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
375 if (err && p->thread.io_bitmap_ptr) {
376 kfree(p->thread.io_bitmap_ptr);
377 p->thread.io_bitmap_max = 0;
383 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
391 write_pda(oldrsp, new_sp);
392 regs->cs = __USER_CS;
393 regs->ss = __USER_DS;
397 * Free the old FP and other extended state
399 free_thread_xstate(current);
401 EXPORT_SYMBOL_GPL(start_thread);
403 static void hard_disable_TSC(void)
405 write_cr4(read_cr4() | X86_CR4_TSD);
408 void disable_TSC(void)
411 if (!test_and_set_thread_flag(TIF_NOTSC))
413 * Must flip the CPU state synchronously with
414 * TIF_NOTSC in the current running context.
420 static void hard_enable_TSC(void)
422 write_cr4(read_cr4() & ~X86_CR4_TSD);
425 static void enable_TSC(void)
428 if (test_and_clear_thread_flag(TIF_NOTSC))
430 * Must flip the CPU state synchronously with
431 * TIF_NOTSC in the current running context.
437 int get_tsc_mode(unsigned long adr)
441 if (test_thread_flag(TIF_NOTSC))
442 val = PR_TSC_SIGSEGV;
446 return put_user(val, (unsigned int __user *)adr);
449 int set_tsc_mode(unsigned int val)
451 if (val == PR_TSC_SIGSEGV)
453 else if (val == PR_TSC_ENABLE)
462 * This special macro can be used to load a debugging register
464 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
466 static inline void __switch_to_xtra(struct task_struct *prev_p,
467 struct task_struct *next_p,
468 struct tss_struct *tss)
470 struct thread_struct *prev, *next;
471 unsigned long debugctl;
473 prev = &prev_p->thread,
474 next = &next_p->thread;
476 debugctl = prev->debugctlmsr;
477 if (next->ds_area_msr != prev->ds_area_msr) {
478 /* we clear debugctl to make sure DS
479 * is not in use when we change it */
481 update_debugctlmsr(0);
482 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
485 if (next->debugctlmsr != debugctl)
486 update_debugctlmsr(next->debugctlmsr);
488 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
498 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
499 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
500 /* prev and next are different */
501 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
507 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
509 * Copy the relevant range of the IO bitmap.
510 * Normally this is 128 bytes or less:
512 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
513 max(prev->io_bitmap_max, next->io_bitmap_max));
514 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
516 * Clear any possible leftover bits:
518 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
522 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
523 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
525 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
526 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
531 * switch_to(x,y) should switch tasks from x to y.
533 * This could still be optimized:
534 * - fold all the options into a flag word and test it with a single test.
535 * - could test fs/gs bitsliced
537 * Kprobes not supported here. Set the probe on schedule instead.
540 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
542 struct thread_struct *prev = &prev_p->thread;
543 struct thread_struct *next = &next_p->thread;
544 int cpu = smp_processor_id();
545 struct tss_struct *tss = &per_cpu(init_tss, cpu);
546 unsigned fsindex, gsindex;
548 /* we're going to use this soon, after a few expensive things */
549 if (next_p->fpu_counter > 5)
550 prefetch(next->xstate);
553 * Reload esp0, LDT and the page table pointer:
559 * This won't pick up thread selector changes, but I guess that is ok.
561 savesegment(es, prev->es);
562 if (unlikely(next->es | prev->es))
563 loadsegment(es, next->es);
565 savesegment(ds, prev->ds);
566 if (unlikely(next->ds | prev->ds))
567 loadsegment(ds, next->ds);
570 /* We must save %fs and %gs before load_TLS() because
571 * %fs and %gs may be cleared by load_TLS().
573 * (e.g. xen_load_tls())
575 savesegment(fs, fsindex);
576 savesegment(gs, gsindex);
581 * Leave lazy mode, flushing any hypercalls made here.
582 * This must be done before restoring TLS segments so
583 * the GDT and LDT are properly updated, and must be
584 * done before math_state_restore, so the TS bit is up
587 arch_leave_lazy_cpu_mode();
592 * Segment register != 0 always requires a reload. Also
593 * reload when it has changed. When prev process used 64bit
594 * base always reload to avoid an information leak.
596 if (unlikely(fsindex | next->fsindex | prev->fs)) {
597 loadsegment(fs, next->fsindex);
599 * Check if the user used a selector != 0; if yes
600 * clear 64bit base, since overloaded base is always
601 * mapped to the Null selector
606 /* when next process has a 64bit base use it */
608 wrmsrl(MSR_FS_BASE, next->fs);
609 prev->fsindex = fsindex;
611 if (unlikely(gsindex | next->gsindex | prev->gs)) {
612 load_gs_index(next->gsindex);
617 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
618 prev->gsindex = gsindex;
620 /* Must be after DS reload */
624 * Switch the PDA and FPU contexts.
626 prev->usersp = read_pda(oldrsp);
627 write_pda(oldrsp, next->usersp);
628 write_pda(pcurrent, next_p);
630 write_pda(kernelstack,
631 (unsigned long)task_stack_page(next_p) +
632 THREAD_SIZE - PDA_STACKOFFSET);
633 #ifdef CONFIG_CC_STACKPROTECTOR
634 write_pda(stack_canary, next_p->stack_canary);
636 * Build time only check to make sure the stack_canary is at
637 * offset 40 in the pda; this is a gcc ABI requirement
639 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
643 * Now maybe reload the debug registers and handle I/O bitmaps
645 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
646 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
647 __switch_to_xtra(prev_p, next_p, tss);
649 /* If the task has used fpu the last 5 timeslices, just do a full
650 * restore of the math state immediately to avoid the trap; the
651 * chances of needing FPU soon are obviously high now
653 * tsk_used_math() checks prevent calling math_state_restore(),
654 * which can sleep in the case of !tsk_used_math()
656 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
657 math_state_restore();
662 * sys_execve() executes a new program.
665 long sys_execve(char __user *name, char __user * __user *argv,
666 char __user * __user *envp, struct pt_regs *regs)
671 filename = getname(name);
672 error = PTR_ERR(filename);
673 if (IS_ERR(filename))
675 error = do_execve(filename, argv, envp, regs);
680 void set_personality_64bit(void)
682 /* inherit personality from parent */
684 /* Make sure to be in 64bit mode */
685 clear_thread_flag(TIF_IA32);
687 /* TBD: overwrites user setup. Should have two bits.
688 But 64bit processes have always behaved this way,
689 so it's not too bad. The main problem is just that
690 32bit childs are affected again. */
691 current->personality &= ~READ_IMPLIES_EXEC;
694 asmlinkage long sys_fork(struct pt_regs *regs)
696 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
700 sys_clone(unsigned long clone_flags, unsigned long newsp,
701 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
705 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
709 * This is trivial, and on the face of it looks like it
710 * could equally well be done in user mode.
712 * Not so, for quite unobvious reasons - register pressure.
713 * In user mode vfork() cannot have a stack frame, and if
714 * done by calling the "clone()" system call directly, you
715 * do not have enough call-clobbered registers to hold all
716 * the information you need.
718 asmlinkage long sys_vfork(struct pt_regs *regs)
720 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
724 unsigned long get_wchan(struct task_struct *p)
730 if (!p || p == current || p->state == TASK_RUNNING)
732 stack = (unsigned long)task_stack_page(p);
733 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
735 fp = *(u64 *)(p->thread.sp);
737 if (fp < (unsigned long)stack ||
738 fp > (unsigned long)stack+THREAD_SIZE)
741 if (!in_sched_functions(ip))
744 } while (count++ < 16);
748 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
751 int doit = task == current;
756 if (addr >= TASK_SIZE_OF(task))
759 /* handle small bases via the GDT because that's faster to
761 if (addr <= 0xffffffff) {
762 set_32bit_tls(task, GS_TLS, addr);
764 load_TLS(&task->thread, cpu);
765 load_gs_index(GS_TLS_SEL);
767 task->thread.gsindex = GS_TLS_SEL;
770 task->thread.gsindex = 0;
771 task->thread.gs = addr;
774 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
780 /* Not strictly needed for fs, but do it for symmetry
782 if (addr >= TASK_SIZE_OF(task))
785 /* handle small bases via the GDT because that's faster to
787 if (addr <= 0xffffffff) {
788 set_32bit_tls(task, FS_TLS, addr);
790 load_TLS(&task->thread, cpu);
791 loadsegment(fs, FS_TLS_SEL);
793 task->thread.fsindex = FS_TLS_SEL;
796 task->thread.fsindex = 0;
797 task->thread.fs = addr;
799 /* set the selector to 0 to not confuse
802 ret = checking_wrmsrl(MSR_FS_BASE, addr);
809 if (task->thread.fsindex == FS_TLS_SEL)
810 base = read_32bit_tls(task, FS_TLS);
812 rdmsrl(MSR_FS_BASE, base);
814 base = task->thread.fs;
815 ret = put_user(base, (unsigned long __user *)addr);
821 if (task->thread.gsindex == GS_TLS_SEL)
822 base = read_32bit_tls(task, GS_TLS);
824 savesegment(gs, gsindex);
826 rdmsrl(MSR_KERNEL_GS_BASE, base);
828 base = task->thread.gs;
830 base = task->thread.gs;
831 ret = put_user(base, (unsigned long __user *)addr);
843 long sys_arch_prctl(int code, unsigned long addr)
845 return do_arch_prctl(current, code, addr);
848 unsigned long arch_align_stack(unsigned long sp)
850 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
851 sp -= get_random_int() % 8192;
855 unsigned long arch_randomize_brk(struct mm_struct *mm)
857 unsigned long range_end = mm->brk + 0x02000000;
858 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;