2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 void idle_notifier_register(struct notifier_block *n)
63 atomic_notifier_chain_register(&idle_notifier, n);
69 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
72 static void __exit_idle(void)
74 if (test_and_clear_bit_pda(0, isidle) == 0)
76 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
79 /* Called from interrupts to signify idle end */
82 /* idle loop has pid 0 */
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state);
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
96 c1e_remove_cpu(raw_smp_processor_id());
100 __get_cpu_var(cpu_state) = CPU_DEAD;
103 /* mask all interrupts, flush any and all caches, and halt */
107 static inline void play_dead(void)
111 #endif /* CONFIG_HOTPLUG_CPU */
114 * The idle thread. There's no useful work to be
115 * done, so just try to conserve power and have a
116 * low exit latency (ie sit in a loop waiting for
117 * somebody to say that they'd like to reschedule)
121 current_thread_info()->status |= TS_POLLING;
122 /* endless idle loop with no priority at all */
124 tick_nohz_stop_sched_tick(1);
125 while (!need_resched()) {
129 if (cpu_is_offline(smp_processor_id()))
132 * Idle routines should keep interrupts disabled
133 * from here on, until they go to idle.
134 * Otherwise, idle callbacks can misfire.
138 /* Don't trace irqs off for idle */
139 stop_critical_timings();
141 start_critical_timings();
142 /* In many cases the interrupt that ended idle
143 has already called exit_idle. But some idle
144 loops can be woken up without interrupt. */
148 tick_nohz_restart_sched_tick();
149 preempt_enable_no_resched();
155 /* Prints also some state that isn't saved in the pt_regs */
156 void __show_regs(struct pt_regs * regs)
158 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 unsigned long d0, d1, d2, d3, d6, d7;
160 unsigned int fsindex, gsindex;
161 unsigned int ds, cs, es;
165 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
166 current->pid, current->comm, print_tainted(),
167 init_utsname()->release,
168 (int)strcspn(init_utsname()->version, " "),
169 init_utsname()->version);
170 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
171 printk_address(regs->ip, 1);
172 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
174 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 regs->ax, regs->bx, regs->cx);
176 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 regs->dx, regs->si, regs->di);
178 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
179 regs->bp, regs->r8, regs->r9);
180 printk("R10: %016lx R11: %016lx R12: %016lx\n",
181 regs->r10, regs->r11, regs->r12);
182 printk("R13: %016lx R14: %016lx R15: %016lx\n",
183 regs->r13, regs->r14, regs->r15);
185 asm("movl %%ds,%0" : "=r" (ds));
186 asm("movl %%cs,%0" : "=r" (cs));
187 asm("movl %%es,%0" : "=r" (es));
188 asm("movl %%fs,%0" : "=r" (fsindex));
189 asm("movl %%gs,%0" : "=r" (gsindex));
191 rdmsrl(MSR_FS_BASE, fs);
192 rdmsrl(MSR_GS_BASE, gs);
193 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
200 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
201 fs,fsindex,gs,gsindex,shadowgs);
202 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
203 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
208 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
212 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
215 void show_regs(struct pt_regs *regs)
217 printk("CPU %d:", smp_processor_id());
219 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
223 * Free current thread data structures etc..
225 void exit_thread(void)
227 struct task_struct *me = current;
228 struct thread_struct *t = &me->thread;
230 if (me->thread.io_bitmap_ptr) {
231 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
233 kfree(t->io_bitmap_ptr);
234 t->io_bitmap_ptr = NULL;
235 clear_thread_flag(TIF_IO_BITMAP);
237 * Careful, clear this in the TSS too:
239 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
240 t->io_bitmap_max = 0;
245 void flush_thread(void)
247 struct task_struct *tsk = current;
249 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
250 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
251 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
252 clear_tsk_thread_flag(tsk, TIF_IA32);
254 set_tsk_thread_flag(tsk, TIF_IA32);
255 current_thread_info()->status |= TS_COMPAT;
258 clear_tsk_thread_flag(tsk, TIF_DEBUG);
260 tsk->thread.debugreg0 = 0;
261 tsk->thread.debugreg1 = 0;
262 tsk->thread.debugreg2 = 0;
263 tsk->thread.debugreg3 = 0;
264 tsk->thread.debugreg6 = 0;
265 tsk->thread.debugreg7 = 0;
266 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
268 * Forget coprocessor state..
270 tsk->fpu_counter = 0;
275 void release_thread(struct task_struct *dead_task)
278 if (dead_task->mm->context.size) {
279 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
281 dead_task->mm->context.ldt,
282 dead_task->mm->context.size);
288 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
290 struct user_desc ud = {
297 struct desc_struct *desc = t->thread.tls_array;
302 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
304 return get_desc_base(&t->thread.tls_array[tls]);
308 * This gets called before we allocate a new thread and copy
309 * the current task into it.
311 void prepare_to_copy(struct task_struct *tsk)
316 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
317 unsigned long unused,
318 struct task_struct * p, struct pt_regs * regs)
321 struct pt_regs * childregs;
322 struct task_struct *me = current;
324 childregs = ((struct pt_regs *)
325 (THREAD_SIZE + task_stack_page(p))) - 1;
331 childregs->sp = (unsigned long)childregs;
333 p->thread.sp = (unsigned long) childregs;
334 p->thread.sp0 = (unsigned long) (childregs+1);
335 p->thread.usersp = me->thread.usersp;
337 set_tsk_thread_flag(p, TIF_FORK);
339 p->thread.fs = me->thread.fs;
340 p->thread.gs = me->thread.gs;
342 savesegment(gs, p->thread.gsindex);
343 savesegment(fs, p->thread.fsindex);
344 savesegment(es, p->thread.es);
345 savesegment(ds, p->thread.ds);
347 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
348 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
349 if (!p->thread.io_bitmap_ptr) {
350 p->thread.io_bitmap_max = 0;
353 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
355 set_tsk_thread_flag(p, TIF_IO_BITMAP);
359 * Set a new TLS for the child thread?
361 if (clone_flags & CLONE_SETTLS) {
362 #ifdef CONFIG_IA32_EMULATION
363 if (test_thread_flag(TIF_IA32))
364 err = do_set_thread_area(p, -1,
365 (struct user_desc __user *)childregs->si, 0);
368 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
374 if (err && p->thread.io_bitmap_ptr) {
375 kfree(p->thread.io_bitmap_ptr);
376 p->thread.io_bitmap_max = 0;
382 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
390 write_pda(oldrsp, new_sp);
391 regs->cs = __USER_CS;
392 regs->ss = __USER_DS;
396 * Free the old FP and other extended state
398 free_thread_xstate(current);
400 EXPORT_SYMBOL_GPL(start_thread);
402 static void hard_disable_TSC(void)
404 write_cr4(read_cr4() | X86_CR4_TSD);
407 void disable_TSC(void)
410 if (!test_and_set_thread_flag(TIF_NOTSC))
412 * Must flip the CPU state synchronously with
413 * TIF_NOTSC in the current running context.
419 static void hard_enable_TSC(void)
421 write_cr4(read_cr4() & ~X86_CR4_TSD);
424 static void enable_TSC(void)
427 if (test_and_clear_thread_flag(TIF_NOTSC))
429 * Must flip the CPU state synchronously with
430 * TIF_NOTSC in the current running context.
436 int get_tsc_mode(unsigned long adr)
440 if (test_thread_flag(TIF_NOTSC))
441 val = PR_TSC_SIGSEGV;
445 return put_user(val, (unsigned int __user *)adr);
448 int set_tsc_mode(unsigned int val)
450 if (val == PR_TSC_SIGSEGV)
452 else if (val == PR_TSC_ENABLE)
461 * This special macro can be used to load a debugging register
463 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
465 static inline void __switch_to_xtra(struct task_struct *prev_p,
466 struct task_struct *next_p,
467 struct tss_struct *tss)
469 struct thread_struct *prev, *next;
470 unsigned long debugctl;
472 prev = &prev_p->thread,
473 next = &next_p->thread;
475 debugctl = prev->debugctlmsr;
476 if (next->ds_area_msr != prev->ds_area_msr) {
477 /* we clear debugctl to make sure DS
478 * is not in use when we change it */
480 update_debugctlmsr(0);
481 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
484 if (next->debugctlmsr != debugctl)
485 update_debugctlmsr(next->debugctlmsr);
487 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
497 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
498 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
499 /* prev and next are different */
500 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
506 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
508 * Copy the relevant range of the IO bitmap.
509 * Normally this is 128 bytes or less:
511 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
512 max(prev->io_bitmap_max, next->io_bitmap_max));
513 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
515 * Clear any possible leftover bits:
517 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
521 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
522 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
524 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
525 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
530 * switch_to(x,y) should switch tasks from x to y.
532 * This could still be optimized:
533 * - fold all the options into a flag word and test it with a single test.
534 * - could test fs/gs bitsliced
536 * Kprobes not supported here. Set the probe on schedule instead.
539 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
541 struct thread_struct *prev = &prev_p->thread;
542 struct thread_struct *next = &next_p->thread;
543 int cpu = smp_processor_id();
544 struct tss_struct *tss = &per_cpu(init_tss, cpu);
545 unsigned fsindex, gsindex;
547 /* we're going to use this soon, after a few expensive things */
548 if (next_p->fpu_counter>5)
549 prefetch(next->xstate);
552 * Reload esp0, LDT and the page table pointer:
558 * This won't pick up thread selector changes, but I guess that is ok.
560 savesegment(es, prev->es);
561 if (unlikely(next->es | prev->es))
562 loadsegment(es, next->es);
564 savesegment(ds, prev->ds);
565 if (unlikely(next->ds | prev->ds))
566 loadsegment(ds, next->ds);
569 /* We must save %fs and %gs before load_TLS() because
570 * %fs and %gs may be cleared by load_TLS().
572 * (e.g. xen_load_tls())
574 savesegment(fs, fsindex);
575 savesegment(gs, gsindex);
580 * Leave lazy mode, flushing any hypercalls made here.
581 * This must be done before restoring TLS segments so
582 * the GDT and LDT are properly updated, and must be
583 * done before math_state_restore, so the TS bit is up
586 arch_leave_lazy_cpu_mode();
591 * Segment register != 0 always requires a reload. Also
592 * reload when it has changed. When prev process used 64bit
593 * base always reload to avoid an information leak.
595 if (unlikely(fsindex | next->fsindex | prev->fs)) {
596 loadsegment(fs, next->fsindex);
598 * Check if the user used a selector != 0; if yes
599 * clear 64bit base, since overloaded base is always
600 * mapped to the Null selector
605 /* when next process has a 64bit base use it */
607 wrmsrl(MSR_FS_BASE, next->fs);
608 prev->fsindex = fsindex;
610 if (unlikely(gsindex | next->gsindex | prev->gs)) {
611 load_gs_index(next->gsindex);
616 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
617 prev->gsindex = gsindex;
619 /* Must be after DS reload */
623 * Switch the PDA and FPU contexts.
625 prev->usersp = read_pda(oldrsp);
626 write_pda(oldrsp, next->usersp);
627 write_pda(pcurrent, next_p);
629 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) +
631 THREAD_SIZE - PDA_STACKOFFSET);
632 #ifdef CONFIG_CC_STACKPROTECTOR
633 write_pda(stack_canary, next_p->stack_canary);
635 * Build time only check to make sure the stack_canary is at
636 * offset 40 in the pda; this is a gcc ABI requirement
638 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
642 * Now maybe reload the debug registers and handle I/O bitmaps
644 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
645 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
646 __switch_to_xtra(prev_p, next_p, tss);
648 /* If the task has used fpu the last 5 timeslices, just do a full
649 * restore of the math state immediately to avoid the trap; the
650 * chances of needing FPU soon are obviously high now
652 * tsk_used_math() checks prevent calling math_state_restore(),
653 * which can sleep in the case of !tsk_used_math()
655 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
656 math_state_restore();
661 * sys_execve() executes a new program.
664 long sys_execve(char __user *name, char __user * __user *argv,
665 char __user * __user *envp, struct pt_regs *regs)
670 filename = getname(name);
671 error = PTR_ERR(filename);
672 if (IS_ERR(filename))
674 error = do_execve(filename, argv, envp, regs);
679 void set_personality_64bit(void)
681 /* inherit personality from parent */
683 /* Make sure to be in 64bit mode */
684 clear_thread_flag(TIF_IA32);
686 /* TBD: overwrites user setup. Should have two bits.
687 But 64bit processes have always behaved this way,
688 so it's not too bad. The main problem is just that
689 32bit childs are affected again. */
690 current->personality &= ~READ_IMPLIES_EXEC;
693 asmlinkage long sys_fork(struct pt_regs *regs)
695 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
699 sys_clone(unsigned long clone_flags, unsigned long newsp,
700 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
704 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
708 * This is trivial, and on the face of it looks like it
709 * could equally well be done in user mode.
711 * Not so, for quite unobvious reasons - register pressure.
712 * In user mode vfork() cannot have a stack frame, and if
713 * done by calling the "clone()" system call directly, you
714 * do not have enough call-clobbered registers to hold all
715 * the information you need.
717 asmlinkage long sys_vfork(struct pt_regs *regs)
719 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
723 unsigned long get_wchan(struct task_struct *p)
729 if (!p || p == current || p->state==TASK_RUNNING)
731 stack = (unsigned long)task_stack_page(p);
732 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
734 fp = *(u64 *)(p->thread.sp);
736 if (fp < (unsigned long)stack ||
737 fp > (unsigned long)stack+THREAD_SIZE)
740 if (!in_sched_functions(ip))
743 } while (count++ < 16);
747 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
750 int doit = task == current;
755 if (addr >= TASK_SIZE_OF(task))
758 /* handle small bases via the GDT because that's faster to
760 if (addr <= 0xffffffff) {
761 set_32bit_tls(task, GS_TLS, addr);
763 load_TLS(&task->thread, cpu);
764 load_gs_index(GS_TLS_SEL);
766 task->thread.gsindex = GS_TLS_SEL;
769 task->thread.gsindex = 0;
770 task->thread.gs = addr;
773 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
779 /* Not strictly needed for fs, but do it for symmetry
781 if (addr >= TASK_SIZE_OF(task))
784 /* handle small bases via the GDT because that's faster to
786 if (addr <= 0xffffffff) {
787 set_32bit_tls(task, FS_TLS, addr);
789 load_TLS(&task->thread, cpu);
790 loadsegment(fs, FS_TLS_SEL);
792 task->thread.fsindex = FS_TLS_SEL;
795 task->thread.fsindex = 0;
796 task->thread.fs = addr;
798 /* set the selector to 0 to not confuse
801 ret = checking_wrmsrl(MSR_FS_BASE, addr);
808 if (task->thread.fsindex == FS_TLS_SEL)
809 base = read_32bit_tls(task, FS_TLS);
811 rdmsrl(MSR_FS_BASE, base);
813 base = task->thread.fs;
814 ret = put_user(base, (unsigned long __user *)addr);
820 if (task->thread.gsindex == GS_TLS_SEL)
821 base = read_32bit_tls(task, GS_TLS);
823 savesegment(gs, gsindex);
825 rdmsrl(MSR_KERNEL_GS_BASE, base);
827 base = task->thread.gs;
830 base = task->thread.gs;
831 ret = put_user(base, (unsigned long __user *)addr);
843 long sys_arch_prctl(int code, unsigned long addr)
845 return do_arch_prctl(current, code, addr);
848 unsigned long arch_align_stack(unsigned long sp)
850 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
851 sp -= get_random_int() % 8192;
855 unsigned long arch_randomize_brk(struct mm_struct *mm)
857 unsigned long range_end = mm->brk + 0x02000000;
858 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;