2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
50 #include <asm/prctl.h>
52 #include <asm/proto.h>
55 #include <asm/syscalls.h>
58 asmlinkage extern void ret_from_fork(void);
60 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
61 EXPORT_PER_CPU_SYMBOL(current_task);
63 DEFINE_PER_CPU(unsigned long, old_rsp);
64 static DEFINE_PER_CPU(unsigned char, is_idle);
66 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
68 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
70 void idle_notifier_register(struct notifier_block *n)
72 atomic_notifier_chain_register(&idle_notifier, n);
74 EXPORT_SYMBOL_GPL(idle_notifier_register);
76 void idle_notifier_unregister(struct notifier_block *n)
78 atomic_notifier_chain_unregister(&idle_notifier, n);
80 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
84 percpu_write(is_idle, 1);
85 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
88 static void __exit_idle(void)
90 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
92 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
95 /* Called from interrupts to signify idle end */
98 /* idle loop has pid 0 */
105 static inline void play_dead(void)
112 * The idle thread. There's no useful work to be
113 * done, so just try to conserve power and have a
114 * low exit latency (ie sit in a loop waiting for
115 * somebody to say that they'd like to reschedule)
119 current_thread_info()->status |= TS_POLLING;
122 * If we're the non-boot CPU, nothing set the PDA stack
123 * canary up for us - and if we are the boot CPU we have
124 * a 0 stack canary. This is a good place for updating
125 * it, as we wont ever return from this function (so the
126 * invalid canaries already on the stack wont ever
129 boot_init_stack_canary();
131 /* endless idle loop with no priority at all */
133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) {
138 if (cpu_is_offline(smp_processor_id()))
141 * Idle routines should keep interrupts disabled
142 * from here on, until they go to idle.
143 * Otherwise, idle callbacks can misfire.
147 /* Don't trace irqs off for idle */
148 stop_critical_timings();
150 start_critical_timings();
151 /* In many cases the interrupt that ended idle
152 has already called exit_idle. But some idle
153 loops can be woken up without interrupt. */
157 tick_nohz_restart_sched_tick();
158 preempt_enable_no_resched();
164 /* Prints also some state that isn't saved in the pt_regs */
165 void __show_regs(struct pt_regs *regs, int all)
167 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168 unsigned long d0, d1, d2, d3, d6, d7;
169 unsigned int fsindex, gsindex;
170 unsigned int ds, cs, es;
174 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
175 current->pid, current->comm, print_tainted(),
176 init_utsname()->release,
177 (int)strcspn(init_utsname()->version, " "),
178 init_utsname()->version);
179 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
180 printk_address(regs->ip, 1);
181 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
182 regs->sp, regs->flags);
183 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
184 regs->ax, regs->bx, regs->cx);
185 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
186 regs->dx, regs->si, regs->di);
187 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
188 regs->bp, regs->r8, regs->r9);
189 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
190 regs->r10, regs->r11, regs->r12);
191 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
192 regs->r13, regs->r14, regs->r15);
194 asm("movl %%ds,%0" : "=r" (ds));
195 asm("movl %%cs,%0" : "=r" (cs));
196 asm("movl %%es,%0" : "=r" (es));
197 asm("movl %%fs,%0" : "=r" (fsindex));
198 asm("movl %%gs,%0" : "=r" (gsindex));
200 rdmsrl(MSR_FS_BASE, fs);
201 rdmsrl(MSR_GS_BASE, gs);
202 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
212 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
213 fs, fsindex, gs, gsindex, shadowgs);
214 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
216 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
222 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
226 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
229 void show_regs(struct pt_regs *regs)
231 printk(KERN_INFO "CPU %d:", smp_processor_id());
232 __show_regs(regs, 1);
233 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
237 * Free current thread data structures etc..
239 void exit_thread(void)
241 struct task_struct *me = current;
242 struct thread_struct *t = &me->thread;
244 if (me->thread.io_bitmap_ptr) {
245 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
247 kfree(t->io_bitmap_ptr);
248 t->io_bitmap_ptr = NULL;
249 clear_thread_flag(TIF_IO_BITMAP);
251 * Careful, clear this in the TSS too:
253 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
254 t->io_bitmap_max = 0;
258 ds_exit_thread(current);
261 void flush_thread(void)
263 struct task_struct *tsk = current;
265 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
266 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
267 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
268 clear_tsk_thread_flag(tsk, TIF_IA32);
270 set_tsk_thread_flag(tsk, TIF_IA32);
271 current_thread_info()->status |= TS_COMPAT;
274 clear_tsk_thread_flag(tsk, TIF_DEBUG);
276 tsk->thread.debugreg0 = 0;
277 tsk->thread.debugreg1 = 0;
278 tsk->thread.debugreg2 = 0;
279 tsk->thread.debugreg3 = 0;
280 tsk->thread.debugreg6 = 0;
281 tsk->thread.debugreg7 = 0;
282 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
284 * Forget coprocessor state..
286 tsk->fpu_counter = 0;
291 void release_thread(struct task_struct *dead_task)
294 if (dead_task->mm->context.size) {
295 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
297 dead_task->mm->context.ldt,
298 dead_task->mm->context.size);
304 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
306 struct user_desc ud = {
313 struct desc_struct *desc = t->thread.tls_array;
318 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
320 return get_desc_base(&t->thread.tls_array[tls]);
324 * This gets called before we allocate a new thread and copy
325 * the current task into it.
327 void prepare_to_copy(struct task_struct *tsk)
332 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
333 unsigned long unused,
334 struct task_struct *p, struct pt_regs *regs)
337 struct pt_regs *childregs;
338 struct task_struct *me = current;
340 childregs = ((struct pt_regs *)
341 (THREAD_SIZE + task_stack_page(p))) - 1;
347 childregs->sp = (unsigned long)childregs;
349 p->thread.sp = (unsigned long) childregs;
350 p->thread.sp0 = (unsigned long) (childregs+1);
351 p->thread.usersp = me->thread.usersp;
353 set_tsk_thread_flag(p, TIF_FORK);
355 p->thread.fs = me->thread.fs;
356 p->thread.gs = me->thread.gs;
358 savesegment(gs, p->thread.gsindex);
359 savesegment(fs, p->thread.fsindex);
360 savesegment(es, p->thread.es);
361 savesegment(ds, p->thread.ds);
363 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
364 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
365 if (!p->thread.io_bitmap_ptr) {
366 p->thread.io_bitmap_max = 0;
369 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
371 set_tsk_thread_flag(p, TIF_IO_BITMAP);
375 * Set a new TLS for the child thread?
377 if (clone_flags & CLONE_SETTLS) {
378 #ifdef CONFIG_IA32_EMULATION
379 if (test_thread_flag(TIF_IA32))
380 err = do_set_thread_area(p, -1,
381 (struct user_desc __user *)childregs->si, 0);
384 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
389 ds_copy_thread(p, me);
391 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
392 p->thread.debugctlmsr = 0;
396 if (err && p->thread.io_bitmap_ptr) {
397 kfree(p->thread.io_bitmap_ptr);
398 p->thread.io_bitmap_max = 0;
404 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
412 percpu_write(old_rsp, new_sp);
413 regs->cs = __USER_CS;
414 regs->ss = __USER_DS;
418 * Free the old FP and other extended state
420 free_thread_xstate(current);
422 EXPORT_SYMBOL_GPL(start_thread);
424 static void hard_disable_TSC(void)
426 write_cr4(read_cr4() | X86_CR4_TSD);
429 void disable_TSC(void)
432 if (!test_and_set_thread_flag(TIF_NOTSC))
434 * Must flip the CPU state synchronously with
435 * TIF_NOTSC in the current running context.
441 static void hard_enable_TSC(void)
443 write_cr4(read_cr4() & ~X86_CR4_TSD);
446 static void enable_TSC(void)
449 if (test_and_clear_thread_flag(TIF_NOTSC))
451 * Must flip the CPU state synchronously with
452 * TIF_NOTSC in the current running context.
458 int get_tsc_mode(unsigned long adr)
462 if (test_thread_flag(TIF_NOTSC))
463 val = PR_TSC_SIGSEGV;
467 return put_user(val, (unsigned int __user *)adr);
470 int set_tsc_mode(unsigned int val)
472 if (val == PR_TSC_SIGSEGV)
474 else if (val == PR_TSC_ENABLE)
483 * This special macro can be used to load a debugging register
485 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
487 static inline void __switch_to_xtra(struct task_struct *prev_p,
488 struct task_struct *next_p,
489 struct tss_struct *tss)
491 struct thread_struct *prev, *next;
493 prev = &prev_p->thread,
494 next = &next_p->thread;
496 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
497 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
498 ds_switch_to(prev_p, next_p);
499 else if (next->debugctlmsr != prev->debugctlmsr)
500 update_debugctlmsr(next->debugctlmsr);
502 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
512 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
513 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
514 /* prev and next are different */
515 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
521 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
523 * Copy the relevant range of the IO bitmap.
524 * Normally this is 128 bytes or less:
526 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
527 max(prev->io_bitmap_max, next->io_bitmap_max));
528 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
530 * Clear any possible leftover bits:
532 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
537 * switch_to(x,y) should switch tasks from x to y.
539 * This could still be optimized:
540 * - fold all the options into a flag word and test it with a single test.
541 * - could test fs/gs bitsliced
543 * Kprobes not supported here. Set the probe on schedule instead.
544 * Function graph tracer not supported too.
546 __notrace_funcgraph struct task_struct *
547 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
549 struct thread_struct *prev = &prev_p->thread;
550 struct thread_struct *next = &next_p->thread;
551 int cpu = smp_processor_id();
552 struct tss_struct *tss = &per_cpu(init_tss, cpu);
553 unsigned fsindex, gsindex;
555 /* we're going to use this soon, after a few expensive things */
556 if (next_p->fpu_counter > 5)
557 prefetch(next->xstate);
560 * Reload esp0, LDT and the page table pointer:
566 * This won't pick up thread selector changes, but I guess that is ok.
568 savesegment(es, prev->es);
569 if (unlikely(next->es | prev->es))
570 loadsegment(es, next->es);
572 savesegment(ds, prev->ds);
573 if (unlikely(next->ds | prev->ds))
574 loadsegment(ds, next->ds);
577 /* We must save %fs and %gs before load_TLS() because
578 * %fs and %gs may be cleared by load_TLS().
580 * (e.g. xen_load_tls())
582 savesegment(fs, fsindex);
583 savesegment(gs, gsindex);
588 * Leave lazy mode, flushing any hypercalls made here.
589 * This must be done before restoring TLS segments so
590 * the GDT and LDT are properly updated, and must be
591 * done before math_state_restore, so the TS bit is up
594 arch_leave_lazy_cpu_mode();
599 * Segment register != 0 always requires a reload. Also
600 * reload when it has changed. When prev process used 64bit
601 * base always reload to avoid an information leak.
603 if (unlikely(fsindex | next->fsindex | prev->fs)) {
604 loadsegment(fs, next->fsindex);
606 * Check if the user used a selector != 0; if yes
607 * clear 64bit base, since overloaded base is always
608 * mapped to the Null selector
613 /* when next process has a 64bit base use it */
615 wrmsrl(MSR_FS_BASE, next->fs);
616 prev->fsindex = fsindex;
618 if (unlikely(gsindex | next->gsindex | prev->gs)) {
619 load_gs_index(next->gsindex);
624 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
625 prev->gsindex = gsindex;
627 /* Must be after DS reload */
631 * Switch the PDA and FPU contexts.
633 prev->usersp = percpu_read(old_rsp);
634 percpu_write(old_rsp, next->usersp);
635 percpu_write(current_task, next_p);
637 percpu_write(kernel_stack,
638 (unsigned long)task_stack_page(next_p) +
639 THREAD_SIZE - KERNEL_STACK_OFFSET);
642 * Now maybe reload the debug registers and handle I/O bitmaps
644 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
645 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
646 __switch_to_xtra(prev_p, next_p, tss);
648 /* If the task has used fpu the last 5 timeslices, just do a full
649 * restore of the math state immediately to avoid the trap; the
650 * chances of needing FPU soon are obviously high now
652 * tsk_used_math() checks prevent calling math_state_restore(),
653 * which can sleep in the case of !tsk_used_math()
655 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
656 math_state_restore();
661 * sys_execve() executes a new program.
664 long sys_execve(char __user *name, char __user * __user *argv,
665 char __user * __user *envp, struct pt_regs *regs)
670 filename = getname(name);
671 error = PTR_ERR(filename);
672 if (IS_ERR(filename))
674 error = do_execve(filename, argv, envp, regs);
679 void set_personality_64bit(void)
681 /* inherit personality from parent */
683 /* Make sure to be in 64bit mode */
684 clear_thread_flag(TIF_IA32);
686 /* TBD: overwrites user setup. Should have two bits.
687 But 64bit processes have always behaved this way,
688 so it's not too bad. The main problem is just that
689 32bit childs are affected again. */
690 current->personality &= ~READ_IMPLIES_EXEC;
693 asmlinkage long sys_fork(struct pt_regs *regs)
695 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
699 sys_clone(unsigned long clone_flags, unsigned long newsp,
700 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
704 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
708 * This is trivial, and on the face of it looks like it
709 * could equally well be done in user mode.
711 * Not so, for quite unobvious reasons - register pressure.
712 * In user mode vfork() cannot have a stack frame, and if
713 * done by calling the "clone()" system call directly, you
714 * do not have enough call-clobbered registers to hold all
715 * the information you need.
717 asmlinkage long sys_vfork(struct pt_regs *regs)
719 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
723 unsigned long get_wchan(struct task_struct *p)
729 if (!p || p == current || p->state == TASK_RUNNING)
731 stack = (unsigned long)task_stack_page(p);
732 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
734 fp = *(u64 *)(p->thread.sp);
736 if (fp < (unsigned long)stack ||
737 fp >= (unsigned long)stack+THREAD_SIZE)
740 if (!in_sched_functions(ip))
743 } while (count++ < 16);
747 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
750 int doit = task == current;
755 if (addr >= TASK_SIZE_OF(task))
758 /* handle small bases via the GDT because that's faster to
760 if (addr <= 0xffffffff) {
761 set_32bit_tls(task, GS_TLS, addr);
763 load_TLS(&task->thread, cpu);
764 load_gs_index(GS_TLS_SEL);
766 task->thread.gsindex = GS_TLS_SEL;
769 task->thread.gsindex = 0;
770 task->thread.gs = addr;
773 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
779 /* Not strictly needed for fs, but do it for symmetry
781 if (addr >= TASK_SIZE_OF(task))
784 /* handle small bases via the GDT because that's faster to
786 if (addr <= 0xffffffff) {
787 set_32bit_tls(task, FS_TLS, addr);
789 load_TLS(&task->thread, cpu);
790 loadsegment(fs, FS_TLS_SEL);
792 task->thread.fsindex = FS_TLS_SEL;
795 task->thread.fsindex = 0;
796 task->thread.fs = addr;
798 /* set the selector to 0 to not confuse
801 ret = checking_wrmsrl(MSR_FS_BASE, addr);
808 if (task->thread.fsindex == FS_TLS_SEL)
809 base = read_32bit_tls(task, FS_TLS);
811 rdmsrl(MSR_FS_BASE, base);
813 base = task->thread.fs;
814 ret = put_user(base, (unsigned long __user *)addr);
820 if (task->thread.gsindex == GS_TLS_SEL)
821 base = read_32bit_tls(task, GS_TLS);
823 savesegment(gs, gsindex);
825 rdmsrl(MSR_KERNEL_GS_BASE, base);
827 base = task->thread.gs;
829 base = task->thread.gs;
830 ret = put_user(base, (unsigned long __user *)addr);
842 long sys_arch_prctl(int code, unsigned long addr)
844 return do_arch_prctl(current, code, addr);
847 unsigned long arch_align_stack(unsigned long sp)
849 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
850 sp -= get_random_int() % 8192;
854 unsigned long arch_randomize_brk(struct mm_struct *mm)
856 unsigned long range_end = mm->brk + 0x02000000;
857 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;