2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62 EXPORT_PER_CPU_SYMBOL(current_task);
64 DEFINE_PER_CPU(unsigned long, old_rsp);
65 static DEFINE_PER_CPU(unsigned char, is_idle);
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 void idle_notifier_unregister(struct notifier_block *n)
79 atomic_notifier_chain_unregister(&idle_notifier, n);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
85 percpu_write(is_idle, 1);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 static inline void play_dead(void)
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status |= TS_POLLING;
123 * If we're the non-boot CPU, nothing set the stack canary up
124 * for us. CPU0 already has it initialized but no harm in
125 * doing it again. This is a good place for updating it, as
126 * we wont ever return from this function (so the invalid
127 * canaries already on the stack wont ever trigger).
129 boot_init_stack_canary();
131 /* endless idle loop with no priority at all */
133 tick_nohz_stop_sched_tick(1);
134 while (!need_resched()) {
138 if (cpu_is_offline(smp_processor_id()))
141 * Idle routines should keep interrupts disabled
142 * from here on, until they go to idle.
143 * Otherwise, idle callbacks can misfire.
147 /* Don't trace irqs off for idle */
148 stop_critical_timings();
150 start_critical_timings();
151 /* In many cases the interrupt that ended idle
152 has already called exit_idle. But some idle
153 loops can be woken up without interrupt. */
157 tick_nohz_restart_sched_tick();
158 preempt_enable_no_resched();
164 /* Prints also some state that isn't saved in the pt_regs */
165 void __show_regs(struct pt_regs *regs, int all)
167 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
168 unsigned long d0, d1, d2, d3, d6, d7;
169 unsigned int fsindex, gsindex;
170 unsigned int ds, cs, es;
175 board = dmi_get_system_info(DMI_PRODUCT_NAME);
178 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
179 current->pid, current->comm, print_tainted(),
180 init_utsname()->release,
181 (int)strcspn(init_utsname()->version, " "),
182 init_utsname()->version, board);
183 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
184 printk_address(regs->ip, 1);
185 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
186 regs->sp, regs->flags);
187 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
188 regs->ax, regs->bx, regs->cx);
189 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
190 regs->dx, regs->si, regs->di);
191 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
192 regs->bp, regs->r8, regs->r9);
193 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
194 regs->r10, regs->r11, regs->r12);
195 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
196 regs->r13, regs->r14, regs->r15);
198 asm("movl %%ds,%0" : "=r" (ds));
199 asm("movl %%cs,%0" : "=r" (cs));
200 asm("movl %%es,%0" : "=r" (es));
201 asm("movl %%fs,%0" : "=r" (fsindex));
202 asm("movl %%gs,%0" : "=r" (gsindex));
204 rdmsrl(MSR_FS_BASE, fs);
205 rdmsrl(MSR_GS_BASE, gs);
206 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
216 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
217 fs, fsindex, gs, gsindex, shadowgs);
218 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
220 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
226 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
230 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
233 void show_regs(struct pt_regs *regs)
235 printk(KERN_INFO "CPU %d:", smp_processor_id());
236 __show_regs(regs, 1);
237 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
240 void release_thread(struct task_struct *dead_task)
243 if (dead_task->mm->context.size) {
244 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
246 dead_task->mm->context.ldt,
247 dead_task->mm->context.size);
253 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
255 struct user_desc ud = {
262 struct desc_struct *desc = t->thread.tls_array;
267 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
269 return get_desc_base(&t->thread.tls_array[tls]);
273 * This gets called before we allocate a new thread and copy
274 * the current task into it.
276 void prepare_to_copy(struct task_struct *tsk)
281 int copy_thread(unsigned long clone_flags, unsigned long sp,
282 unsigned long unused,
283 struct task_struct *p, struct pt_regs *regs)
286 struct pt_regs *childregs;
287 struct task_struct *me = current;
289 childregs = ((struct pt_regs *)
290 (THREAD_SIZE + task_stack_page(p))) - 1;
296 childregs->sp = (unsigned long)childregs;
298 p->thread.sp = (unsigned long) childregs;
299 p->thread.sp0 = (unsigned long) (childregs+1);
300 p->thread.usersp = me->thread.usersp;
302 set_tsk_thread_flag(p, TIF_FORK);
304 p->thread.fs = me->thread.fs;
305 p->thread.gs = me->thread.gs;
307 savesegment(gs, p->thread.gsindex);
308 savesegment(fs, p->thread.fsindex);
309 savesegment(es, p->thread.es);
310 savesegment(ds, p->thread.ds);
312 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
313 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
314 if (!p->thread.io_bitmap_ptr) {
315 p->thread.io_bitmap_max = 0;
318 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
320 set_tsk_thread_flag(p, TIF_IO_BITMAP);
324 * Set a new TLS for the child thread?
326 if (clone_flags & CLONE_SETTLS) {
327 #ifdef CONFIG_IA32_EMULATION
328 if (test_thread_flag(TIF_IA32))
329 err = do_set_thread_area(p, -1,
330 (struct user_desc __user *)childregs->si, 0);
333 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
338 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
339 p->thread.ds_ctx = NULL;
341 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
342 p->thread.debugctlmsr = 0;
346 if (err && p->thread.io_bitmap_ptr) {
347 kfree(p->thread.io_bitmap_ptr);
348 p->thread.io_bitmap_max = 0;
354 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
362 percpu_write(old_rsp, new_sp);
363 regs->cs = __USER_CS;
364 regs->ss = __USER_DS;
368 * Free the old FP and other extended state
370 free_thread_xstate(current);
372 EXPORT_SYMBOL_GPL(start_thread);
375 * switch_to(x,y) should switch tasks from x to y.
377 * This could still be optimized:
378 * - fold all the options into a flag word and test it with a single test.
379 * - could test fs/gs bitsliced
381 * Kprobes not supported here. Set the probe on schedule instead.
382 * Function graph tracer not supported too.
384 __notrace_funcgraph struct task_struct *
385 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
387 struct thread_struct *prev = &prev_p->thread;
388 struct thread_struct *next = &next_p->thread;
389 int cpu = smp_processor_id();
390 struct tss_struct *tss = &per_cpu(init_tss, cpu);
391 unsigned fsindex, gsindex;
393 /* we're going to use this soon, after a few expensive things */
394 if (next_p->fpu_counter > 5)
395 prefetch(next->xstate);
398 * Reload esp0, LDT and the page table pointer:
404 * This won't pick up thread selector changes, but I guess that is ok.
406 savesegment(es, prev->es);
407 if (unlikely(next->es | prev->es))
408 loadsegment(es, next->es);
410 savesegment(ds, prev->ds);
411 if (unlikely(next->ds | prev->ds))
412 loadsegment(ds, next->ds);
415 /* We must save %fs and %gs before load_TLS() because
416 * %fs and %gs may be cleared by load_TLS().
418 * (e.g. xen_load_tls())
420 savesegment(fs, fsindex);
421 savesegment(gs, gsindex);
426 * Leave lazy mode, flushing any hypercalls made here.
427 * This must be done before restoring TLS segments so
428 * the GDT and LDT are properly updated, and must be
429 * done before math_state_restore, so the TS bit is up
432 arch_leave_lazy_cpu_mode();
437 * Segment register != 0 always requires a reload. Also
438 * reload when it has changed. When prev process used 64bit
439 * base always reload to avoid an information leak.
441 if (unlikely(fsindex | next->fsindex | prev->fs)) {
442 loadsegment(fs, next->fsindex);
444 * Check if the user used a selector != 0; if yes
445 * clear 64bit base, since overloaded base is always
446 * mapped to the Null selector
451 /* when next process has a 64bit base use it */
453 wrmsrl(MSR_FS_BASE, next->fs);
454 prev->fsindex = fsindex;
456 if (unlikely(gsindex | next->gsindex | prev->gs)) {
457 load_gs_index(next->gsindex);
462 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
463 prev->gsindex = gsindex;
465 /* Must be after DS reload */
469 * Switch the PDA and FPU contexts.
471 prev->usersp = percpu_read(old_rsp);
472 percpu_write(old_rsp, next->usersp);
473 percpu_write(current_task, next_p);
475 percpu_write(kernel_stack,
476 (unsigned long)task_stack_page(next_p) +
477 THREAD_SIZE - KERNEL_STACK_OFFSET);
480 * Now maybe reload the debug registers and handle I/O bitmaps
482 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
483 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
484 __switch_to_xtra(prev_p, next_p, tss);
486 /* If the task has used fpu the last 5 timeslices, just do a full
487 * restore of the math state immediately to avoid the trap; the
488 * chances of needing FPU soon are obviously high now
490 * tsk_used_math() checks prevent calling math_state_restore(),
491 * which can sleep in the case of !tsk_used_math()
493 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
494 math_state_restore();
499 * sys_execve() executes a new program.
502 long sys_execve(char __user *name, char __user * __user *argv,
503 char __user * __user *envp, struct pt_regs *regs)
508 filename = getname(name);
509 error = PTR_ERR(filename);
510 if (IS_ERR(filename))
512 error = do_execve(filename, argv, envp, regs);
517 void set_personality_64bit(void)
519 /* inherit personality from parent */
521 /* Make sure to be in 64bit mode */
522 clear_thread_flag(TIF_IA32);
524 /* TBD: overwrites user setup. Should have two bits.
525 But 64bit processes have always behaved this way,
526 so it's not too bad. The main problem is just that
527 32bit childs are affected again. */
528 current->personality &= ~READ_IMPLIES_EXEC;
532 sys_clone(unsigned long clone_flags, unsigned long newsp,
533 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
540 unsigned long get_wchan(struct task_struct *p)
546 if (!p || p == current || p->state == TASK_RUNNING)
548 stack = (unsigned long)task_stack_page(p);
549 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
551 fp = *(u64 *)(p->thread.sp);
553 if (fp < (unsigned long)stack ||
554 fp >= (unsigned long)stack+THREAD_SIZE)
557 if (!in_sched_functions(ip))
560 } while (count++ < 16);
564 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
567 int doit = task == current;
572 if (addr >= TASK_SIZE_OF(task))
575 /* handle small bases via the GDT because that's faster to
577 if (addr <= 0xffffffff) {
578 set_32bit_tls(task, GS_TLS, addr);
580 load_TLS(&task->thread, cpu);
581 load_gs_index(GS_TLS_SEL);
583 task->thread.gsindex = GS_TLS_SEL;
586 task->thread.gsindex = 0;
587 task->thread.gs = addr;
590 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
596 /* Not strictly needed for fs, but do it for symmetry
598 if (addr >= TASK_SIZE_OF(task))
601 /* handle small bases via the GDT because that's faster to
603 if (addr <= 0xffffffff) {
604 set_32bit_tls(task, FS_TLS, addr);
606 load_TLS(&task->thread, cpu);
607 loadsegment(fs, FS_TLS_SEL);
609 task->thread.fsindex = FS_TLS_SEL;
612 task->thread.fsindex = 0;
613 task->thread.fs = addr;
615 /* set the selector to 0 to not confuse
618 ret = checking_wrmsrl(MSR_FS_BASE, addr);
625 if (task->thread.fsindex == FS_TLS_SEL)
626 base = read_32bit_tls(task, FS_TLS);
628 rdmsrl(MSR_FS_BASE, base);
630 base = task->thread.fs;
631 ret = put_user(base, (unsigned long __user *)addr);
637 if (task->thread.gsindex == GS_TLS_SEL)
638 base = read_32bit_tls(task, GS_TLS);
640 savesegment(gs, gsindex);
642 rdmsrl(MSR_KERNEL_GS_BASE, base);
644 base = task->thread.gs;
646 base = task->thread.gs;
647 ret = put_user(base, (unsigned long __user *)addr);
659 long sys_arch_prctl(int code, unsigned long addr)
661 return do_arch_prctl(current, code, addr);
664 unsigned long arch_align_stack(unsigned long sp)
666 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
667 sp -= get_random_int() % 8192;
671 unsigned long arch_randomize_brk(struct mm_struct *mm)
673 unsigned long range_end = mm->brk + 0x02000000;
674 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;