2 * linux/arch/x86_64/entry.S
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
12 * entry.S contains the system-call and fault low-level handling routines.
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers upto R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
27 * - schedule it carefully for the final hardware.
31 #include <linux/config.h>
32 #include <linux/linkage.h>
33 #include <asm/segment.h>
35 #include <asm/cache.h>
36 #include <asm/errno.h>
37 #include <asm/dwarf2.h>
38 #include <asm/calling.h>
39 #include <asm/asm-offsets.h>
41 #include <asm/unistd.h>
42 #include <asm/thread_info.h>
43 #include <asm/hw_irq.h>
47 #ifndef CONFIG_PREEMPT
48 #define retint_kernel retint_restore_args
52 * C code is not supposed to know about undefined top of stack. Every time
53 * a C function with an pt_regs argument is called from the SYSCALL based
54 * fast path FIXUP_TOP_OF_STACK is needed.
55 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
59 /* %rsp:at FRAMEEND */
60 .macro FIXUP_TOP_OF_STACK tmp
61 movq %gs:pda_oldrsp,\tmp
63 movq $__USER_DS,SS(%rsp)
64 movq $__USER_CS,CS(%rsp)
66 movq R11(%rsp),\tmp /* get eflags */
67 movq \tmp,EFLAGS(%rsp)
70 .macro RESTORE_TOP_OF_STACK tmp,offset=0
71 movq RSP-\offset(%rsp),\tmp
72 movq \tmp,%gs:pda_oldrsp
73 movq EFLAGS-\offset(%rsp),\tmp
74 movq \tmp,R11-\offset(%rsp)
77 .macro FAKE_STACK_FRAME child_rip
78 /* push in order ss, rsp, eflags, cs, rip */
81 CFI_ADJUST_CFA_OFFSET 8
83 CFI_ADJUST_CFA_OFFSET 8
85 pushq $(1<<9) /* eflags - interrupts on */
86 CFI_ADJUST_CFA_OFFSET 8
87 pushq $__KERNEL_CS /* cs */
88 CFI_ADJUST_CFA_OFFSET 8
89 pushq \child_rip /* rip */
90 CFI_ADJUST_CFA_OFFSET 8
92 pushq %rax /* orig rax */
93 CFI_ADJUST_CFA_OFFSET 8
96 .macro UNFAKE_STACK_FRAME
98 CFI_ADJUST_CFA_OFFSET -(6*8)
101 .macro CFI_DEFAULT_STACK
102 CFI_ADJUST_CFA_OFFSET (SS)
103 CFI_OFFSET r15,R15-SS
104 CFI_OFFSET r14,R14-SS
105 CFI_OFFSET r13,R13-SS
106 CFI_OFFSET r12,R12-SS
107 CFI_OFFSET rbp,RBP-SS
108 CFI_OFFSET rbx,RBX-SS
109 CFI_OFFSET r11,R11-SS
110 CFI_OFFSET r10,R10-SS
113 CFI_OFFSET rax,RAX-SS
114 CFI_OFFSET rcx,RCX-SS
115 CFI_OFFSET rdx,RDX-SS
116 CFI_OFFSET rsi,RSI-SS
117 CFI_OFFSET rdi,RDI-SS
118 CFI_OFFSET rsp,RSP-SS
119 CFI_OFFSET rip,RIP-SS
122 * A newly forked process directly context switches into this.
129 GET_THREAD_INFO(%rcx)
130 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
134 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
135 je int_ret_from_sys_call
136 testl $_TIF_IA32,threadinfo_flags(%rcx)
137 jnz int_ret_from_sys_call
138 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
139 jmp ret_from_sys_call
142 call syscall_trace_leave
143 GET_THREAD_INFO(%rcx)
148 * System call entry. Upto 6 arguments in registers are supported.
150 * SYSCALL does not save anything on the stack and does not change the
156 * rax system call number
158 * rcx return address for syscall/sysret, C arg3
161 * r10 arg3 (--> moved to rcx for C)
164 * r11 eflags for syscall/sysret, temporary for C
165 * r12-r15,rbp,rbx saved by C code, not touched.
167 * Interrupts are off on entry.
168 * Only called from user space.
170 * XXX if we had a free scratch register we could save the RSP into the stack frame
171 * and report it properly in ps. Unfortunately we haven't.
177 movq %rsp,%gs:pda_oldrsp
178 movq %gs:pda_kernelstack,%rsp
181 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
182 movq %rcx,RIP-ARGOFFSET(%rsp)
183 GET_THREAD_INFO(%rcx)
184 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
186 cmpq $__NR_syscall_max,%rax
189 call *sys_call_table(,%rax,8) # XXX: rip relative
190 movq %rax,RAX-ARGOFFSET(%rsp)
192 * Syscall return path ending with SYSRET (fast path)
193 * Has incomplete stack frame and undefined top of stack.
195 .globl ret_from_sys_call
197 movl $_TIF_ALLWORK_MASK,%edi
200 GET_THREAD_INFO(%rcx)
202 movl threadinfo_flags(%rcx),%edx
205 movq RIP-ARGOFFSET(%rsp),%rcx
206 RESTORE_ARGS 0,-ARG_SKIP,1
207 movq %gs:pda_oldrsp,%rsp
211 /* Handle reschedules */
212 /* edx: work, edi: workmask */
214 bt $TIF_NEED_RESCHED,%edx
222 /* Handle a signal */
225 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
228 /* Really a signal */
229 /* edx: work flags (arg3) */
230 leaq do_notify_resume(%rip),%rax
231 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
232 xorl %esi,%esi # oldset -> arg2
233 call ptregscall_common
234 1: movl $_TIF_NEED_RESCHED,%edi
237 /* Do syscall tracing */
240 movq $-ENOSYS,RAX(%rsp)
241 FIXUP_TOP_OF_STACK %rdi
243 call syscall_trace_enter
244 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
246 cmpq $__NR_syscall_max,%rax
248 movq %r10,%rcx /* fixup for C */
249 call *sys_call_table(,%rax,8)
250 movq %rax,RAX-ARGOFFSET(%rsp)
253 call syscall_trace_leave
254 RESTORE_TOP_OF_STACK %rbx
256 jmp ret_from_sys_call
259 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
260 jmp ret_from_sys_call
263 * Syscall return path ending with IRET.
264 * Has correct top of stack, but partial stack frame.
266 ENTRY(int_ret_from_sys_call)
268 testl $3,CS-ARGOFFSET(%rsp)
269 je retint_restore_args
270 movl $_TIF_ALLWORK_MASK,%edi
271 /* edi: mask to check */
273 GET_THREAD_INFO(%rcx)
274 movl threadinfo_flags(%rcx),%edx
279 /* Either reschedule or signal or syscall exit tracking needed. */
280 /* First do a reschedule test. */
281 /* edx: work, edi: workmask */
283 bt $TIF_NEED_RESCHED,%edx
292 /* handle signals and tracing -- both require a full stack frame */
296 /* Check for syscall exit trace */
297 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
300 leaq 8(%rsp),%rdi # &ptregs -> arg1
301 call syscall_trace_leave
303 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
308 testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
310 movq %rsp,%rdi # &ptregs -> arg1
311 xorl %esi,%esi # oldset -> arg2
312 call do_notify_resume
313 1: movl $_TIF_NEED_RESCHED,%edi
321 * Certain special system calls that need to save a complete full stack frame.
324 .macro PTREGSCALL label,func,arg
327 leaq \func(%rip),%rax
328 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
329 jmp ptregscall_common
332 PTREGSCALL stub_clone, sys_clone, %r8
333 PTREGSCALL stub_fork, sys_fork, %rdi
334 PTREGSCALL stub_vfork, sys_vfork, %rdi
335 PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
336 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
337 PTREGSCALL stub_iopl, sys_iopl, %rsi
339 ENTRY(ptregscall_common)
342 CFI_ADJUST_CFA_OFFSET -8
345 FIXUP_TOP_OF_STACK %r11
347 RESTORE_TOP_OF_STACK %r11
351 CFI_ADJUST_CFA_OFFSET 8
358 CFI_ADJUST_CFA_OFFSET -8
361 FIXUP_TOP_OF_STACK %r11
363 GET_THREAD_INFO(%rcx)
364 bt $TIF_IA32,threadinfo_flags(%rcx)
366 RESTORE_TOP_OF_STACK %r11
373 CFI_ADJUST_CFA_OFFSET REST_SKIP
376 jmp int_ret_from_sys_call
380 * sigreturn is special because it needs to restore all registers on return.
381 * This cannot be done with SYSRET, so use the IRET return path instead.
383 ENTRY(stub_rt_sigreturn)
388 FIXUP_TOP_OF_STACK %r11
389 call sys_rt_sigreturn
390 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
392 jmp int_ret_from_sys_call
396 * Interrupt entry/exit.
398 * Interrupt entry points save only callee clobbered registers in fast path.
400 * Entry runs with interrupts off.
403 /* 0(%rsp): interrupt number */
404 .macro interrupt func
406 CFI_DEF_CFA rsp,(SS-RDI)
407 CFI_REL_OFFSET rsp,(RSP-ORIG_RAX)
408 CFI_REL_OFFSET rip,(RIP-ORIG_RAX)
410 #ifdef CONFIG_DEBUG_INFO
414 * Setup a stack frame pointer. This allows gdb to trace
415 * back to the original stack.
418 CFI_DEF_CFA_REGISTER rbp
421 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
426 1: incl %gs:pda_irqcount # RED-PEN should check preempt count
427 movq %gs:pda_irqstackptr,%rax
429 pushq %rdi # save old stack
433 ENTRY(common_interrupt)
435 /* 0(%rsp): oldrsp-ARGOFFSET */
439 decl %gs:pda_irqcount
440 #ifdef CONFIG_DEBUG_INFO
443 leaq ARGOFFSET(%rdi),%rsp
445 GET_THREAD_INFO(%rcx)
446 testl $3,CS-ARGOFFSET(%rsp)
449 /* Interrupt came from user space */
451 * Has a correct top of stack, but a partial stack frame
452 * %rcx: thread info. Interrupts off.
454 retint_with_reschedule:
455 movl $_TIF_WORK_MASK,%edi
457 movl threadinfo_flags(%rcx),%edx
468 .section __ex_table,"a"
469 .quad iret_label,bad_iret
472 /* force a signal here? this matches i386 behaviour */
473 /* running with kernel gs */
475 movq $-9999,%rdi /* better code? */
479 /* edi: workmask, edx: work */
481 bt $TIF_NEED_RESCHED,%edx
487 GET_THREAD_INFO(%rcx)
492 testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
496 movq $-1,ORIG_RAX(%rsp)
497 xorl %esi,%esi # oldset
498 movq %rsp,%rdi # &pt_regs
499 call do_notify_resume
502 movl $_TIF_NEED_RESCHED,%edi
503 GET_THREAD_INFO(%rcx)
506 #ifdef CONFIG_PREEMPT
507 /* Returning to kernel space. Check if we need preemption */
508 /* rcx: threadinfo. interrupts off. */
511 cmpl $0,threadinfo_preempt_count(%rcx)
512 jnz retint_restore_args
513 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
514 jnc retint_restore_args
515 bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
516 jnc retint_restore_args
517 call preempt_schedule_irq
525 .macro apicinterrupt num,func
532 ENTRY(thermal_interrupt)
533 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
536 ENTRY(reschedule_interrupt)
537 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
539 .macro INVALIDATE_ENTRY num
540 ENTRY(invalidate_interrupt\num)
541 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
553 ENTRY(call_function_interrupt)
554 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
557 #ifdef CONFIG_X86_LOCAL_APIC
558 ENTRY(apic_timer_interrupt)
559 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
561 ENTRY(error_interrupt)
562 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
564 ENTRY(spurious_interrupt)
565 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
569 * Exception entry points.
572 pushq $0 /* push error code/oldrax */
573 pushq %rax /* push real oldrax to the rdi slot */
578 .macro errorentry sym
584 /* error code is on the stack already */
585 /* handle NMI like exceptions that can happen everywhere */
586 .macro paranoidentry sym
590 movl $MSR_GS_BASE,%ecx
597 movq ORIG_RAX(%rsp),%rsi
598 movq $-1,ORIG_RAX(%rsp)
604 * Exception entry point. This expects an error code/orig_rax on the stack
605 * and the exception handler in %rax.
609 CFI_DEF_CFA rsp,(SS-RDI)
610 CFI_REL_OFFSET rsp,(RSP-RDI)
611 CFI_REL_OFFSET rip,(RIP-RDI)
612 /* rdi slot contains rax, oldrax contains error code */
615 CFI_ADJUST_CFA_OFFSET (14*8)
617 CFI_REL_OFFSET rsi,RSI
618 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
620 CFI_REL_OFFSET rdx,RDX
622 CFI_REL_OFFSET rcx,RCX
623 movq %rsi,10*8(%rsp) /* store rax */
624 CFI_REL_OFFSET rax,RAX
630 CFI_REL_OFFSET r10,R10
632 CFI_REL_OFFSET r11,R11
634 CFI_REL_OFFSET rbx,RBX
636 CFI_REL_OFFSET rbp,RBP
638 CFI_REL_OFFSET r12,R12
640 CFI_REL_OFFSET r13,R13
642 CFI_REL_OFFSET r14,R14
644 CFI_REL_OFFSET r15,R15
653 movq ORIG_RAX(%rsp),%rsi /* get error code */
654 movq $-1,ORIG_RAX(%rsp)
656 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
661 GET_THREAD_INFO(%rcx)
664 movl threadinfo_flags(%rcx),%edx
665 movl $_TIF_WORK_MASK,%edi
675 /* There are two places in the kernel that can potentially fault with
676 usergs. Handle them here. The exception handlers after
677 iret run with kernel gs again, so don't set the user space flag.
678 B stepping K8s sometimes report an truncated RIP for IRET
679 exceptions returning to compat mode. Check for these here too. */
680 leaq iret_label(%rip),%rbp
683 movl %ebp,%ebp /* zero extend */
686 cmpq $gs_change,RIP(%rsp)
690 /* Reload gs selector with exception handling */
691 /* edi: new selector */
698 2: mfence /* workaround */
703 .section __ex_table,"a"
705 .quad gs_change,bad_gs
708 /* running with kernelgs */
710 swapgs /* switch back to user gs */
717 * Create a kernel thread.
719 * C extern interface:
720 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
722 * asm input arguments:
723 * rdi: fn, rsi: arg, rdx: flags
727 FAKE_STACK_FRAME $child_rip
730 # rdi: flags, rsi: usp, rdx: will be &pt_regs
732 orq kernel_thread_flags(%rip),%rdi
745 * It isn't worth to check for reschedule here,
746 * so internally to the x86_64 port you can rely on kernel_thread()
747 * not to reschedule the child before returning, this avoids the need
748 * of hacks for example to fork off the per-CPU idle tasks.
749 * [Hopefully no generic code relies on the reschedule -AK]
759 * Here we are in the child and the registers are set as they were
760 * at kernel_thread() invocation in the parent.
770 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
772 * C extern interface:
773 * extern long execve(char *name, char **argv, char **envp)
775 * asm input arguments:
776 * rdi: name, rsi: argv, rdx: envp
778 * We want to fallback into:
779 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
781 * do_sys_execve asm fallback arguments:
782 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
792 je int_ret_from_sys_call
798 KPROBE_ENTRY(page_fault)
799 errorentry do_page_fault
802 ENTRY(coprocessor_error)
803 zeroentry do_coprocessor_error
805 ENTRY(simd_coprocessor_error)
806 zeroentry do_simd_coprocessor_error
808 ENTRY(device_not_available)
809 zeroentry math_state_restore
811 /* runs on exception stack */
815 CFI_ADJUST_CFA_OFFSET 8
816 paranoidentry do_debug
821 /* runs on exception stack */
825 CFI_ADJUST_CFA_OFFSET 8
828 * "Paranoid" exit path from exception stack.
829 * Paranoid because this is used by NMIs and cannot take
830 * any kernel state for granted.
831 * We don't do kernel preemption checks here, because only
832 * NMI should be common and it does not enable IRQs and
833 * cannot get reschedule ticks.
835 /* ebx: no swapgs flag */
837 testl %ebx,%ebx /* swapgs needed? */
840 jnz paranoid_userspace
847 GET_THREAD_INFO(%rcx)
848 movl threadinfo_flags(%rcx),%ebx
849 andl $_TIF_WORK_MASK,%ebx
851 movq %rsp,%rdi /* &pt_regs */
853 movq %rax,%rsp /* switch stack for scheduling */
854 testl $_TIF_NEED_RESCHED,%ebx
855 jnz paranoid_schedule
856 movl %ebx,%edx /* arg3: thread flags */
858 xorl %esi,%esi /* arg2: oldset */
859 movq %rsp,%rdi /* arg1: &pt_regs */
860 call do_notify_resume
862 jmp paranoid_userspace
867 jmp paranoid_userspace
875 zeroentry do_overflow
881 zeroentry do_invalid_op
883 ENTRY(coprocessor_segment_overrun)
884 zeroentry do_coprocessor_segment_overrun
887 zeroentry do_reserved
889 /* runs on exception stack */
892 paranoidentry do_double_fault
897 errorentry do_invalid_TSS
899 ENTRY(segment_not_present)
900 errorentry do_segment_not_present
902 /* runs on exception stack */
905 paranoidentry do_stack_segment
909 KPROBE_ENTRY(general_protection)
910 errorentry do_general_protection
913 ENTRY(alignment_check)
914 errorentry do_alignment_check
917 zeroentry do_divide_error
919 ENTRY(spurious_interrupt_bug)
920 zeroentry do_spurious_interrupt_bug
922 #ifdef CONFIG_X86_MCE
923 /* runs on exception stack */
927 CFI_ADJUST_CFA_OFFSET 8
928 paranoidentry do_machine_check
934 zeroentry do_call_debug
937 movq %gs:pda_irqstackptr,%rax
940 incl %gs:pda_irqcount
944 decl %gs:pda_irqcount