2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/signal.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <linux/errno.h>
12 #include <linux/string.h>
13 #include <linux/types.h>
14 #include <linux/ptrace.h>
15 #include <linux/mman.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/interrupt.h>
20 #include <linux/init.h>
21 #include <linux/tty.h>
22 #include <linux/vt_kern.h> /* For unblank_screen() */
23 #include <linux/compiler.h>
24 #include <linux/module.h>
25 #include <linux/kprobes.h>
26 #include <linux/uaccess.h>
28 #include <asm/system.h>
29 #include <asm/pgalloc.h>
31 #include <asm/tlbflush.h>
32 #include <asm/proto.h>
33 #include <asm/kdebug.h>
34 #include <asm-generic/sections.h>
36 /* Page fault error code bits */
37 #define PF_PROT (1<<0) /* or no page found */
38 #define PF_WRITE (1<<1)
39 #define PF_USER (1<<2)
40 #define PF_RSVD (1<<3)
41 #define PF_INSTR (1<<4)
43 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
45 /* Hook to register for page fault notifications */
46 int register_page_fault_notifier(struct notifier_block *nb)
49 return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
51 EXPORT_SYMBOL_GPL(register_page_fault_notifier);
53 int unregister_page_fault_notifier(struct notifier_block *nb)
55 return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
57 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
59 static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
62 struct die_args args = {
69 return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args);
72 void bust_spinlocks(int yes)
74 int loglevel_save = console_loglevel;
83 * OK, the message is on the console. Now we call printk()
84 * without oops_in_progress set so that printk will give klogd
85 * a poke. Hold onto your hats...
87 console_loglevel = 15; /* NMI oopser may have shut the console up */
89 console_loglevel = loglevel_save;
93 /* Sometimes the CPU reports invalid exceptions on prefetch.
94 Check that here and ignore.
95 Opcode checker based on code by Richard Brunner */
96 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
97 unsigned long error_code)
102 unsigned char *max_instr;
104 /* If it was a exec fault ignore */
105 if (error_code & PF_INSTR)
108 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
109 max_instr = instr + 15;
111 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
114 while (scan_more && instr < max_instr) {
115 unsigned char opcode;
116 unsigned char instr_hi;
117 unsigned char instr_lo;
119 if (probe_kernel_address(instr, opcode))
122 instr_hi = opcode & 0xf0;
123 instr_lo = opcode & 0x0f;
129 /* Values 0x26,0x2E,0x36,0x3E are valid x86
130 prefixes. In long mode, the CPU will signal
131 invalid opcode if some of these prefixes are
132 present so we will never get here anyway */
133 scan_more = ((instr_lo & 7) == 0x6);
137 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
138 Need to figure out under what instruction mode the
139 instruction was issued ... */
140 /* Could check the LDT for lm, but for now it's good
141 enough to assume that long mode only uses well known
142 segments or kernel. */
143 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
147 /* 0x64 thru 0x67 are valid prefixes in all modes. */
148 scan_more = (instr_lo & 0xC) == 0x4;
151 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
152 scan_more = !instr_lo || (instr_lo>>1) == 1;
155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
157 if (probe_kernel_address(instr, opcode))
159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18);
170 static int bad_address(void *p)
173 return probe_kernel_address((unsigned long *)p, dummy);
176 void dump_pagetable(unsigned long address)
183 asm("movq %%cr3,%0" : "=r" (pgd));
185 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
186 pgd += pgd_index(address);
187 if (bad_address(pgd)) goto bad;
188 printk("PGD %lx ", pgd_val(*pgd));
189 if (!pgd_present(*pgd)) goto ret;
191 pud = pud_offset(pgd, address);
192 if (bad_address(pud)) goto bad;
193 printk("PUD %lx ", pud_val(*pud));
194 if (!pud_present(*pud)) goto ret;
196 pmd = pmd_offset(pud, address);
197 if (bad_address(pmd)) goto bad;
198 printk("PMD %lx ", pmd_val(*pmd));
199 if (!pmd_present(*pmd)) goto ret;
201 pte = pte_offset_kernel(pmd, address);
202 if (bad_address(pte)) goto bad;
203 printk("PTE %lx", pte_val(*pte));
211 static const char errata93_warning[] =
212 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
213 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
214 KERN_ERR "******* Please consider a BIOS update.\n"
215 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
217 /* Workaround for K8 erratum #93 & buggy BIOS.
218 BIOS SMM functions are required to use a specific workaround
219 to avoid corruption of the 64bit RIP register on C stepping K8.
220 A lot of BIOS that didn't get tested properly miss this.
221 The OS sees this as a page fault with the upper 32bits of RIP cleared.
222 Try to work around it here.
223 Note we only handle faults in kernel here. */
225 static int is_errata93(struct pt_regs *regs, unsigned long address)
228 if (address != regs->rip)
230 if ((address >> 32) != 0)
232 address |= 0xffffffffUL << 32;
233 if ((address >= (u64)_stext && address <= (u64)_etext) ||
234 (address >= MODULES_VADDR && address <= MODULES_END)) {
236 printk(errata93_warning);
245 int unhandled_signal(struct task_struct *tsk, int sig)
249 if (tsk->ptrace & PT_PTRACED)
251 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
252 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
255 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
256 unsigned long error_code)
258 unsigned long flags = oops_begin();
259 struct task_struct *tsk;
261 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
262 current->comm, address);
263 dump_pagetable(address);
265 tsk->thread.cr2 = address;
266 tsk->thread.trap_no = 14;
267 tsk->thread.error_code = error_code;
268 __die("Bad pagetable", regs, error_code);
274 * Handle a fault on the vmalloc area
276 * This assumes no large pages in there.
278 static int vmalloc_fault(unsigned long address)
280 pgd_t *pgd, *pgd_ref;
281 pud_t *pud, *pud_ref;
282 pmd_t *pmd, *pmd_ref;
283 pte_t *pte, *pte_ref;
285 /* Copy kernel mappings over when needed. This can also
286 happen within a race in page table update. In the later
289 pgd = pgd_offset(current->mm ?: &init_mm, address);
290 pgd_ref = pgd_offset_k(address);
291 if (pgd_none(*pgd_ref))
294 set_pgd(pgd, *pgd_ref);
296 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
298 /* Below here mismatches are bugs because these lower tables
301 pud = pud_offset(pgd, address);
302 pud_ref = pud_offset(pgd_ref, address);
303 if (pud_none(*pud_ref))
305 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
307 pmd = pmd_offset(pud, address);
308 pmd_ref = pmd_offset(pud_ref, address);
309 if (pmd_none(*pmd_ref))
311 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
313 pte_ref = pte_offset_kernel(pmd_ref, address);
314 if (!pte_present(*pte_ref))
316 pte = pte_offset_kernel(pmd, address);
317 /* Don't use pte_page here, because the mappings can point
318 outside mem_map, and the NUMA hash lookup cannot handle
320 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
325 int page_fault_trace = 0;
326 int exception_trace = 1;
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
333 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
336 struct task_struct *tsk;
337 struct mm_struct *mm;
338 struct vm_area_struct * vma;
339 unsigned long address;
340 const struct exception_table_entry *fixup;
347 prefetchw(&mm->mmap_sem);
349 /* get the address */
350 __asm__("movq %%cr2,%0":"=r" (address));
352 info.si_code = SEGV_MAPERR;
356 * We fault-in kernel-space virtual memory on-demand. The
357 * 'reference' page table is init_mm.pgd.
359 * NOTE! We MUST NOT take any locks for this case. We may
360 * be in an interrupt or a critical region, and should
361 * only copy the information from the master page table,
364 * This verifies that the fault happens in kernel space
365 * (error_code & 4) == 0, and that the fault was not a
366 * protection error (error_code & 9) == 0.
368 if (unlikely(address >= TASK_SIZE64)) {
370 * Don't check for the module range here: its PML4
371 * is always initialized because it's shared with the main
372 * kernel text. Only vmalloc may need PML4 syncups.
374 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
375 ((address >= VMALLOC_START && address < VMALLOC_END))) {
376 if (vmalloc_fault(address) >= 0)
379 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
380 SIGSEGV) == NOTIFY_STOP)
383 * Don't take the mm semaphore here. If we fixup a prefetch
384 * fault we could otherwise deadlock.
386 goto bad_area_nosemaphore;
389 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
390 SIGSEGV) == NOTIFY_STOP)
393 if (likely(regs->eflags & X86_EFLAGS_IF))
396 if (unlikely(page_fault_trace))
397 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
398 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
400 if (unlikely(error_code & PF_RSVD))
401 pgtable_bad(address, regs, error_code);
404 * If we're in an interrupt or have no user
405 * context, we must not take the fault..
407 if (unlikely(in_atomic() || !mm))
408 goto bad_area_nosemaphore;
411 /* When running in the kernel we expect faults to occur only to
412 * addresses in user space. All other faults represent errors in the
413 * kernel and should generate an OOPS. Unfortunatly, in the case of an
414 * erroneous fault occurring in a code path which already holds mmap_sem
415 * we will deadlock attempting to validate the fault against the
416 * address space. Luckily the kernel only validly references user
417 * space from well defined areas of code, which are listed in the
420 * As the vast majority of faults will be valid we will only perform
421 * the source reference check when there is a possibilty of a deadlock.
422 * Attempt to lock the address space, if we cannot we then validate the
423 * source. If this is invalid we can skip the address space check,
424 * thus avoiding the deadlock.
426 if (!down_read_trylock(&mm->mmap_sem)) {
427 if ((error_code & PF_USER) == 0 &&
428 !search_exception_tables(regs->rip))
429 goto bad_area_nosemaphore;
430 down_read(&mm->mmap_sem);
433 vma = find_vma(mm, address);
436 if (likely(vma->vm_start <= address))
438 if (!(vma->vm_flags & VM_GROWSDOWN))
440 if (error_code & 4) {
441 /* Allow userspace just enough access below the stack pointer
442 * to let the 'enter' instruction work.
444 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
447 if (expand_stack(vma, address))
450 * Ok, we have a good vm_area for this memory access, so
454 info.si_code = SEGV_ACCERR;
456 switch (error_code & (PF_PROT|PF_WRITE)) {
457 default: /* 3: write, present */
459 case PF_WRITE: /* write, not present */
460 if (!(vma->vm_flags & VM_WRITE))
464 case PF_PROT: /* read, present */
466 case 0: /* read, not present */
467 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
472 * If for any reason at all we couldn't handle the fault,
473 * make sure we exit gracefully rather than endlessly redo
476 switch (handle_mm_fault(mm, vma, address, write)) {
483 case VM_FAULT_SIGBUS:
489 up_read(&mm->mmap_sem);
493 * Something tried to access memory that isn't in our memory map..
494 * Fix it, but check if it's kernel or user first..
497 up_read(&mm->mmap_sem);
499 bad_area_nosemaphore:
500 /* User mode accesses just cause a SIGSEGV */
501 if (error_code & PF_USER) {
502 if (is_prefetch(regs, address, error_code))
505 /* Work around K8 erratum #100 K8 in compat mode
506 occasionally jumps to illegal addresses >4GB. We
507 catch this here in the page fault handler because
508 these addresses are not reachable. Just detect this
509 case and return. Any code segment in LDT is
510 compatibility mode. */
511 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
515 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
517 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
518 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
519 tsk->comm, tsk->pid, address, regs->rip,
520 regs->rsp, error_code);
523 tsk->thread.cr2 = address;
524 /* Kernel addresses are always protection faults */
525 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
526 tsk->thread.trap_no = 14;
527 info.si_signo = SIGSEGV;
529 /* info.si_code has been set above */
530 info.si_addr = (void __user *)address;
531 force_sig_info(SIGSEGV, &info, tsk);
537 /* Are we prepared to handle this kernel fault? */
538 fixup = search_exception_tables(regs->rip);
540 regs->rip = fixup->fixup;
545 * Hall of shame of CPU/BIOS bugs.
548 if (is_prefetch(regs, address, error_code))
551 if (is_errata93(regs, address))
555 * Oops. The kernel tried to access some bad page. We'll have to
556 * terminate things with extreme prejudice.
559 flags = oops_begin();
561 if (address < PAGE_SIZE)
562 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
564 printk(KERN_ALERT "Unable to handle kernel paging request");
565 printk(" at %016lx RIP: \n" KERN_ALERT,address);
566 printk_address(regs->rip);
567 dump_pagetable(address);
568 tsk->thread.cr2 = address;
569 tsk->thread.trap_no = 14;
570 tsk->thread.error_code = error_code;
571 __die("Oops", regs, error_code);
572 /* Executive summary in case the body of the oops scrolled away */
573 printk(KERN_EMERG "CR2: %016lx\n", address);
578 * We ran out of memory, or some other thing happened to us that made
579 * us unable to handle the page fault gracefully.
582 up_read(&mm->mmap_sem);
583 if (is_init(current)) {
587 printk("VM: killing process %s\n", tsk->comm);
593 up_read(&mm->mmap_sem);
595 /* Kernel mode? Handle exceptions or die */
596 if (!(error_code & PF_USER))
599 tsk->thread.cr2 = address;
600 tsk->thread.error_code = error_code;
601 tsk->thread.trap_no = 14;
602 info.si_signo = SIGBUS;
604 info.si_code = BUS_ADRERR;
605 info.si_addr = (void __user *)address;
606 force_sig_info(SIGBUS, &info, tsk);
610 DEFINE_SPINLOCK(pgd_lock);
611 struct page *pgd_list;
613 void vmalloc_sync_all(void)
615 /* Note that races in the updates of insync and start aren't
617 insync can only get set bits added, and updates to start are only
618 improving performance (without affecting correctness if undone). */
619 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
620 static unsigned long start = VMALLOC_START & PGDIR_MASK;
621 unsigned long address;
623 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
624 if (!test_bit(pgd_index(address), insync)) {
625 const pgd_t *pgd_ref = pgd_offset_k(address);
628 if (pgd_none(*pgd_ref))
630 spin_lock(&pgd_lock);
631 for (page = pgd_list; page;
632 page = (struct page *)page->index) {
634 pgd = (pgd_t *)page_address(page) + pgd_index(address);
636 set_pgd(pgd, *pgd_ref);
638 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
640 spin_unlock(&pgd_lock);
641 set_bit(pgd_index(address), insync);
643 if (address == start)
644 start = address + PGDIR_SIZE;
646 /* Check that there is no need to do the same for the modules area. */
647 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
648 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
649 (__START_KERNEL & PGDIR_MASK)));
652 static int __init enable_pagefaulttrace(char *str)
654 page_fault_trace = 1;
657 __setup("pagefaulttrace", enable_pagefaulttrace);