2 * linux/arch/x86-64/mm/fault.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
8 #include <linux/signal.h>
9 #include <linux/sched.h>
10 #include <linux/kernel.h>
11 #include <linux/errno.h>
12 #include <linux/string.h>
13 #include <linux/types.h>
14 #include <linux/ptrace.h>
15 #include <linux/mman.h>
17 #include <linux/smp.h>
18 #include <linux/smp_lock.h>
19 #include <linux/interrupt.h>
20 #include <linux/init.h>
21 #include <linux/tty.h>
22 #include <linux/vt_kern.h> /* For unblank_screen() */
23 #include <linux/compiler.h>
24 #include <linux/vmalloc.h>
25 #include <linux/module.h>
26 #include <linux/kprobes.h>
27 #include <linux/uaccess.h>
28 #include <linux/kdebug.h>
30 #include <asm/system.h>
31 #include <asm/pgalloc.h>
33 #include <asm/tlbflush.h>
34 #include <asm/proto.h>
35 #include <asm-generic/sections.h>
37 /* Page fault error code bits */
38 #define PF_PROT (1<<0) /* or no page found */
39 #define PF_WRITE (1<<1)
40 #define PF_USER (1<<2)
41 #define PF_RSVD (1<<3)
42 #define PF_INSTR (1<<4)
44 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
46 /* Hook to register for page fault notifications */
47 int register_page_fault_notifier(struct notifier_block *nb)
50 return atomic_notifier_chain_register(¬ify_page_fault_chain, nb);
52 EXPORT_SYMBOL_GPL(register_page_fault_notifier);
54 int unregister_page_fault_notifier(struct notifier_block *nb)
56 return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb);
58 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
60 static inline int notify_page_fault(struct pt_regs *regs, long err)
62 struct die_args args = {
69 return atomic_notifier_call_chain(¬ify_page_fault_chain,
70 DIE_PAGE_FAULT, &args);
73 /* Sometimes the CPU reports invalid exceptions on prefetch.
74 Check that here and ignore.
75 Opcode checker based on code by Richard Brunner */
76 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
77 unsigned long error_code)
82 unsigned char *max_instr;
84 /* If it was a exec fault ignore */
85 if (error_code & PF_INSTR)
88 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
89 max_instr = instr + 15;
91 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
94 while (scan_more && instr < max_instr) {
96 unsigned char instr_hi;
97 unsigned char instr_lo;
99 if (probe_kernel_address(instr, opcode))
102 instr_hi = opcode & 0xf0;
103 instr_lo = opcode & 0x0f;
109 /* Values 0x26,0x2E,0x36,0x3E are valid x86
110 prefixes. In long mode, the CPU will signal
111 invalid opcode if some of these prefixes are
112 present so we will never get here anyway */
113 scan_more = ((instr_lo & 7) == 0x6);
117 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
118 Need to figure out under what instruction mode the
119 instruction was issued ... */
120 /* Could check the LDT for lm, but for now it's good
121 enough to assume that long mode only uses well known
122 segments or kernel. */
123 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
127 /* 0x64 thru 0x67 are valid prefixes in all modes. */
128 scan_more = (instr_lo & 0xC) == 0x4;
131 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
132 scan_more = !instr_lo || (instr_lo>>1) == 1;
135 /* Prefetch instruction is 0x0F0D or 0x0F18 */
137 if (probe_kernel_address(instr, opcode))
139 prefetch = (instr_lo == 0xF) &&
140 (opcode == 0x0D || opcode == 0x18);
150 static int bad_address(void *p)
153 return probe_kernel_address((unsigned long *)p, dummy);
156 void dump_pagetable(unsigned long address)
163 asm("movq %%cr3,%0" : "=r" (pgd));
165 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
166 pgd += pgd_index(address);
167 if (bad_address(pgd)) goto bad;
168 printk("PGD %lx ", pgd_val(*pgd));
169 if (!pgd_present(*pgd)) goto ret;
171 pud = pud_offset(pgd, address);
172 if (bad_address(pud)) goto bad;
173 printk("PUD %lx ", pud_val(*pud));
174 if (!pud_present(*pud)) goto ret;
176 pmd = pmd_offset(pud, address);
177 if (bad_address(pmd)) goto bad;
178 printk("PMD %lx ", pmd_val(*pmd));
179 if (!pmd_present(*pmd)) goto ret;
181 pte = pte_offset_kernel(pmd, address);
182 if (bad_address(pte)) goto bad;
183 printk("PTE %lx", pte_val(*pte));
191 static const char errata93_warning[] =
192 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
193 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
194 KERN_ERR "******* Please consider a BIOS update.\n"
195 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
197 /* Workaround for K8 erratum #93 & buggy BIOS.
198 BIOS SMM functions are required to use a specific workaround
199 to avoid corruption of the 64bit RIP register on C stepping K8.
200 A lot of BIOS that didn't get tested properly miss this.
201 The OS sees this as a page fault with the upper 32bits of RIP cleared.
202 Try to work around it here.
203 Note we only handle faults in kernel here. */
205 static int is_errata93(struct pt_regs *regs, unsigned long address)
208 if (address != regs->rip)
210 if ((address >> 32) != 0)
212 address |= 0xffffffffUL << 32;
213 if ((address >= (u64)_stext && address <= (u64)_etext) ||
214 (address >= MODULES_VADDR && address <= MODULES_END)) {
216 printk(errata93_warning);
225 int unhandled_signal(struct task_struct *tsk, int sig)
229 if (tsk->ptrace & PT_PTRACED)
231 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
232 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
235 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
236 unsigned long error_code)
238 unsigned long flags = oops_begin();
239 struct task_struct *tsk;
241 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
242 current->comm, address);
243 dump_pagetable(address);
245 tsk->thread.cr2 = address;
246 tsk->thread.trap_no = 14;
247 tsk->thread.error_code = error_code;
248 __die("Bad pagetable", regs, error_code);
254 * Handle a fault on the vmalloc area
256 * This assumes no large pages in there.
258 static int vmalloc_fault(unsigned long address)
260 pgd_t *pgd, *pgd_ref;
261 pud_t *pud, *pud_ref;
262 pmd_t *pmd, *pmd_ref;
263 pte_t *pte, *pte_ref;
265 /* Copy kernel mappings over when needed. This can also
266 happen within a race in page table update. In the later
269 pgd = pgd_offset(current->mm ?: &init_mm, address);
270 pgd_ref = pgd_offset_k(address);
271 if (pgd_none(*pgd_ref))
274 set_pgd(pgd, *pgd_ref);
276 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
278 /* Below here mismatches are bugs because these lower tables
281 pud = pud_offset(pgd, address);
282 pud_ref = pud_offset(pgd_ref, address);
283 if (pud_none(*pud_ref))
285 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
287 pmd = pmd_offset(pud, address);
288 pmd_ref = pmd_offset(pud_ref, address);
289 if (pmd_none(*pmd_ref))
291 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
293 pte_ref = pte_offset_kernel(pmd_ref, address);
294 if (!pte_present(*pte_ref))
296 pte = pte_offset_kernel(pmd, address);
297 /* Don't use pte_page here, because the mappings can point
298 outside mem_map, and the NUMA hash lookup cannot handle
300 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
305 int page_fault_trace = 0;
306 int exception_trace = 1;
309 * This routine handles page faults. It determines the address,
310 * and the problem, and then passes it off to one of the appropriate
313 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
314 unsigned long error_code)
316 struct task_struct *tsk;
317 struct mm_struct *mm;
318 struct vm_area_struct * vma;
319 unsigned long address;
320 const struct exception_table_entry *fixup;
327 prefetchw(&mm->mmap_sem);
329 /* get the address */
330 __asm__("movq %%cr2,%0":"=r" (address));
332 info.si_code = SEGV_MAPERR;
336 * We fault-in kernel-space virtual memory on-demand. The
337 * 'reference' page table is init_mm.pgd.
339 * NOTE! We MUST NOT take any locks for this case. We may
340 * be in an interrupt or a critical region, and should
341 * only copy the information from the master page table,
344 * This verifies that the fault happens in kernel space
345 * (error_code & 4) == 0, and that the fault was not a
346 * protection error (error_code & 9) == 0.
348 if (unlikely(address >= TASK_SIZE64)) {
350 * Don't check for the module range here: its PML4
351 * is always initialized because it's shared with the main
352 * kernel text. Only vmalloc may need PML4 syncups.
354 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
355 ((address >= VMALLOC_START && address < VMALLOC_END))) {
356 if (vmalloc_fault(address) >= 0)
359 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
362 * Don't take the mm semaphore here. If we fixup a prefetch
363 * fault we could otherwise deadlock.
365 goto bad_area_nosemaphore;
368 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
371 if (likely(regs->eflags & X86_EFLAGS_IF))
374 if (unlikely(page_fault_trace))
375 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
376 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
378 if (unlikely(error_code & PF_RSVD))
379 pgtable_bad(address, regs, error_code);
382 * If we're in an interrupt or have no user
383 * context, we must not take the fault..
385 if (unlikely(in_atomic() || !mm))
386 goto bad_area_nosemaphore;
389 /* When running in the kernel we expect faults to occur only to
390 * addresses in user space. All other faults represent errors in the
391 * kernel and should generate an OOPS. Unfortunatly, in the case of an
392 * erroneous fault occurring in a code path which already holds mmap_sem
393 * we will deadlock attempting to validate the fault against the
394 * address space. Luckily the kernel only validly references user
395 * space from well defined areas of code, which are listed in the
398 * As the vast majority of faults will be valid we will only perform
399 * the source reference check when there is a possibilty of a deadlock.
400 * Attempt to lock the address space, if we cannot we then validate the
401 * source. If this is invalid we can skip the address space check,
402 * thus avoiding the deadlock.
404 if (!down_read_trylock(&mm->mmap_sem)) {
405 if ((error_code & PF_USER) == 0 &&
406 !search_exception_tables(regs->rip))
407 goto bad_area_nosemaphore;
408 down_read(&mm->mmap_sem);
411 vma = find_vma(mm, address);
414 if (likely(vma->vm_start <= address))
416 if (!(vma->vm_flags & VM_GROWSDOWN))
418 if (error_code & 4) {
419 /* Allow userspace just enough access below the stack pointer
420 * to let the 'enter' instruction work.
422 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
425 if (expand_stack(vma, address))
428 * Ok, we have a good vm_area for this memory access, so
432 info.si_code = SEGV_ACCERR;
434 switch (error_code & (PF_PROT|PF_WRITE)) {
435 default: /* 3: write, present */
437 case PF_WRITE: /* write, not present */
438 if (!(vma->vm_flags & VM_WRITE))
442 case PF_PROT: /* read, present */
444 case 0: /* read, not present */
445 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
450 * If for any reason at all we couldn't handle the fault,
451 * make sure we exit gracefully rather than endlessly redo
454 switch (handle_mm_fault(mm, vma, address, write)) {
461 case VM_FAULT_SIGBUS:
467 up_read(&mm->mmap_sem);
471 * Something tried to access memory that isn't in our memory map..
472 * Fix it, but check if it's kernel or user first..
475 up_read(&mm->mmap_sem);
477 bad_area_nosemaphore:
478 /* User mode accesses just cause a SIGSEGV */
479 if (error_code & PF_USER) {
480 if (is_prefetch(regs, address, error_code))
483 /* Work around K8 erratum #100 K8 in compat mode
484 occasionally jumps to illegal addresses >4GB. We
485 catch this here in the page fault handler because
486 these addresses are not reachable. Just detect this
487 case and return. Any code segment in LDT is
488 compatibility mode. */
489 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
493 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
495 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
496 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
497 tsk->comm, tsk->pid, address, regs->rip,
498 regs->rsp, error_code);
501 tsk->thread.cr2 = address;
502 /* Kernel addresses are always protection faults */
503 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
504 tsk->thread.trap_no = 14;
505 info.si_signo = SIGSEGV;
507 /* info.si_code has been set above */
508 info.si_addr = (void __user *)address;
509 force_sig_info(SIGSEGV, &info, tsk);
515 /* Are we prepared to handle this kernel fault? */
516 fixup = search_exception_tables(regs->rip);
518 regs->rip = fixup->fixup;
523 * Hall of shame of CPU/BIOS bugs.
526 if (is_prefetch(regs, address, error_code))
529 if (is_errata93(regs, address))
533 * Oops. The kernel tried to access some bad page. We'll have to
534 * terminate things with extreme prejudice.
537 flags = oops_begin();
539 if (address < PAGE_SIZE)
540 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
542 printk(KERN_ALERT "Unable to handle kernel paging request");
543 printk(" at %016lx RIP: \n" KERN_ALERT,address);
544 printk_address(regs->rip);
545 dump_pagetable(address);
546 tsk->thread.cr2 = address;
547 tsk->thread.trap_no = 14;
548 tsk->thread.error_code = error_code;
549 __die("Oops", regs, error_code);
550 /* Executive summary in case the body of the oops scrolled away */
551 printk(KERN_EMERG "CR2: %016lx\n", address);
556 * We ran out of memory, or some other thing happened to us that made
557 * us unable to handle the page fault gracefully.
560 up_read(&mm->mmap_sem);
561 if (is_init(current)) {
565 printk("VM: killing process %s\n", tsk->comm);
571 up_read(&mm->mmap_sem);
573 /* Kernel mode? Handle exceptions or die */
574 if (!(error_code & PF_USER))
577 tsk->thread.cr2 = address;
578 tsk->thread.error_code = error_code;
579 tsk->thread.trap_no = 14;
580 info.si_signo = SIGBUS;
582 info.si_code = BUS_ADRERR;
583 info.si_addr = (void __user *)address;
584 force_sig_info(SIGBUS, &info, tsk);
588 DEFINE_SPINLOCK(pgd_lock);
591 void vmalloc_sync_all(void)
593 /* Note that races in the updates of insync and start aren't
595 insync can only get set bits added, and updates to start are only
596 improving performance (without affecting correctness if undone). */
597 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
598 static unsigned long start = VMALLOC_START & PGDIR_MASK;
599 unsigned long address;
601 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
602 if (!test_bit(pgd_index(address), insync)) {
603 const pgd_t *pgd_ref = pgd_offset_k(address);
606 if (pgd_none(*pgd_ref))
608 spin_lock(&pgd_lock);
609 list_for_each_entry(page, &pgd_list, lru) {
611 pgd = (pgd_t *)page_address(page) + pgd_index(address);
613 set_pgd(pgd, *pgd_ref);
615 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
617 spin_unlock(&pgd_lock);
618 set_bit(pgd_index(address), insync);
620 if (address == start)
621 start = address + PGDIR_SIZE;
623 /* Check that there is no need to do the same for the modules area. */
624 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
625 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
626 (__START_KERNEL & PGDIR_MASK)));
629 static int __init enable_pagefaulttrace(char *str)
631 page_fault_trace = 1;
634 __setup("pagefaulttrace", enable_pagefaulttrace);