2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
76 * Opcode checker based on code by Richard Brunner
78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
84 unsigned char *max_instr;
88 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
89 boot_cpu_data.x86 >= 6)) {
90 /* Catch an obscure case of prefetch inside an NX page. */
91 if (nx_enabled && (error_code & PF_INSTR))
96 instr = (unsigned char *)get_segment_eip(regs, &limit);
98 /* If it was a exec fault ignore */
99 if (error_code & PF_INSTR)
101 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
104 max_instr = instr + 15;
107 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
111 while (scan_more && instr < max_instr) {
112 unsigned char opcode;
113 unsigned char instr_hi;
114 unsigned char instr_lo;
117 if (instr > (unsigned char *)limit)
120 if (probe_kernel_address(instr, opcode))
123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
136 scan_more = ((instr_lo & 7) == 0x6);
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
156 scan_more = !instr_lo || (instr_lo>>1) == 1;
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
162 if (instr > (unsigned char *)limit)
165 if (probe_kernel_address(instr, opcode))
167 prefetch = (instr_lo == 0xF) &&
168 (opcode == 0x0D || opcode == 0x18);
178 static void force_sig_info_fault(int si_signo, int si_code,
179 unsigned long address, struct task_struct *tsk)
183 info.si_signo = si_signo;
185 info.si_code = si_code;
186 info.si_addr = (void __user *)address;
187 force_sig_info(si_signo, &info, tsk);
190 static int bad_address(void *p)
193 return probe_kernel_address((unsigned long *)p, dummy);
196 void dump_pagetable(unsigned long address)
203 pgd = (pgd_t *)read_cr3();
205 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
206 pgd += pgd_index(address);
207 if (bad_address(pgd)) goto bad;
208 printk("PGD %lx ", pgd_val(*pgd));
209 if (!pgd_present(*pgd)) goto ret;
211 pud = pud_offset(pgd, address);
212 if (bad_address(pud)) goto bad;
213 printk("PUD %lx ", pud_val(*pud));
214 if (!pud_present(*pud)) goto ret;
216 pmd = pmd_offset(pud, address);
217 if (bad_address(pmd)) goto bad;
218 printk("PMD %lx ", pmd_val(*pmd));
219 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
221 pte = pte_offset_kernel(pmd, address);
222 if (bad_address(pte)) goto bad;
223 printk("PTE %lx", pte_val(*pte));
232 static const char errata93_warning[] =
233 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
234 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
235 KERN_ERR "******* Please consider a BIOS update.\n"
236 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
238 /* Workaround for K8 erratum #93 & buggy BIOS.
239 BIOS SMM functions are required to use a specific workaround
240 to avoid corruption of the 64bit RIP register on C stepping K8.
241 A lot of BIOS that didn't get tested properly miss this.
242 The OS sees this as a page fault with the upper 32bits of RIP cleared.
243 Try to work around it here.
244 Note we only handle faults in kernel here. */
246 static int is_errata93(struct pt_regs *regs, unsigned long address)
249 if (address != regs->ip)
251 if ((address >> 32) != 0)
253 address |= 0xffffffffUL << 32;
254 if ((address >= (u64)_stext && address <= (u64)_etext) ||
255 (address >= MODULES_VADDR && address <= MODULES_END)) {
257 printk(errata93_warning);
267 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
268 unsigned long error_code)
270 unsigned long flags = oops_begin();
271 struct task_struct *tsk;
273 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
274 current->comm, address);
275 dump_pagetable(address);
277 tsk->thread.cr2 = address;
278 tsk->thread.trap_no = 14;
279 tsk->thread.error_code = error_code;
280 if (__die("Bad pagetable", regs, error_code))
282 oops_end(flags, regs, SIGKILL);
286 * Handle a fault on the vmalloc area
288 * This assumes no large pages in there.
290 static int vmalloc_fault(unsigned long address)
292 pgd_t *pgd, *pgd_ref;
293 pud_t *pud, *pud_ref;
294 pmd_t *pmd, *pmd_ref;
295 pte_t *pte, *pte_ref;
297 /* Copy kernel mappings over when needed. This can also
298 happen within a race in page table update. In the later
301 pgd = pgd_offset(current->mm ?: &init_mm, address);
302 pgd_ref = pgd_offset_k(address);
303 if (pgd_none(*pgd_ref))
306 set_pgd(pgd, *pgd_ref);
308 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
310 /* Below here mismatches are bugs because these lower tables
313 pud = pud_offset(pgd, address);
314 pud_ref = pud_offset(pgd_ref, address);
315 if (pud_none(*pud_ref))
317 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
319 pmd = pmd_offset(pud, address);
320 pmd_ref = pmd_offset(pud_ref, address);
321 if (pmd_none(*pmd_ref))
323 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
325 pte_ref = pte_offset_kernel(pmd_ref, address);
326 if (!pte_present(*pte_ref))
328 pte = pte_offset_kernel(pmd, address);
329 /* Don't use pte_page here, because the mappings can point
330 outside mem_map, and the NUMA hash lookup cannot handle
332 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
337 int show_unhandled_signals = 1;
340 * This routine handles page faults. It determines the address,
341 * and the problem, and then passes it off to one of the appropriate
344 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
345 unsigned long error_code)
347 struct task_struct *tsk;
348 struct mm_struct *mm;
349 struct vm_area_struct *vma;
350 unsigned long address;
356 * We can fault from pretty much anywhere, with unknown IRQ state.
358 trace_hardirqs_fixup();
362 prefetchw(&mm->mmap_sem);
364 /* get the address */
365 address = read_cr2();
367 si_code = SEGV_MAPERR;
371 * We fault-in kernel-space virtual memory on-demand. The
372 * 'reference' page table is init_mm.pgd.
374 * NOTE! We MUST NOT take any locks for this case. We may
375 * be in an interrupt or a critical region, and should
376 * only copy the information from the master page table,
379 * This verifies that the fault happens in kernel space
380 * (error_code & 4) == 0, and that the fault was not a
381 * protection error (error_code & 9) == 0.
383 if (unlikely(address >= TASK_SIZE64)) {
385 * Don't check for the module range here: its PML4
386 * is always initialized because it's shared with the main
387 * kernel text. Only vmalloc may need PML4 syncups.
389 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
390 ((address >= VMALLOC_START && address < VMALLOC_END))) {
391 if (vmalloc_fault(address) >= 0)
394 if (notify_page_fault(regs))
397 * Don't take the mm semaphore here. If we fixup a prefetch
398 * fault we could otherwise deadlock.
400 goto bad_area_nosemaphore;
403 if (notify_page_fault(regs))
406 if (likely(regs->flags & X86_EFLAGS_IF))
409 if (unlikely(error_code & PF_RSVD))
410 pgtable_bad(address, regs, error_code);
413 * If we're in an interrupt, have no user context or are running in an
414 * atomic region then we must not take the fault.
416 if (unlikely(in_atomic() || !mm))
417 goto bad_area_nosemaphore;
420 * User-mode registers count as a user access even for any
421 * potential system fault or CPU buglet.
423 if (user_mode_vm(regs))
424 error_code |= PF_USER;
427 /* When running in the kernel we expect faults to occur only to
428 * addresses in user space. All other faults represent errors in the
429 * kernel and should generate an OOPS. Unfortunately, in the case of an
430 * erroneous fault occurring in a code path which already holds mmap_sem
431 * we will deadlock attempting to validate the fault against the
432 * address space. Luckily the kernel only validly references user
433 * space from well defined areas of code, which are listed in the
436 * As the vast majority of faults will be valid we will only perform
437 * the source reference check when there is a possibility of a deadlock.
438 * Attempt to lock the address space, if we cannot we then validate the
439 * source. If this is invalid we can skip the address space check,
440 * thus avoiding the deadlock.
442 if (!down_read_trylock(&mm->mmap_sem)) {
443 if ((error_code & PF_USER) == 0 &&
444 !search_exception_tables(regs->ip))
445 goto bad_area_nosemaphore;
446 down_read(&mm->mmap_sem);
449 vma = find_vma(mm, address);
452 if (likely(vma->vm_start <= address))
454 if (!(vma->vm_flags & VM_GROWSDOWN))
456 if (error_code & PF_USER) {
457 /* Allow userspace just enough access below the stack pointer
458 * to let the 'enter' instruction work.
460 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
463 if (expand_stack(vma, address))
466 * Ok, we have a good vm_area for this memory access, so
470 si_code = SEGV_ACCERR;
472 switch (error_code & (PF_PROT|PF_WRITE)) {
473 default: /* 3: write, present */
475 case PF_WRITE: /* write, not present */
476 if (!(vma->vm_flags & VM_WRITE))
480 case PF_PROT: /* read, present */
482 case 0: /* read, not present */
483 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
488 * If for any reason at all we couldn't handle the fault,
489 * make sure we exit gracefully rather than endlessly redo
492 fault = handle_mm_fault(mm, vma, address, write);
493 if (unlikely(fault & VM_FAULT_ERROR)) {
494 if (fault & VM_FAULT_OOM)
496 else if (fault & VM_FAULT_SIGBUS)
500 if (fault & VM_FAULT_MAJOR)
504 up_read(&mm->mmap_sem);
508 * Something tried to access memory that isn't in our memory map..
509 * Fix it, but check if it's kernel or user first..
512 up_read(&mm->mmap_sem);
514 bad_area_nosemaphore:
515 /* User mode accesses just cause a SIGSEGV */
516 if (error_code & PF_USER) {
519 * It's possible to have interrupts off here.
523 if (is_prefetch(regs, address, error_code))
526 /* Work around K8 erratum #100 K8 in compat mode
527 occasionally jumps to illegal addresses >4GB. We
528 catch this here in the page fault handler because
529 these addresses are not reachable. Just detect this
530 case and return. Any code segment in LDT is
531 compatibility mode. */
532 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
536 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
537 printk_ratelimit()) {
539 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
540 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
541 tsk->comm, tsk->pid, address, regs->ip,
542 regs->sp, error_code);
545 tsk->thread.cr2 = address;
546 /* Kernel addresses are always protection faults */
547 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
548 tsk->thread.trap_no = 14;
550 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
555 /* Are we prepared to handle this kernel fault? */
556 if (fixup_exception(regs))
560 * Hall of shame of CPU/BIOS bugs.
563 if (is_prefetch(regs, address, error_code))
566 if (is_errata93(regs, address))
570 * Oops. The kernel tried to access some bad page. We'll have to
571 * terminate things with extreme prejudice.
574 flags = oops_begin();
576 if (address < PAGE_SIZE)
577 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
579 printk(KERN_ALERT "Unable to handle kernel paging request");
580 printk(" at %016lx RIP: \n" KERN_ALERT, address);
581 printk_address(regs->ip, regs->bp);
582 dump_pagetable(address);
583 tsk->thread.cr2 = address;
584 tsk->thread.trap_no = 14;
585 tsk->thread.error_code = error_code;
586 if (__die("Oops", regs, error_code))
588 /* Executive summary in case the body of the oops scrolled away */
589 printk(KERN_EMERG "CR2: %016lx\n", address);
590 oops_end(flags, regs, SIGKILL);
593 * We ran out of memory, or some other thing happened to us that made
594 * us unable to handle the page fault gracefully.
597 up_read(&mm->mmap_sem);
598 if (is_global_init(current)) {
602 printk("VM: killing process %s\n", tsk->comm);
603 if (error_code & PF_USER)
604 do_group_exit(SIGKILL);
608 up_read(&mm->mmap_sem);
610 /* Kernel mode? Handle exceptions or die */
611 if (!(error_code & PF_USER))
614 tsk->thread.cr2 = address;
615 tsk->thread.error_code = error_code;
616 tsk->thread.trap_no = 14;
617 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
621 DEFINE_SPINLOCK(pgd_lock);
624 void vmalloc_sync_all(void)
626 /* Note that races in the updates of insync and start aren't
628 insync can only get set bits added, and updates to start are only
629 improving performance (without affecting correctness if undone). */
630 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
631 static unsigned long start = VMALLOC_START & PGDIR_MASK;
632 unsigned long address;
634 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
635 if (!test_bit(pgd_index(address), insync)) {
636 const pgd_t *pgd_ref = pgd_offset_k(address);
639 if (pgd_none(*pgd_ref))
641 spin_lock(&pgd_lock);
642 list_for_each_entry(page, &pgd_list, lru) {
644 pgd = (pgd_t *)page_address(page) + pgd_index(address);
646 set_pgd(pgd, *pgd_ref);
648 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
650 spin_unlock(&pgd_lock);
651 set_bit(pgd_index(address), insync);
653 if (address == start)
654 start = address + PGDIR_SIZE;
656 /* Check that there is no need to do the same for the modules area. */
657 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
658 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
659 (__START_KERNEL & PGDIR_MASK)));