2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 #include <linux/signal.h>
7 #include <linux/sched.h>
8 #include <linux/kernel.h>
9 #include <linux/errno.h>
10 #include <linux/string.h>
11 #include <linux/types.h>
12 #include <linux/ptrace.h>
13 #include <linux/mman.h>
15 #include <linux/smp.h>
16 #include <linux/interrupt.h>
17 #include <linux/init.h>
18 #include <linux/tty.h>
19 #include <linux/vt_kern.h> /* For unblank_screen() */
20 #include <linux/compiler.h>
21 #include <linux/vmalloc.h>
22 #include <linux/module.h>
23 #include <linux/kprobes.h>
24 #include <linux/uaccess.h>
25 #include <linux/kdebug.h>
27 #include <asm/system.h>
28 #include <asm/pgalloc.h>
30 #include <asm/tlbflush.h>
31 #include <asm/proto.h>
32 #include <asm-generic/sections.h>
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
42 #define PF_PROT (1<<0)
43 #define PF_WRITE (1<<1)
44 #define PF_USER (1<<2)
45 #define PF_RSVD (1<<3)
46 #define PF_INSTR (1<<4)
48 static inline int notify_page_fault(struct pt_regs *regs)
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
76 * Opcode checker based on code by Richard Brunner
78 static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
84 unsigned char *max_instr;
87 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
96 /* If it was a exec fault ignore */
97 if (error_code & PF_INSTR)
101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
102 max_instr = instr + 15;
104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
107 while (scan_more && instr < max_instr) {
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
112 if (probe_kernel_address(instr, opcode))
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
128 scan_more = ((instr_lo & 7) == 0x6);
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
148 scan_more = !instr_lo || (instr_lo>>1) == 1;
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
154 if (probe_kernel_address(instr, opcode))
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
167 static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
172 info.si_signo = si_signo;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
179 static int bad_address(void *p)
182 return probe_kernel_address((unsigned long *)p, dummy);
185 void dump_pagetable(unsigned long address)
192 pgd = (pgd_t *)read_cr3();
194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
195 pgd += pgd_index(address);
196 if (bad_address(pgd)) goto bad;
197 printk("PGD %lx ", pgd_val(*pgd));
198 if (!pgd_present(*pgd)) goto ret;
200 pud = pud_offset(pgd, address);
201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
212 printk("PTE %lx", pte_val(*pte));
221 static const char errata93_warning[] =
222 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224 KERN_ERR "******* Please consider a BIOS update.\n"
225 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
228 /* Workaround for K8 erratum #93 & buggy BIOS.
229 BIOS SMM functions are required to use a specific workaround
230 to avoid corruption of the 64bit RIP register on C stepping K8.
231 A lot of BIOS that didn't get tested properly miss this.
232 The OS sees this as a page fault with the upper 32bits of RIP cleared.
233 Try to work around it here.
234 Note we only handle faults in kernel here.
235 Does nothing for X86_32
237 static int is_errata93(struct pt_regs *regs, unsigned long address)
241 if (address != regs->ip)
243 if ((address >> 32) != 0)
245 address |= 0xffffffffUL << 32;
246 if ((address >= (u64)_stext && address <= (u64)_etext) ||
247 (address >= MODULES_VADDR && address <= MODULES_END)) {
249 printk(errata93_warning);
259 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
260 unsigned long error_code)
262 unsigned long flags = oops_begin();
263 struct task_struct *tsk;
265 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
266 current->comm, address);
267 dump_pagetable(address);
269 tsk->thread.cr2 = address;
270 tsk->thread.trap_no = 14;
271 tsk->thread.error_code = error_code;
272 if (__die("Bad pagetable", regs, error_code))
274 oops_end(flags, regs, SIGKILL);
278 * Handle a fault on the vmalloc area
280 * This assumes no large pages in there.
282 static int vmalloc_fault(unsigned long address)
285 unsigned long pgd_paddr;
289 * Synchronize this task's top level page-table
290 * with the 'reference' page table.
292 * Do _not_ use "current" here. We might be inside
293 * an interrupt in the middle of a task switch..
295 pgd_paddr = read_cr3();
296 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
299 pte_k = pte_offset_kernel(pmd_k, address);
300 if (!pte_present(*pte_k))
304 pgd_t *pgd, *pgd_ref;
305 pud_t *pud, *pud_ref;
306 pmd_t *pmd, *pmd_ref;
307 pte_t *pte, *pte_ref;
309 /* Copy kernel mappings over when needed. This can also
310 happen within a race in page table update. In the later
313 pgd = pgd_offset(current->mm ?: &init_mm, address);
314 pgd_ref = pgd_offset_k(address);
315 if (pgd_none(*pgd_ref))
318 set_pgd(pgd, *pgd_ref);
320 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
322 /* Below here mismatches are bugs because these lower tables
325 pud = pud_offset(pgd, address);
326 pud_ref = pud_offset(pgd_ref, address);
327 if (pud_none(*pud_ref))
329 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
331 pmd = pmd_offset(pud, address);
332 pmd_ref = pmd_offset(pud_ref, address);
333 if (pmd_none(*pmd_ref))
335 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
337 pte_ref = pte_offset_kernel(pmd_ref, address);
338 if (!pte_present(*pte_ref))
340 pte = pte_offset_kernel(pmd, address);
341 /* Don't use pte_page here, because the mappings can point
342 outside mem_map, and the NUMA hash lookup cannot handle
344 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
350 int show_unhandled_signals = 1;
353 * This routine handles page faults. It determines the address,
354 * and the problem, and then passes it off to one of the appropriate
357 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
358 unsigned long error_code)
360 struct task_struct *tsk;
361 struct mm_struct *mm;
362 struct vm_area_struct *vma;
363 unsigned long address;
369 * We can fault from pretty much anywhere, with unknown IRQ state.
371 trace_hardirqs_fixup();
375 prefetchw(&mm->mmap_sem);
377 /* get the address */
378 address = read_cr2();
380 si_code = SEGV_MAPERR;
382 if (notify_page_fault(regs))
386 * We fault-in kernel-space virtual memory on-demand. The
387 * 'reference' page table is init_mm.pgd.
389 * NOTE! We MUST NOT take any locks for this case. We may
390 * be in an interrupt or a critical region, and should
391 * only copy the information from the master page table,
394 * This verifies that the fault happens in kernel space
395 * (error_code & 4) == 0, and that the fault was not a
396 * protection error (error_code & 9) == 0.
398 if (unlikely(address >= TASK_SIZE64)) {
400 * Don't check for the module range here: its PML4
401 * is always initialized because it's shared with the main
402 * kernel text. Only vmalloc may need PML4 syncups.
404 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
405 ((address >= VMALLOC_START && address < VMALLOC_END))) {
406 if (vmalloc_fault(address) >= 0)
410 * Don't take the mm semaphore here. If we fixup a prefetch
411 * fault we could otherwise deadlock.
413 goto bad_area_nosemaphore;
416 if (likely(regs->flags & X86_EFLAGS_IF))
419 if (unlikely(error_code & PF_RSVD))
420 pgtable_bad(address, regs, error_code);
423 * If we're in an interrupt, have no user context or are running in an
424 * atomic region then we must not take the fault.
426 if (unlikely(in_atomic() || !mm))
427 goto bad_area_nosemaphore;
430 * User-mode registers count as a user access even for any
431 * potential system fault or CPU buglet.
433 if (user_mode_vm(regs))
434 error_code |= PF_USER;
437 /* When running in the kernel we expect faults to occur only to
438 * addresses in user space. All other faults represent errors in the
439 * kernel and should generate an OOPS. Unfortunately, in the case of an
440 * erroneous fault occurring in a code path which already holds mmap_sem
441 * we will deadlock attempting to validate the fault against the
442 * address space. Luckily the kernel only validly references user
443 * space from well defined areas of code, which are listed in the
446 * As the vast majority of faults will be valid we will only perform
447 * the source reference check when there is a possibility of a deadlock.
448 * Attempt to lock the address space, if we cannot we then validate the
449 * source. If this is invalid we can skip the address space check,
450 * thus avoiding the deadlock.
452 if (!down_read_trylock(&mm->mmap_sem)) {
453 if ((error_code & PF_USER) == 0 &&
454 !search_exception_tables(regs->ip))
455 goto bad_area_nosemaphore;
456 down_read(&mm->mmap_sem);
459 vma = find_vma(mm, address);
462 if (likely(vma->vm_start <= address))
464 if (!(vma->vm_flags & VM_GROWSDOWN))
466 if (error_code & PF_USER) {
468 * Accessing the stack below %sp is always a bug.
469 * The large cushion allows instructions like enter
470 * and pusha to work. ("enter $65535,$31" pushes
471 * 32 pointers and then decrements %sp by 65535.)
473 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
476 if (expand_stack(vma, address))
479 * Ok, we have a good vm_area for this memory access, so
483 si_code = SEGV_ACCERR;
485 switch (error_code & (PF_PROT|PF_WRITE)) {
486 default: /* 3: write, present */
488 case PF_WRITE: /* write, not present */
489 if (!(vma->vm_flags & VM_WRITE))
493 case PF_PROT: /* read, present */
495 case 0: /* read, not present */
496 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
501 * If for any reason at all we couldn't handle the fault,
502 * make sure we exit gracefully rather than endlessly redo
505 fault = handle_mm_fault(mm, vma, address, write);
506 if (unlikely(fault & VM_FAULT_ERROR)) {
507 if (fault & VM_FAULT_OOM)
509 else if (fault & VM_FAULT_SIGBUS)
513 if (fault & VM_FAULT_MAJOR)
520 * Did it hit the DOS screen memory VA from vm86 mode?
522 if (v8086_mode(regs)) {
523 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
525 tsk->thread.screen_bitmap |= 1 << bit;
528 up_read(&mm->mmap_sem);
532 * Something tried to access memory that isn't in our memory map..
533 * Fix it, but check if it's kernel or user first..
536 up_read(&mm->mmap_sem);
538 bad_area_nosemaphore:
539 /* User mode accesses just cause a SIGSEGV */
540 if (error_code & PF_USER) {
543 * It's possible to have interrupts off here.
547 if (is_prefetch(regs, address, error_code))
550 /* Work around K8 erratum #100 K8 in compat mode
551 occasionally jumps to illegal addresses >4GB. We
552 catch this here in the page fault handler because
553 these addresses are not reachable. Just detect this
554 case and return. Any code segment in LDT is
555 compatibility mode. */
556 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
560 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
561 printk_ratelimit()) {
564 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
566 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
568 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
569 tsk->comm, task_pid_nr(tsk), address, regs->ip,
570 regs->sp, error_code);
571 print_vma_addr(" in ", regs->ip);
575 tsk->thread.cr2 = address;
576 /* Kernel addresses are always protection faults */
577 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
578 tsk->thread.trap_no = 14;
580 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
585 /* Are we prepared to handle this kernel fault? */
586 if (fixup_exception(regs))
590 * Hall of shame of CPU/BIOS bugs.
593 if (is_prefetch(regs, address, error_code))
596 if (is_errata93(regs, address))
600 * Oops. The kernel tried to access some bad page. We'll have to
601 * terminate things with extreme prejudice.
604 flags = oops_begin();
606 if (address < PAGE_SIZE)
607 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
609 printk(KERN_ALERT "Unable to handle kernel paging request");
610 printk(" at %016lx RIP: \n" KERN_ALERT, address);
611 printk_address(regs->ip, 1);
612 dump_pagetable(address);
613 tsk->thread.cr2 = address;
614 tsk->thread.trap_no = 14;
615 tsk->thread.error_code = error_code;
616 if (__die("Oops", regs, error_code))
618 /* Executive summary in case the body of the oops scrolled away */
619 printk(KERN_EMERG "CR2: %016lx\n", address);
620 oops_end(flags, regs, SIGKILL);
623 * We ran out of memory, or some other thing happened to us that made
624 * us unable to handle the page fault gracefully.
627 up_read(&mm->mmap_sem);
628 if (is_global_init(current)) {
632 printk("VM: killing process %s\n", tsk->comm);
633 if (error_code & PF_USER)
634 do_group_exit(SIGKILL);
638 up_read(&mm->mmap_sem);
640 /* Kernel mode? Handle exceptions or die */
641 if (!(error_code & PF_USER))
644 tsk->thread.cr2 = address;
645 tsk->thread.error_code = error_code;
646 tsk->thread.trap_no = 14;
647 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
651 DEFINE_SPINLOCK(pgd_lock);
654 void vmalloc_sync_all(void)
657 * Note that races in the updates of insync and start aren't
658 * problematic: insync can only get set bits added, and updates to
659 * start are only improving performance (without affecting correctness
662 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
663 static unsigned long start = VMALLOC_START & PGDIR_MASK;
664 unsigned long address;
666 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
667 if (!test_bit(pgd_index(address), insync)) {
668 const pgd_t *pgd_ref = pgd_offset_k(address);
671 if (pgd_none(*pgd_ref))
673 spin_lock(&pgd_lock);
674 list_for_each_entry(page, &pgd_list, lru) {
676 pgd = (pgd_t *)page_address(page) + pgd_index(address);
678 set_pgd(pgd, *pgd_ref);
680 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
682 spin_unlock(&pgd_lock);
683 set_bit(pgd_index(address), insync);
685 if (address == start)
686 start = address + PGDIR_SIZE;
688 /* Check that there is no need to do the same for the modules area. */
689 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
690 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
691 (__START_KERNEL & PGDIR_MASK)));