git.oblomov.eu Git - linux-2.6/blob - arch/x86_64/mm/fault.c

   1 /*
   2  *  linux/arch/x86-64/mm/fault.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
   6  */
   7
   8 #include <linux/signal.h>
   9 #include <linux/sched.h>
  10 #include <linux/kernel.h>
  11 #include <linux/errno.h>
  12 #include <linux/string.h>
  13 #include <linux/types.h>
  14 #include <linux/ptrace.h>
  15 #include <linux/mman.h>
  16 #include <linux/mm.h>
  17 #include <linux/smp.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/init.h>
  21 #include <linux/tty.h>
  22 #include <linux/vt_kern.h>              /* For unblank_screen() */
  23 #include <linux/compiler.h>
  24 #include <linux/vmalloc.h>
  25 #include <linux/module.h>
  26 #include <linux/kprobes.h>
  27 #include <linux/uaccess.h>
  28 #include <linux/kdebug.h>
  29
  30 #include <asm/system.h>
  31 #include <asm/pgalloc.h>
  32 #include <asm/smp.h>
  33 #include <asm/tlbflush.h>
  34 #include <asm/proto.h>
  35 #include <asm-generic/sections.h>
  36
  37 /* Page fault error code bits */
  38 #define PF_PROT (1<<0)          /* or no page found */
  39 #define PF_WRITE        (1<<1)
  40 #define PF_USER (1<<2)
  41 #define PF_RSVD (1<<3)
  42 #define PF_INSTR        (1<<4)
  43
  44 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
  45
  46 /* Hook to register for page fault notifications */
  47 int register_page_fault_notifier(struct notifier_block *nb)
  48 {
  49         vmalloc_sync_all();
  50         return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
  51 }
  52 EXPORT_SYMBOL_GPL(register_page_fault_notifier);
  53
  54 int unregister_page_fault_notifier(struct notifier_block *nb)
  55 {
  56         return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
  57 }
  58 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
  59
  60 static inline int notify_page_fault(struct pt_regs *regs, long err)
  61 {
  62         struct die_args args = {
  63                 .regs = regs,
  64                 .str = "page fault",
  65                 .err = err,
  66                 .trapnr = 14,
  67                 .signr = SIGSEGV
  68         };
  69         return atomic_notifier_call_chain(&notify_page_fault_chain,
  70                                           DIE_PAGE_FAULT, &args);
  71 }
  72
  73 /* Sometimes the CPU reports invalid exceptions on prefetch.
  74    Check that here and ignore.
  75    Opcode checker based on code by Richard Brunner */
  76 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
  77                                 unsigned long error_code)
  78 {
  79         unsigned char *instr;
  80         int scan_more = 1;
  81         int prefetch = 0;
  82         unsigned char *max_instr;
  83
  84         /* If it was a exec fault ignore */
  85         if (error_code & PF_INSTR)
  86                 return 0;
  87
  88         instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
  89         max_instr = instr + 15;
  90
  91         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
  92                 return 0;
  93
  94         while (scan_more && instr < max_instr) {
  95                 unsigned char opcode;
  96                 unsigned char instr_hi;
  97                 unsigned char instr_lo;
  98
  99                 if (probe_kernel_address(instr, opcode))
 100                         break;
 101
 102                 instr_hi = opcode & 0xf0;
 103                 instr_lo = opcode & 0x0f;
 104                 instr++;
 105
 106                 switch (instr_hi) {
 107                 case 0x20:
 108                 case 0x30:
 109                         /* Values 0x26,0x2E,0x36,0x3E are valid x86
 110                            prefixes.  In long mode, the CPU will signal
 111                            invalid opcode if some of these prefixes are
 112                            present so we will never get here anyway */
 113                         scan_more = ((instr_lo & 7) == 0x6);
 114                         break;
 115
 116                 case 0x40:
 117                         /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
 118                            Need to figure out under what instruction mode the
 119                            instruction was issued ... */
 120                         /* Could check the LDT for lm, but for now it's good
 121                            enough to assume that long mode only uses well known
 122                            segments or kernel. */
 123                         scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
 124                         break;
 125
 126                 case 0x60:
 127                         /* 0x64 thru 0x67 are valid prefixes in all modes. */
 128                         scan_more = (instr_lo & 0xC) == 0x4;
 129                         break;
 130                 case 0xF0:
 131                         /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
 132                         scan_more = !instr_lo || (instr_lo>>1) == 1;
 133                         break;
 134                 case 0x00:
 135                         /* Prefetch instruction is 0x0F0D or 0x0F18 */
 136                         scan_more = 0;
 137                         if (probe_kernel_address(instr, opcode))
 138                                 break;
 139                         prefetch = (instr_lo == 0xF) &&
 140                                 (opcode == 0x0D || opcode == 0x18);
 141                         break;
 142                 default:
 143                         scan_more = 0;
 144                         break;
 145                 }
 146         }
 147         return prefetch;
 148 }
 149
 150 static int bad_address(void *p)
 151 {
 152         unsigned long dummy;
 153         return probe_kernel_address((unsigned long *)p, dummy);
 154 }
 155
 156 void dump_pagetable(unsigned long address)
 157 {
 158         pgd_t *pgd;
 159         pud_t *pud;
 160         pmd_t *pmd;
 161         pte_t *pte;
 162
 163         asm("movq %%cr3,%0" : "=r" (pgd));
 164
 165         pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
 166         pgd += pgd_index(address);
 167         if (bad_address(pgd)) goto bad;
 168         printk("PGD %lx ", pgd_val(*pgd));
 169         if (!pgd_present(*pgd)) goto ret;
 170
 171         pud = pud_offset(pgd, address);
 172         if (bad_address(pud)) goto bad;
 173         printk("PUD %lx ", pud_val(*pud));
 174         if (!pud_present(*pud)) goto ret;
 175
 176         pmd = pmd_offset(pud, address);
 177         if (bad_address(pmd)) goto bad;
 178         printk("PMD %lx ", pmd_val(*pmd));
 179         if (!pmd_present(*pmd)) goto ret;
 180
 181         pte = pte_offset_kernel(pmd, address);
 182         if (bad_address(pte)) goto bad;
 183         printk("PTE %lx", pte_val(*pte));
 184 ret:
 185         printk("\n");
 186         return;
 187 bad:
 188         printk("BAD\n");
 189 }
 190
 191 static const char errata93_warning[] =
 192 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 193 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
 194 KERN_ERR "******* Please consider a BIOS update.\n"
 195 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 196
 197 /* Workaround for K8 erratum #93 & buggy BIOS.
 198    BIOS SMM functions are required to use a specific workaround
 199    to avoid corruption of the 64bit RIP register on C stepping K8.
 200    A lot of BIOS that didn't get tested properly miss this.
 201    The OS sees this as a page fault with the upper 32bits of RIP cleared.
 202    Try to work around it here.
 203    Note we only handle faults in kernel here. */
 204
 205 static int is_errata93(struct pt_regs *regs, unsigned long address)
 206 {
 207         static int warned;
 208         if (address != regs->rip)
 209                 return 0;
 210         if ((address >> 32) != 0)
 211                 return 0;
 212         address |= 0xffffffffUL << 32;
 213         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 214             (address >= MODULES_VADDR && address <= MODULES_END)) {
 215                 if (!warned) {
 216                         printk(errata93_warning);
 217                         warned = 1;
 218                 }
 219                 regs->rip = address;
 220                 return 1;
 221         }
 222         return 0;
 223 }
 224
 225 int unhandled_signal(struct task_struct *tsk, int sig)
 226 {
 227         if (is_init(tsk))
 228                 return 1;
 229         if (tsk->ptrace & PT_PTRACED)
 230                 return 0;
 231         return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
 232                 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
 233 }
 234
 235 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 236                                  unsigned long error_code)
 237 {
 238         unsigned long flags = oops_begin();
 239         struct task_struct *tsk;
 240
 241         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 242                current->comm, address);
 243         dump_pagetable(address);
 244         tsk = current;
 245         tsk->thread.cr2 = address;
 246         tsk->thread.trap_no = 14;
 247         tsk->thread.error_code = error_code;
 248         __die("Bad pagetable", regs, error_code);
 249         oops_end(flags);
 250         do_exit(SIGKILL);
 251 }
 252
 253 /*
 254  * Handle a fault on the vmalloc area
 255  *
 256  * This assumes no large pages in there.
 257  */
 258 static int vmalloc_fault(unsigned long address)
 259 {
 260         pgd_t *pgd, *pgd_ref;
 261         pud_t *pud, *pud_ref;
 262         pmd_t *pmd, *pmd_ref;
 263         pte_t *pte, *pte_ref;
 264
 265         /* Copy kernel mappings over when needed. This can also
 266            happen within a race in page table update. In the later
 267            case just flush. */
 268
 269         pgd = pgd_offset(current->mm ?: &init_mm, address);
 270         pgd_ref = pgd_offset_k(address);
 271         if (pgd_none(*pgd_ref))
 272                 return -1;
 273         if (pgd_none(*pgd))
 274                 set_pgd(pgd, *pgd_ref);
 275         else
 276                 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 277
 278         /* Below here mismatches are bugs because these lower tables
 279            are shared */
 280
 281         pud = pud_offset(pgd, address);
 282         pud_ref = pud_offset(pgd_ref, address);
 283         if (pud_none(*pud_ref))
 284                 return -1;
 285         if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 286                 BUG();
 287         pmd = pmd_offset(pud, address);
 288         pmd_ref = pmd_offset(pud_ref, address);
 289         if (pmd_none(*pmd_ref))
 290                 return -1;
 291         if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 292                 BUG();
 293         pte_ref = pte_offset_kernel(pmd_ref, address);
 294         if (!pte_present(*pte_ref))
 295                 return -1;
 296         pte = pte_offset_kernel(pmd, address);
 297         /* Don't use pte_page here, because the mappings can point
 298            outside mem_map, and the NUMA hash lookup cannot handle
 299            that. */
 300         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 301                 BUG();
 302         return 0;
 303 }
 304
 305 int page_fault_trace = 0;
 306 int exception_trace = 1;
 307
 308 /*
 309  * This routine handles page faults.  It determines the address,
 310  * and the problem, and then passes it off to one of the appropriate
 311  * routines.
 312  */
 313 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 314                                         unsigned long error_code)
 315 {
 316         struct task_struct *tsk;
 317         struct mm_struct *mm;
 318         struct vm_area_struct * vma;
 319         unsigned long address;
 320         const struct exception_table_entry *fixup;
 321         int write;
 322         unsigned long flags;
 323         siginfo_t info;
 324
 325         tsk = current;
 326         mm = tsk->mm;
 327         prefetchw(&mm->mmap_sem);
 328
 329         /* get the address */
 330         __asm__("movq %%cr2,%0":"=r" (address));
 331
 332         info.si_code = SEGV_MAPERR;
 333
 334
 335         /*
 336          * We fault-in kernel-space virtual memory on-demand. The
 337          * 'reference' page table is init_mm.pgd.
 338          *
 339          * NOTE! We MUST NOT take any locks for this case. We may
 340          * be in an interrupt or a critical region, and should
 341          * only copy the information from the master page table,
 342          * nothing more.
 343          *
 344          * This verifies that the fault happens in kernel space
 345          * (error_code & 4) == 0, and that the fault was not a
 346          * protection error (error_code & 9) == 0.
 347          */
 348         if (unlikely(address >= TASK_SIZE64)) {
 349                 /*
 350                  * Don't check for the module range here: its PML4
 351                  * is always initialized because it's shared with the main
 352                  * kernel text. Only vmalloc may need PML4 syncups.
 353                  */
 354                 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 355                       ((address >= VMALLOC_START && address < VMALLOC_END))) {
 356                         if (vmalloc_fault(address) >= 0)
 357                                 return;
 358                 }
 359                 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 360                         return;
 361                 /*
 362                  * Don't take the mm semaphore here. If we fixup a prefetch
 363                  * fault we could otherwise deadlock.
 364                  */
 365                 goto bad_area_nosemaphore;
 366         }
 367
 368         if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
 369                 return;
 370
 371         if (likely(regs->eflags & X86_EFLAGS_IF))
 372                 local_irq_enable();
 373
 374         if (unlikely(page_fault_trace))
 375                 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
 376                        regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
 377
 378         if (unlikely(error_code & PF_RSVD))
 379                 pgtable_bad(address, regs, error_code);
 380
 381         /*
 382          * If we're in an interrupt or have no user
 383          * context, we must not take the fault..
 384          */
 385         if (unlikely(in_atomic() || !mm))
 386                 goto bad_area_nosemaphore;
 387
 388  again:
 389         /* When running in the kernel we expect faults to occur only to
 390          * addresses in user space.  All other faults represent errors in the
 391          * kernel and should generate an OOPS.  Unfortunatly, in the case of an
 392          * erroneous fault occurring in a code path which already holds mmap_sem
 393          * we will deadlock attempting to validate the fault against the
 394          * address space.  Luckily the kernel only validly references user
 395          * space from well defined areas of code, which are listed in the
 396          * exceptions table.
 397          *
 398          * As the vast majority of faults will be valid we will only perform
 399          * the source reference check when there is a possibilty of a deadlock.
 400          * Attempt to lock the address space, if we cannot we then validate the
 401          * source.  If this is invalid we can skip the address space check,
 402          * thus avoiding the deadlock.
 403          */
 404         if (!down_read_trylock(&mm->mmap_sem)) {
 405                 if ((error_code & PF_USER) == 0 &&
 406                     !search_exception_tables(regs->rip))
 407                         goto bad_area_nosemaphore;
 408                 down_read(&mm->mmap_sem);
 409         }
 410
 411         vma = find_vma(mm, address);
 412         if (!vma)
 413                 goto bad_area;
 414         if (likely(vma->vm_start <= address))
 415                 goto good_area;
 416         if (!(vma->vm_flags & VM_GROWSDOWN))
 417                 goto bad_area;
 418         if (error_code & 4) {
 419                 /* Allow userspace just enough access below the stack pointer
 420                  * to let the 'enter' instruction work.
 421                  */
 422                 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
 423                         goto bad_area;
 424         }
 425         if (expand_stack(vma, address))
 426                 goto bad_area;
 427 /*
 428  * Ok, we have a good vm_area for this memory access, so
 429  * we can handle it..
 430  */
 431 good_area:
 432         info.si_code = SEGV_ACCERR;
 433         write = 0;
 434         switch (error_code & (PF_PROT|PF_WRITE)) {
 435                 default:        /* 3: write, present */
 436                         /* fall through */
 437                 case PF_WRITE:          /* write, not present */
 438                         if (!(vma->vm_flags & VM_WRITE))
 439                                 goto bad_area;
 440                         write++;
 441                         break;
 442                 case PF_PROT:           /* read, present */
 443                         goto bad_area;
 444                 case 0:                 /* read, not present */
 445                         if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 446                                 goto bad_area;
 447         }
 448
 449         /*
 450          * If for any reason at all we couldn't handle the fault,
 451          * make sure we exit gracefully rather than endlessly redo
 452          * the fault.
 453          */
 454         switch (handle_mm_fault(mm, vma, address, write)) {
 455         case VM_FAULT_MINOR:
 456                 tsk->min_flt++;
 457                 break;
 458         case VM_FAULT_MAJOR:
 459                 tsk->maj_flt++;
 460                 break;
 461         case VM_FAULT_SIGBUS:
 462                 goto do_sigbus;
 463         default:
 464                 goto out_of_memory;
 465         }
 466
 467         up_read(&mm->mmap_sem);
 468         return;
 469
 470 /*
 471  * Something tried to access memory that isn't in our memory map..
 472  * Fix it, but check if it's kernel or user first..
 473  */
 474 bad_area:
 475         up_read(&mm->mmap_sem);
 476
 477 bad_area_nosemaphore:
 478         /* User mode accesses just cause a SIGSEGV */
 479         if (error_code & PF_USER) {
 480                 if (is_prefetch(regs, address, error_code))
 481                         return;
 482
 483                 /* Work around K8 erratum #100 K8 in compat mode
 484                    occasionally jumps to illegal addresses >4GB.  We
 485                    catch this here in the page fault handler because
 486                    these addresses are not reachable. Just detect this
 487                    case and return.  Any code segment in LDT is
 488                    compatibility mode. */
 489                 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
 490                     (address >> 32))
 491                         return;
 492
 493                 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
 494                         printk(
 495                        "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
 496                                         tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
 497                                         tsk->comm, tsk->pid, address, regs->rip,
 498                                         regs->rsp, error_code);
 499                 }
 500
 501                 tsk->thread.cr2 = address;
 502                 /* Kernel addresses are always protection faults */
 503                 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
 504                 tsk->thread.trap_no = 14;
 505                 info.si_signo = SIGSEGV;
 506                 info.si_errno = 0;
 507                 /* info.si_code has been set above */
 508                 info.si_addr = (void __user *)address;
 509                 force_sig_info(SIGSEGV, &info, tsk);
 510                 return;
 511         }
 512
 513 no_context:
 514
 515         /* Are we prepared to handle this kernel fault?  */
 516         fixup = search_exception_tables(regs->rip);
 517         if (fixup) {
 518                 regs->rip = fixup->fixup;
 519                 return;
 520         }
 521
 522         /*
 523          * Hall of shame of CPU/BIOS bugs.
 524          */
 525
 526         if (is_prefetch(regs, address, error_code))
 527                 return;
 528
 529         if (is_errata93(regs, address))
 530                 return;
 531
 532 /*
 533  * Oops. The kernel tried to access some bad page. We'll have to
 534  * terminate things with extreme prejudice.
 535  */
 536
 537         flags = oops_begin();
 538
 539         if (address < PAGE_SIZE)
 540                 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
 541         else
 542                 printk(KERN_ALERT "Unable to handle kernel paging request");
 543         printk(" at %016lx RIP: \n" KERN_ALERT,address);
 544         printk_address(regs->rip);
 545         dump_pagetable(address);
 546         tsk->thread.cr2 = address;
 547         tsk->thread.trap_no = 14;
 548         tsk->thread.error_code = error_code;
 549         __die("Oops", regs, error_code);
 550         /* Executive summary in case the body of the oops scrolled away */
 551         printk(KERN_EMERG "CR2: %016lx\n", address);
 552         oops_end(flags);
 553         do_exit(SIGKILL);
 554
 555 /*
 556  * We ran out of memory, or some other thing happened to us that made
 557  * us unable to handle the page fault gracefully.
 558  */
 559 out_of_memory:
 560         up_read(&mm->mmap_sem);
 561         if (is_init(current)) {
 562                 yield();
 563                 goto again;
 564         }
 565         printk("VM: killing process %s\n", tsk->comm);
 566         if (error_code & 4)
 567                 do_exit(SIGKILL);
 568         goto no_context;
 569
 570 do_sigbus:
 571         up_read(&mm->mmap_sem);
 572
 573         /* Kernel mode? Handle exceptions or die */
 574         if (!(error_code & PF_USER))
 575                 goto no_context;
 576
 577         tsk->thread.cr2 = address;
 578         tsk->thread.error_code = error_code;
 579         tsk->thread.trap_no = 14;
 580         info.si_signo = SIGBUS;
 581         info.si_errno = 0;
 582         info.si_code = BUS_ADRERR;
 583         info.si_addr = (void __user *)address;
 584         force_sig_info(SIGBUS, &info, tsk);
 585         return;
 586 }
 587
 588 DEFINE_SPINLOCK(pgd_lock);
 589 LIST_HEAD(pgd_list);
 590
 591 void vmalloc_sync_all(void)
 592 {
 593         /* Note that races in the updates of insync and start aren't
 594            problematic:
 595            insync can only get set bits added, and updates to start are only
 596            improving performance (without affecting correctness if undone). */
 597         static DECLARE_BITMAP(insync, PTRS_PER_PGD);
 598         static unsigned long start = VMALLOC_START & PGDIR_MASK;
 599         unsigned long address;
 600
 601         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
 602                 if (!test_bit(pgd_index(address), insync)) {
 603                         const pgd_t *pgd_ref = pgd_offset_k(address);
 604                         struct page *page;
 605
 606                         if (pgd_none(*pgd_ref))
 607                                 continue;
 608                         spin_lock(&pgd_lock);
 609                         list_for_each_entry(page, &pgd_list, lru) {
 610                                 pgd_t *pgd;
 611                                 pgd = (pgd_t *)page_address(page) + pgd_index(address);
 612                                 if (pgd_none(*pgd))
 613                                         set_pgd(pgd, *pgd_ref);
 614                                 else
 615                                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 616                         }
 617                         spin_unlock(&pgd_lock);
 618                         set_bit(pgd_index(address), insync);
 619                 }
 620                 if (address == start)
 621                         start = address + PGDIR_SIZE;
 622         }
 623         /* Check that there is no need to do the same for the modules area. */
 624         BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 625         BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 626                                 (__START_KERNEL & PGDIR_MASK)));
 627 }
 628
 629 static int __init enable_pagefaulttrace(char *str)
 630 {
 631         page_fault_trace = 1;
 632         return 1;
 633 }
 634 __setup("pagefaulttrace", enable_pagefaulttrace);