git.oblomov.eu Git - linux-2.6/blob - arch/x86/mm/pageattr.c

   1 /*
   2  * Copyright 2002 Andi Kleen, SuSE Labs.
   3  * Thanks to Ben LaHaise for precious feedback.
   4  */
   5 #include <linux/highmem.h>
   6 #include <linux/bootmem.h>
   7 #include <linux/module.h>
   8 #include <linux/sched.h>
   9 #include <linux/slab.h>
  10 #include <linux/mm.h>
  11 #include <linux/interrupt.h>
  12
  13 #include <asm/e820.h>
  14 #include <asm/processor.h>
  15 #include <asm/tlbflush.h>
  16 #include <asm/sections.h>
  17 #include <asm/uaccess.h>
  18 #include <asm/pgalloc.h>
  19 #include <asm/proto.h>
  20
  21 /*
  22  * The current flushing context - we pass it instead of 5 arguments:
  23  */
  24 struct cpa_data {
  25         unsigned long   vaddr;
  26         pgprot_t        mask_set;
  27         pgprot_t        mask_clr;
  28         int             numpages;
  29         int             flushtlb;
  30         unsigned long   pfn;
  31 };
  32
  33 #ifdef CONFIG_X86_64
  34
  35 static inline unsigned long highmap_start_pfn(void)
  36 {
  37         return __pa(_text) >> PAGE_SHIFT;
  38 }
  39
  40 static inline unsigned long highmap_end_pfn(void)
  41 {
  42         return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
  43 }
  44
  45 #endif
  46
  47 #ifdef CONFIG_DEBUG_PAGEALLOC
  48 # define debug_pagealloc 1
  49 #else
  50 # define debug_pagealloc 0
  51 #endif
  52
  53 static inline int
  54 within(unsigned long addr, unsigned long start, unsigned long end)
  55 {
  56         return addr >= start && addr < end;
  57 }
  58
  59 /*
  60  * Flushing functions
  61  */
  62
  63 /**
  64  * clflush_cache_range - flush a cache range with clflush
  65  * @addr:       virtual start address
  66  * @size:       number of bytes to flush
  67  *
  68  * clflush is an unordered instruction which needs fencing with mfence
  69  * to avoid ordering issues.
  70  */
  71 void clflush_cache_range(void *vaddr, unsigned int size)
  72 {
  73         void *vend = vaddr + size - 1;
  74
  75         mb();
  76
  77         for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
  78                 clflush(vaddr);
  79         /*
  80          * Flush any possible final partial cacheline:
  81          */
  82         clflush(vend);
  83
  84         mb();
  85 }
  86
  87 static void __cpa_flush_all(void *arg)
  88 {
  89         unsigned long cache = (unsigned long)arg;
  90
  91         /*
  92          * Flush all to work around Errata in early athlons regarding
  93          * large page flushing.
  94          */
  95         __flush_tlb_all();
  96
  97         if (cache && boot_cpu_data.x86_model >= 4)
  98                 wbinvd();
  99 }
 100
 101 static void cpa_flush_all(unsigned long cache)
 102 {
 103         BUG_ON(irqs_disabled());
 104
 105         on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
 106 }
 107
 108 static void __cpa_flush_range(void *arg)
 109 {
 110         /*
 111          * We could optimize that further and do individual per page
 112          * tlb invalidates for a low number of pages. Caveat: we must
 113          * flush the high aliases on 64bit as well.
 114          */
 115         __flush_tlb_all();
 116 }
 117
 118 static void cpa_flush_range(unsigned long start, int numpages, int cache)
 119 {
 120         unsigned int i, level;
 121         unsigned long addr;
 122
 123         BUG_ON(irqs_disabled());
 124         WARN_ON(PAGE_ALIGN(start) != start);
 125
 126         on_each_cpu(__cpa_flush_range, NULL, 1, 1);
 127
 128         if (!cache)
 129                 return;
 130
 131         /*
 132          * We only need to flush on one CPU,
 133          * clflush is a MESI-coherent instruction that
 134          * will cause all other CPUs to flush the same
 135          * cachelines:
 136          */
 137         for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 138                 pte_t *pte = lookup_address(addr, &level);
 139
 140                 /*
 141                  * Only flush present addresses:
 142                  */
 143                 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 144                         clflush_cache_range((void *) addr, PAGE_SIZE);
 145         }
 146 }
 147
 148 /*
 149  * Certain areas of memory on x86 require very specific protection flags,
 150  * for example the BIOS area or kernel text. Callers don't always get this
 151  * right (again, ioremap() on BIOS memory is not uncommon) so this function
 152  * checks and fixes these known static required protection bits.
 153  */
 154 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 155                                    unsigned long pfn)
 156 {
 157         pgprot_t forbidden = __pgprot(0);
 158
 159         /*
 160          * The BIOS area between 640k and 1Mb needs to be executable for
 161          * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 162          */
 163         if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 164                 pgprot_val(forbidden) |= _PAGE_NX;
 165
 166         /*
 167          * The kernel text needs to be executable for obvious reasons
 168          * Does not cover __inittext since that is gone later on. On
 169          * 64bit we do not enforce !NX on the low mapping
 170          */
 171         if (within(address, (unsigned long)_text, (unsigned long)_etext))
 172                 pgprot_val(forbidden) |= _PAGE_NX;
 173
 174         /*
 175          * The .rodata section needs to be read-only. Using the pfn
 176          * catches all aliases.
 177          */
 178         if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 179                    __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 180                 pgprot_val(forbidden) |= _PAGE_RW;
 181
 182         prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 183
 184         return prot;
 185 }
 186
 187 /*
 188  * Lookup the page table entry for a virtual address. Return a pointer
 189  * to the entry and the level of the mapping.
 190  *
 191  * Note: We return pud and pmd either when the entry is marked large
 192  * or when the present bit is not set. Otherwise we would return a
 193  * pointer to a nonexisting mapping.
 194  */
 195 pte_t *lookup_address(unsigned long address, unsigned int *level)
 196 {
 197         pgd_t *pgd = pgd_offset_k(address);
 198         pud_t *pud;
 199         pmd_t *pmd;
 200
 201         *level = PG_LEVEL_NONE;
 202
 203         if (pgd_none(*pgd))
 204                 return NULL;
 205
 206         pud = pud_offset(pgd, address);
 207         if (pud_none(*pud))
 208                 return NULL;
 209
 210         *level = PG_LEVEL_1G;
 211         if (pud_large(*pud) || !pud_present(*pud))
 212                 return (pte_t *)pud;
 213
 214         pmd = pmd_offset(pud, address);
 215         if (pmd_none(*pmd))
 216                 return NULL;
 217
 218         *level = PG_LEVEL_2M;
 219         if (pmd_large(*pmd) || !pmd_present(*pmd))
 220                 return (pte_t *)pmd;
 221
 222         *level = PG_LEVEL_4K;
 223
 224         return pte_offset_kernel(pmd, address);
 225 }
 226
 227 /*
 228  * Set the new pmd in all the pgds we know about:
 229  */
 230 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 231 {
 232         /* change init_mm */
 233         set_pte_atomic(kpte, pte);
 234 #ifdef CONFIG_X86_32
 235         if (!SHARED_KERNEL_PMD) {
 236                 struct page *page;
 237
 238                 list_for_each_entry(page, &pgd_list, lru) {
 239                         pgd_t *pgd;
 240                         pud_t *pud;
 241                         pmd_t *pmd;
 242
 243                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
 244                         pud = pud_offset(pgd, address);
 245                         pmd = pmd_offset(pud, address);
 246                         set_pte_atomic((pte_t *)pmd, pte);
 247                 }
 248         }
 249 #endif
 250 }
 251
 252 static int
 253 try_preserve_large_page(pte_t *kpte, unsigned long address,
 254                         struct cpa_data *cpa)
 255 {
 256         unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
 257         pte_t new_pte, old_pte, *tmp;
 258         pgprot_t old_prot, new_prot;
 259         int i, do_split = 1;
 260         unsigned int level;
 261
 262         spin_lock_irqsave(&pgd_lock, flags);
 263         /*
 264          * Check for races, another CPU might have split this page
 265          * up already:
 266          */
 267         tmp = lookup_address(address, &level);
 268         if (tmp != kpte)
 269                 goto out_unlock;
 270
 271         switch (level) {
 272         case PG_LEVEL_2M:
 273                 psize = PMD_PAGE_SIZE;
 274                 pmask = PMD_PAGE_MASK;
 275                 break;
 276 #ifdef CONFIG_X86_64
 277         case PG_LEVEL_1G:
 278                 psize = PUD_PAGE_SIZE;
 279                 pmask = PUD_PAGE_MASK;
 280                 break;
 281 #endif
 282         default:
 283                 do_split = -EINVAL;
 284                 goto out_unlock;
 285         }
 286
 287         /*
 288          * Calculate the number of pages, which fit into this large
 289          * page starting at address:
 290          */
 291         nextpage_addr = (address + psize) & pmask;
 292         numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 293         if (numpages < cpa->numpages)
 294                 cpa->numpages = numpages;
 295
 296         /*
 297          * We are safe now. Check whether the new pgprot is the same:
 298          */
 299         old_pte = *kpte;
 300         old_prot = new_prot = pte_pgprot(old_pte);
 301
 302         pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
 303         pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
 304
 305         /*
 306          * old_pte points to the large page base address. So we need
 307          * to add the offset of the virtual address:
 308          */
 309         pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 310         cpa->pfn = pfn;
 311
 312         new_prot = static_protections(new_prot, address, pfn);
 313
 314         /*
 315          * We need to check the full range, whether
 316          * static_protection() requires a different pgprot for one of
 317          * the pages in the range we try to preserve:
 318          */
 319         addr = address + PAGE_SIZE;
 320         pfn++;
 321         for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
 322                 pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
 323
 324                 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 325                         goto out_unlock;
 326         }
 327
 328         /*
 329          * If there are no changes, return. maxpages has been updated
 330          * above:
 331          */
 332         if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 333                 do_split = 0;
 334                 goto out_unlock;
 335         }
 336
 337         /*
 338          * We need to change the attributes. Check, whether we can
 339          * change the large page in one go. We request a split, when
 340          * the address is not aligned and the number of pages is
 341          * smaller than the number of pages in the large page. Note
 342          * that we limited the number of possible pages already to
 343          * the number of pages in the large page.
 344          */
 345         if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
 346                 /*
 347                  * The address is aligned and the number of pages
 348                  * covers the full page.
 349                  */
 350                 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
 351                 __set_pmd_pte(kpte, address, new_pte);
 352                 cpa->flushtlb = 1;
 353                 do_split = 0;
 354         }
 355
 356 out_unlock:
 357         spin_unlock_irqrestore(&pgd_lock, flags);
 358
 359         return do_split;
 360 }
 361
 362 static LIST_HEAD(page_pool);
 363 static unsigned long pool_size, pool_pages, pool_low;
 364 static unsigned long pool_used, pool_failed;
 365
 366 static void cpa_fill_pool(struct page **ret)
 367 {
 368         gfp_t gfp = GFP_KERNEL;
 369         unsigned long flags;
 370         struct page *p;
 371
 372         /*
 373          * Avoid recursion (on debug-pagealloc) and also signal
 374          * our priority to get to these pagetables:
 375          */
 376         if (current->flags & PF_MEMALLOC)
 377                 return;
 378         current->flags |= PF_MEMALLOC;
 379
 380         /*
 381          * Allocate atomically from atomic contexts:
 382          */
 383         if (in_atomic() || irqs_disabled() || debug_pagealloc)
 384                 gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
 385
 386         while (pool_pages < pool_size || (ret && !*ret)) {
 387                 p = alloc_pages(gfp, 0);
 388                 if (!p) {
 389                         pool_failed++;
 390                         break;
 391                 }
 392                 /*
 393                  * If the call site needs a page right now, provide it:
 394                  */
 395                 if (ret && !*ret) {
 396                         *ret = p;
 397                         continue;
 398                 }
 399                 spin_lock_irqsave(&pgd_lock, flags);
 400                 list_add(&p->lru, &page_pool);
 401                 pool_pages++;
 402                 spin_unlock_irqrestore(&pgd_lock, flags);
 403         }
 404
 405         current->flags &= ~PF_MEMALLOC;
 406 }
 407
 408 #define SHIFT_MB                (20 - PAGE_SHIFT)
 409 #define ROUND_MB_GB             ((1 << 10) - 1)
 410 #define SHIFT_MB_GB             10
 411 #define POOL_PAGES_PER_GB       16
 412
 413 void __init cpa_init(void)
 414 {
 415         struct sysinfo si;
 416         unsigned long gb;
 417
 418         si_meminfo(&si);
 419         /*
 420          * Calculate the number of pool pages:
 421          *
 422          * Convert totalram (nr of pages) to MiB and round to the next
 423          * GiB. Shift MiB to Gib and multiply the result by
 424          * POOL_PAGES_PER_GB:
 425          */
 426         if (debug_pagealloc) {
 427                 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
 428                 pool_size = POOL_PAGES_PER_GB * gb;
 429         } else {
 430                 pool_size = 1;
 431         }
 432         pool_low = pool_size;
 433
 434         cpa_fill_pool(NULL);
 435         printk(KERN_DEBUG
 436                "CPA: page pool initialized %lu of %lu pages preallocated\n",
 437                pool_pages, pool_size);
 438 }
 439
 440 static int split_large_page(pte_t *kpte, unsigned long address)
 441 {
 442         unsigned long flags, pfn, pfninc = 1;
 443         unsigned int i, level;
 444         pte_t *pbase, *tmp;
 445         pgprot_t ref_prot;
 446         struct page *base;
 447
 448         /*
 449          * Get a page from the pool. The pool list is protected by the
 450          * pgd_lock, which we have to take anyway for the split
 451          * operation:
 452          */
 453         spin_lock_irqsave(&pgd_lock, flags);
 454         if (list_empty(&page_pool)) {
 455                 spin_unlock_irqrestore(&pgd_lock, flags);
 456                 base = NULL;
 457                 cpa_fill_pool(&base);
 458                 if (!base)
 459                         return -ENOMEM;
 460                 spin_lock_irqsave(&pgd_lock, flags);
 461         } else {
 462                 base = list_first_entry(&page_pool, struct page, lru);
 463                 list_del(&base->lru);
 464                 pool_pages--;
 465
 466                 if (pool_pages < pool_low)
 467                         pool_low = pool_pages;
 468         }
 469
 470         /*
 471          * Check for races, another CPU might have split this page
 472          * up for us already:
 473          */
 474         tmp = lookup_address(address, &level);
 475         if (tmp != kpte)
 476                 goto out_unlock;
 477
 478         pbase = (pte_t *)page_address(base);
 479 #ifdef CONFIG_X86_32
 480         paravirt_alloc_pt(&init_mm, page_to_pfn(base));
 481 #endif
 482         ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 483
 484 #ifdef CONFIG_X86_64
 485         if (level == PG_LEVEL_1G) {
 486                 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 487                 pgprot_val(ref_prot) |= _PAGE_PSE;
 488         }
 489 #endif
 490
 491         /*
 492          * Get the target pfn from the original entry:
 493          */
 494         pfn = pte_pfn(*kpte);
 495         for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 496                 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 497
 498         /*
 499          * Install the new, split up pagetable. Important details here:
 500          *
 501          * On Intel the NX bit of all levels must be cleared to make a
 502          * page executable. See section 4.13.2 of Intel 64 and IA-32
 503          * Architectures Software Developer's Manual).
 504          *
 505          * Mark the entry present. The current mapping might be
 506          * set to not present, which we preserved above.
 507          */
 508         ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
 509         pgprot_val(ref_prot) |= _PAGE_PRESENT;
 510         __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
 511         base = NULL;
 512
 513 out_unlock:
 514         /*
 515          * If we dropped out via the lookup_address check under
 516          * pgd_lock then stick the page back into the pool:
 517          */
 518         if (base) {
 519                 list_add(&base->lru, &page_pool);
 520                 pool_pages++;
 521         } else
 522                 pool_used++;
 523         spin_unlock_irqrestore(&pgd_lock, flags);
 524
 525         return 0;
 526 }
 527
 528 static int __change_page_attr(struct cpa_data *cpa, int primary)
 529 {
 530         unsigned long address = cpa->vaddr;
 531         int do_split, err;
 532         unsigned int level;
 533         pte_t *kpte, old_pte;
 534
 535 repeat:
 536         kpte = lookup_address(address, &level);
 537         if (!kpte)
 538                 return primary ? -EINVAL : 0;
 539
 540         old_pte = *kpte;
 541         if (!pte_val(old_pte)) {
 542                 if (!primary)
 543                         return 0;
 544                 printk(KERN_WARNING "CPA: called for zero pte. "
 545                        "vaddr = %lx cpa->vaddr = %lx\n", address,
 546                        cpa->vaddr);
 547                 WARN_ON(1);
 548                 return -EINVAL;
 549         }
 550
 551         if (level == PG_LEVEL_4K) {
 552                 pte_t new_pte;
 553                 pgprot_t new_prot = pte_pgprot(old_pte);
 554                 unsigned long pfn = pte_pfn(old_pte);
 555
 556                 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
 557                 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
 558
 559                 new_prot = static_protections(new_prot, address, pfn);
 560
 561                 /*
 562                  * We need to keep the pfn from the existing PTE,
 563                  * after all we're only going to change it's attributes
 564                  * not the memory it points to
 565                  */
 566                 new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
 567                 cpa->pfn = pfn;
 568                 /*
 569                  * Do we really change anything ?
 570                  */
 571                 if (pte_val(old_pte) != pte_val(new_pte)) {
 572                         set_pte_atomic(kpte, new_pte);
 573                         cpa->flushtlb = 1;
 574                 }
 575                 cpa->numpages = 1;
 576                 return 0;
 577         }
 578
 579         /*
 580          * Check, whether we can keep the large page intact
 581          * and just change the pte:
 582          */
 583         do_split = try_preserve_large_page(kpte, address, cpa);
 584         /*
 585          * When the range fits into the existing large page,
 586          * return. cp->numpages and cpa->tlbflush have been updated in
 587          * try_large_page:
 588          */
 589         if (do_split <= 0)
 590                 return do_split;
 591
 592         /*
 593          * We have to split the large page:
 594          */
 595         err = split_large_page(kpte, address);
 596         if (!err) {
 597                 cpa->flushtlb = 1;
 598                 goto repeat;
 599         }
 600
 601         return err;
 602 }
 603
 604 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
 605
 606 static int cpa_process_alias(struct cpa_data *cpa)
 607 {
 608         struct cpa_data alias_cpa;
 609         int ret = 0;
 610
 611         if (cpa->pfn > max_pfn_mapped)
 612                 return 0;
 613
 614         /*
 615          * No need to redo, when the primary call touched the direct
 616          * mapping already:
 617          */
 618         if (!within(cpa->vaddr, PAGE_OFFSET,
 619                     PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
 620
 621                 alias_cpa = *cpa;
 622                 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
 623
 624                 ret = __change_page_attr_set_clr(&alias_cpa, 0);
 625         }
 626
 627 #ifdef CONFIG_X86_64
 628         if (ret)
 629                 return ret;
 630         /*
 631          * No need to redo, when the primary call touched the high
 632          * mapping already:
 633          */
 634         if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
 635                 return 0;
 636
 637         /*
 638          * If the physical address is inside the kernel map, we need
 639          * to touch the high mapped kernel as well:
 640          */
 641         if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
 642                 return 0;
 643
 644         alias_cpa = *cpa;
 645         alias_cpa.vaddr =
 646                 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
 647
 648         /*
 649          * The high mapping range is imprecise, so ignore the return value.
 650          */
 651         __change_page_attr_set_clr(&alias_cpa, 0);
 652 #endif
 653         return ret;
 654 }
 655
 656 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 657 {
 658         int ret, numpages = cpa->numpages;
 659
 660         while (numpages) {
 661                 /*
 662                  * Store the remaining nr of pages for the large page
 663                  * preservation check.
 664                  */
 665                 cpa->numpages = numpages;
 666
 667                 ret = __change_page_attr(cpa, checkalias);
 668                 if (ret)
 669                         return ret;
 670
 671                 if (checkalias) {
 672                         ret = cpa_process_alias(cpa);
 673                         if (ret)
 674                                 return ret;
 675                 }
 676
 677                 /*
 678                  * Adjust the number of pages with the result of the
 679                  * CPA operation. Either a large page has been
 680                  * preserved or a single page update happened.
 681                  */
 682                 BUG_ON(cpa->numpages > numpages);
 683                 numpages -= cpa->numpages;
 684                 cpa->vaddr += cpa->numpages * PAGE_SIZE;
 685         }
 686         return 0;
 687 }
 688
 689 static inline int cache_attr(pgprot_t attr)
 690 {
 691         return pgprot_val(attr) &
 692                 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
 693 }
 694
 695 static int change_page_attr_set_clr(unsigned long addr, int numpages,
 696                                     pgprot_t mask_set, pgprot_t mask_clr)
 697 {
 698         struct cpa_data cpa;
 699         int ret, cache, checkalias;
 700
 701         /*
 702          * Check, if we are requested to change a not supported
 703          * feature:
 704          */
 705         mask_set = canon_pgprot(mask_set);
 706         mask_clr = canon_pgprot(mask_clr);
 707         if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
 708                 return 0;
 709
 710         /* Ensure we are PAGE_SIZE aligned */
 711         if (addr & ~PAGE_MASK) {
 712                 addr &= PAGE_MASK;
 713                 /*
 714                  * People should not be passing in unaligned addresses:
 715                  */
 716                 WARN_ON_ONCE(1);
 717         }
 718
 719         cpa.vaddr = addr;
 720         cpa.numpages = numpages;
 721         cpa.mask_set = mask_set;
 722         cpa.mask_clr = mask_clr;
 723         cpa.flushtlb = 0;
 724
 725         /* No alias checking for _NX bit modifications */
 726         checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
 727
 728         ret = __change_page_attr_set_clr(&cpa, checkalias);
 729
 730         /*
 731          * Check whether we really changed something:
 732          */
 733         if (!cpa.flushtlb)
 734                 goto out;
 735
 736         /*
 737          * No need to flush, when we did not set any of the caching
 738          * attributes:
 739          */
 740         cache = cache_attr(mask_set);
 741
 742         /*
 743          * On success we use clflush, when the CPU supports it to
 744          * avoid the wbindv. If the CPU does not support it and in the
 745          * error case we fall back to cpa_flush_all (which uses
 746          * wbindv):
 747          */
 748         if (!ret && cpu_has_clflush)
 749                 cpa_flush_range(addr, numpages, cache);
 750         else
 751                 cpa_flush_all(cache);
 752
 753 out:
 754         cpa_fill_pool(NULL);
 755
 756         return ret;
 757 }
 758
 759 static inline int change_page_attr_set(unsigned long addr, int numpages,
 760                                        pgprot_t mask)
 761 {
 762         return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
 763 }
 764
 765 static inline int change_page_attr_clear(unsigned long addr, int numpages,
 766                                          pgprot_t mask)
 767 {
 768         return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
 769 }
 770
 771 int set_memory_uc(unsigned long addr, int numpages)
 772 {
 773         return change_page_attr_set(addr, numpages,
 774                                     __pgprot(_PAGE_PCD));
 775 }
 776 EXPORT_SYMBOL(set_memory_uc);
 777
 778 int set_memory_wb(unsigned long addr, int numpages)
 779 {
 780         return change_page_attr_clear(addr, numpages,
 781                                       __pgprot(_PAGE_PCD | _PAGE_PWT));
 782 }
 783 EXPORT_SYMBOL(set_memory_wb);
 784
 785 int set_memory_x(unsigned long addr, int numpages)
 786 {
 787         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
 788 }
 789 EXPORT_SYMBOL(set_memory_x);
 790
 791 int set_memory_nx(unsigned long addr, int numpages)
 792 {
 793         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
 794 }
 795 EXPORT_SYMBOL(set_memory_nx);
 796
 797 int set_memory_ro(unsigned long addr, int numpages)
 798 {
 799         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
 800 }
 801
 802 int set_memory_rw(unsigned long addr, int numpages)
 803 {
 804         return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
 805 }
 806
 807 int set_memory_np(unsigned long addr, int numpages)
 808 {
 809         return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
 810 }
 811
 812 int set_pages_uc(struct page *page, int numpages)
 813 {
 814         unsigned long addr = (unsigned long)page_address(page);
 815
 816         return set_memory_uc(addr, numpages);
 817 }
 818 EXPORT_SYMBOL(set_pages_uc);
 819
 820 int set_pages_wb(struct page *page, int numpages)
 821 {
 822         unsigned long addr = (unsigned long)page_address(page);
 823
 824         return set_memory_wb(addr, numpages);
 825 }
 826 EXPORT_SYMBOL(set_pages_wb);
 827
 828 int set_pages_x(struct page *page, int numpages)
 829 {
 830         unsigned long addr = (unsigned long)page_address(page);
 831
 832         return set_memory_x(addr, numpages);
 833 }
 834 EXPORT_SYMBOL(set_pages_x);
 835
 836 int set_pages_nx(struct page *page, int numpages)
 837 {
 838         unsigned long addr = (unsigned long)page_address(page);
 839
 840         return set_memory_nx(addr, numpages);
 841 }
 842 EXPORT_SYMBOL(set_pages_nx);
 843
 844 int set_pages_ro(struct page *page, int numpages)
 845 {
 846         unsigned long addr = (unsigned long)page_address(page);
 847
 848         return set_memory_ro(addr, numpages);
 849 }
 850
 851 int set_pages_rw(struct page *page, int numpages)
 852 {
 853         unsigned long addr = (unsigned long)page_address(page);
 854
 855         return set_memory_rw(addr, numpages);
 856 }
 857
 858 #ifdef CONFIG_DEBUG_PAGEALLOC
 859
 860 static int __set_pages_p(struct page *page, int numpages)
 861 {
 862         struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
 863                                 .numpages = numpages,
 864                                 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
 865                                 .mask_clr = __pgprot(0)};
 866
 867         return __change_page_attr_set_clr(&cpa, 1);
 868 }
 869
 870 static int __set_pages_np(struct page *page, int numpages)
 871 {
 872         struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
 873                                 .numpages = numpages,
 874                                 .mask_set = __pgprot(0),
 875                                 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
 876
 877         return __change_page_attr_set_clr(&cpa, 1);
 878 }
 879
 880 void kernel_map_pages(struct page *page, int numpages, int enable)
 881 {
 882         if (PageHighMem(page))
 883                 return;
 884         if (!enable) {
 885                 debug_check_no_locks_freed(page_address(page),
 886                                            numpages * PAGE_SIZE);
 887         }
 888
 889         /*
 890          * If page allocator is not up yet then do not call c_p_a():
 891          */
 892         if (!debug_pagealloc_enabled)
 893                 return;
 894
 895         /*
 896          * The return value is ignored as the calls cannot fail.
 897          * Large pages are kept enabled at boot time, and are
 898          * split up quickly with DEBUG_PAGEALLOC. If a splitup
 899          * fails here (due to temporary memory shortage) no damage
 900          * is done because we just keep the largepage intact up
 901          * to the next attempt when it will likely be split up:
 902          */
 903         if (enable)
 904                 __set_pages_p(page, numpages);
 905         else
 906                 __set_pages_np(page, numpages);
 907
 908         /*
 909          * We should perform an IPI and flush all tlbs,
 910          * but that can deadlock->flush only current cpu:
 911          */
 912         __flush_tlb_all();
 913
 914         /*
 915          * Try to refill the page pool here. We can do this only after
 916          * the tlb flush.
 917          */
 918         cpa_fill_pool(NULL);
 919 }
 920
 921 #ifdef CONFIG_HIBERNATION
 922
 923 bool kernel_page_present(struct page *page)
 924 {
 925         unsigned int level;
 926         pte_t *pte;
 927
 928         if (PageHighMem(page))
 929                 return false;
 930
 931         pte = lookup_address((unsigned long)page_address(page), &level);
 932         return (pte_val(*pte) & _PAGE_PRESENT);
 933 }
 934
 935 #endif /* CONFIG_HIBERNATION */
 936
 937 #endif /* CONFIG_DEBUG_PAGEALLOC */
 938
 939 /*
 940  * The testcases use internal knowledge of the implementation that shouldn't
 941  * be exposed to the rest of the kernel. Include these directly here.
 942  */
 943 #ifdef CONFIG_CPA_DEBUG
 944 #include "pageattr-test.c"
 945 #endif