git.oblomov.eu Git - linux-2.6/blob - arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  *
   6  * Based on the IA-32 version:
   7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/slab.h>
  16 #include <linux/err.h>
  17 #include <linux/sysctl.h>
  18 #include <asm/mman.h>
  19 #include <asm/pgalloc.h>
  20 #include <asm/tlb.h>
  21 #include <asm/tlbflush.h>
  22 #include <asm/mmu_context.h>
  23 #include <asm/machdep.h>
  24 #include <asm/cputable.h>
  25 #include <asm/spu.h>
  26
  27 #define PAGE_SHIFT_64K  16
  28 #define PAGE_SHIFT_16M  24
  29 #define PAGE_SHIFT_16G  34
  30
  31 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  32 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  33 #define MAX_NUMBER_GPAGES       1024
  34
  35 /* Tracks the 16G pages after the device tree is scanned and before the
  36  * huge_boot_pages list is ready.  */
  37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  38 static unsigned nr_gpages;
  39
  40 /* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  41  * stored for the huge page sizes that are valid.
  42  */
  43 unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
  44
  45 #define hugepte_shift                   mmu_huge_psizes
  46 #define PTRS_PER_HUGEPTE(psize)         (1 << hugepte_shift[psize])
  47 #define HUGEPTE_TABLE_SIZE(psize)       (sizeof(pte_t) << hugepte_shift[psize])
  48
  49 #define HUGEPD_SHIFT(psize)             (mmu_psize_to_shift(psize) \
  50                                                 + hugepte_shift[psize])
  51 #define HUGEPD_SIZE(psize)              (1UL << HUGEPD_SHIFT(psize))
  52 #define HUGEPD_MASK(psize)              (~(HUGEPD_SIZE(psize)-1))
  53
  54 /* Subtract one from array size because we don't need a cache for 4K since
  55  * is not a huge page size */
  56 #define HUGE_PGTABLE_INDEX(psize)       (HUGEPTE_CACHE_NUM + psize - 1)
  57 #define HUGEPTE_CACHE_NAME(psize)       (huge_pgtable_cache_name[psize])
  58
  59 static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
  60         "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
  61         "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
  62 };
  63
  64 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  65  * will choke on pointers to hugepte tables, which is handy for
  66  * catching screwups early. */
  67 #define HUGEPD_OK       0x1
  68
  69 typedef struct { unsigned long pd; } hugepd_t;
  70
  71 #define hugepd_none(hpd)        ((hpd).pd == 0)
  72
  73 static inline int shift_to_mmu_psize(unsigned int shift)
  74 {
  75         switch (shift) {
  76 #ifndef CONFIG_PPC_64K_PAGES
  77         case PAGE_SHIFT_64K:
  78             return MMU_PAGE_64K;
  79 #endif
  80         case PAGE_SHIFT_16M:
  81             return MMU_PAGE_16M;
  82         case PAGE_SHIFT_16G:
  83             return MMU_PAGE_16G;
  84         }
  85         return -1;
  86 }
  87
  88 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  89 {
  90         if (mmu_psize_defs[mmu_psize].shift)
  91                 return mmu_psize_defs[mmu_psize].shift;
  92         BUG();
  93 }
  94
  95 static inline pte_t *hugepd_page(hugepd_t hpd)
  96 {
  97         BUG_ON(!(hpd.pd & HUGEPD_OK));
  98         return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  99 }
 100
 101 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 102                                     struct hstate *hstate)
 103 {
 104         unsigned int shift = huge_page_shift(hstate);
 105         int psize = shift_to_mmu_psize(shift);
 106         unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 107         pte_t *dir = hugepd_page(*hpdp);
 108
 109         return dir + idx;
 110 }
 111
 112 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 113                            unsigned long address, unsigned int psize)
 114 {
 115         pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
 116                                       GFP_KERNEL|__GFP_REPEAT);
 117
 118         if (! new)
 119                 return -ENOMEM;
 120
 121         spin_lock(&mm->page_table_lock);
 122         if (!hugepd_none(*hpdp))
 123                 kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
 124         else
 125                 hpdp->pd = (unsigned long)new | HUGEPD_OK;
 126         spin_unlock(&mm->page_table_lock);
 127         return 0;
 128 }
 129
 130
 131 static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
 132 {
 133         if (huge_page_shift(hstate) < PUD_SHIFT)
 134                 return pud_offset(pgd, addr);
 135         else
 136                 return (pud_t *) pgd;
 137 }
 138 static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
 139                          struct hstate *hstate)
 140 {
 141         if (huge_page_shift(hstate) < PUD_SHIFT)
 142                 return pud_alloc(mm, pgd, addr);
 143         else
 144                 return (pud_t *) pgd;
 145 }
 146 static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 147 {
 148         if (huge_page_shift(hstate) < PMD_SHIFT)
 149                 return pmd_offset(pud, addr);
 150         else
 151                 return (pmd_t *) pud;
 152 }
 153 static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
 154                          struct hstate *hstate)
 155 {
 156         if (huge_page_shift(hstate) < PMD_SHIFT)
 157                 return pmd_alloc(mm, pud, addr);
 158         else
 159                 return (pmd_t *) pud;
 160 }
 161
 162 /* Build list of addresses of gigantic pages.  This function is used in early
 163  * boot before the buddy or bootmem allocator is setup.
 164  */
 165 void add_gpage(unsigned long addr, unsigned long page_size,
 166         unsigned long number_of_pages)
 167 {
 168         if (!addr)
 169                 return;
 170         while (number_of_pages > 0) {
 171                 gpage_freearray[nr_gpages] = addr;
 172                 nr_gpages++;
 173                 number_of_pages--;
 174                 addr += page_size;
 175         }
 176 }
 177
 178 /* Moves the gigantic page addresses from the temporary list to the
 179  * huge_boot_pages list.
 180  */
 181 int alloc_bootmem_huge_page(struct hstate *hstate)
 182 {
 183         struct huge_bootmem_page *m;
 184         if (nr_gpages == 0)
 185                 return 0;
 186         m = phys_to_virt(gpage_freearray[--nr_gpages]);
 187         gpage_freearray[nr_gpages] = 0;
 188         list_add(&m->list, &huge_boot_pages);
 189         m->hstate = hstate;
 190         return 1;
 191 }
 192
 193
 194 /* Modelled after find_linux_pte() */
 195 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 196 {
 197         pgd_t *pg;
 198         pud_t *pu;
 199         pmd_t *pm;
 200
 201         unsigned int psize;
 202         unsigned int shift;
 203         unsigned long sz;
 204         struct hstate *hstate;
 205         psize = get_slice_psize(mm, addr);
 206         shift = mmu_psize_to_shift(psize);
 207         sz = ((1UL) << shift);
 208         hstate = size_to_hstate(sz);
 209
 210         addr &= hstate->mask;
 211
 212         pg = pgd_offset(mm, addr);
 213         if (!pgd_none(*pg)) {
 214                 pu = hpud_offset(pg, addr, hstate);
 215                 if (!pud_none(*pu)) {
 216                         pm = hpmd_offset(pu, addr, hstate);
 217                         if (!pmd_none(*pm))
 218                                 return hugepte_offset((hugepd_t *)pm, addr,
 219                                                       hstate);
 220                 }
 221         }
 222
 223         return NULL;
 224 }
 225
 226 pte_t *huge_pte_alloc(struct mm_struct *mm,
 227                         unsigned long addr, unsigned long sz)
 228 {
 229         pgd_t *pg;
 230         pud_t *pu;
 231         pmd_t *pm;
 232         hugepd_t *hpdp = NULL;
 233         struct hstate *hstate;
 234         unsigned int psize;
 235         hstate = size_to_hstate(sz);
 236
 237         psize = get_slice_psize(mm, addr);
 238         BUG_ON(!mmu_huge_psizes[psize]);
 239
 240         addr &= hstate->mask;
 241
 242         pg = pgd_offset(mm, addr);
 243         pu = hpud_alloc(mm, pg, addr, hstate);
 244
 245         if (pu) {
 246                 pm = hpmd_alloc(mm, pu, addr, hstate);
 247                 if (pm)
 248                         hpdp = (hugepd_t *)pm;
 249         }
 250
 251         if (! hpdp)
 252                 return NULL;
 253
 254         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 255                 return NULL;
 256
 257         return hugepte_offset(hpdp, addr, hstate);
 258 }
 259
 260 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 261 {
 262         return 0;
 263 }
 264
 265 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 266                                unsigned int psize)
 267 {
 268         pte_t *hugepte = hugepd_page(*hpdp);
 269
 270         hpdp->pd = 0;
 271         tlb->need_flush = 1;
 272         pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
 273                                                  HUGEPTE_CACHE_NUM+psize-1,
 274                                                  PGF_CACHENUM_MASK));
 275 }
 276
 277 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 278                                    unsigned long addr, unsigned long end,
 279                                    unsigned long floor, unsigned long ceiling,
 280                                    unsigned int psize)
 281 {
 282         pmd_t *pmd;
 283         unsigned long next;
 284         unsigned long start;
 285
 286         start = addr;
 287         pmd = pmd_offset(pud, addr);
 288         do {
 289                 next = pmd_addr_end(addr, end);
 290                 if (pmd_none(*pmd))
 291                         continue;
 292                 free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 293         } while (pmd++, addr = next, addr != end);
 294
 295         start &= PUD_MASK;
 296         if (start < floor)
 297                 return;
 298         if (ceiling) {
 299                 ceiling &= PUD_MASK;
 300                 if (!ceiling)
 301                         return;
 302         }
 303         if (end - 1 > ceiling - 1)
 304                 return;
 305
 306         pmd = pmd_offset(pud, start);
 307         pud_clear(pud);
 308         pmd_free_tlb(tlb, pmd);
 309 }
 310
 311 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 312                                    unsigned long addr, unsigned long end,
 313                                    unsigned long floor, unsigned long ceiling)
 314 {
 315         pud_t *pud;
 316         unsigned long next;
 317         unsigned long start;
 318         unsigned int shift;
 319         unsigned int psize = get_slice_psize(tlb->mm, addr);
 320         shift = mmu_psize_to_shift(psize);
 321
 322         start = addr;
 323         pud = pud_offset(pgd, addr);
 324         do {
 325                 next = pud_addr_end(addr, end);
 326                 if (shift < PMD_SHIFT) {
 327                         if (pud_none_or_clear_bad(pud))
 328                                 continue;
 329                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 330                                                ceiling, psize);
 331                 } else {
 332                         if (pud_none(*pud))
 333                                 continue;
 334                         free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 335                 }
 336         } while (pud++, addr = next, addr != end);
 337
 338         start &= PGDIR_MASK;
 339         if (start < floor)
 340                 return;
 341         if (ceiling) {
 342                 ceiling &= PGDIR_MASK;
 343                 if (!ceiling)
 344                         return;
 345         }
 346         if (end - 1 > ceiling - 1)
 347                 return;
 348
 349         pud = pud_offset(pgd, start);
 350         pgd_clear(pgd);
 351         pud_free_tlb(tlb, pud);
 352 }
 353
 354 /*
 355  * This function frees user-level page tables of a process.
 356  *
 357  * Must be called with pagetable lock held.
 358  */
 359 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 360                             unsigned long addr, unsigned long end,
 361                             unsigned long floor, unsigned long ceiling)
 362 {
 363         pgd_t *pgd;
 364         unsigned long next;
 365         unsigned long start;
 366
 367         /*
 368          * Comments below take from the normal free_pgd_range().  They
 369          * apply here too.  The tests against HUGEPD_MASK below are
 370          * essential, because we *don't* test for this at the bottom
 371          * level.  Without them we'll attempt to free a hugepte table
 372          * when we unmap just part of it, even if there are other
 373          * active mappings using it.
 374          *
 375          * The next few lines have given us lots of grief...
 376          *
 377          * Why are we testing HUGEPD* at this top level?  Because
 378          * often there will be no work to do at all, and we'd prefer
 379          * not to go all the way down to the bottom just to discover
 380          * that.
 381          *
 382          * Why all these "- 1"s?  Because 0 represents both the bottom
 383          * of the address space and the top of it (using -1 for the
 384          * top wouldn't help much: the masks would do the wrong thing).
 385          * The rule is that addr 0 and floor 0 refer to the bottom of
 386          * the address space, but end 0 and ceiling 0 refer to the top
 387          * Comparisons need to use "end - 1" and "ceiling - 1" (though
 388          * that end 0 case should be mythical).
 389          *
 390          * Wherever addr is brought up or ceiling brought down, we
 391          * must be careful to reject "the opposite 0" before it
 392          * confuses the subsequent tests.  But what about where end is
 393          * brought down by HUGEPD_SIZE below? no, end can't go down to
 394          * 0 there.
 395          *
 396          * Whereas we round start (addr) and ceiling down, by different
 397          * masks at different levels, in order to test whether a table
 398          * now has no other vmas using it, so can be freed, we don't
 399          * bother to round floor or end up - the tests don't need that.
 400          */
 401         unsigned int psize = get_slice_psize(tlb->mm, addr);
 402
 403         addr &= HUGEPD_MASK(psize);
 404         if (addr < floor) {
 405                 addr += HUGEPD_SIZE(psize);
 406                 if (!addr)
 407                         return;
 408         }
 409         if (ceiling) {
 410                 ceiling &= HUGEPD_MASK(psize);
 411                 if (!ceiling)
 412                         return;
 413         }
 414         if (end - 1 > ceiling - 1)
 415                 end -= HUGEPD_SIZE(psize);
 416         if (addr > end - 1)
 417                 return;
 418
 419         start = addr;
 420         pgd = pgd_offset(tlb->mm, addr);
 421         do {
 422                 psize = get_slice_psize(tlb->mm, addr);
 423                 BUG_ON(!mmu_huge_psizes[psize]);
 424                 next = pgd_addr_end(addr, end);
 425                 if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
 426                         if (pgd_none_or_clear_bad(pgd))
 427                                 continue;
 428                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 429                 } else {
 430                         if (pgd_none(*pgd))
 431                                 continue;
 432                         free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
 433                 }
 434         } while (pgd++, addr = next, addr != end);
 435 }
 436
 437 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 438                      pte_t *ptep, pte_t pte)
 439 {
 440         if (pte_present(*ptep)) {
 441                 /* We open-code pte_clear because we need to pass the right
 442                  * argument to hpte_need_flush (huge / !huge). Might not be
 443                  * necessary anymore if we make hpte_need_flush() get the
 444                  * page size from the slices
 445                  */
 446                 unsigned int psize = get_slice_psize(mm, addr);
 447                 unsigned int shift = mmu_psize_to_shift(psize);
 448                 unsigned long sz = ((1UL) << shift);
 449                 struct hstate *hstate = size_to_hstate(sz);
 450                 pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 451         }
 452         *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 453 }
 454
 455 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 456                               pte_t *ptep)
 457 {
 458         unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 459         return __pte(old);
 460 }
 461
 462 struct page *
 463 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 464 {
 465         pte_t *ptep;
 466         struct page *page;
 467         unsigned int mmu_psize = get_slice_psize(mm, address);
 468
 469         /* Verify it is a huge page else bail. */
 470         if (!mmu_huge_psizes[mmu_psize])
 471                 return ERR_PTR(-EINVAL);
 472
 473         ptep = huge_pte_offset(mm, address);
 474         page = pte_page(*ptep);
 475         if (page) {
 476                 unsigned int shift = mmu_psize_to_shift(mmu_psize);
 477                 unsigned long sz = ((1UL) << shift);
 478                 page += (address % sz) / PAGE_SIZE;
 479         }
 480
 481         return page;
 482 }
 483
 484 int pmd_huge(pmd_t pmd)
 485 {
 486         return 0;
 487 }
 488
 489 int pud_huge(pud_t pud)
 490 {
 491         return 0;
 492 }
 493
 494 struct page *
 495 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 496                 pmd_t *pmd, int write)
 497 {
 498         BUG();
 499         return NULL;
 500 }
 501
 502
 503 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 504                                         unsigned long len, unsigned long pgoff,
 505                                         unsigned long flags)
 506 {
 507         struct hstate *hstate = hstate_file(file);
 508         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 509
 510         if (!mmu_huge_psizes[mmu_psize])
 511                 return -EINVAL;
 512         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 513 }
 514
 515 /*
 516  * Called by asm hashtable.S for doing lazy icache flush
 517  */
 518 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 519                                         pte_t pte, int trap, unsigned long sz)
 520 {
 521         struct page *page;
 522         int i;
 523
 524         if (!pfn_valid(pte_pfn(pte)))
 525                 return rflags;
 526
 527         page = pte_page(pte);
 528
 529         /* page is dirty */
 530         if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 531                 if (trap == 0x400) {
 532                         for (i = 0; i < (sz / PAGE_SIZE); i++)
 533                                 __flush_dcache_icache(page_address(page+i));
 534                         set_bit(PG_arch_1, &page->flags);
 535                 } else {
 536                         rflags |= HPTE_R_N;
 537                 }
 538         }
 539         return rflags;
 540 }
 541
 542 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 543                    unsigned long ea, unsigned long vsid, int local,
 544                    unsigned long trap)
 545 {
 546         pte_t *ptep;
 547         unsigned long old_pte, new_pte;
 548         unsigned long va, rflags, pa, sz;
 549         long slot;
 550         int err = 1;
 551         int ssize = user_segment_size(ea);
 552         unsigned int mmu_psize;
 553         int shift;
 554         mmu_psize = get_slice_psize(mm, ea);
 555
 556         if (!mmu_huge_psizes[mmu_psize])
 557                 goto out;
 558         ptep = huge_pte_offset(mm, ea);
 559
 560         /* Search the Linux page table for a match with va */
 561         va = hpt_va(ea, vsid, ssize);
 562
 563         /*
 564          * If no pte found or not present, send the problem up to
 565          * do_page_fault
 566          */
 567         if (unlikely(!ptep || pte_none(*ptep)))
 568                 goto out;
 569
 570         /*
 571          * Check the user's access rights to the page.  If access should be
 572          * prevented then send the problem up to do_page_fault.
 573          */
 574         if (unlikely(access & ~pte_val(*ptep)))
 575                 goto out;
 576         /*
 577          * At this point, we have a pte (old_pte) which can be used to build
 578          * or update an HPTE. There are 2 cases:
 579          *
 580          * 1. There is a valid (present) pte with no associated HPTE (this is
 581          *      the most common case)
 582          * 2. There is a valid (present) pte with an associated HPTE. The
 583          *      current values of the pp bits in the HPTE prevent access
 584          *      because we are doing software DIRTY bit management and the
 585          *      page is currently not DIRTY.
 586          */
 587
 588
 589         do {
 590                 old_pte = pte_val(*ptep);
 591                 if (old_pte & _PAGE_BUSY)
 592                         goto out;
 593                 new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 594         } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 595                                          old_pte, new_pte));
 596
 597         rflags = 0x2 | (!(new_pte & _PAGE_RW));
 598         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 599         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 600         shift = mmu_psize_to_shift(mmu_psize);
 601         sz = ((1UL) << shift);
 602         if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 603                 /* No CPU has hugepages but lacks no execute, so we
 604                  * don't need to worry about that case */
 605                 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 606                                                        trap, sz);
 607
 608         /* Check if pte already has an hpte (case 2) */
 609         if (unlikely(old_pte & _PAGE_HASHPTE)) {
 610                 /* There MIGHT be an HPTE for this pte */
 611                 unsigned long hash, slot;
 612
 613                 hash = hpt_hash(va, shift, ssize);
 614                 if (old_pte & _PAGE_F_SECOND)
 615                         hash = ~hash;
 616                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 617                 slot += (old_pte & _PAGE_F_GIX) >> 12;
 618
 619                 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 620                                          ssize, local) == -1)
 621                         old_pte &= ~_PAGE_HPTEFLAGS;
 622         }
 623
 624         if (likely(!(old_pte & _PAGE_HASHPTE))) {
 625                 unsigned long hash = hpt_hash(va, shift, ssize);
 626                 unsigned long hpte_group;
 627
 628                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 629
 630 repeat:
 631                 hpte_group = ((hash & htab_hash_mask) *
 632                               HPTES_PER_GROUP) & ~0x7UL;
 633
 634                 /* clear HPTE slot informations in new PTE */
 635 #ifdef CONFIG_PPC_64K_PAGES
 636                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
 637 #else
 638                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 639 #endif
 640                 /* Add in WIMG bits */
 641                 rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 642                                       _PAGE_COHERENT | _PAGE_GUARDED));
 643
 644                 /* Insert into the hash table, primary slot */
 645                 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 646                                           mmu_psize, ssize);
 647
 648                 /* Primary is full, try the secondary */
 649                 if (unlikely(slot == -1)) {
 650                         hpte_group = ((~hash & htab_hash_mask) *
 651                                       HPTES_PER_GROUP) & ~0x7UL;
 652                         slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 653                                                   HPTE_V_SECONDARY,
 654                                                   mmu_psize, ssize);
 655                         if (slot == -1) {
 656                                 if (mftb() & 0x1)
 657                                         hpte_group = ((hash & htab_hash_mask) *
 658                                                       HPTES_PER_GROUP)&~0x7UL;
 659
 660                                 ppc_md.hpte_remove(hpte_group);
 661                                 goto repeat;
 662                         }
 663                 }
 664
 665                 if (unlikely(slot == -2))
 666                         panic("hash_huge_page: pte_insert failed\n");
 667
 668                 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 669         }
 670
 671         /*
 672          * No need to use ldarx/stdcx here
 673          */
 674         *ptep = __pte(new_pte & ~_PAGE_BUSY);
 675
 676         err = 0;
 677
 678  out:
 679         return err;
 680 }
 681
 682 static void __init set_huge_psize(int psize)
 683 {
 684         /* Check that it is a page size supported by the hardware and
 685          * that it fits within pagetable limits. */
 686         if (mmu_psize_defs[psize].shift &&
 687                 mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 688                 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
 689                  mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
 690                  mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
 691                 /* Return if huge page size has already been setup or is the
 692                  * same as the base page size. */
 693                 if (mmu_huge_psizes[psize] ||
 694                    mmu_psize_defs[psize].shift == PAGE_SHIFT)
 695                         return;
 696                 hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 697
 698                 switch (mmu_psize_defs[psize].shift) {
 699                 case PAGE_SHIFT_64K:
 700                     /* We only allow 64k hpages with 4k base page,
 701                      * which was checked above, and always put them
 702                      * at the PMD */
 703                     hugepte_shift[psize] = PMD_SHIFT;
 704                     break;
 705                 case PAGE_SHIFT_16M:
 706                     /* 16M pages can be at two different levels
 707                      * of pagestables based on base page size */
 708                     if (PAGE_SHIFT == PAGE_SHIFT_64K)
 709                             hugepte_shift[psize] = PMD_SHIFT;
 710                     else /* 4k base page */
 711                             hugepte_shift[psize] = PUD_SHIFT;
 712                     break;
 713                 case PAGE_SHIFT_16G:
 714                     /* 16G pages are always at PGD level */
 715                     hugepte_shift[psize] = PGDIR_SHIFT;
 716                     break;
 717                 }
 718                 hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 719         } else
 720                 hugepte_shift[psize] = 0;
 721 }
 722
 723 static int __init hugepage_setup_sz(char *str)
 724 {
 725         unsigned long long size;
 726         int mmu_psize;
 727         int shift;
 728
 729         size = memparse(str, &str);
 730
 731         shift = __ffs(size);
 732         mmu_psize = shift_to_mmu_psize(shift);
 733         if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 734                 set_huge_psize(mmu_psize);
 735         else
 736                 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 737
 738         return 1;
 739 }
 740 __setup("hugepagesz=", hugepage_setup_sz);
 741
 742 static int __init hugetlbpage_init(void)
 743 {
 744         unsigned int psize;
 745
 746         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 747                 return -ENODEV;
 748
 749         /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
 750          * and adjust PTE_NONCACHE_NUM if the number of supported huge page
 751          * sizes changes.
 752          */
 753         set_huge_psize(MMU_PAGE_16M);
 754         set_huge_psize(MMU_PAGE_16G);
 755
 756         /* Temporarily disable support for 64K huge pages when 64K SPU local
 757          * store support is enabled as the current implementation conflicts.
 758          */
 759 #ifndef CONFIG_SPU_FS_64K_LS
 760         set_huge_psize(MMU_PAGE_64K);
 761 #endif
 762
 763         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 764                 if (mmu_huge_psizes[psize]) {
 765                         pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
 766                                 kmem_cache_create(
 767                                         HUGEPTE_CACHE_NAME(psize),
 768                                         HUGEPTE_TABLE_SIZE(psize),
 769                                         HUGEPTE_TABLE_SIZE(psize),
 770                                         0,
 771                                         NULL);
 772                         if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
 773                                 panic("hugetlbpage_init(): could not create %s"\
 774                                       "\n", HUGEPTE_CACHE_NAME(psize));
 775                 }
 776         }
 777
 778         return 0;
 779 }
 780
 781 module_init(hugetlbpage_init);