2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
10 #include <linux/init.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
20 #include <asm/pgalloc.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
28 #include <linux/sysctl.h>
30 /* Modelled after find_linux_pte() */
31 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
38 BUG_ON(! in_hugepage_area(mm->context, addr));
42 pg = pgd_offset(mm, addr);
44 pu = pud_offset(pg, addr);
46 pm = pmd_offset(pu, addr);
49 && !(pte_present(*pt) && pte_huge(*pt)));
57 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
64 BUG_ON(! in_hugepage_area(mm->context, addr));
68 pg = pgd_offset(mm, addr);
69 pu = pud_alloc(mm, pg, addr);
72 pm = pmd_alloc(mm, pu, addr);
76 && !(pte_present(*pt) && pte_huge(*pt)));
84 #define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE)
86 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
87 pte_t *ptep, pte_t pte)
91 if (pte_present(*ptep)) {
92 pte_clear(mm, addr, ptep);
96 for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
97 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
102 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
105 unsigned long old = pte_update(ptep, ~0UL);
108 if (old & _PAGE_HASHPTE)
109 hpte_update(mm, addr, old, 0);
111 for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
118 * This function checks for proper alignment of input addr and len parameters.
120 int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
122 if (len & ~HPAGE_MASK)
124 if (addr & ~HPAGE_MASK)
126 if (! (within_hugepage_low_range(addr, len)
127 || within_hugepage_high_range(addr, len)) )
132 static void flush_segments(void *parm)
134 u16 segs = (unsigned long) parm;
137 asm volatile("isync" : : : "memory");
139 for (i = 0; i < 16; i++) {
140 if (! (segs & (1U << i)))
142 asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
145 asm volatile("isync" : : : "memory");
148 static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
150 unsigned long start = seg << SID_SHIFT;
151 unsigned long end = (seg+1) << SID_SHIFT;
152 struct vm_area_struct *vma;
156 /* Check no VMAs are in the region */
157 vma = find_vma(mm, start);
158 if (vma && (vma->vm_start < end))
164 static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
168 newsegs &= ~(mm->context.htlb_segs);
170 return 0; /* The segments we want are already open */
172 for (i = 0; i < 16; i++)
173 if ((1 << i) & newsegs)
174 if (prepare_low_seg_for_htlb(mm, i) != 0)
177 mm->context.htlb_segs |= newsegs;
179 /* update the paca copy of the context struct */
180 get_paca()->context = mm->context;
182 /* the context change must make it to memory before the flush,
183 * so that further SLB misses do the right thing. */
185 on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
190 int prepare_hugepage_range(unsigned long addr, unsigned long len)
192 if (within_hugepage_high_range(addr, len))
194 else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
196 /* Yes, we need both tests, in case addr+len overflows
197 * 64-bit arithmetic */
198 err = open_low_hpage_segs(current->mm,
199 LOW_ESID_MASK(addr, len));
201 printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
202 " failed (segs: 0x%04hx)\n", addr, len,
203 LOW_ESID_MASK(addr, len));
211 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
216 if (! in_hugepage_area(mm->context, address))
217 return ERR_PTR(-EINVAL);
219 ptep = huge_pte_offset(mm, address);
220 page = pte_page(*ptep);
222 page += (address % HPAGE_SIZE) / PAGE_SIZE;
227 int pmd_huge(pmd_t pmd)
233 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
234 pmd_t *pmd, int write)
240 /* Because we have an exclusive hugepage region which lies within the
241 * normal user address space, we have to take special measures to make
242 * non-huge mmap()s evade the hugepage reserved regions. */
243 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
244 unsigned long len, unsigned long pgoff,
247 struct mm_struct *mm = current->mm;
248 struct vm_area_struct *vma;
249 unsigned long start_addr;
255 addr = PAGE_ALIGN(addr);
256 vma = find_vma(mm, addr);
257 if (((TASK_SIZE - len) >= addr)
258 && (!vma || (addr+len) <= vma->vm_start)
259 && !is_hugepage_only_range(mm, addr,len))
262 if (len > mm->cached_hole_size) {
263 start_addr = addr = mm->free_area_cache;
265 start_addr = addr = TASK_UNMAPPED_BASE;
266 mm->cached_hole_size = 0;
270 vma = find_vma(mm, addr);
271 while (TASK_SIZE - len >= addr) {
272 BUG_ON(vma && (addr >= vma->vm_end));
274 if (touches_hugepage_low_range(mm, addr, len)) {
275 addr = ALIGN(addr+1, 1<<SID_SHIFT);
276 vma = find_vma(mm, addr);
279 if (touches_hugepage_high_range(addr, len)) {
280 addr = TASK_HPAGE_END;
281 vma = find_vma(mm, addr);
284 if (!vma || addr + len <= vma->vm_start) {
286 * Remember the place where we stopped the search:
288 mm->free_area_cache = addr + len;
291 if (addr + mm->cached_hole_size < vma->vm_start)
292 mm->cached_hole_size = vma->vm_start - addr;
297 /* Make sure we didn't miss any holes */
298 if (start_addr != TASK_UNMAPPED_BASE) {
299 start_addr = addr = TASK_UNMAPPED_BASE;
300 mm->cached_hole_size = 0;
307 * This mmap-allocator allocates new areas top-down from below the
308 * stack's low limit (the base):
310 * Because we have an exclusive hugepage region which lies within the
311 * normal user address space, we have to take special measures to make
312 * non-huge mmap()s evade the hugepage reserved regions.
315 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
316 const unsigned long len, const unsigned long pgoff,
317 const unsigned long flags)
319 struct vm_area_struct *vma, *prev_vma;
320 struct mm_struct *mm = current->mm;
321 unsigned long base = mm->mmap_base, addr = addr0;
322 unsigned long largest_hole = mm->cached_hole_size;
325 /* requested length too big for entire address space */
329 /* dont allow allocations above current base */
330 if (mm->free_area_cache > base)
331 mm->free_area_cache = base;
333 /* requesting a specific address */
335 addr = PAGE_ALIGN(addr);
336 vma = find_vma(mm, addr);
337 if (TASK_SIZE - len >= addr &&
338 (!vma || addr + len <= vma->vm_start)
339 && !is_hugepage_only_range(mm, addr,len))
343 if (len <= largest_hole) {
345 mm->free_area_cache = base;
348 /* make sure it can fit in the remaining address space */
349 if (mm->free_area_cache < len)
352 /* either no address requested or cant fit in requested address hole */
353 addr = (mm->free_area_cache - len) & PAGE_MASK;
356 if (touches_hugepage_low_range(mm, addr, len)) {
357 addr = (addr & ((~0) << SID_SHIFT)) - len;
358 goto hugepage_recheck;
359 } else if (touches_hugepage_high_range(addr, len)) {
360 addr = TASK_HPAGE_BASE - len;
364 * Lookup failure means no vma is above this address,
365 * i.e. return with success:
367 if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
371 * new region fits between prev_vma->vm_end and
372 * vma->vm_start, use it:
374 if (addr+len <= vma->vm_start &&
375 (!prev_vma || (addr >= prev_vma->vm_end))) {
376 /* remember the address as a hint for next time */
377 mm->cached_hole_size = largest_hole;
378 return (mm->free_area_cache = addr);
380 /* pull free_area_cache down to the first hole */
381 if (mm->free_area_cache == vma->vm_end) {
382 mm->free_area_cache = vma->vm_start;
383 mm->cached_hole_size = largest_hole;
387 /* remember the largest hole we saw so far */
388 if (addr + largest_hole < vma->vm_start)
389 largest_hole = vma->vm_start - addr;
391 /* try just below the current vma->vm_start */
392 addr = vma->vm_start-len;
393 } while (len <= vma->vm_start);
397 * if hint left us with no space for the requested
398 * mapping then try again:
401 mm->free_area_cache = base;
407 * A failed mmap() very likely causes application failure,
408 * so fall back to the bottom-up function here. This scenario
409 * can happen with large stack limits and large mmap()
412 mm->free_area_cache = TASK_UNMAPPED_BASE;
413 mm->cached_hole_size = ~0UL;
414 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
416 * Restore the topdown base:
418 mm->free_area_cache = base;
419 mm->cached_hole_size = ~0UL;
424 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
426 unsigned long addr = 0;
427 struct vm_area_struct *vma;
429 vma = find_vma(current->mm, addr);
430 while (addr + len <= 0x100000000UL) {
431 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
433 if (! __within_hugepage_low_range(addr, len, segmask)) {
434 addr = ALIGN(addr+1, 1<<SID_SHIFT);
435 vma = find_vma(current->mm, addr);
439 if (!vma || (addr + len) <= vma->vm_start)
441 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
442 /* Depending on segmask this might not be a confirmed
443 * hugepage region, so the ALIGN could have skipped
445 vma = find_vma(current->mm, addr);
451 static unsigned long htlb_get_high_area(unsigned long len)
453 unsigned long addr = TASK_HPAGE_BASE;
454 struct vm_area_struct *vma;
456 vma = find_vma(current->mm, addr);
457 for (vma = find_vma(current->mm, addr);
458 addr + len <= TASK_HPAGE_END;
459 vma = vma->vm_next) {
460 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
461 BUG_ON(! within_hugepage_high_range(addr, len));
463 if (!vma || (addr + len) <= vma->vm_start)
465 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
466 /* Because we're in a hugepage region, this alignment
467 * should not skip us over any VMAs */
473 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
474 unsigned long len, unsigned long pgoff,
477 if (len & ~HPAGE_MASK)
480 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
483 if (test_thread_flag(TIF_32BIT)) {
485 u16 segmask, cursegs = current->mm->context.htlb_segs;
487 /* First see if we can do the mapping in the existing
488 * low hpage segments */
489 addr = htlb_get_low_area(len, cursegs);
493 for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
494 ! lastshift; segmask >>=1) {
498 addr = htlb_get_low_area(len, cursegs | segmask);
499 if ((addr != -ENOMEM)
500 && open_low_hpage_segs(current->mm, segmask) == 0)
503 printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
504 " enough segments\n");
507 return htlb_get_high_area(len);
511 int hash_huge_page(struct mm_struct *mm, unsigned long access,
512 unsigned long ea, unsigned long vsid, int local)
515 unsigned long va, vpn;
516 pte_t old_pte, new_pte;
517 unsigned long rflags, prpn;
521 spin_lock(&mm->page_table_lock);
523 ptep = huge_pte_offset(mm, ea);
525 /* Search the Linux page table for a match with va */
526 va = (vsid << 28) | (ea & 0x0fffffff);
527 vpn = va >> HPAGE_SHIFT;
530 * If no pte found or not present, send the problem up to
533 if (unlikely(!ptep || pte_none(*ptep)))
536 /* BUG_ON(pte_bad(*ptep)); */
539 * Check the user's access rights to the page. If access should be
540 * prevented then send the problem up to do_page_fault.
542 if (unlikely(access & ~pte_val(*ptep)))
545 * At this point, we have a pte (old_pte) which can be used to build
546 * or update an HPTE. There are 2 cases:
548 * 1. There is a valid (present) pte with no associated HPTE (this is
549 * the most common case)
550 * 2. There is a valid (present) pte with an associated HPTE. The
551 * current values of the pp bits in the HPTE prevent access
552 * because we are doing software DIRTY bit management and the
553 * page is currently not DIRTY.
560 rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
561 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
562 rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
564 /* Check if pte already has an hpte (case 2) */
565 if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
566 /* There MIGHT be an HPTE for this pte */
567 unsigned long hash, slot;
569 hash = hpt_hash(vpn, 1);
570 if (pte_val(old_pte) & _PAGE_SECONDARY)
572 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
573 slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
575 if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
576 pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
579 if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
580 unsigned long hash = hpt_hash(vpn, 1);
581 unsigned long hpte_group;
583 prpn = pte_pfn(old_pte);
586 hpte_group = ((hash & htab_hash_mask) *
587 HPTES_PER_GROUP) & ~0x7UL;
589 /* Update the linux pte with the HPTE slot */
590 pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
591 pte_val(new_pte) |= _PAGE_HASHPTE;
593 /* Add in WIMG bits */
594 /* XXX We should store these in the pte */
595 rflags |= _PAGE_COHERENT;
597 slot = ppc_md.hpte_insert(hpte_group, va, prpn,
598 HPTE_V_LARGE, rflags);
600 /* Primary is full, try the secondary */
601 if (unlikely(slot == -1)) {
602 pte_val(new_pte) |= _PAGE_SECONDARY;
603 hpte_group = ((~hash & htab_hash_mask) *
604 HPTES_PER_GROUP) & ~0x7UL;
605 slot = ppc_md.hpte_insert(hpte_group, va, prpn,
606 HPTE_V_LARGE, rflags);
609 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
611 ppc_md.hpte_remove(hpte_group);
616 if (unlikely(slot == -2))
617 panic("hash_huge_page: pte_insert failed\n");
619 pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
622 * No need to use ldarx/stdcx here because all who
623 * might be updating the pte will hold the
632 spin_unlock(&mm->page_table_lock);