Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
[linux-2.6] / arch / powerpc / mm / gup.c
1 /*
2  * Lockless get_user_pages_fast for powerpc
3  *
4  * Copyright (C) 2008 Nick Piggin
5  * Copyright (C) 2008 Novell Inc.
6  */
7 #undef DEBUG
8
9 #include <linux/sched.h>
10 #include <linux/mm.h>
11 #include <linux/hugetlb.h>
12 #include <linux/vmstat.h>
13 #include <linux/pagemap.h>
14 #include <linux/rwsem.h>
15 #include <asm/pgtable.h>
16
17 /*
18  * The performance critical leaf functions are made noinline otherwise gcc
19  * inlines everything into a single function which results in too much
20  * register pressure.
21  */
22 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
23                 unsigned long end, int write, struct page **pages, int *nr)
24 {
25         unsigned long mask, result;
26         pte_t *ptep;
27
28         result = _PAGE_PRESENT|_PAGE_USER;
29         if (write)
30                 result |= _PAGE_RW;
31         mask = result | _PAGE_SPECIAL;
32
33         ptep = pte_offset_kernel(&pmd, addr);
34         do {
35                 pte_t pte = *ptep;
36                 struct page *page;
37
38                 if ((pte_val(pte) & mask) != result)
39                         return 0;
40                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
41                 page = pte_page(pte);
42                 if (!page_cache_get_speculative(page))
43                         return 0;
44                 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
45                         put_page(page);
46                         return 0;
47                 }
48                 pages[*nr] = page;
49                 (*nr)++;
50
51         } while (ptep++, addr += PAGE_SIZE, addr != end);
52
53         return 1;
54 }
55
56 #ifdef CONFIG_HUGETLB_PAGE
57 static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
58                                  unsigned long *addr, unsigned long end,
59                                  int write, struct page **pages, int *nr)
60 {
61         unsigned long mask;
62         unsigned long pte_end;
63         struct page *head, *page;
64         pte_t pte;
65         int refs;
66
67         pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
68         if (pte_end < end)
69                 end = pte_end;
70
71         pte = *ptep;
72         mask = _PAGE_PRESENT|_PAGE_USER;
73         if (write)
74                 mask |= _PAGE_RW;
75         if ((pte_val(pte) & mask) != mask)
76                 return 0;
77         /* hugepages are never "special" */
78         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
79
80         refs = 0;
81         head = pte_page(pte);
82         page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
83         do {
84                 VM_BUG_ON(compound_head(page) != head);
85                 pages[*nr] = page;
86                 (*nr)++;
87                 page++;
88                 refs++;
89         } while (*addr += PAGE_SIZE, *addr != end);
90
91         if (!page_cache_add_speculative(head, refs)) {
92                 *nr -= refs;
93                 return 0;
94         }
95         if (unlikely(pte_val(pte) != pte_val(*ptep))) {
96                 /* Could be optimized better */
97                 while (*nr) {
98                         put_page(page);
99                         (*nr)--;
100                 }
101         }
102
103         return 1;
104 }
105 #endif /* CONFIG_HUGETLB_PAGE */
106
107 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
108                 int write, struct page **pages, int *nr)
109 {
110         unsigned long next;
111         pmd_t *pmdp;
112
113         pmdp = pmd_offset(&pud, addr);
114         do {
115                 pmd_t pmd = *pmdp;
116
117                 next = pmd_addr_end(addr, end);
118                 if (pmd_none(pmd))
119                         return 0;
120                 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
121                         return 0;
122         } while (pmdp++, addr = next, addr != end);
123
124         return 1;
125 }
126
127 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
128                 int write, struct page **pages, int *nr)
129 {
130         unsigned long next;
131         pud_t *pudp;
132
133         pudp = pud_offset(&pgd, addr);
134         do {
135                 pud_t pud = *pudp;
136
137                 next = pud_addr_end(addr, end);
138                 if (pud_none(pud))
139                         return 0;
140                 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
141                         return 0;
142         } while (pudp++, addr = next, addr != end);
143
144         return 1;
145 }
146
147 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
148                         struct page **pages)
149 {
150         struct mm_struct *mm = current->mm;
151         unsigned long addr, len, end;
152         unsigned long next;
153         pgd_t *pgdp;
154         int psize, nr = 0;
155         unsigned int shift;
156
157         pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
158
159         start &= PAGE_MASK;
160         addr = start;
161         len = (unsigned long) nr_pages << PAGE_SHIFT;
162         end = start + len;
163
164         if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
165                                         start, len)))
166                 goto slow_irqon;
167
168         pr_debug("  aligned: %lx .. %lx\n", start, end);
169
170 #ifdef CONFIG_HUGETLB_PAGE
171         /* We bail out on slice boundary crossing when hugetlb is
172          * enabled in order to not have to deal with two different
173          * page table formats
174          */
175         if (addr < SLICE_LOW_TOP) {
176                 if (end > SLICE_LOW_TOP)
177                         goto slow_irqon;
178
179                 if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
180                              GET_LOW_SLICE_INDEX(end - 1)))
181                         goto slow_irqon;
182         } else {
183                 if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
184                              GET_HIGH_SLICE_INDEX(end - 1)))
185                         goto slow_irqon;
186         }
187 #endif /* CONFIG_HUGETLB_PAGE */
188
189         /*
190          * XXX: batch / limit 'nr', to avoid large irq off latency
191          * needs some instrumenting to determine the common sizes used by
192          * important workloads (eg. DB2), and whether limiting the batch size
193          * will decrease performance.
194          *
195          * It seems like we're in the clear for the moment. Direct-IO is
196          * the main guy that batches up lots of get_user_pages, and even
197          * they are limited to 64-at-a-time which is not so many.
198          */
199         /*
200          * This doesn't prevent pagetable teardown, but does prevent
201          * the pagetables from being freed on powerpc.
202          *
203          * So long as we atomically load page table pointers versus teardown,
204          * we can follow the address down to the the page and take a ref on it.
205          */
206         local_irq_disable();
207
208         psize = get_slice_psize(mm, addr);
209         shift = mmu_psize_defs[psize].shift;
210
211 #ifdef CONFIG_HUGETLB_PAGE
212         if (unlikely(mmu_huge_psizes[psize])) {
213                 pte_t *ptep;
214                 unsigned long a = addr;
215                 unsigned long sz = ((1UL) << shift);
216                 struct hstate *hstate = size_to_hstate(sz);
217
218                 BUG_ON(!hstate);
219                 /*
220                  * XXX: could be optimized to avoid hstate
221                  * lookup entirely (just use shift)
222                  */
223
224                 do {
225                         VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
226                         ptep = huge_pte_offset(mm, a);
227                         pr_debug(" %016lx: huge ptep %p\n", a, ptep);
228                         if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
229                                                    &nr))
230                                 goto slow;
231                 } while (a != end);
232         } else
233 #endif /* CONFIG_HUGETLB_PAGE */
234         {
235                 pgdp = pgd_offset(mm, addr);
236                 do {
237                         pgd_t pgd = *pgdp;
238
239                         VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
240                         pr_debug("  %016lx: normal pgd %p\n", addr,
241                                  (void *)pgd_val(pgd));
242                         next = pgd_addr_end(addr, end);
243                         if (pgd_none(pgd))
244                                 goto slow;
245                         if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
246                                 goto slow;
247                 } while (pgdp++, addr = next, addr != end);
248         }
249         local_irq_enable();
250
251         VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
252         return nr;
253
254         {
255                 int ret;
256
257 slow:
258                 local_irq_enable();
259 slow_irqon:
260                 pr_debug("  slow path ! nr = %d\n", nr);
261
262                 /* Try to get the remaining pages with get_user_pages */
263                 start += nr << PAGE_SHIFT;
264                 pages += nr;
265
266                 down_read(&mm->mmap_sem);
267                 ret = get_user_pages(current, mm, start,
268                         (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
269                 up_read(&mm->mmap_sem);
270
271                 /* Have to be a bit careful with return values */
272                 if (nr > 0) {
273                         if (ret < 0)
274                                 ret = nr;
275                         else
276                                 ret += nr;
277                 }
278
279                 return ret;
280         }
281 }