Merge commit 'kumar/next' into next
[linux-2.6] / arch / powerpc / mm / gup.c
1 /*
2  * Lockless get_user_pages_fast for powerpc
3  *
4  * Copyright (C) 2008 Nick Piggin
5  * Copyright (C) 2008 Novell Inc.
6  */
7 #undef DEBUG
8
9 #include <linux/sched.h>
10 #include <linux/mm.h>
11 #include <linux/hugetlb.h>
12 #include <linux/vmstat.h>
13 #include <linux/pagemap.h>
14 #include <linux/rwsem.h>
15 #include <asm/pgtable.h>
16
17 #ifdef __HAVE_ARCH_PTE_SPECIAL
18
19 /*
20  * The performance critical leaf functions are made noinline otherwise gcc
21  * inlines everything into a single function which results in too much
22  * register pressure.
23  */
24 static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
25                 unsigned long end, int write, struct page **pages, int *nr)
26 {
27         unsigned long mask, result;
28         pte_t *ptep;
29
30         result = _PAGE_PRESENT|_PAGE_USER;
31         if (write)
32                 result |= _PAGE_RW;
33         mask = result | _PAGE_SPECIAL;
34
35         ptep = pte_offset_kernel(&pmd, addr);
36         do {
37                 pte_t pte = *ptep;
38                 struct page *page;
39
40                 if ((pte_val(pte) & mask) != result)
41                         return 0;
42                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
43                 page = pte_page(pte);
44                 if (!page_cache_get_speculative(page))
45                         return 0;
46                 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
47                         put_page(page);
48                         return 0;
49                 }
50                 pages[*nr] = page;
51                 (*nr)++;
52
53         } while (ptep++, addr += PAGE_SIZE, addr != end);
54
55         return 1;
56 }
57
58 #ifdef CONFIG_HUGETLB_PAGE
59 static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
60                                  unsigned long *addr, unsigned long end,
61                                  int write, struct page **pages, int *nr)
62 {
63         unsigned long mask;
64         unsigned long pte_end;
65         struct page *head, *page;
66         pte_t pte;
67         int refs;
68
69         pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
70         if (pte_end < end)
71                 end = pte_end;
72
73         pte = *ptep;
74         mask = _PAGE_PRESENT|_PAGE_USER;
75         if (write)
76                 mask |= _PAGE_RW;
77         if ((pte_val(pte) & mask) != mask)
78                 return 0;
79         /* hugepages are never "special" */
80         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
81
82         refs = 0;
83         head = pte_page(pte);
84         page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
85         do {
86                 VM_BUG_ON(compound_head(page) != head);
87                 pages[*nr] = page;
88                 (*nr)++;
89                 page++;
90                 refs++;
91         } while (*addr += PAGE_SIZE, *addr != end);
92
93         if (!page_cache_add_speculative(head, refs)) {
94                 *nr -= refs;
95                 return 0;
96         }
97         if (unlikely(pte_val(pte) != pte_val(*ptep))) {
98                 /* Could be optimized better */
99                 while (*nr) {
100                         put_page(page);
101                         (*nr)--;
102                 }
103         }
104
105         return 1;
106 }
107 #endif /* CONFIG_HUGETLB_PAGE */
108
109 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
110                 int write, struct page **pages, int *nr)
111 {
112         unsigned long next;
113         pmd_t *pmdp;
114
115         pmdp = pmd_offset(&pud, addr);
116         do {
117                 pmd_t pmd = *pmdp;
118
119                 next = pmd_addr_end(addr, end);
120                 if (pmd_none(pmd))
121                         return 0;
122                 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
123                         return 0;
124         } while (pmdp++, addr = next, addr != end);
125
126         return 1;
127 }
128
129 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
130                 int write, struct page **pages, int *nr)
131 {
132         unsigned long next;
133         pud_t *pudp;
134
135         pudp = pud_offset(&pgd, addr);
136         do {
137                 pud_t pud = *pudp;
138
139                 next = pud_addr_end(addr, end);
140                 if (pud_none(pud))
141                         return 0;
142                 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
143                         return 0;
144         } while (pudp++, addr = next, addr != end);
145
146         return 1;
147 }
148
149 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
150                         struct page **pages)
151 {
152         struct mm_struct *mm = current->mm;
153         unsigned long addr, len, end;
154         unsigned long next;
155         pgd_t *pgdp;
156         int nr = 0;
157 #ifdef CONFIG_PPC64
158         unsigned int shift;
159         int psize;
160 #endif
161
162         pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
163
164         start &= PAGE_MASK;
165         addr = start;
166         len = (unsigned long) nr_pages << PAGE_SHIFT;
167         end = start + len;
168
169         if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
170                                         start, len)))
171                 goto slow_irqon;
172
173         pr_debug("  aligned: %lx .. %lx\n", start, end);
174
175 #ifdef CONFIG_HUGETLB_PAGE
176         /* We bail out on slice boundary crossing when hugetlb is
177          * enabled in order to not have to deal with two different
178          * page table formats
179          */
180         if (addr < SLICE_LOW_TOP) {
181                 if (end > SLICE_LOW_TOP)
182                         goto slow_irqon;
183
184                 if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
185                              GET_LOW_SLICE_INDEX(end - 1)))
186                         goto slow_irqon;
187         } else {
188                 if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
189                              GET_HIGH_SLICE_INDEX(end - 1)))
190                         goto slow_irqon;
191         }
192 #endif /* CONFIG_HUGETLB_PAGE */
193
194         /*
195          * XXX: batch / limit 'nr', to avoid large irq off latency
196          * needs some instrumenting to determine the common sizes used by
197          * important workloads (eg. DB2), and whether limiting the batch size
198          * will decrease performance.
199          *
200          * It seems like we're in the clear for the moment. Direct-IO is
201          * the main guy that batches up lots of get_user_pages, and even
202          * they are limited to 64-at-a-time which is not so many.
203          */
204         /*
205          * This doesn't prevent pagetable teardown, but does prevent
206          * the pagetables from being freed on powerpc.
207          *
208          * So long as we atomically load page table pointers versus teardown,
209          * we can follow the address down to the the page and take a ref on it.
210          */
211         local_irq_disable();
212
213 #ifdef CONFIG_PPC64
214         /* Those bits are related to hugetlbfs implementation and only exist
215          * on 64-bit for now
216          */
217         psize = get_slice_psize(mm, addr);
218         shift = mmu_psize_defs[psize].shift;
219 #endif /* CONFIG_PPC64 */
220
221 #ifdef CONFIG_HUGETLB_PAGE
222         if (unlikely(mmu_huge_psizes[psize])) {
223                 pte_t *ptep;
224                 unsigned long a = addr;
225                 unsigned long sz = ((1UL) << shift);
226                 struct hstate *hstate = size_to_hstate(sz);
227
228                 BUG_ON(!hstate);
229                 /*
230                  * XXX: could be optimized to avoid hstate
231                  * lookup entirely (just use shift)
232                  */
233
234                 do {
235                         VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
236                         ptep = huge_pte_offset(mm, a);
237                         pr_debug(" %016lx: huge ptep %p\n", a, ptep);
238                         if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
239                                                    &nr))
240                                 goto slow;
241                 } while (a != end);
242         } else
243 #endif /* CONFIG_HUGETLB_PAGE */
244         {
245                 pgdp = pgd_offset(mm, addr);
246                 do {
247                         pgd_t pgd = *pgdp;
248
249 #ifdef CONFIG_PPC64
250                         VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
251 #endif
252                         pr_debug("  %016lx: normal pgd %p\n", addr,
253                                  (void *)pgd_val(pgd));
254                         next = pgd_addr_end(addr, end);
255                         if (pgd_none(pgd))
256                                 goto slow;
257                         if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
258                                 goto slow;
259                 } while (pgdp++, addr = next, addr != end);
260         }
261         local_irq_enable();
262
263         VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
264         return nr;
265
266         {
267                 int ret;
268
269 slow:
270                 local_irq_enable();
271 slow_irqon:
272                 pr_debug("  slow path ! nr = %d\n", nr);
273
274                 /* Try to get the remaining pages with get_user_pages */
275                 start += nr << PAGE_SHIFT;
276                 pages += nr;
277
278                 down_read(&mm->mmap_sem);
279                 ret = get_user_pages(current, mm, start,
280                         (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
281                 up_read(&mm->mmap_sem);
282
283                 /* Have to be a bit careful with return values */
284                 if (nr > 0) {
285                         if (ret < 0)
286                                 ret = nr;
287                         else
288                                 ret += nr;
289                 }
290
291                 return ret;
292         }
293 }
294
295 #endif /* __HAVE_ARCH_PTE_SPECIAL */