block: add support for IO CPU affinity
[linux-2.6] / mm / mremap.c
1 /*
2  *      mm/mremap.c
3  *
4  *      (C) Copyright 1996 Linus Torvalds
5  *
6  *      Address space accounting code   <alan@redhat.com>
7  *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8  */
9
10 #include <linux/mm.h>
11 #include <linux/hugetlb.h>
12 #include <linux/slab.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/swap.h>
16 #include <linux/capability.h>
17 #include <linux/fs.h>
18 #include <linux/highmem.h>
19 #include <linux/security.h>
20 #include <linux/syscalls.h>
21 #include <linux/mmu_notifier.h>
22
23 #include <asm/uaccess.h>
24 #include <asm/cacheflush.h>
25 #include <asm/tlbflush.h>
26
27 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
28 {
29         pgd_t *pgd;
30         pud_t *pud;
31         pmd_t *pmd;
32
33         pgd = pgd_offset(mm, addr);
34         if (pgd_none_or_clear_bad(pgd))
35                 return NULL;
36
37         pud = pud_offset(pgd, addr);
38         if (pud_none_or_clear_bad(pud))
39                 return NULL;
40
41         pmd = pmd_offset(pud, addr);
42         if (pmd_none_or_clear_bad(pmd))
43                 return NULL;
44
45         return pmd;
46 }
47
48 static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
49 {
50         pgd_t *pgd;
51         pud_t *pud;
52         pmd_t *pmd;
53
54         pgd = pgd_offset(mm, addr);
55         pud = pud_alloc(mm, pgd, addr);
56         if (!pud)
57                 return NULL;
58
59         pmd = pmd_alloc(mm, pud, addr);
60         if (!pmd)
61                 return NULL;
62
63         if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
64                 return NULL;
65
66         return pmd;
67 }
68
69 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
70                 unsigned long old_addr, unsigned long old_end,
71                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
72                 unsigned long new_addr)
73 {
74         struct address_space *mapping = NULL;
75         struct mm_struct *mm = vma->vm_mm;
76         pte_t *old_pte, *new_pte, pte;
77         spinlock_t *old_ptl, *new_ptl;
78         unsigned long old_start;
79
80         old_start = old_addr;
81         mmu_notifier_invalidate_range_start(vma->vm_mm,
82                                             old_start, old_end);
83         if (vma->vm_file) {
84                 /*
85                  * Subtle point from Rajesh Venkatasubramanian: before
86                  * moving file-based ptes, we must lock vmtruncate out,
87                  * since it might clean the dst vma before the src vma,
88                  * and we propagate stale pages into the dst afterward.
89                  */
90                 mapping = vma->vm_file->f_mapping;
91                 spin_lock(&mapping->i_mmap_lock);
92                 if (new_vma->vm_truncate_count &&
93                     new_vma->vm_truncate_count != vma->vm_truncate_count)
94                         new_vma->vm_truncate_count = 0;
95         }
96
97         /*
98          * We don't have to worry about the ordering of src and dst
99          * pte locks because exclusive mmap_sem prevents deadlock.
100          */
101         old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
102         new_pte = pte_offset_map_nested(new_pmd, new_addr);
103         new_ptl = pte_lockptr(mm, new_pmd);
104         if (new_ptl != old_ptl)
105                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
106         arch_enter_lazy_mmu_mode();
107
108         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
109                                    new_pte++, new_addr += PAGE_SIZE) {
110                 if (pte_none(*old_pte))
111                         continue;
112                 pte = ptep_clear_flush(vma, old_addr, old_pte);
113                 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
114                 set_pte_at(mm, new_addr, new_pte, pte);
115         }
116
117         arch_leave_lazy_mmu_mode();
118         if (new_ptl != old_ptl)
119                 spin_unlock(new_ptl);
120         pte_unmap_nested(new_pte - 1);
121         pte_unmap_unlock(old_pte - 1, old_ptl);
122         if (mapping)
123                 spin_unlock(&mapping->i_mmap_lock);
124         mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
125 }
126
127 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
128
129 unsigned long move_page_tables(struct vm_area_struct *vma,
130                 unsigned long old_addr, struct vm_area_struct *new_vma,
131                 unsigned long new_addr, unsigned long len)
132 {
133         unsigned long extent, next, old_end;
134         pmd_t *old_pmd, *new_pmd;
135
136         old_end = old_addr + len;
137         flush_cache_range(vma, old_addr, old_end);
138
139         for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
140                 cond_resched();
141                 next = (old_addr + PMD_SIZE) & PMD_MASK;
142                 if (next - 1 > old_end)
143                         next = old_end;
144                 extent = next - old_addr;
145                 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
146                 if (!old_pmd)
147                         continue;
148                 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
149                 if (!new_pmd)
150                         break;
151                 next = (new_addr + PMD_SIZE) & PMD_MASK;
152                 if (extent > next - new_addr)
153                         extent = next - new_addr;
154                 if (extent > LATENCY_LIMIT)
155                         extent = LATENCY_LIMIT;
156                 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
157                                 new_vma, new_pmd, new_addr);
158         }
159
160         return len + old_addr - old_end;        /* how much done */
161 }
162
163 static unsigned long move_vma(struct vm_area_struct *vma,
164                 unsigned long old_addr, unsigned long old_len,
165                 unsigned long new_len, unsigned long new_addr)
166 {
167         struct mm_struct *mm = vma->vm_mm;
168         struct vm_area_struct *new_vma;
169         unsigned long vm_flags = vma->vm_flags;
170         unsigned long new_pgoff;
171         unsigned long moved_len;
172         unsigned long excess = 0;
173         unsigned long hiwater_vm;
174         int split = 0;
175
176         /*
177          * We'd prefer to avoid failure later on in do_munmap:
178          * which may split one vma into three before unmapping.
179          */
180         if (mm->map_count >= sysctl_max_map_count - 3)
181                 return -ENOMEM;
182
183         new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
184         new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
185         if (!new_vma)
186                 return -ENOMEM;
187
188         moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
189         if (moved_len < old_len) {
190                 /*
191                  * On error, move entries back from new area to old,
192                  * which will succeed since page tables still there,
193                  * and then proceed to unmap new area instead of old.
194                  */
195                 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
196                 vma = new_vma;
197                 old_len = new_len;
198                 old_addr = new_addr;
199                 new_addr = -ENOMEM;
200         }
201
202         /* Conceal VM_ACCOUNT so old reservation is not undone */
203         if (vm_flags & VM_ACCOUNT) {
204                 vma->vm_flags &= ~VM_ACCOUNT;
205                 excess = vma->vm_end - vma->vm_start - old_len;
206                 if (old_addr > vma->vm_start &&
207                     old_addr + old_len < vma->vm_end)
208                         split = 1;
209         }
210
211         /*
212          * If we failed to move page tables we still do total_vm increment
213          * since do_munmap() will decrement it by old_len == new_len.
214          *
215          * Since total_vm is about to be raised artificially high for a
216          * moment, we need to restore high watermark afterwards: if stats
217          * are taken meanwhile, total_vm and hiwater_vm appear too high.
218          * If this were a serious issue, we'd add a flag to do_munmap().
219          */
220         hiwater_vm = mm->hiwater_vm;
221         mm->total_vm += new_len >> PAGE_SHIFT;
222         vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
223
224         if (do_munmap(mm, old_addr, old_len) < 0) {
225                 /* OOM: unable to split vma, just get accounts right */
226                 vm_unacct_memory(excess >> PAGE_SHIFT);
227                 excess = 0;
228         }
229         mm->hiwater_vm = hiwater_vm;
230
231         /* Restore VM_ACCOUNT if one or two pieces of vma left */
232         if (excess) {
233                 vma->vm_flags |= VM_ACCOUNT;
234                 if (split)
235                         vma->vm_next->vm_flags |= VM_ACCOUNT;
236         }
237
238         if (vm_flags & VM_LOCKED) {
239                 mm->locked_vm += new_len >> PAGE_SHIFT;
240                 if (new_len > old_len)
241                         make_pages_present(new_addr + old_len,
242                                            new_addr + new_len);
243         }
244
245         return new_addr;
246 }
247
248 /*
249  * Expand (or shrink) an existing mapping, potentially moving it at the
250  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
251  *
252  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
253  * This option implies MREMAP_MAYMOVE.
254  */
255 unsigned long do_mremap(unsigned long addr,
256         unsigned long old_len, unsigned long new_len,
257         unsigned long flags, unsigned long new_addr)
258 {
259         struct mm_struct *mm = current->mm;
260         struct vm_area_struct *vma;
261         unsigned long ret = -EINVAL;
262         unsigned long charged = 0;
263
264         if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
265                 goto out;
266
267         if (addr & ~PAGE_MASK)
268                 goto out;
269
270         old_len = PAGE_ALIGN(old_len);
271         new_len = PAGE_ALIGN(new_len);
272
273         /*
274          * We allow a zero old-len as a special case
275          * for DOS-emu "duplicate shm area" thing. But
276          * a zero new-len is nonsensical.
277          */
278         if (!new_len)
279                 goto out;
280
281         /* new_addr is only valid if MREMAP_FIXED is specified */
282         if (flags & MREMAP_FIXED) {
283                 if (new_addr & ~PAGE_MASK)
284                         goto out;
285                 if (!(flags & MREMAP_MAYMOVE))
286                         goto out;
287
288                 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
289                         goto out;
290
291                 /* Check if the location we're moving into overlaps the
292                  * old location at all, and fail if it does.
293                  */
294                 if ((new_addr <= addr) && (new_addr+new_len) > addr)
295                         goto out;
296
297                 if ((addr <= new_addr) && (addr+old_len) > new_addr)
298                         goto out;
299
300                 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
301                 if (ret)
302                         goto out;
303
304                 ret = do_munmap(mm, new_addr, new_len);
305                 if (ret)
306                         goto out;
307         }
308
309         /*
310          * Always allow a shrinking remap: that just unmaps
311          * the unnecessary pages..
312          * do_munmap does all the needed commit accounting
313          */
314         if (old_len >= new_len) {
315                 ret = do_munmap(mm, addr+new_len, old_len - new_len);
316                 if (ret && old_len != new_len)
317                         goto out;
318                 ret = addr;
319                 if (!(flags & MREMAP_FIXED) || (new_addr == addr))
320                         goto out;
321                 old_len = new_len;
322         }
323
324         /*
325          * Ok, we need to grow..  or relocate.
326          */
327         ret = -EFAULT;
328         vma = find_vma(mm, addr);
329         if (!vma || vma->vm_start > addr)
330                 goto out;
331         if (is_vm_hugetlb_page(vma)) {
332                 ret = -EINVAL;
333                 goto out;
334         }
335         /* We can't remap across vm area boundaries */
336         if (old_len > vma->vm_end - addr)
337                 goto out;
338         if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
339                 if (new_len > old_len)
340                         goto out;
341         }
342         if (vma->vm_flags & VM_LOCKED) {
343                 unsigned long locked, lock_limit;
344                 locked = mm->locked_vm << PAGE_SHIFT;
345                 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
346                 locked += new_len - old_len;
347                 ret = -EAGAIN;
348                 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
349                         goto out;
350         }
351         if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
352                 ret = -ENOMEM;
353                 goto out;
354         }
355
356         if (vma->vm_flags & VM_ACCOUNT) {
357                 charged = (new_len - old_len) >> PAGE_SHIFT;
358                 if (security_vm_enough_memory(charged))
359                         goto out_nc;
360         }
361
362         /* old_len exactly to the end of the area..
363          * And we're not relocating the area.
364          */
365         if (old_len == vma->vm_end - addr &&
366             !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
367             (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
368                 unsigned long max_addr = TASK_SIZE;
369                 if (vma->vm_next)
370                         max_addr = vma->vm_next->vm_start;
371                 /* can we just expand the current mapping? */
372                 if (max_addr - addr >= new_len) {
373                         int pages = (new_len - old_len) >> PAGE_SHIFT;
374
375                         vma_adjust(vma, vma->vm_start,
376                                 addr + new_len, vma->vm_pgoff, NULL);
377
378                         mm->total_vm += pages;
379                         vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
380                         if (vma->vm_flags & VM_LOCKED) {
381                                 mm->locked_vm += pages;
382                                 make_pages_present(addr + old_len,
383                                                    addr + new_len);
384                         }
385                         ret = addr;
386                         goto out;
387                 }
388         }
389
390         /*
391          * We weren't able to just expand or shrink the area,
392          * we need to create a new one and move it..
393          */
394         ret = -ENOMEM;
395         if (flags & MREMAP_MAYMOVE) {
396                 if (!(flags & MREMAP_FIXED)) {
397                         unsigned long map_flags = 0;
398                         if (vma->vm_flags & VM_MAYSHARE)
399                                 map_flags |= MAP_SHARED;
400
401                         new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
402                                                 vma->vm_pgoff, map_flags);
403                         if (new_addr & ~PAGE_MASK) {
404                                 ret = new_addr;
405                                 goto out;
406                         }
407
408                         ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
409                         if (ret)
410                                 goto out;
411                 }
412                 ret = move_vma(vma, addr, old_len, new_len, new_addr);
413         }
414 out:
415         if (ret & ~PAGE_MASK)
416                 vm_unacct_memory(charged);
417 out_nc:
418         return ret;
419 }
420
421 asmlinkage unsigned long sys_mremap(unsigned long addr,
422         unsigned long old_len, unsigned long new_len,
423         unsigned long flags, unsigned long new_addr)
424 {
425         unsigned long ret;
426
427         down_write(&current->mm->mmap_sem);
428         ret = do_mremap(addr, old_len, new_len, flags, new_addr);
429         up_write(&current->mm->mmap_sem);
430         return ret;
431 }