[PATCH] fix NUMA interleaving for huge pages
[linux-2.6] / mm / msync.c
1 /*
2  *      linux/mm/msync.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6
7 /*
8  * The msync() system call.
9  */
10 #include <linux/slab.h>
11 #include <linux/pagemap.h>
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/mman.h>
15 #include <linux/hugetlb.h>
16 #include <linux/writeback.h>
17 #include <linux/file.h>
18 #include <linux/syscalls.h>
19
20 #include <asm/pgtable.h>
21 #include <asm/tlbflush.h>
22
23 static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24                                 unsigned long addr, unsigned long end)
25 {
26         pte_t *pte;
27         spinlock_t *ptl;
28         int progress = 0;
29         unsigned long ret = 0;
30
31 again:
32         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33         do {
34                 struct page *page;
35
36                 if (progress >= 64) {
37                         progress = 0;
38                         if (need_resched() || need_lockbreak(ptl))
39                                 break;
40                 }
41                 progress++;
42                 if (!pte_present(*pte))
43                         continue;
44                 if (!pte_maybe_dirty(*pte))
45                         continue;
46                 page = vm_normal_page(vma, addr, *pte);
47                 if (!page)
48                         continue;
49                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
50                                 page_test_and_clear_dirty(page))
51                         ret += set_page_dirty(page);
52                 progress += 3;
53         } while (pte++, addr += PAGE_SIZE, addr != end);
54         pte_unmap_unlock(pte - 1, ptl);
55         cond_resched();
56         if (addr != end)
57                 goto again;
58         return ret;
59 }
60
61 static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62                         pud_t *pud, unsigned long addr, unsigned long end)
63 {
64         pmd_t *pmd;
65         unsigned long next;
66         unsigned long ret = 0;
67
68         pmd = pmd_offset(pud, addr);
69         do {
70                 next = pmd_addr_end(addr, end);
71                 if (pmd_none_or_clear_bad(pmd))
72                         continue;
73                 ret += msync_pte_range(vma, pmd, addr, next);
74         } while (pmd++, addr = next, addr != end);
75         return ret;
76 }
77
78 static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79                         pgd_t *pgd, unsigned long addr, unsigned long end)
80 {
81         pud_t *pud;
82         unsigned long next;
83         unsigned long ret = 0;
84
85         pud = pud_offset(pgd, addr);
86         do {
87                 next = pud_addr_end(addr, end);
88                 if (pud_none_or_clear_bad(pud))
89                         continue;
90                 ret += msync_pmd_range(vma, pud, addr, next);
91         } while (pud++, addr = next, addr != end);
92         return ret;
93 }
94
95 static unsigned long msync_page_range(struct vm_area_struct *vma,
96                                 unsigned long addr, unsigned long end)
97 {
98         pgd_t *pgd;
99         unsigned long next;
100         unsigned long ret = 0;
101
102         /* For hugepages we can't go walking the page table normally,
103          * but that's ok, hugetlbfs is memory based, so we don't need
104          * to do anything more on an msync().
105          */
106         if (vma->vm_flags & VM_HUGETLB)
107                 return 0;
108
109         BUG_ON(addr >= end);
110         pgd = pgd_offset(vma->vm_mm, addr);
111         flush_cache_range(vma, addr, end);
112         do {
113                 next = pgd_addr_end(addr, end);
114                 if (pgd_none_or_clear_bad(pgd))
115                         continue;
116                 ret += msync_pud_range(vma, pgd, addr, next);
117         } while (pgd++, addr = next, addr != end);
118         return ret;
119 }
120
121 /*
122  * MS_SYNC syncs the entire file - including mappings.
123  *
124  * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
125  * marks the relevant pages dirty.  The application may now run fsync() to
126  * write out the dirty pages and wait on the writeout and check the result.
127  * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128  * async writeout immediately.
129  * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130  * applications.
131  */
132 static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133                         unsigned long end, int flags,
134                         unsigned long *nr_pages_dirtied)
135 {
136         struct file *file = vma->vm_file;
137
138         if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139                 return -EBUSY;
140
141         if (file && (vma->vm_flags & VM_SHARED))
142                 *nr_pages_dirtied = msync_page_range(vma, addr, end);
143         return 0;
144 }
145
146 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147 {
148         unsigned long end;
149         struct vm_area_struct *vma;
150         int unmapped_error = 0;
151         int error = -EINVAL;
152         int done = 0;
153
154         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155                 goto out;
156         if (start & ~PAGE_MASK)
157                 goto out;
158         if ((flags & MS_ASYNC) && (flags & MS_SYNC))
159                 goto out;
160         error = -ENOMEM;
161         len = (len + ~PAGE_MASK) & PAGE_MASK;
162         end = start + len;
163         if (end < start)
164                 goto out;
165         error = 0;
166         if (end == start)
167                 goto out;
168         /*
169          * If the interval [start,end) covers some unmapped address ranges,
170          * just ignore them, but return -ENOMEM at the end.
171          */
172         down_read(&current->mm->mmap_sem);
173         vma = find_vma(current->mm, start);
174         if (!vma) {
175                 error = -ENOMEM;
176                 goto out_unlock;
177         }
178         do {
179                 unsigned long nr_pages_dirtied = 0;
180                 struct file *file;
181
182                 /* Here start < vma->vm_end. */
183                 if (start < vma->vm_start) {
184                         unmapped_error = -ENOMEM;
185                         start = vma->vm_start;
186                 }
187                 /* Here vma->vm_start <= start < vma->vm_end. */
188                 if (end <= vma->vm_end) {
189                         if (start < end) {
190                                 error = msync_interval(vma, start, end, flags,
191                                                         &nr_pages_dirtied);
192                                 if (error)
193                                         goto out_unlock;
194                         }
195                         error = unmapped_error;
196                         done = 1;
197                 } else {
198                         /* Here vma->vm_start <= start < vma->vm_end < end. */
199                         error = msync_interval(vma, start, vma->vm_end, flags,
200                                                 &nr_pages_dirtied);
201                         if (error)
202                                 goto out_unlock;
203                 }
204                 file = vma->vm_file;
205                 start = vma->vm_end;
206                 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
207                         get_file(file);
208                         up_read(&current->mm->mmap_sem);
209                         balance_dirty_pages_ratelimited_nr(file->f_mapping,
210                                                         nr_pages_dirtied);
211                         fput(file);
212                         down_read(&current->mm->mmap_sem);
213                         vma = find_vma(current->mm, start);
214                 } else if ((flags & MS_SYNC) && file &&
215                                 (vma->vm_flags & VM_SHARED)) {
216                         get_file(file);
217                         up_read(&current->mm->mmap_sem);
218                         error = do_fsync(file, 0);
219                         fput(file);
220                         down_read(&current->mm->mmap_sem);
221                         if (error)
222                                 goto out_unlock;
223                         vma = find_vma(current->mm, start);
224                 } else {
225                         vma = vma->vm_next;
226                 }
227         } while (vma && !done);
228 out_unlock:
229         up_read(&current->mm->mmap_sem);
230 out:
231         return error;
232 }