[PATCH] inotify (4/5): allow watch removal from event handler
[linux-2.6] / mm / msync.c
1 /*
2  *      linux/mm/msync.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6
7 /*
8  * The msync() system call.
9  */
10 #include <linux/slab.h>
11 #include <linux/pagemap.h>
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/mman.h>
15 #include <linux/hugetlb.h>
16 #include <linux/writeback.h>
17 #include <linux/file.h>
18 #include <linux/syscalls.h>
19
20 #include <asm/pgtable.h>
21 #include <asm/tlbflush.h>
22
23 static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24                                 unsigned long addr, unsigned long end)
25 {
26         pte_t *pte;
27         spinlock_t *ptl;
28         int progress = 0;
29         unsigned long ret = 0;
30
31 again:
32         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33         do {
34                 struct page *page;
35
36                 if (progress >= 64) {
37                         progress = 0;
38                         if (need_resched() || need_lockbreak(ptl))
39                                 break;
40                 }
41                 progress++;
42                 if (!pte_present(*pte))
43                         continue;
44                 if (!pte_maybe_dirty(*pte))
45                         continue;
46                 page = vm_normal_page(vma, addr, *pte);
47                 if (!page)
48                         continue;
49                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
50                                 page_test_and_clear_dirty(page))
51                         ret += set_page_dirty(page);
52                 progress += 3;
53         } while (pte++, addr += PAGE_SIZE, addr != end);
54         pte_unmap_unlock(pte - 1, ptl);
55         cond_resched();
56         if (addr != end)
57                 goto again;
58         return ret;
59 }
60
61 static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62                         pud_t *pud, unsigned long addr, unsigned long end)
63 {
64         pmd_t *pmd;
65         unsigned long next;
66         unsigned long ret = 0;
67
68         pmd = pmd_offset(pud, addr);
69         do {
70                 next = pmd_addr_end(addr, end);
71                 if (pmd_none_or_clear_bad(pmd))
72                         continue;
73                 ret += msync_pte_range(vma, pmd, addr, next);
74         } while (pmd++, addr = next, addr != end);
75         return ret;
76 }
77
78 static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79                         pgd_t *pgd, unsigned long addr, unsigned long end)
80 {
81         pud_t *pud;
82         unsigned long next;
83         unsigned long ret = 0;
84
85         pud = pud_offset(pgd, addr);
86         do {
87                 next = pud_addr_end(addr, end);
88                 if (pud_none_or_clear_bad(pud))
89                         continue;
90                 ret += msync_pmd_range(vma, pud, addr, next);
91         } while (pud++, addr = next, addr != end);
92         return ret;
93 }
94
95 static unsigned long msync_page_range(struct vm_area_struct *vma,
96                                 unsigned long addr, unsigned long end)
97 {
98         pgd_t *pgd;
99         unsigned long next;
100         unsigned long ret = 0;
101
102         /* For hugepages we can't go walking the page table normally,
103          * but that's ok, hugetlbfs is memory based, so we don't need
104          * to do anything more on an msync().
105          */
106         if (vma->vm_flags & VM_HUGETLB)
107                 return 0;
108
109         BUG_ON(addr >= end);
110         pgd = pgd_offset(vma->vm_mm, addr);
111         flush_cache_range(vma, addr, end);
112         do {
113                 next = pgd_addr_end(addr, end);
114                 if (pgd_none_or_clear_bad(pgd))
115                         continue;
116                 ret += msync_pud_range(vma, pgd, addr, next);
117         } while (pgd++, addr = next, addr != end);
118         return ret;
119 }
120
121 /*
122  * MS_SYNC syncs the entire file - including mappings.
123  *
124  * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
125  * marks the relevant pages dirty.  The application may now run fsync() to
126  * write out the dirty pages and wait on the writeout and check the result.
127  * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128  * async writeout immediately.
129  * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130  * applications.
131  */
132 static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133                         unsigned long end, int flags,
134                         unsigned long *nr_pages_dirtied)
135 {
136         struct file *file = vma->vm_file;
137
138         if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139                 return -EBUSY;
140
141         if (file && (vma->vm_flags & VM_SHARED))
142                 *nr_pages_dirtied = msync_page_range(vma, addr, end);
143         return 0;
144 }
145
146 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147 {
148         unsigned long end;
149         struct vm_area_struct *vma;
150         int unmapped_error = 0;
151         int error = -EINVAL;
152         int done = 0;
153
154         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155                 goto out;
156         if (start & ~PAGE_MASK)
157                 goto out;
158         if ((flags & MS_ASYNC) && (flags & MS_SYNC))
159                 goto out;
160         error = -ENOMEM;
161         len = (len + ~PAGE_MASK) & PAGE_MASK;
162         end = start + len;
163         if (end < start)
164                 goto out;
165         error = 0;
166         if (end == start)
167                 goto out;
168         /*
169          * If the interval [start,end) covers some unmapped address ranges,
170          * just ignore them, but return -ENOMEM at the end.
171          */
172         down_read(&current->mm->mmap_sem);
173         if (flags & MS_SYNC)
174                 current->flags |= PF_SYNCWRITE;
175         vma = find_vma(current->mm, start);
176         if (!vma) {
177                 error = -ENOMEM;
178                 goto out_unlock;
179         }
180         do {
181                 unsigned long nr_pages_dirtied = 0;
182                 struct file *file;
183
184                 /* Here start < vma->vm_end. */
185                 if (start < vma->vm_start) {
186                         unmapped_error = -ENOMEM;
187                         start = vma->vm_start;
188                 }
189                 /* Here vma->vm_start <= start < vma->vm_end. */
190                 if (end <= vma->vm_end) {
191                         if (start < end) {
192                                 error = msync_interval(vma, start, end, flags,
193                                                         &nr_pages_dirtied);
194                                 if (error)
195                                         goto out_unlock;
196                         }
197                         error = unmapped_error;
198                         done = 1;
199                 } else {
200                         /* Here vma->vm_start <= start < vma->vm_end < end. */
201                         error = msync_interval(vma, start, vma->vm_end, flags,
202                                                 &nr_pages_dirtied);
203                         if (error)
204                                 goto out_unlock;
205                 }
206                 file = vma->vm_file;
207                 start = vma->vm_end;
208                 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
209                         get_file(file);
210                         up_read(&current->mm->mmap_sem);
211                         balance_dirty_pages_ratelimited_nr(file->f_mapping,
212                                                         nr_pages_dirtied);
213                         fput(file);
214                         down_read(&current->mm->mmap_sem);
215                         vma = find_vma(current->mm, start);
216                 } else if ((flags & MS_SYNC) && file &&
217                                 (vma->vm_flags & VM_SHARED)) {
218                         get_file(file);
219                         up_read(&current->mm->mmap_sem);
220                         error = do_fsync(file, 0);
221                         fput(file);
222                         down_read(&current->mm->mmap_sem);
223                         if (error)
224                                 goto out_unlock;
225                         vma = find_vma(current->mm, start);
226                 } else {
227                         vma = vma->vm_next;
228                 }
229         } while (vma && !done);
230 out_unlock:
231         current->flags &= ~PF_SYNCWRITE;
232         up_read(&current->mm->mmap_sem);
233 out:
234         return error;
235 }