Page allocator: get rid of the list of cold pages
[linux-2.6] / mm / truncate.c
1 /*
2  * mm/truncate.c - code for taking down pages from address_spaces
3  *
4  * Copyright (C) 2002, Linus Torvalds
5  *
6  * 10Sep2002    akpm@zip.com.au
7  *              Initial version.
8  */
9
10 #include <linux/kernel.h>
11 #include <linux/backing-dev.h>
12 #include <linux/mm.h>
13 #include <linux/swap.h>
14 #include <linux/module.h>
15 #include <linux/pagemap.h>
16 #include <linux/highmem.h>
17 #include <linux/pagevec.h>
18 #include <linux/task_io_accounting_ops.h>
19 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
20                                    do_invalidatepage */
21
22
23 /**
24  * do_invalidatepage - invalidate part or all of a page
25  * @page: the page which is affected
26  * @offset: the index of the truncation point
27  *
28  * do_invalidatepage() is called when all or part of the page has become
29  * invalidated by a truncate operation.
30  *
31  * do_invalidatepage() does not have to release all buffers, but it must
32  * ensure that no dirty buffer is left outside @offset and that no I/O
33  * is underway against any of the blocks which are outside the truncation
34  * point.  Because the caller is about to free (and possibly reuse) those
35  * blocks on-disk.
36  */
37 void do_invalidatepage(struct page *page, unsigned long offset)
38 {
39         void (*invalidatepage)(struct page *, unsigned long);
40         invalidatepage = page->mapping->a_ops->invalidatepage;
41 #ifdef CONFIG_BLOCK
42         if (!invalidatepage)
43                 invalidatepage = block_invalidatepage;
44 #endif
45         if (invalidatepage)
46                 (*invalidatepage)(page, offset);
47 }
48
49 static inline void truncate_partial_page(struct page *page, unsigned partial)
50 {
51         zero_user_segment(page, partial, PAGE_CACHE_SIZE);
52         if (PagePrivate(page))
53                 do_invalidatepage(page, partial);
54 }
55
56 /*
57  * This cancels just the dirty bit on the kernel page itself, it
58  * does NOT actually remove dirty bits on any mmap's that may be
59  * around. It also leaves the page tagged dirty, so any sync
60  * activity will still find it on the dirty lists, and in particular,
61  * clear_page_dirty_for_io() will still look at the dirty bits in
62  * the VM.
63  *
64  * Doing this should *normally* only ever be done when a page
65  * is truncated, and is not actually mapped anywhere at all. However,
66  * fs/buffer.c does this when it notices that somebody has cleaned
67  * out all the buffers on a page without actually doing it through
68  * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
69  */
70 void cancel_dirty_page(struct page *page, unsigned int account_size)
71 {
72         if (TestClearPageDirty(page)) {
73                 struct address_space *mapping = page->mapping;
74                 if (mapping && mapping_cap_account_dirty(mapping)) {
75                         dec_zone_page_state(page, NR_FILE_DIRTY);
76                         dec_bdi_stat(mapping->backing_dev_info,
77                                         BDI_RECLAIMABLE);
78                         if (account_size)
79                                 task_io_account_cancelled_write(account_size);
80                 }
81         }
82 }
83 EXPORT_SYMBOL(cancel_dirty_page);
84
85 /*
86  * If truncate cannot remove the fs-private metadata from the page, the page
87  * becomes anonymous.  It will be left on the LRU and may even be mapped into
88  * user pagetables if we're racing with filemap_fault().
89  *
90  * We need to bale out if page->mapping is no longer equal to the original
91  * mapping.  This happens a) when the VM reclaimed the page while we waited on
92  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
93  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
94  */
95 static void
96 truncate_complete_page(struct address_space *mapping, struct page *page)
97 {
98         if (page->mapping != mapping)
99                 return;
100
101         cancel_dirty_page(page, PAGE_CACHE_SIZE);
102
103         if (PagePrivate(page))
104                 do_invalidatepage(page, 0);
105
106         remove_from_page_cache(page);
107         ClearPageUptodate(page);
108         ClearPageMappedToDisk(page);
109         page_cache_release(page);       /* pagecache ref */
110 }
111
112 /*
113  * This is for invalidate_mapping_pages().  That function can be called at
114  * any time, and is not supposed to throw away dirty pages.  But pages can
115  * be marked dirty at any time too, so use remove_mapping which safely
116  * discards clean, unused pages.
117  *
118  * Returns non-zero if the page was successfully invalidated.
119  */
120 static int
121 invalidate_complete_page(struct address_space *mapping, struct page *page)
122 {
123         int ret;
124
125         if (page->mapping != mapping)
126                 return 0;
127
128         if (PagePrivate(page) && !try_to_release_page(page, 0))
129                 return 0;
130
131         ret = remove_mapping(mapping, page);
132
133         return ret;
134 }
135
136 /**
137  * truncate_inode_pages - truncate range of pages specified by start and
138  * end byte offsets
139  * @mapping: mapping to truncate
140  * @lstart: offset from which to truncate
141  * @lend: offset to which to truncate
142  *
143  * Truncate the page cache, removing the pages that are between
144  * specified offsets (and zeroing out partial page
145  * (if lstart is not page aligned)).
146  *
147  * Truncate takes two passes - the first pass is nonblocking.  It will not
148  * block on page locks and it will not block on writeback.  The second pass
149  * will wait.  This is to prevent as much IO as possible in the affected region.
150  * The first pass will remove most pages, so the search cost of the second pass
151  * is low.
152  *
153  * When looking at page->index outside the page lock we need to be careful to
154  * copy it into a local to avoid races (it could change at any time).
155  *
156  * We pass down the cache-hot hint to the page freeing code.  Even if the
157  * mapping is large, it is probably the case that the final pages are the most
158  * recently touched, and freeing happens in ascending file offset order.
159  */
160 void truncate_inode_pages_range(struct address_space *mapping,
161                                 loff_t lstart, loff_t lend)
162 {
163         const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
164         pgoff_t end;
165         const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
166         struct pagevec pvec;
167         pgoff_t next;
168         int i;
169
170         if (mapping->nrpages == 0)
171                 return;
172
173         BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
174         end = (lend >> PAGE_CACHE_SHIFT);
175
176         pagevec_init(&pvec, 0);
177         next = start;
178         while (next <= end &&
179                pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
180                 for (i = 0; i < pagevec_count(&pvec); i++) {
181                         struct page *page = pvec.pages[i];
182                         pgoff_t page_index = page->index;
183
184                         if (page_index > end) {
185                                 next = page_index;
186                                 break;
187                         }
188
189                         if (page_index > next)
190                                 next = page_index;
191                         next++;
192                         if (TestSetPageLocked(page))
193                                 continue;
194                         if (PageWriteback(page)) {
195                                 unlock_page(page);
196                                 continue;
197                         }
198                         if (page_mapped(page)) {
199                                 unmap_mapping_range(mapping,
200                                   (loff_t)page_index<<PAGE_CACHE_SHIFT,
201                                   PAGE_CACHE_SIZE, 0);
202                         }
203                         truncate_complete_page(mapping, page);
204                         unlock_page(page);
205                 }
206                 pagevec_release(&pvec);
207                 cond_resched();
208         }
209
210         if (partial) {
211                 struct page *page = find_lock_page(mapping, start - 1);
212                 if (page) {
213                         wait_on_page_writeback(page);
214                         truncate_partial_page(page, partial);
215                         unlock_page(page);
216                         page_cache_release(page);
217                 }
218         }
219
220         next = start;
221         for ( ; ; ) {
222                 cond_resched();
223                 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
224                         if (next == start)
225                                 break;
226                         next = start;
227                         continue;
228                 }
229                 if (pvec.pages[0]->index > end) {
230                         pagevec_release(&pvec);
231                         break;
232                 }
233                 for (i = 0; i < pagevec_count(&pvec); i++) {
234                         struct page *page = pvec.pages[i];
235
236                         if (page->index > end)
237                                 break;
238                         lock_page(page);
239                         wait_on_page_writeback(page);
240                         if (page_mapped(page)) {
241                                 unmap_mapping_range(mapping,
242                                   (loff_t)page->index<<PAGE_CACHE_SHIFT,
243                                   PAGE_CACHE_SIZE, 0);
244                         }
245                         if (page->index > next)
246                                 next = page->index;
247                         next++;
248                         truncate_complete_page(mapping, page);
249                         unlock_page(page);
250                 }
251                 pagevec_release(&pvec);
252         }
253 }
254 EXPORT_SYMBOL(truncate_inode_pages_range);
255
256 /**
257  * truncate_inode_pages - truncate *all* the pages from an offset
258  * @mapping: mapping to truncate
259  * @lstart: offset from which to truncate
260  *
261  * Called under (and serialised by) inode->i_mutex.
262  */
263 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
264 {
265         truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
266 }
267 EXPORT_SYMBOL(truncate_inode_pages);
268
269 unsigned long __invalidate_mapping_pages(struct address_space *mapping,
270                                 pgoff_t start, pgoff_t end, bool be_atomic)
271 {
272         struct pagevec pvec;
273         pgoff_t next = start;
274         unsigned long ret = 0;
275         int i;
276
277         pagevec_init(&pvec, 0);
278         while (next <= end &&
279                         pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
280                 for (i = 0; i < pagevec_count(&pvec); i++) {
281                         struct page *page = pvec.pages[i];
282                         pgoff_t index;
283                         int lock_failed;
284
285                         lock_failed = TestSetPageLocked(page);
286
287                         /*
288                          * We really shouldn't be looking at the ->index of an
289                          * unlocked page.  But we're not allowed to lock these
290                          * pages.  So we rely upon nobody altering the ->index
291                          * of this (pinned-by-us) page.
292                          */
293                         index = page->index;
294                         if (index > next)
295                                 next = index;
296                         next++;
297                         if (lock_failed)
298                                 continue;
299
300                         if (PageDirty(page) || PageWriteback(page))
301                                 goto unlock;
302                         if (page_mapped(page))
303                                 goto unlock;
304                         ret += invalidate_complete_page(mapping, page);
305 unlock:
306                         unlock_page(page);
307                         if (next > end)
308                                 break;
309                 }
310                 pagevec_release(&pvec);
311                 if (likely(!be_atomic))
312                         cond_resched();
313         }
314         return ret;
315 }
316
317 /**
318  * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
319  * @mapping: the address_space which holds the pages to invalidate
320  * @start: the offset 'from' which to invalidate
321  * @end: the offset 'to' which to invalidate (inclusive)
322  *
323  * This function only removes the unlocked pages, if you want to
324  * remove all the pages of one inode, you must call truncate_inode_pages.
325  *
326  * invalidate_mapping_pages() will not block on IO activity. It will not
327  * invalidate pages which are dirty, locked, under writeback or mapped into
328  * pagetables.
329  */
330 unsigned long invalidate_mapping_pages(struct address_space *mapping,
331                                 pgoff_t start, pgoff_t end)
332 {
333         return __invalidate_mapping_pages(mapping, start, end, false);
334 }
335 EXPORT_SYMBOL(invalidate_mapping_pages);
336
337 /*
338  * This is like invalidate_complete_page(), except it ignores the page's
339  * refcount.  We do this because invalidate_inode_pages2() needs stronger
340  * invalidation guarantees, and cannot afford to leave pages behind because
341  * shrink_page_list() has a temp ref on them, or because they're transiently
342  * sitting in the lru_cache_add() pagevecs.
343  */
344 static int
345 invalidate_complete_page2(struct address_space *mapping, struct page *page)
346 {
347         if (page->mapping != mapping)
348                 return 0;
349
350         if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
351                 return 0;
352
353         write_lock_irq(&mapping->tree_lock);
354         if (PageDirty(page))
355                 goto failed;
356
357         BUG_ON(PagePrivate(page));
358         __remove_from_page_cache(page);
359         write_unlock_irq(&mapping->tree_lock);
360         ClearPageUptodate(page);
361         page_cache_release(page);       /* pagecache ref */
362         return 1;
363 failed:
364         write_unlock_irq(&mapping->tree_lock);
365         return 0;
366 }
367
368 static int do_launder_page(struct address_space *mapping, struct page *page)
369 {
370         if (!PageDirty(page))
371                 return 0;
372         if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
373                 return 0;
374         return mapping->a_ops->launder_page(page);
375 }
376
377 /**
378  * invalidate_inode_pages2_range - remove range of pages from an address_space
379  * @mapping: the address_space
380  * @start: the page offset 'from' which to invalidate
381  * @end: the page offset 'to' which to invalidate (inclusive)
382  *
383  * Any pages which are found to be mapped into pagetables are unmapped prior to
384  * invalidation.
385  *
386  * Returns -EIO if any pages could not be invalidated.
387  */
388 int invalidate_inode_pages2_range(struct address_space *mapping,
389                                   pgoff_t start, pgoff_t end)
390 {
391         struct pagevec pvec;
392         pgoff_t next;
393         int i;
394         int ret = 0;
395         int did_range_unmap = 0;
396         int wrapped = 0;
397
398         pagevec_init(&pvec, 0);
399         next = start;
400         while (next <= end && !wrapped &&
401                 pagevec_lookup(&pvec, mapping, next,
402                         min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
403                 for (i = 0; i < pagevec_count(&pvec); i++) {
404                         struct page *page = pvec.pages[i];
405                         pgoff_t page_index;
406
407                         lock_page(page);
408                         if (page->mapping != mapping) {
409                                 unlock_page(page);
410                                 continue;
411                         }
412                         page_index = page->index;
413                         next = page_index + 1;
414                         if (next == 0)
415                                 wrapped = 1;
416                         if (page_index > end) {
417                                 unlock_page(page);
418                                 break;
419                         }
420                         wait_on_page_writeback(page);
421                         if (page_mapped(page)) {
422                                 if (!did_range_unmap) {
423                                         /*
424                                          * Zap the rest of the file in one hit.
425                                          */
426                                         unmap_mapping_range(mapping,
427                                            (loff_t)page_index<<PAGE_CACHE_SHIFT,
428                                            (loff_t)(end - page_index + 1)
429                                                         << PAGE_CACHE_SHIFT,
430                                             0);
431                                         did_range_unmap = 1;
432                                 } else {
433                                         /*
434                                          * Just zap this page
435                                          */
436                                         unmap_mapping_range(mapping,
437                                           (loff_t)page_index<<PAGE_CACHE_SHIFT,
438                                           PAGE_CACHE_SIZE, 0);
439                                 }
440                         }
441                         BUG_ON(page_mapped(page));
442                         ret = do_launder_page(mapping, page);
443                         if (ret == 0 && !invalidate_complete_page2(mapping, page))
444                                 ret = -EIO;
445                         unlock_page(page);
446                 }
447                 pagevec_release(&pvec);
448                 cond_resched();
449         }
450         return ret;
451 }
452 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
453
454 /**
455  * invalidate_inode_pages2 - remove all pages from an address_space
456  * @mapping: the address_space
457  *
458  * Any pages which are found to be mapped into pagetables are unmapped prior to
459  * invalidation.
460  *
461  * Returns -EIO if any pages could not be invalidated.
462  */
463 int invalidate_inode_pages2(struct address_space *mapping)
464 {
465         return invalidate_inode_pages2_range(mapping, 0, -1);
466 }
467 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);