4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52 bh->b_end_io = handler;
53 bh->b_private = private;
56 static int sync_buffer(void *word)
58 struct block_device *bd;
59 struct buffer_head *bh
60 = container_of(word, struct buffer_head, b_state);
65 blk_run_address_space(bd->bd_inode->i_mapping);
70 void __lock_buffer(struct buffer_head *bh)
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 TASK_UNINTERRUPTIBLE);
75 EXPORT_SYMBOL(__lock_buffer);
77 void unlock_buffer(struct buffer_head *bh)
79 clear_bit_unlock(BH_Lock, &bh->b_state);
80 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock);
85 * Block until a buffer comes unlocked. This doesn't stop it
86 * from becoming locked again - you have to lock it yourself
87 * if you want to preserve its state.
89 void __wait_on_buffer(struct buffer_head * bh)
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
95 __clear_page_buffers(struct page *page)
97 ClearPagePrivate(page);
98 set_page_private(page, 0);
99 page_cache_release(page);
102 static void buffer_io_error(struct buffer_head *bh)
104 char b[BDEVNAME_SIZE];
106 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 bdevname(bh->b_bdev, b),
108 (unsigned long long)bh->b_blocknr);
112 * End-of-IO handler helper function which does not touch the bh after
114 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
115 * a race there is benign: unlock_buffer() only use the bh's address for
116 * hashing after unlocking the buffer, so it doesn't actually touch the bh
119 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
122 set_buffer_uptodate(bh);
124 /* This happens, due to failed READA attempts. */
125 clear_buffer_uptodate(bh);
131 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
132 * unlock the buffer. This is what ll_rw_block uses too.
134 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
136 __end_buffer_read_notouch(bh, uptodate);
140 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
142 char b[BDEVNAME_SIZE];
145 set_buffer_uptodate(bh);
147 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
149 printk(KERN_WARNING "lost page write due to "
151 bdevname(bh->b_bdev, b));
153 set_buffer_write_io_error(bh);
154 clear_buffer_uptodate(bh);
161 * Write out and wait upon all the dirty data associated with a block
162 * device via its mapping. Does not take the superblock lock.
164 int sync_blockdev(struct block_device *bdev)
169 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
172 EXPORT_SYMBOL(sync_blockdev);
175 * Write out and wait upon all dirty data associated with this
176 * device. Filesystem data as well as the underlying block
177 * device. Takes the superblock lock.
179 int fsync_bdev(struct block_device *bdev)
181 struct super_block *sb = get_super(bdev);
183 int res = fsync_super(sb);
187 return sync_blockdev(bdev);
191 * freeze_bdev -- lock a filesystem and force it into a consistent state
192 * @bdev: blockdevice to lock
194 * This takes the block device bd_mount_sem to make sure no new mounts
195 * happen on bdev until thaw_bdev() is called.
196 * If a superblock is found on this device, we take the s_umount semaphore
197 * on it to make sure nobody unmounts until the snapshot creation is done.
199 struct super_block *freeze_bdev(struct block_device *bdev)
201 struct super_block *sb;
203 down(&bdev->bd_mount_sem);
204 sb = get_super(bdev);
205 if (sb && !(sb->s_flags & MS_RDONLY)) {
206 sb->s_frozen = SB_FREEZE_WRITE;
211 sb->s_frozen = SB_FREEZE_TRANS;
214 sync_blockdev(sb->s_bdev);
216 if (sb->s_op->write_super_lockfs)
217 sb->s_op->write_super_lockfs(sb);
221 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
223 EXPORT_SYMBOL(freeze_bdev);
226 * thaw_bdev -- unlock filesystem
227 * @bdev: blockdevice to unlock
228 * @sb: associated superblock
230 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
232 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
235 BUG_ON(sb->s_bdev != bdev);
237 if (sb->s_op->unlockfs)
238 sb->s_op->unlockfs(sb);
239 sb->s_frozen = SB_UNFROZEN;
241 wake_up(&sb->s_wait_unfrozen);
245 up(&bdev->bd_mount_sem);
247 EXPORT_SYMBOL(thaw_bdev);
250 * Various filesystems appear to want __find_get_block to be non-blocking.
251 * But it's the page lock which protects the buffers. To get around this,
252 * we get exclusion from try_to_free_buffers with the blockdev mapping's
255 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
256 * may be quite high. This code could TryLock the page, and if that
257 * succeeds, there is no need to take private_lock. (But if
258 * private_lock is contended then so is mapping->tree_lock).
260 static struct buffer_head *
261 __find_get_block_slow(struct block_device *bdev, sector_t block)
263 struct inode *bd_inode = bdev->bd_inode;
264 struct address_space *bd_mapping = bd_inode->i_mapping;
265 struct buffer_head *ret = NULL;
267 struct buffer_head *bh;
268 struct buffer_head *head;
272 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
273 page = find_get_page(bd_mapping, index);
277 spin_lock(&bd_mapping->private_lock);
278 if (!page_has_buffers(page))
280 head = page_buffers(page);
283 if (bh->b_blocknr == block) {
288 if (!buffer_mapped(bh))
290 bh = bh->b_this_page;
291 } while (bh != head);
293 /* we might be here because some of the buffers on this page are
294 * not mapped. This is due to various races between
295 * file io on the block device and getblk. It gets dealt with
296 * elsewhere, don't buffer_error if we had some unmapped buffers
299 printk("__find_get_block_slow() failed. "
300 "block=%llu, b_blocknr=%llu\n",
301 (unsigned long long)block,
302 (unsigned long long)bh->b_blocknr);
303 printk("b_state=0x%08lx, b_size=%zu\n",
304 bh->b_state, bh->b_size);
305 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
308 spin_unlock(&bd_mapping->private_lock);
309 page_cache_release(page);
314 /* If invalidate_buffers() will trash dirty buffers, it means some kind
315 of fs corruption is going on. Trashing dirty data always imply losing
316 information that was supposed to be just stored on the physical layer
319 Thus invalidate_buffers in general usage is not allwowed to trash
320 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
321 be preserved. These buffers are simply skipped.
323 We also skip buffers which are still in use. For example this can
324 happen if a userspace program is reading the block device.
326 NOTE: In the case where the user removed a removable-media-disk even if
327 there's still dirty data not synced on disk (due a bug in the device driver
328 or due an error of the user), by not destroying the dirty buffers we could
329 generate corruption also on the next media inserted, thus a parameter is
330 necessary to handle this case in the most safe way possible (trying
331 to not corrupt also the new disk inserted with the data belonging to
332 the old now corrupted disk). Also for the ramdisk the natural thing
333 to do in order to release the ramdisk memory is to destroy dirty buffers.
335 These are two special cases. Normal usage imply the device driver
336 to issue a sync on the device (without waiting I/O completion) and
337 then an invalidate_buffers call that doesn't trash dirty buffers.
339 For handling cache coherency with the blkdev pagecache the 'update' case
340 is been introduced. It is needed to re-read from disk any pinned
341 buffer. NOTE: re-reading from disk is destructive so we can do it only
342 when we assume nobody is changing the buffercache under our I/O and when
343 we think the disk contains more recent information than the buffercache.
344 The update == 1 pass marks the buffers we need to update, the update == 2
345 pass does the actual I/O. */
346 void invalidate_bdev(struct block_device *bdev)
348 struct address_space *mapping = bdev->bd_inode->i_mapping;
350 if (mapping->nrpages == 0)
353 invalidate_bh_lrus();
354 invalidate_mapping_pages(mapping, 0, -1);
358 * Kick pdflush then try to free up some ZONE_NORMAL memory.
360 static void free_more_memory(void)
365 wakeup_pdflush(1024);
368 for_each_online_node(nid) {
369 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
370 gfp_zone(GFP_NOFS), NULL,
373 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
379 * I/O completion handler for block_read_full_page() - pages
380 * which come unlocked at the end of I/O.
382 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
385 struct buffer_head *first;
386 struct buffer_head *tmp;
388 int page_uptodate = 1;
390 BUG_ON(!buffer_async_read(bh));
394 set_buffer_uptodate(bh);
396 clear_buffer_uptodate(bh);
397 if (printk_ratelimit())
403 * Be _very_ careful from here on. Bad things can happen if
404 * two buffer heads end IO at almost the same time and both
405 * decide that the page is now completely done.
407 first = page_buffers(page);
408 local_irq_save(flags);
409 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
410 clear_buffer_async_read(bh);
414 if (!buffer_uptodate(tmp))
416 if (buffer_async_read(tmp)) {
417 BUG_ON(!buffer_locked(tmp));
420 tmp = tmp->b_this_page;
422 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
423 local_irq_restore(flags);
426 * If none of the buffers had errors and they are all
427 * uptodate then we can set the page uptodate.
429 if (page_uptodate && !PageError(page))
430 SetPageUptodate(page);
435 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
436 local_irq_restore(flags);
441 * Completion handler for block_write_full_page() - pages which are unlocked
442 * during I/O, and which have PageWriteback cleared upon I/O completion.
444 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
446 char b[BDEVNAME_SIZE];
448 struct buffer_head *first;
449 struct buffer_head *tmp;
452 BUG_ON(!buffer_async_write(bh));
456 set_buffer_uptodate(bh);
458 if (printk_ratelimit()) {
460 printk(KERN_WARNING "lost page write due to "
462 bdevname(bh->b_bdev, b));
464 set_bit(AS_EIO, &page->mapping->flags);
465 set_buffer_write_io_error(bh);
466 clear_buffer_uptodate(bh);
470 first = page_buffers(page);
471 local_irq_save(flags);
472 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
474 clear_buffer_async_write(bh);
476 tmp = bh->b_this_page;
478 if (buffer_async_write(tmp)) {
479 BUG_ON(!buffer_locked(tmp));
482 tmp = tmp->b_this_page;
484 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
485 local_irq_restore(flags);
486 end_page_writeback(page);
490 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
491 local_irq_restore(flags);
496 * If a page's buffers are under async readin (end_buffer_async_read
497 * completion) then there is a possibility that another thread of
498 * control could lock one of the buffers after it has completed
499 * but while some of the other buffers have not completed. This
500 * locked buffer would confuse end_buffer_async_read() into not unlocking
501 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
502 * that this buffer is not under async I/O.
504 * The page comes unlocked when it has no locked buffer_async buffers
507 * PageLocked prevents anyone starting new async I/O reads any of
510 * PageWriteback is used to prevent simultaneous writeout of the same
513 * PageLocked prevents anyone from starting writeback of a page which is
514 * under read I/O (PageWriteback is only ever set against a locked page).
516 static void mark_buffer_async_read(struct buffer_head *bh)
518 bh->b_end_io = end_buffer_async_read;
519 set_buffer_async_read(bh);
522 void mark_buffer_async_write(struct buffer_head *bh)
524 bh->b_end_io = end_buffer_async_write;
525 set_buffer_async_write(bh);
527 EXPORT_SYMBOL(mark_buffer_async_write);
531 * fs/buffer.c contains helper functions for buffer-backed address space's
532 * fsync functions. A common requirement for buffer-based filesystems is
533 * that certain data from the backing blockdev needs to be written out for
534 * a successful fsync(). For example, ext2 indirect blocks need to be
535 * written back and waited upon before fsync() returns.
537 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
538 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
539 * management of a list of dependent buffers at ->i_mapping->private_list.
541 * Locking is a little subtle: try_to_free_buffers() will remove buffers
542 * from their controlling inode's queue when they are being freed. But
543 * try_to_free_buffers() will be operating against the *blockdev* mapping
544 * at the time, not against the S_ISREG file which depends on those buffers.
545 * So the locking for private_list is via the private_lock in the address_space
546 * which backs the buffers. Which is different from the address_space
547 * against which the buffers are listed. So for a particular address_space,
548 * mapping->private_lock does *not* protect mapping->private_list! In fact,
549 * mapping->private_list will always be protected by the backing blockdev's
552 * Which introduces a requirement: all buffers on an address_space's
553 * ->private_list must be from the same address_space: the blockdev's.
555 * address_spaces which do not place buffers at ->private_list via these
556 * utility functions are free to use private_lock and private_list for
557 * whatever they want. The only requirement is that list_empty(private_list)
558 * be true at clear_inode() time.
560 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
561 * filesystems should do that. invalidate_inode_buffers() should just go
562 * BUG_ON(!list_empty).
564 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
565 * take an address_space, not an inode. And it should be called
566 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
569 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
570 * list if it is already on a list. Because if the buffer is on a list,
571 * it *must* already be on the right one. If not, the filesystem is being
572 * silly. This will save a ton of locking. But first we have to ensure
573 * that buffers are taken *off* the old inode's list when they are freed
574 * (presumably in truncate). That requires careful auditing of all
575 * filesystems (do it inside bforget()). It could also be done by bringing
580 * The buffer's backing address_space's private_lock must be held
582 static void __remove_assoc_queue(struct buffer_head *bh)
584 list_del_init(&bh->b_assoc_buffers);
585 WARN_ON(!bh->b_assoc_map);
586 if (buffer_write_io_error(bh))
587 set_bit(AS_EIO, &bh->b_assoc_map->flags);
588 bh->b_assoc_map = NULL;
591 int inode_has_buffers(struct inode *inode)
593 return !list_empty(&inode->i_data.private_list);
597 * osync is designed to support O_SYNC io. It waits synchronously for
598 * all already-submitted IO to complete, but does not queue any new
599 * writes to the disk.
601 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
602 * you dirty the buffers, and then use osync_inode_buffers to wait for
603 * completion. Any other dirty buffers which are not yet queued for
604 * write will not be flushed to disk by the osync.
606 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
608 struct buffer_head *bh;
614 list_for_each_prev(p, list) {
616 if (buffer_locked(bh)) {
620 if (!buffer_uptodate(bh))
632 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
633 * @mapping: the mapping which wants those buffers written
635 * Starts I/O against the buffers at mapping->private_list, and waits upon
638 * Basically, this is a convenience function for fsync().
639 * @mapping is a file or directory which needs those buffers to be written for
640 * a successful fsync().
642 int sync_mapping_buffers(struct address_space *mapping)
644 struct address_space *buffer_mapping = mapping->assoc_mapping;
646 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
649 return fsync_buffers_list(&buffer_mapping->private_lock,
650 &mapping->private_list);
652 EXPORT_SYMBOL(sync_mapping_buffers);
655 * Called when we've recently written block `bblock', and it is known that
656 * `bblock' was for a buffer_boundary() buffer. This means that the block at
657 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
658 * dirty, schedule it for IO. So that indirects merge nicely with their data.
660 void write_boundary_block(struct block_device *bdev,
661 sector_t bblock, unsigned blocksize)
663 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
665 if (buffer_dirty(bh))
666 ll_rw_block(WRITE, 1, &bh);
671 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
673 struct address_space *mapping = inode->i_mapping;
674 struct address_space *buffer_mapping = bh->b_page->mapping;
676 mark_buffer_dirty(bh);
677 if (!mapping->assoc_mapping) {
678 mapping->assoc_mapping = buffer_mapping;
680 BUG_ON(mapping->assoc_mapping != buffer_mapping);
682 if (!bh->b_assoc_map) {
683 spin_lock(&buffer_mapping->private_lock);
684 list_move_tail(&bh->b_assoc_buffers,
685 &mapping->private_list);
686 bh->b_assoc_map = mapping;
687 spin_unlock(&buffer_mapping->private_lock);
690 EXPORT_SYMBOL(mark_buffer_dirty_inode);
693 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
696 * If warn is true, then emit a warning if the page is not uptodate and has
697 * not been truncated.
699 static int __set_page_dirty(struct page *page,
700 struct address_space *mapping, int warn)
702 if (unlikely(!mapping))
703 return !TestSetPageDirty(page);
705 if (TestSetPageDirty(page))
708 spin_lock_irq(&mapping->tree_lock);
709 if (page->mapping) { /* Race with truncate? */
710 WARN_ON_ONCE(warn && !PageUptodate(page));
712 if (mapping_cap_account_dirty(mapping)) {
713 __inc_zone_page_state(page, NR_FILE_DIRTY);
714 __inc_bdi_stat(mapping->backing_dev_info,
716 task_io_account_write(PAGE_CACHE_SIZE);
718 radix_tree_tag_set(&mapping->page_tree,
719 page_index(page), PAGECACHE_TAG_DIRTY);
721 spin_unlock_irq(&mapping->tree_lock);
722 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
728 * Add a page to the dirty page list.
730 * It is a sad fact of life that this function is called from several places
731 * deeply under spinlocking. It may not sleep.
733 * If the page has buffers, the uptodate buffers are set dirty, to preserve
734 * dirty-state coherency between the page and the buffers. It the page does
735 * not have buffers then when they are later attached they will all be set
738 * The buffers are dirtied before the page is dirtied. There's a small race
739 * window in which a writepage caller may see the page cleanness but not the
740 * buffer dirtiness. That's fine. If this code were to set the page dirty
741 * before the buffers, a concurrent writepage caller could clear the page dirty
742 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
743 * page on the dirty page list.
745 * We use private_lock to lock against try_to_free_buffers while using the
746 * page's buffer list. Also use this to protect against clean buffers being
747 * added to the page after it was set dirty.
749 * FIXME: may need to call ->reservepage here as well. That's rather up to the
750 * address_space though.
752 int __set_page_dirty_buffers(struct page *page)
754 struct address_space *mapping = page_mapping(page);
756 if (unlikely(!mapping))
757 return !TestSetPageDirty(page);
759 spin_lock(&mapping->private_lock);
760 if (page_has_buffers(page)) {
761 struct buffer_head *head = page_buffers(page);
762 struct buffer_head *bh = head;
765 set_buffer_dirty(bh);
766 bh = bh->b_this_page;
767 } while (bh != head);
769 spin_unlock(&mapping->private_lock);
771 return __set_page_dirty(page, mapping, 1);
773 EXPORT_SYMBOL(__set_page_dirty_buffers);
776 * Write out and wait upon a list of buffers.
778 * We have conflicting pressures: we want to make sure that all
779 * initially dirty buffers get waited on, but that any subsequently
780 * dirtied buffers don't. After all, we don't want fsync to last
781 * forever if somebody is actively writing to the file.
783 * Do this in two main stages: first we copy dirty buffers to a
784 * temporary inode list, queueing the writes as we go. Then we clean
785 * up, waiting for those writes to complete.
787 * During this second stage, any subsequent updates to the file may end
788 * up refiling the buffer on the original inode's dirty list again, so
789 * there is a chance we will end up with a buffer queued for write but
790 * not yet completed on that list. So, as a final cleanup we go through
791 * the osync code to catch these locked, dirty buffers without requeuing
792 * any newly dirty buffers for write.
794 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
796 struct buffer_head *bh;
797 struct list_head tmp;
798 struct address_space *mapping;
801 INIT_LIST_HEAD(&tmp);
804 while (!list_empty(list)) {
805 bh = BH_ENTRY(list->next);
806 mapping = bh->b_assoc_map;
807 __remove_assoc_queue(bh);
808 /* Avoid race with mark_buffer_dirty_inode() which does
809 * a lockless check and we rely on seeing the dirty bit */
811 if (buffer_dirty(bh) || buffer_locked(bh)) {
812 list_add(&bh->b_assoc_buffers, &tmp);
813 bh->b_assoc_map = mapping;
814 if (buffer_dirty(bh)) {
818 * Ensure any pending I/O completes so that
819 * ll_rw_block() actually writes the current
820 * contents - it is a noop if I/O is still in
821 * flight on potentially older contents.
823 ll_rw_block(SWRITE_SYNC, 1, &bh);
830 while (!list_empty(&tmp)) {
831 bh = BH_ENTRY(tmp.prev);
833 mapping = bh->b_assoc_map;
834 __remove_assoc_queue(bh);
835 /* Avoid race with mark_buffer_dirty_inode() which does
836 * a lockless check and we rely on seeing the dirty bit */
838 if (buffer_dirty(bh)) {
839 list_add(&bh->b_assoc_buffers,
840 &mapping->private_list);
841 bh->b_assoc_map = mapping;
845 if (!buffer_uptodate(bh))
852 err2 = osync_buffers_list(lock, list);
860 * Invalidate any and all dirty buffers on a given inode. We are
861 * probably unmounting the fs, but that doesn't mean we have already
862 * done a sync(). Just drop the buffers from the inode list.
864 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
865 * assumes that all the buffers are against the blockdev. Not true
868 void invalidate_inode_buffers(struct inode *inode)
870 if (inode_has_buffers(inode)) {
871 struct address_space *mapping = &inode->i_data;
872 struct list_head *list = &mapping->private_list;
873 struct address_space *buffer_mapping = mapping->assoc_mapping;
875 spin_lock(&buffer_mapping->private_lock);
876 while (!list_empty(list))
877 __remove_assoc_queue(BH_ENTRY(list->next));
878 spin_unlock(&buffer_mapping->private_lock);
883 * Remove any clean buffers from the inode's buffer list. This is called
884 * when we're trying to free the inode itself. Those buffers can pin it.
886 * Returns true if all buffers were removed.
888 int remove_inode_buffers(struct inode *inode)
892 if (inode_has_buffers(inode)) {
893 struct address_space *mapping = &inode->i_data;
894 struct list_head *list = &mapping->private_list;
895 struct address_space *buffer_mapping = mapping->assoc_mapping;
897 spin_lock(&buffer_mapping->private_lock);
898 while (!list_empty(list)) {
899 struct buffer_head *bh = BH_ENTRY(list->next);
900 if (buffer_dirty(bh)) {
904 __remove_assoc_queue(bh);
906 spin_unlock(&buffer_mapping->private_lock);
912 * Create the appropriate buffers when given a page for data area and
913 * the size of each buffer.. Use the bh->b_this_page linked list to
914 * follow the buffers created. Return NULL if unable to create more
917 * The retry flag is used to differentiate async IO (paging, swapping)
918 * which may not fail from ordinary buffer allocations.
920 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
923 struct buffer_head *bh, *head;
929 while ((offset -= size) >= 0) {
930 bh = alloc_buffer_head(GFP_NOFS);
935 bh->b_this_page = head;
940 atomic_set(&bh->b_count, 0);
941 bh->b_private = NULL;
944 /* Link the buffer to its page */
945 set_bh_page(bh, page, offset);
947 init_buffer(bh, NULL, NULL);
951 * In case anything failed, we just free everything we got.
957 head = head->b_this_page;
958 free_buffer_head(bh);
963 * Return failure for non-async IO requests. Async IO requests
964 * are not allowed to fail, so we have to wait until buffer heads
965 * become available. But we don't want tasks sleeping with
966 * partially complete buffers, so all were released above.
971 /* We're _really_ low on memory. Now we just
972 * wait for old buffer heads to become free due to
973 * finishing IO. Since this is an async request and
974 * the reserve list is empty, we're sure there are
975 * async buffer heads in use.
980 EXPORT_SYMBOL_GPL(alloc_page_buffers);
983 link_dev_buffers(struct page *page, struct buffer_head *head)
985 struct buffer_head *bh, *tail;
990 bh = bh->b_this_page;
992 tail->b_this_page = head;
993 attach_page_buffers(page, head);
997 * Initialise the state of a blockdev page's buffers.
1000 init_page_buffers(struct page *page, struct block_device *bdev,
1001 sector_t block, int size)
1003 struct buffer_head *head = page_buffers(page);
1004 struct buffer_head *bh = head;
1005 int uptodate = PageUptodate(page);
1008 if (!buffer_mapped(bh)) {
1009 init_buffer(bh, NULL, NULL);
1011 bh->b_blocknr = block;
1013 set_buffer_uptodate(bh);
1014 set_buffer_mapped(bh);
1017 bh = bh->b_this_page;
1018 } while (bh != head);
1022 * Create the page-cache page that contains the requested block.
1024 * This is user purely for blockdev mappings.
1026 static struct page *
1027 grow_dev_page(struct block_device *bdev, sector_t block,
1028 pgoff_t index, int size)
1030 struct inode *inode = bdev->bd_inode;
1032 struct buffer_head *bh;
1034 page = find_or_create_page(inode->i_mapping, index,
1035 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1039 BUG_ON(!PageLocked(page));
1041 if (page_has_buffers(page)) {
1042 bh = page_buffers(page);
1043 if (bh->b_size == size) {
1044 init_page_buffers(page, bdev, block, size);
1047 if (!try_to_free_buffers(page))
1052 * Allocate some buffers for this page
1054 bh = alloc_page_buffers(page, size, 0);
1059 * Link the page to the buffers and initialise them. Take the
1060 * lock to be atomic wrt __find_get_block(), which does not
1061 * run under the page lock.
1063 spin_lock(&inode->i_mapping->private_lock);
1064 link_dev_buffers(page, bh);
1065 init_page_buffers(page, bdev, block, size);
1066 spin_unlock(&inode->i_mapping->private_lock);
1072 page_cache_release(page);
1077 * Create buffers for the specified block device block's page. If
1078 * that page was dirty, the buffers are set dirty also.
1081 grow_buffers(struct block_device *bdev, sector_t block, int size)
1090 } while ((size << sizebits) < PAGE_SIZE);
1092 index = block >> sizebits;
1095 * Check for a block which wants to lie outside our maximum possible
1096 * pagecache index. (this comparison is done using sector_t types).
1098 if (unlikely(index != block >> sizebits)) {
1099 char b[BDEVNAME_SIZE];
1101 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1103 __func__, (unsigned long long)block,
1107 block = index << sizebits;
1108 /* Create a page with the proper size buffers.. */
1109 page = grow_dev_page(bdev, block, index, size);
1113 page_cache_release(page);
1117 static struct buffer_head *
1118 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1120 /* Size must be multiple of hard sectorsize */
1121 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1122 (size < 512 || size > PAGE_SIZE))) {
1123 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1125 printk(KERN_ERR "hardsect size: %d\n",
1126 bdev_hardsect_size(bdev));
1133 struct buffer_head * bh;
1136 bh = __find_get_block(bdev, block, size);
1140 ret = grow_buffers(bdev, block, size);
1149 * The relationship between dirty buffers and dirty pages:
1151 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1152 * the page is tagged dirty in its radix tree.
1154 * At all times, the dirtiness of the buffers represents the dirtiness of
1155 * subsections of the page. If the page has buffers, the page dirty bit is
1156 * merely a hint about the true dirty state.
1158 * When a page is set dirty in its entirety, all its buffers are marked dirty
1159 * (if the page has buffers).
1161 * When a buffer is marked dirty, its page is dirtied, but the page's other
1164 * Also. When blockdev buffers are explicitly read with bread(), they
1165 * individually become uptodate. But their backing page remains not
1166 * uptodate - even if all of its buffers are uptodate. A subsequent
1167 * block_read_full_page() against that page will discover all the uptodate
1168 * buffers, will set the page uptodate and will perform no I/O.
1172 * mark_buffer_dirty - mark a buffer_head as needing writeout
1173 * @bh: the buffer_head to mark dirty
1175 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1176 * backing page dirty, then tag the page as dirty in its address_space's radix
1177 * tree and then attach the address_space's inode to its superblock's dirty
1180 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1181 * mapping->tree_lock and the global inode_lock.
1183 void mark_buffer_dirty(struct buffer_head *bh)
1185 WARN_ON_ONCE(!buffer_uptodate(bh));
1188 * Very *carefully* optimize the it-is-already-dirty case.
1190 * Don't let the final "is it dirty" escape to before we
1191 * perhaps modified the buffer.
1193 if (buffer_dirty(bh)) {
1195 if (buffer_dirty(bh))
1199 if (!test_set_buffer_dirty(bh))
1200 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1204 * Decrement a buffer_head's reference count. If all buffers against a page
1205 * have zero reference count, are clean and unlocked, and if the page is clean
1206 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1207 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1208 * a page but it ends up not being freed, and buffers may later be reattached).
1210 void __brelse(struct buffer_head * buf)
1212 if (atomic_read(&buf->b_count)) {
1216 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1220 * bforget() is like brelse(), except it discards any
1221 * potentially dirty data.
1223 void __bforget(struct buffer_head *bh)
1225 clear_buffer_dirty(bh);
1226 if (bh->b_assoc_map) {
1227 struct address_space *buffer_mapping = bh->b_page->mapping;
1229 spin_lock(&buffer_mapping->private_lock);
1230 list_del_init(&bh->b_assoc_buffers);
1231 bh->b_assoc_map = NULL;
1232 spin_unlock(&buffer_mapping->private_lock);
1237 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1240 if (buffer_uptodate(bh)) {
1245 bh->b_end_io = end_buffer_read_sync;
1246 submit_bh(READ, bh);
1248 if (buffer_uptodate(bh))
1256 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1257 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1258 * refcount elevated by one when they're in an LRU. A buffer can only appear
1259 * once in a particular CPU's LRU. A single buffer can be present in multiple
1260 * CPU's LRUs at the same time.
1262 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1263 * sb_find_get_block().
1265 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1266 * a local interrupt disable for that.
1269 #define BH_LRU_SIZE 8
1272 struct buffer_head *bhs[BH_LRU_SIZE];
1275 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1278 #define bh_lru_lock() local_irq_disable()
1279 #define bh_lru_unlock() local_irq_enable()
1281 #define bh_lru_lock() preempt_disable()
1282 #define bh_lru_unlock() preempt_enable()
1285 static inline void check_irqs_on(void)
1287 #ifdef irqs_disabled
1288 BUG_ON(irqs_disabled());
1293 * The LRU management algorithm is dopey-but-simple. Sorry.
1295 static void bh_lru_install(struct buffer_head *bh)
1297 struct buffer_head *evictee = NULL;
1302 lru = &__get_cpu_var(bh_lrus);
1303 if (lru->bhs[0] != bh) {
1304 struct buffer_head *bhs[BH_LRU_SIZE];
1310 for (in = 0; in < BH_LRU_SIZE; in++) {
1311 struct buffer_head *bh2 = lru->bhs[in];
1316 if (out >= BH_LRU_SIZE) {
1317 BUG_ON(evictee != NULL);
1324 while (out < BH_LRU_SIZE)
1326 memcpy(lru->bhs, bhs, sizeof(bhs));
1335 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1337 static struct buffer_head *
1338 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1340 struct buffer_head *ret = NULL;
1346 lru = &__get_cpu_var(bh_lrus);
1347 for (i = 0; i < BH_LRU_SIZE; i++) {
1348 struct buffer_head *bh = lru->bhs[i];
1350 if (bh && bh->b_bdev == bdev &&
1351 bh->b_blocknr == block && bh->b_size == size) {
1354 lru->bhs[i] = lru->bhs[i - 1];
1369 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1370 * it in the LRU and mark it as accessed. If it is not present then return
1373 struct buffer_head *
1374 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1376 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1379 bh = __find_get_block_slow(bdev, block);
1387 EXPORT_SYMBOL(__find_get_block);
1390 * __getblk will locate (and, if necessary, create) the buffer_head
1391 * which corresponds to the passed block_device, block and size. The
1392 * returned buffer has its reference count incremented.
1394 * __getblk() cannot fail - it just keeps trying. If you pass it an
1395 * illegal block number, __getblk() will happily return a buffer_head
1396 * which represents the non-existent block. Very weird.
1398 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1399 * attempt is failing. FIXME, perhaps?
1401 struct buffer_head *
1402 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1404 struct buffer_head *bh = __find_get_block(bdev, block, size);
1408 bh = __getblk_slow(bdev, block, size);
1411 EXPORT_SYMBOL(__getblk);
1414 * Do async read-ahead on a buffer..
1416 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1418 struct buffer_head *bh = __getblk(bdev, block, size);
1420 ll_rw_block(READA, 1, &bh);
1424 EXPORT_SYMBOL(__breadahead);
1427 * __bread() - reads a specified block and returns the bh
1428 * @bdev: the block_device to read from
1429 * @block: number of block
1430 * @size: size (in bytes) to read
1432 * Reads a specified block, and returns buffer head that contains it.
1433 * It returns NULL if the block was unreadable.
1435 struct buffer_head *
1436 __bread(struct block_device *bdev, sector_t block, unsigned size)
1438 struct buffer_head *bh = __getblk(bdev, block, size);
1440 if (likely(bh) && !buffer_uptodate(bh))
1441 bh = __bread_slow(bh);
1444 EXPORT_SYMBOL(__bread);
1447 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1448 * This doesn't race because it runs in each cpu either in irq
1449 * or with preempt disabled.
1451 static void invalidate_bh_lru(void *arg)
1453 struct bh_lru *b = &get_cpu_var(bh_lrus);
1456 for (i = 0; i < BH_LRU_SIZE; i++) {
1460 put_cpu_var(bh_lrus);
1463 void invalidate_bh_lrus(void)
1465 on_each_cpu(invalidate_bh_lru, NULL, 1);
1467 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1469 void set_bh_page(struct buffer_head *bh,
1470 struct page *page, unsigned long offset)
1473 BUG_ON(offset >= PAGE_SIZE);
1474 if (PageHighMem(page))
1476 * This catches illegal uses and preserves the offset:
1478 bh->b_data = (char *)(0 + offset);
1480 bh->b_data = page_address(page) + offset;
1482 EXPORT_SYMBOL(set_bh_page);
1485 * Called when truncating a buffer on a page completely.
1487 static void discard_buffer(struct buffer_head * bh)
1490 clear_buffer_dirty(bh);
1492 clear_buffer_mapped(bh);
1493 clear_buffer_req(bh);
1494 clear_buffer_new(bh);
1495 clear_buffer_delay(bh);
1496 clear_buffer_unwritten(bh);
1501 * block_invalidatepage - invalidate part of all of a buffer-backed page
1503 * @page: the page which is affected
1504 * @offset: the index of the truncation point
1506 * block_invalidatepage() is called when all or part of the page has become
1507 * invalidatedby a truncate operation.
1509 * block_invalidatepage() does not have to release all buffers, but it must
1510 * ensure that no dirty buffer is left outside @offset and that no I/O
1511 * is underway against any of the blocks which are outside the truncation
1512 * point. Because the caller is about to free (and possibly reuse) those
1515 void block_invalidatepage(struct page *page, unsigned long offset)
1517 struct buffer_head *head, *bh, *next;
1518 unsigned int curr_off = 0;
1520 BUG_ON(!PageLocked(page));
1521 if (!page_has_buffers(page))
1524 head = page_buffers(page);
1527 unsigned int next_off = curr_off + bh->b_size;
1528 next = bh->b_this_page;
1531 * is this block fully invalidated?
1533 if (offset <= curr_off)
1535 curr_off = next_off;
1537 } while (bh != head);
1540 * We release buffers only if the entire page is being invalidated.
1541 * The get_block cached value has been unconditionally invalidated,
1542 * so real IO is not possible anymore.
1545 try_to_release_page(page, 0);
1549 EXPORT_SYMBOL(block_invalidatepage);
1552 * We attach and possibly dirty the buffers atomically wrt
1553 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1554 * is already excluded via the page lock.
1556 void create_empty_buffers(struct page *page,
1557 unsigned long blocksize, unsigned long b_state)
1559 struct buffer_head *bh, *head, *tail;
1561 head = alloc_page_buffers(page, blocksize, 1);
1564 bh->b_state |= b_state;
1566 bh = bh->b_this_page;
1568 tail->b_this_page = head;
1570 spin_lock(&page->mapping->private_lock);
1571 if (PageUptodate(page) || PageDirty(page)) {
1574 if (PageDirty(page))
1575 set_buffer_dirty(bh);
1576 if (PageUptodate(page))
1577 set_buffer_uptodate(bh);
1578 bh = bh->b_this_page;
1579 } while (bh != head);
1581 attach_page_buffers(page, head);
1582 spin_unlock(&page->mapping->private_lock);
1584 EXPORT_SYMBOL(create_empty_buffers);
1587 * We are taking a block for data and we don't want any output from any
1588 * buffer-cache aliases starting from return from that function and
1589 * until the moment when something will explicitly mark the buffer
1590 * dirty (hopefully that will not happen until we will free that block ;-)
1591 * We don't even need to mark it not-uptodate - nobody can expect
1592 * anything from a newly allocated buffer anyway. We used to used
1593 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1594 * don't want to mark the alias unmapped, for example - it would confuse
1595 * anyone who might pick it with bread() afterwards...
1597 * Also.. Note that bforget() doesn't lock the buffer. So there can
1598 * be writeout I/O going on against recently-freed buffers. We don't
1599 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1600 * only if we really need to. That happens here.
1602 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1604 struct buffer_head *old_bh;
1608 old_bh = __find_get_block_slow(bdev, block);
1610 clear_buffer_dirty(old_bh);
1611 wait_on_buffer(old_bh);
1612 clear_buffer_req(old_bh);
1616 EXPORT_SYMBOL(unmap_underlying_metadata);
1619 * NOTE! All mapped/uptodate combinations are valid:
1621 * Mapped Uptodate Meaning
1623 * No No "unknown" - must do get_block()
1624 * No Yes "hole" - zero-filled
1625 * Yes No "allocated" - allocated on disk, not read in
1626 * Yes Yes "valid" - allocated and up-to-date in memory.
1628 * "Dirty" is valid only with the last case (mapped+uptodate).
1632 * While block_write_full_page is writing back the dirty buffers under
1633 * the page lock, whoever dirtied the buffers may decide to clean them
1634 * again at any time. We handle that by only looking at the buffer
1635 * state inside lock_buffer().
1637 * If block_write_full_page() is called for regular writeback
1638 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1639 * locked buffer. This only can happen if someone has written the buffer
1640 * directly, with submit_bh(). At the address_space level PageWriteback
1641 * prevents this contention from occurring.
1643 static int __block_write_full_page(struct inode *inode, struct page *page,
1644 get_block_t *get_block, struct writeback_control *wbc)
1648 sector_t last_block;
1649 struct buffer_head *bh, *head;
1650 const unsigned blocksize = 1 << inode->i_blkbits;
1651 int nr_underway = 0;
1653 BUG_ON(!PageLocked(page));
1655 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1657 if (!page_has_buffers(page)) {
1658 create_empty_buffers(page, blocksize,
1659 (1 << BH_Dirty)|(1 << BH_Uptodate));
1663 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1664 * here, and the (potentially unmapped) buffers may become dirty at
1665 * any time. If a buffer becomes dirty here after we've inspected it
1666 * then we just miss that fact, and the page stays dirty.
1668 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1669 * handle that here by just cleaning them.
1672 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1673 head = page_buffers(page);
1677 * Get all the dirty buffers mapped to disk addresses and
1678 * handle any aliases from the underlying blockdev's mapping.
1681 if (block > last_block) {
1683 * mapped buffers outside i_size will occur, because
1684 * this page can be outside i_size when there is a
1685 * truncate in progress.
1688 * The buffer was zeroed by block_write_full_page()
1690 clear_buffer_dirty(bh);
1691 set_buffer_uptodate(bh);
1692 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1694 WARN_ON(bh->b_size != blocksize);
1695 err = get_block(inode, block, bh, 1);
1698 clear_buffer_delay(bh);
1699 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */
1701 clear_buffer_new(bh);
1702 unmap_underlying_metadata(bh->b_bdev,
1706 bh = bh->b_this_page;
1708 } while (bh != head);
1711 if (!buffer_mapped(bh))
1714 * If it's a fully non-blocking write attempt and we cannot
1715 * lock the buffer then redirty the page. Note that this can
1716 * potentially cause a busy-wait loop from pdflush and kswapd
1717 * activity, but those code paths have their own higher-level
1720 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1722 } else if (!trylock_buffer(bh)) {
1723 redirty_page_for_writepage(wbc, page);
1726 if (test_clear_buffer_dirty(bh)) {
1727 mark_buffer_async_write(bh);
1731 } while ((bh = bh->b_this_page) != head);
1734 * The page and its buffers are protected by PageWriteback(), so we can
1735 * drop the bh refcounts early.
1737 BUG_ON(PageWriteback(page));
1738 set_page_writeback(page);
1741 struct buffer_head *next = bh->b_this_page;
1742 if (buffer_async_write(bh)) {
1743 submit_bh(WRITE, bh);
1747 } while (bh != head);
1752 if (nr_underway == 0) {
1754 * The page was marked dirty, but the buffers were
1755 * clean. Someone wrote them back by hand with
1756 * ll_rw_block/submit_bh. A rare case.
1758 end_page_writeback(page);
1761 * The page and buffer_heads can be released at any time from
1769 * ENOSPC, or some other error. We may already have added some
1770 * blocks to the file, so we need to write these out to avoid
1771 * exposing stale data.
1772 * The page is currently locked and not marked for writeback
1775 /* Recovery: lock and submit the mapped buffers */
1777 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1778 !buffer_delay(bh)) {
1780 mark_buffer_async_write(bh);
1783 * The buffer may have been set dirty during
1784 * attachment to a dirty page.
1786 clear_buffer_dirty(bh);
1788 } while ((bh = bh->b_this_page) != head);
1790 BUG_ON(PageWriteback(page));
1791 mapping_set_error(page->mapping, err);
1792 set_page_writeback(page);
1794 struct buffer_head *next = bh->b_this_page;
1795 if (buffer_async_write(bh)) {
1796 clear_buffer_dirty(bh);
1797 submit_bh(WRITE, bh);
1801 } while (bh != head);
1807 * If a page has any new buffers, zero them out here, and mark them uptodate
1808 * and dirty so they'll be written out (in order to prevent uninitialised
1809 * block data from leaking). And clear the new bit.
1811 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1813 unsigned int block_start, block_end;
1814 struct buffer_head *head, *bh;
1816 BUG_ON(!PageLocked(page));
1817 if (!page_has_buffers(page))
1820 bh = head = page_buffers(page);
1823 block_end = block_start + bh->b_size;
1825 if (buffer_new(bh)) {
1826 if (block_end > from && block_start < to) {
1827 if (!PageUptodate(page)) {
1828 unsigned start, size;
1830 start = max(from, block_start);
1831 size = min(to, block_end) - start;
1833 zero_user(page, start, size);
1834 set_buffer_uptodate(bh);
1837 clear_buffer_new(bh);
1838 mark_buffer_dirty(bh);
1842 block_start = block_end;
1843 bh = bh->b_this_page;
1844 } while (bh != head);
1846 EXPORT_SYMBOL(page_zero_new_buffers);
1848 static int __block_prepare_write(struct inode *inode, struct page *page,
1849 unsigned from, unsigned to, get_block_t *get_block)
1851 unsigned block_start, block_end;
1854 unsigned blocksize, bbits;
1855 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1857 BUG_ON(!PageLocked(page));
1858 BUG_ON(from > PAGE_CACHE_SIZE);
1859 BUG_ON(to > PAGE_CACHE_SIZE);
1862 blocksize = 1 << inode->i_blkbits;
1863 if (!page_has_buffers(page))
1864 create_empty_buffers(page, blocksize, 0);
1865 head = page_buffers(page);
1867 bbits = inode->i_blkbits;
1868 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1870 for(bh = head, block_start = 0; bh != head || !block_start;
1871 block++, block_start=block_end, bh = bh->b_this_page) {
1872 block_end = block_start + blocksize;
1873 if (block_end <= from || block_start >= to) {
1874 if (PageUptodate(page)) {
1875 if (!buffer_uptodate(bh))
1876 set_buffer_uptodate(bh);
1881 clear_buffer_new(bh);
1882 if (!buffer_mapped(bh)) {
1883 WARN_ON(bh->b_size != blocksize);
1884 err = get_block(inode, block, bh, 1);
1887 if (buffer_new(bh)) {
1888 unmap_underlying_metadata(bh->b_bdev,
1890 if (PageUptodate(page)) {
1891 clear_buffer_new(bh);
1892 set_buffer_uptodate(bh);
1893 mark_buffer_dirty(bh);
1896 if (block_end > to || block_start < from)
1897 zero_user_segments(page,
1903 if (PageUptodate(page)) {
1904 if (!buffer_uptodate(bh))
1905 set_buffer_uptodate(bh);
1908 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1909 !buffer_unwritten(bh) &&
1910 (block_start < from || block_end > to)) {
1911 ll_rw_block(READ, 1, &bh);
1916 * If we issued read requests - let them complete.
1918 while(wait_bh > wait) {
1919 wait_on_buffer(*--wait_bh);
1920 if (!buffer_uptodate(*wait_bh))
1924 page_zero_new_buffers(page, from, to);
1928 static int __block_commit_write(struct inode *inode, struct page *page,
1929 unsigned from, unsigned to)
1931 unsigned block_start, block_end;
1934 struct buffer_head *bh, *head;
1936 blocksize = 1 << inode->i_blkbits;
1938 for(bh = head = page_buffers(page), block_start = 0;
1939 bh != head || !block_start;
1940 block_start=block_end, bh = bh->b_this_page) {
1941 block_end = block_start + blocksize;
1942 if (block_end <= from || block_start >= to) {
1943 if (!buffer_uptodate(bh))
1946 set_buffer_uptodate(bh);
1947 mark_buffer_dirty(bh);
1949 clear_buffer_new(bh);
1953 * If this is a partial write which happened to make all buffers
1954 * uptodate then we can optimize away a bogus readpage() for
1955 * the next read(). Here we 'discover' whether the page went
1956 * uptodate as a result of this (potentially partial) write.
1959 SetPageUptodate(page);
1964 * block_write_begin takes care of the basic task of block allocation and
1965 * bringing partial write blocks uptodate first.
1967 * If *pagep is not NULL, then block_write_begin uses the locked page
1968 * at *pagep rather than allocating its own. In this case, the page will
1969 * not be unlocked or deallocated on failure.
1971 int block_write_begin(struct file *file, struct address_space *mapping,
1972 loff_t pos, unsigned len, unsigned flags,
1973 struct page **pagep, void **fsdata,
1974 get_block_t *get_block)
1976 struct inode *inode = mapping->host;
1980 unsigned start, end;
1983 index = pos >> PAGE_CACHE_SHIFT;
1984 start = pos & (PAGE_CACHE_SIZE - 1);
1990 page = __grab_cache_page(mapping, index);
1997 BUG_ON(!PageLocked(page));
1999 status = __block_prepare_write(inode, page, start, end, get_block);
2000 if (unlikely(status)) {
2001 ClearPageUptodate(page);
2005 page_cache_release(page);
2009 * prepare_write() may have instantiated a few blocks
2010 * outside i_size. Trim these off again. Don't need
2011 * i_size_read because we hold i_mutex.
2013 if (pos + len > inode->i_size)
2014 vmtruncate(inode, inode->i_size);
2022 EXPORT_SYMBOL(block_write_begin);
2024 int block_write_end(struct file *file, struct address_space *mapping,
2025 loff_t pos, unsigned len, unsigned copied,
2026 struct page *page, void *fsdata)
2028 struct inode *inode = mapping->host;
2031 start = pos & (PAGE_CACHE_SIZE - 1);
2033 if (unlikely(copied < len)) {
2035 * The buffers that were written will now be uptodate, so we
2036 * don't have to worry about a readpage reading them and
2037 * overwriting a partial write. However if we have encountered
2038 * a short write and only partially written into a buffer, it
2039 * will not be marked uptodate, so a readpage might come in and
2040 * destroy our partial write.
2042 * Do the simplest thing, and just treat any short write to a
2043 * non uptodate page as a zero-length write, and force the
2044 * caller to redo the whole thing.
2046 if (!PageUptodate(page))
2049 page_zero_new_buffers(page, start+copied, start+len);
2051 flush_dcache_page(page);
2053 /* This could be a short (even 0-length) commit */
2054 __block_commit_write(inode, page, start, start+copied);
2058 EXPORT_SYMBOL(block_write_end);
2060 int generic_write_end(struct file *file, struct address_space *mapping,
2061 loff_t pos, unsigned len, unsigned copied,
2062 struct page *page, void *fsdata)
2064 struct inode *inode = mapping->host;
2065 int i_size_changed = 0;
2067 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2070 * No need to use i_size_read() here, the i_size
2071 * cannot change under us because we hold i_mutex.
2073 * But it's important to update i_size while still holding page lock:
2074 * page writeout could otherwise come in and zero beyond i_size.
2076 if (pos+copied > inode->i_size) {
2077 i_size_write(inode, pos+copied);
2082 page_cache_release(page);
2085 * Don't mark the inode dirty under page lock. First, it unnecessarily
2086 * makes the holding time of page lock longer. Second, it forces lock
2087 * ordering of page lock and transaction start for journaling
2091 mark_inode_dirty(inode);
2095 EXPORT_SYMBOL(generic_write_end);
2098 * block_is_partially_uptodate checks whether buffers within a page are
2101 * Returns true if all buffers which correspond to a file portion
2102 * we want to read are uptodate.
2104 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2107 struct inode *inode = page->mapping->host;
2108 unsigned block_start, block_end, blocksize;
2110 struct buffer_head *bh, *head;
2113 if (!page_has_buffers(page))
2116 blocksize = 1 << inode->i_blkbits;
2117 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2119 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2122 head = page_buffers(page);
2126 block_end = block_start + blocksize;
2127 if (block_end > from && block_start < to) {
2128 if (!buffer_uptodate(bh)) {
2132 if (block_end >= to)
2135 block_start = block_end;
2136 bh = bh->b_this_page;
2137 } while (bh != head);
2141 EXPORT_SYMBOL(block_is_partially_uptodate);
2144 * Generic "read page" function for block devices that have the normal
2145 * get_block functionality. This is most of the block device filesystems.
2146 * Reads the page asynchronously --- the unlock_buffer() and
2147 * set/clear_buffer_uptodate() functions propagate buffer state into the
2148 * page struct once IO has completed.
2150 int block_read_full_page(struct page *page, get_block_t *get_block)
2152 struct inode *inode = page->mapping->host;
2153 sector_t iblock, lblock;
2154 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2155 unsigned int blocksize;
2157 int fully_mapped = 1;
2159 BUG_ON(!PageLocked(page));
2160 blocksize = 1 << inode->i_blkbits;
2161 if (!page_has_buffers(page))
2162 create_empty_buffers(page, blocksize, 0);
2163 head = page_buffers(page);
2165 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2166 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2172 if (buffer_uptodate(bh))
2175 if (!buffer_mapped(bh)) {
2179 if (iblock < lblock) {
2180 WARN_ON(bh->b_size != blocksize);
2181 err = get_block(inode, iblock, bh, 0);
2185 if (!buffer_mapped(bh)) {
2186 zero_user(page, i * blocksize, blocksize);
2188 set_buffer_uptodate(bh);
2192 * get_block() might have updated the buffer
2195 if (buffer_uptodate(bh))
2199 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2202 SetPageMappedToDisk(page);
2206 * All buffers are uptodate - we can set the page uptodate
2207 * as well. But not if get_block() returned an error.
2209 if (!PageError(page))
2210 SetPageUptodate(page);
2215 /* Stage two: lock the buffers */
2216 for (i = 0; i < nr; i++) {
2219 mark_buffer_async_read(bh);
2223 * Stage 3: start the IO. Check for uptodateness
2224 * inside the buffer lock in case another process reading
2225 * the underlying blockdev brought it uptodate (the sct fix).
2227 for (i = 0; i < nr; i++) {
2229 if (buffer_uptodate(bh))
2230 end_buffer_async_read(bh, 1);
2232 submit_bh(READ, bh);
2237 /* utility function for filesystems that need to do work on expanding
2238 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2239 * deal with the hole.
2241 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2243 struct address_space *mapping = inode->i_mapping;
2246 unsigned long limit;
2250 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2251 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2252 send_sig(SIGXFSZ, current, 0);
2255 if (size > inode->i_sb->s_maxbytes)
2258 err = pagecache_write_begin(NULL, mapping, size, 0,
2259 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2264 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2271 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2272 loff_t pos, loff_t *bytes)
2274 struct inode *inode = mapping->host;
2275 unsigned blocksize = 1 << inode->i_blkbits;
2278 pgoff_t index, curidx;
2280 unsigned zerofrom, offset, len;
2283 index = pos >> PAGE_CACHE_SHIFT;
2284 offset = pos & ~PAGE_CACHE_MASK;
2286 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2287 zerofrom = curpos & ~PAGE_CACHE_MASK;
2288 if (zerofrom & (blocksize-1)) {
2289 *bytes |= (blocksize-1);
2292 len = PAGE_CACHE_SIZE - zerofrom;
2294 err = pagecache_write_begin(file, mapping, curpos, len,
2295 AOP_FLAG_UNINTERRUPTIBLE,
2299 zero_user(page, zerofrom, len);
2300 err = pagecache_write_end(file, mapping, curpos, len, len,
2307 balance_dirty_pages_ratelimited(mapping);
2310 /* page covers the boundary, find the boundary offset */
2311 if (index == curidx) {
2312 zerofrom = curpos & ~PAGE_CACHE_MASK;
2313 /* if we will expand the thing last block will be filled */
2314 if (offset <= zerofrom) {
2317 if (zerofrom & (blocksize-1)) {
2318 *bytes |= (blocksize-1);
2321 len = offset - zerofrom;
2323 err = pagecache_write_begin(file, mapping, curpos, len,
2324 AOP_FLAG_UNINTERRUPTIBLE,
2328 zero_user(page, zerofrom, len);
2329 err = pagecache_write_end(file, mapping, curpos, len, len,
2341 * For moronic filesystems that do not allow holes in file.
2342 * We may have to extend the file.
2344 int cont_write_begin(struct file *file, struct address_space *mapping,
2345 loff_t pos, unsigned len, unsigned flags,
2346 struct page **pagep, void **fsdata,
2347 get_block_t *get_block, loff_t *bytes)
2349 struct inode *inode = mapping->host;
2350 unsigned blocksize = 1 << inode->i_blkbits;
2354 err = cont_expand_zero(file, mapping, pos, bytes);
2358 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2359 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2360 *bytes |= (blocksize-1);
2365 err = block_write_begin(file, mapping, pos, len,
2366 flags, pagep, fsdata, get_block);
2371 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2372 get_block_t *get_block)
2374 struct inode *inode = page->mapping->host;
2375 int err = __block_prepare_write(inode, page, from, to, get_block);
2377 ClearPageUptodate(page);
2381 int block_commit_write(struct page *page, unsigned from, unsigned to)
2383 struct inode *inode = page->mapping->host;
2384 __block_commit_write(inode,page,from,to);
2389 * block_page_mkwrite() is not allowed to change the file size as it gets
2390 * called from a page fault handler when a page is first dirtied. Hence we must
2391 * be careful to check for EOF conditions here. We set the page up correctly
2392 * for a written page which means we get ENOSPC checking when writing into
2393 * holes and correct delalloc and unwritten extent mapping on filesystems that
2394 * support these features.
2396 * We are not allowed to take the i_mutex here so we have to play games to
2397 * protect against truncate races as the page could now be beyond EOF. Because
2398 * vmtruncate() writes the inode size before removing pages, once we have the
2399 * page lock we can determine safely if the page is beyond EOF. If it is not
2400 * beyond EOF, then the page is guaranteed safe against truncation until we
2404 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2405 get_block_t get_block)
2407 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2413 size = i_size_read(inode);
2414 if ((page->mapping != inode->i_mapping) ||
2415 (page_offset(page) > size)) {
2416 /* page got truncated out from underneath us */
2420 /* page is wholly or partially inside EOF */
2421 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422 end = size & ~PAGE_CACHE_MASK;
2424 end = PAGE_CACHE_SIZE;
2426 ret = block_prepare_write(page, 0, end, get_block);
2428 ret = block_commit_write(page, 0, end);
2436 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2437 * immediately, while under the page lock. So it needs a special end_io
2438 * handler which does not touch the bh after unlocking it.
2440 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2442 __end_buffer_read_notouch(bh, uptodate);
2446 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2447 * the page (converting it to circular linked list and taking care of page
2450 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2452 struct buffer_head *bh;
2454 BUG_ON(!PageLocked(page));
2456 spin_lock(&page->mapping->private_lock);
2459 if (PageDirty(page))
2460 set_buffer_dirty(bh);
2461 if (!bh->b_this_page)
2462 bh->b_this_page = head;
2463 bh = bh->b_this_page;
2464 } while (bh != head);
2465 attach_page_buffers(page, head);
2466 spin_unlock(&page->mapping->private_lock);
2470 * On entry, the page is fully not uptodate.
2471 * On exit the page is fully uptodate in the areas outside (from,to)
2473 int nobh_write_begin(struct file *file, struct address_space *mapping,
2474 loff_t pos, unsigned len, unsigned flags,
2475 struct page **pagep, void **fsdata,
2476 get_block_t *get_block)
2478 struct inode *inode = mapping->host;
2479 const unsigned blkbits = inode->i_blkbits;
2480 const unsigned blocksize = 1 << blkbits;
2481 struct buffer_head *head, *bh;
2485 unsigned block_in_page;
2486 unsigned block_start, block_end;
2487 sector_t block_in_file;
2490 int is_mapped_to_disk = 1;
2492 index = pos >> PAGE_CACHE_SHIFT;
2493 from = pos & (PAGE_CACHE_SIZE - 1);
2496 page = __grab_cache_page(mapping, index);
2502 if (page_has_buffers(page)) {
2504 page_cache_release(page);
2506 return block_write_begin(file, mapping, pos, len, flags, pagep,
2510 if (PageMappedToDisk(page))
2514 * Allocate buffers so that we can keep track of state, and potentially
2515 * attach them to the page if an error occurs. In the common case of
2516 * no error, they will just be freed again without ever being attached
2517 * to the page (which is all OK, because we're under the page lock).
2519 * Be careful: the buffer linked list is a NULL terminated one, rather
2520 * than the circular one we're used to.
2522 head = alloc_page_buffers(page, blocksize, 0);
2528 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2531 * We loop across all blocks in the page, whether or not they are
2532 * part of the affected region. This is so we can discover if the
2533 * page is fully mapped-to-disk.
2535 for (block_start = 0, block_in_page = 0, bh = head;
2536 block_start < PAGE_CACHE_SIZE;
2537 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2540 block_end = block_start + blocksize;
2543 if (block_start >= to)
2545 ret = get_block(inode, block_in_file + block_in_page,
2549 if (!buffer_mapped(bh))
2550 is_mapped_to_disk = 0;
2552 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2553 if (PageUptodate(page)) {
2554 set_buffer_uptodate(bh);
2557 if (buffer_new(bh) || !buffer_mapped(bh)) {
2558 zero_user_segments(page, block_start, from,
2562 if (buffer_uptodate(bh))
2563 continue; /* reiserfs does this */
2564 if (block_start < from || block_end > to) {
2566 bh->b_end_io = end_buffer_read_nobh;
2567 submit_bh(READ, bh);
2574 * The page is locked, so these buffers are protected from
2575 * any VM or truncate activity. Hence we don't need to care
2576 * for the buffer_head refcounts.
2578 for (bh = head; bh; bh = bh->b_this_page) {
2580 if (!buffer_uptodate(bh))
2587 if (is_mapped_to_disk)
2588 SetPageMappedToDisk(page);
2590 *fsdata = head; /* to be released by nobh_write_end */
2597 * Error recovery is a bit difficult. We need to zero out blocks that
2598 * were newly allocated, and dirty them to ensure they get written out.
2599 * Buffers need to be attached to the page at this point, otherwise
2600 * the handling of potential IO errors during writeout would be hard
2601 * (could try doing synchronous writeout, but what if that fails too?)
2603 attach_nobh_buffers(page, head);
2604 page_zero_new_buffers(page, from, to);
2608 page_cache_release(page);
2611 if (pos + len > inode->i_size)
2612 vmtruncate(inode, inode->i_size);
2616 EXPORT_SYMBOL(nobh_write_begin);
2618 int nobh_write_end(struct file *file, struct address_space *mapping,
2619 loff_t pos, unsigned len, unsigned copied,
2620 struct page *page, void *fsdata)
2622 struct inode *inode = page->mapping->host;
2623 struct buffer_head *head = fsdata;
2624 struct buffer_head *bh;
2625 BUG_ON(fsdata != NULL && page_has_buffers(page));
2627 if (unlikely(copied < len) && !page_has_buffers(page))
2628 attach_nobh_buffers(page, head);
2629 if (page_has_buffers(page))
2630 return generic_write_end(file, mapping, pos, len,
2631 copied, page, fsdata);
2633 SetPageUptodate(page);
2634 set_page_dirty(page);
2635 if (pos+copied > inode->i_size) {
2636 i_size_write(inode, pos+copied);
2637 mark_inode_dirty(inode);
2641 page_cache_release(page);
2645 head = head->b_this_page;
2646 free_buffer_head(bh);
2651 EXPORT_SYMBOL(nobh_write_end);
2654 * nobh_writepage() - based on block_full_write_page() except
2655 * that it tries to operate without attaching bufferheads to
2658 int nobh_writepage(struct page *page, get_block_t *get_block,
2659 struct writeback_control *wbc)
2661 struct inode * const inode = page->mapping->host;
2662 loff_t i_size = i_size_read(inode);
2663 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2667 /* Is the page fully inside i_size? */
2668 if (page->index < end_index)
2671 /* Is the page fully outside i_size? (truncate in progress) */
2672 offset = i_size & (PAGE_CACHE_SIZE-1);
2673 if (page->index >= end_index+1 || !offset) {
2675 * The page may have dirty, unmapped buffers. For example,
2676 * they may have been added in ext3_writepage(). Make them
2677 * freeable here, so the page does not leak.
2680 /* Not really sure about this - do we need this ? */
2681 if (page->mapping->a_ops->invalidatepage)
2682 page->mapping->a_ops->invalidatepage(page, offset);
2685 return 0; /* don't care */
2689 * The page straddles i_size. It must be zeroed out on each and every
2690 * writepage invocation because it may be mmapped. "A file is mapped
2691 * in multiples of the page size. For a file that is not a multiple of
2692 * the page size, the remaining memory is zeroed when mapped, and
2693 * writes to that region are not written out to the file."
2695 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2697 ret = mpage_writepage(page, get_block, wbc);
2699 ret = __block_write_full_page(inode, page, get_block, wbc);
2702 EXPORT_SYMBOL(nobh_writepage);
2704 int nobh_truncate_page(struct address_space *mapping,
2705 loff_t from, get_block_t *get_block)
2707 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2708 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2711 unsigned length, pos;
2712 struct inode *inode = mapping->host;
2714 struct buffer_head map_bh;
2717 blocksize = 1 << inode->i_blkbits;
2718 length = offset & (blocksize - 1);
2720 /* Block boundary? Nothing to do */
2724 length = blocksize - length;
2725 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2727 page = grab_cache_page(mapping, index);
2732 if (page_has_buffers(page)) {
2735 page_cache_release(page);
2736 return block_truncate_page(mapping, from, get_block);
2739 /* Find the buffer that contains "offset" */
2741 while (offset >= pos) {
2746 err = get_block(inode, iblock, &map_bh, 0);
2749 /* unmapped? It's a hole - nothing to do */
2750 if (!buffer_mapped(&map_bh))
2753 /* Ok, it's mapped. Make sure it's up-to-date */
2754 if (!PageUptodate(page)) {
2755 err = mapping->a_ops->readpage(NULL, page);
2757 page_cache_release(page);
2761 if (!PageUptodate(page)) {
2765 if (page_has_buffers(page))
2768 zero_user(page, offset, length);
2769 set_page_dirty(page);
2774 page_cache_release(page);
2778 EXPORT_SYMBOL(nobh_truncate_page);
2780 int block_truncate_page(struct address_space *mapping,
2781 loff_t from, get_block_t *get_block)
2783 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2784 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2787 unsigned length, pos;
2788 struct inode *inode = mapping->host;
2790 struct buffer_head *bh;
2793 blocksize = 1 << inode->i_blkbits;
2794 length = offset & (blocksize - 1);
2796 /* Block boundary? Nothing to do */
2800 length = blocksize - length;
2801 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2803 page = grab_cache_page(mapping, index);
2808 if (!page_has_buffers(page))
2809 create_empty_buffers(page, blocksize, 0);
2811 /* Find the buffer that contains "offset" */
2812 bh = page_buffers(page);
2814 while (offset >= pos) {
2815 bh = bh->b_this_page;
2821 if (!buffer_mapped(bh)) {
2822 WARN_ON(bh->b_size != blocksize);
2823 err = get_block(inode, iblock, bh, 0);
2826 /* unmapped? It's a hole - nothing to do */
2827 if (!buffer_mapped(bh))
2831 /* Ok, it's mapped. Make sure it's up-to-date */
2832 if (PageUptodate(page))
2833 set_buffer_uptodate(bh);
2835 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2837 ll_rw_block(READ, 1, &bh);
2839 /* Uhhuh. Read error. Complain and punt. */
2840 if (!buffer_uptodate(bh))
2844 zero_user(page, offset, length);
2845 mark_buffer_dirty(bh);
2850 page_cache_release(page);
2856 * The generic ->writepage function for buffer-backed address_spaces
2858 int block_write_full_page(struct page *page, get_block_t *get_block,
2859 struct writeback_control *wbc)
2861 struct inode * const inode = page->mapping->host;
2862 loff_t i_size = i_size_read(inode);
2863 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2866 /* Is the page fully inside i_size? */
2867 if (page->index < end_index)
2868 return __block_write_full_page(inode, page, get_block, wbc);
2870 /* Is the page fully outside i_size? (truncate in progress) */
2871 offset = i_size & (PAGE_CACHE_SIZE-1);
2872 if (page->index >= end_index+1 || !offset) {
2874 * The page may have dirty, unmapped buffers. For example,
2875 * they may have been added in ext3_writepage(). Make them
2876 * freeable here, so the page does not leak.
2878 do_invalidatepage(page, 0);
2880 return 0; /* don't care */
2884 * The page straddles i_size. It must be zeroed out on each and every
2885 * writepage invokation because it may be mmapped. "A file is mapped
2886 * in multiples of the page size. For a file that is not a multiple of
2887 * the page size, the remaining memory is zeroed when mapped, and
2888 * writes to that region are not written out to the file."
2890 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2891 return __block_write_full_page(inode, page, get_block, wbc);
2894 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2895 get_block_t *get_block)
2897 struct buffer_head tmp;
2898 struct inode *inode = mapping->host;
2901 tmp.b_size = 1 << inode->i_blkbits;
2902 get_block(inode, block, &tmp, 0);
2903 return tmp.b_blocknr;
2906 static void end_bio_bh_io_sync(struct bio *bio, int err)
2908 struct buffer_head *bh = bio->bi_private;
2910 if (err == -EOPNOTSUPP) {
2911 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2912 set_bit(BH_Eopnotsupp, &bh->b_state);
2915 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2919 int submit_bh(int rw, struct buffer_head * bh)
2924 BUG_ON(!buffer_locked(bh));
2925 BUG_ON(!buffer_mapped(bh));
2926 BUG_ON(!bh->b_end_io);
2929 * Mask in barrier bit for a write (could be either a WRITE or a
2932 if (buffer_ordered(bh) && (rw & WRITE))
2933 rw |= WRITE_BARRIER;
2936 * Only clear out a write error when rewriting
2938 if (test_set_buffer_req(bh) && (rw & WRITE))
2939 clear_buffer_write_io_error(bh);
2942 * from here on down, it's all bio -- do the initial mapping,
2943 * submit_bio -> generic_make_request may further map this bio around
2945 bio = bio_alloc(GFP_NOIO, 1);
2947 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2948 bio->bi_bdev = bh->b_bdev;
2949 bio->bi_io_vec[0].bv_page = bh->b_page;
2950 bio->bi_io_vec[0].bv_len = bh->b_size;
2951 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2955 bio->bi_size = bh->b_size;
2957 bio->bi_end_io = end_bio_bh_io_sync;
2958 bio->bi_private = bh;
2961 submit_bio(rw, bio);
2963 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2971 * ll_rw_block: low-level access to block devices (DEPRECATED)
2972 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2973 * @nr: number of &struct buffer_heads in the array
2974 * @bhs: array of pointers to &struct buffer_head
2976 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2977 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2978 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2979 * are sent to disk. The fourth %READA option is described in the documentation
2980 * for generic_make_request() which ll_rw_block() calls.
2982 * This function drops any buffer that it cannot get a lock on (with the
2983 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2984 * clean when doing a write request, and any buffer that appears to be
2985 * up-to-date when doing read request. Further it marks as clean buffers that
2986 * are processed for writing (the buffer cache won't assume that they are
2987 * actually clean until the buffer gets unlocked).
2989 * ll_rw_block sets b_end_io to simple completion handler that marks
2990 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2993 * All of the buffers must be for the same device, and must also be a
2994 * multiple of the current approved size for the device.
2996 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3000 for (i = 0; i < nr; i++) {
3001 struct buffer_head *bh = bhs[i];
3003 if (rw == SWRITE || rw == SWRITE_SYNC)
3005 else if (!trylock_buffer(bh))
3008 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
3009 if (test_clear_buffer_dirty(bh)) {
3010 bh->b_end_io = end_buffer_write_sync;
3012 if (rw == SWRITE_SYNC)
3013 submit_bh(WRITE_SYNC, bh);
3015 submit_bh(WRITE, bh);
3019 if (!buffer_uptodate(bh)) {
3020 bh->b_end_io = end_buffer_read_sync;
3031 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3032 * and then start new I/O and then wait upon it. The caller must have a ref on
3035 int sync_dirty_buffer(struct buffer_head *bh)
3039 WARN_ON(atomic_read(&bh->b_count) < 1);
3041 if (test_clear_buffer_dirty(bh)) {
3043 bh->b_end_io = end_buffer_write_sync;
3044 ret = submit_bh(WRITE_SYNC, bh);
3046 if (buffer_eopnotsupp(bh)) {
3047 clear_buffer_eopnotsupp(bh);
3050 if (!ret && !buffer_uptodate(bh))
3059 * try_to_free_buffers() checks if all the buffers on this particular page
3060 * are unused, and releases them if so.
3062 * Exclusion against try_to_free_buffers may be obtained by either
3063 * locking the page or by holding its mapping's private_lock.
3065 * If the page is dirty but all the buffers are clean then we need to
3066 * be sure to mark the page clean as well. This is because the page
3067 * may be against a block device, and a later reattachment of buffers
3068 * to a dirty page will set *all* buffers dirty. Which would corrupt
3069 * filesystem data on the same device.
3071 * The same applies to regular filesystem pages: if all the buffers are
3072 * clean then we set the page clean and proceed. To do that, we require
3073 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3076 * try_to_free_buffers() is non-blocking.
3078 static inline int buffer_busy(struct buffer_head *bh)
3080 return atomic_read(&bh->b_count) |
3081 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3085 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3087 struct buffer_head *head = page_buffers(page);
3088 struct buffer_head *bh;
3092 if (buffer_write_io_error(bh) && page->mapping)
3093 set_bit(AS_EIO, &page->mapping->flags);
3094 if (buffer_busy(bh))
3096 bh = bh->b_this_page;
3097 } while (bh != head);
3100 struct buffer_head *next = bh->b_this_page;
3102 if (bh->b_assoc_map)
3103 __remove_assoc_queue(bh);
3105 } while (bh != head);
3106 *buffers_to_free = head;
3107 __clear_page_buffers(page);
3113 int try_to_free_buffers(struct page *page)
3115 struct address_space * const mapping = page->mapping;
3116 struct buffer_head *buffers_to_free = NULL;
3119 BUG_ON(!PageLocked(page));
3120 if (PageWriteback(page))
3123 if (mapping == NULL) { /* can this still happen? */
3124 ret = drop_buffers(page, &buffers_to_free);
3128 spin_lock(&mapping->private_lock);
3129 ret = drop_buffers(page, &buffers_to_free);
3132 * If the filesystem writes its buffers by hand (eg ext3)
3133 * then we can have clean buffers against a dirty page. We
3134 * clean the page here; otherwise the VM will never notice
3135 * that the filesystem did any IO at all.
3137 * Also, during truncate, discard_buffer will have marked all
3138 * the page's buffers clean. We discover that here and clean
3141 * private_lock must be held over this entire operation in order
3142 * to synchronise against __set_page_dirty_buffers and prevent the
3143 * dirty bit from being lost.
3146 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3147 spin_unlock(&mapping->private_lock);
3149 if (buffers_to_free) {
3150 struct buffer_head *bh = buffers_to_free;
3153 struct buffer_head *next = bh->b_this_page;
3154 free_buffer_head(bh);
3156 } while (bh != buffers_to_free);
3160 EXPORT_SYMBOL(try_to_free_buffers);
3162 void block_sync_page(struct page *page)
3164 struct address_space *mapping;
3167 mapping = page_mapping(page);
3169 blk_run_backing_dev(mapping->backing_dev_info, page);
3173 * There are no bdflush tunables left. But distributions are
3174 * still running obsolete flush daemons, so we terminate them here.
3176 * Use of bdflush() is deprecated and will be removed in a future kernel.
3177 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3179 asmlinkage long sys_bdflush(int func, long data)
3181 static int msg_count;
3183 if (!capable(CAP_SYS_ADMIN))
3186 if (msg_count < 5) {
3189 "warning: process `%s' used the obsolete bdflush"
3190 " system call\n", current->comm);
3191 printk(KERN_INFO "Fix your initscripts?\n");
3200 * Buffer-head allocation
3202 static struct kmem_cache *bh_cachep;
3205 * Once the number of bh's in the machine exceeds this level, we start
3206 * stripping them in writeback.
3208 static int max_buffer_heads;
3210 int buffer_heads_over_limit;
3212 struct bh_accounting {
3213 int nr; /* Number of live bh's */
3214 int ratelimit; /* Limit cacheline bouncing */
3217 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3219 static void recalc_bh_state(void)
3224 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3226 __get_cpu_var(bh_accounting).ratelimit = 0;
3227 for_each_online_cpu(i)
3228 tot += per_cpu(bh_accounting, i).nr;
3229 buffer_heads_over_limit = (tot > max_buffer_heads);
3232 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3234 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3236 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3237 get_cpu_var(bh_accounting).nr++;
3239 put_cpu_var(bh_accounting);
3243 EXPORT_SYMBOL(alloc_buffer_head);
3245 void free_buffer_head(struct buffer_head *bh)
3247 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3248 kmem_cache_free(bh_cachep, bh);
3249 get_cpu_var(bh_accounting).nr--;
3251 put_cpu_var(bh_accounting);
3253 EXPORT_SYMBOL(free_buffer_head);
3255 static void buffer_exit_cpu(int cpu)
3258 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3260 for (i = 0; i < BH_LRU_SIZE; i++) {
3264 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3265 per_cpu(bh_accounting, cpu).nr = 0;
3266 put_cpu_var(bh_accounting);
3269 static int buffer_cpu_notify(struct notifier_block *self,
3270 unsigned long action, void *hcpu)
3272 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3273 buffer_exit_cpu((unsigned long)hcpu);
3278 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3279 * @bh: struct buffer_head
3281 * Return true if the buffer is up-to-date and false,
3282 * with the buffer locked, if not.
3284 int bh_uptodate_or_lock(struct buffer_head *bh)
3286 if (!buffer_uptodate(bh)) {
3288 if (!buffer_uptodate(bh))
3294 EXPORT_SYMBOL(bh_uptodate_or_lock);
3297 * bh_submit_read - Submit a locked buffer for reading
3298 * @bh: struct buffer_head
3300 * Returns zero on success and -EIO on error.
3302 int bh_submit_read(struct buffer_head *bh)
3304 BUG_ON(!buffer_locked(bh));
3306 if (buffer_uptodate(bh)) {
3312 bh->b_end_io = end_buffer_read_sync;
3313 submit_bh(READ, bh);
3315 if (buffer_uptodate(bh))
3319 EXPORT_SYMBOL(bh_submit_read);
3322 init_buffer_head(void *data)
3324 struct buffer_head *bh = data;
3326 memset(bh, 0, sizeof(*bh));
3327 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3330 void __init buffer_init(void)
3334 bh_cachep = kmem_cache_create("buffer_head",
3335 sizeof(struct buffer_head), 0,
3336 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3341 * Limit the bh occupancy to 10% of ZONE_NORMAL
3343 nrpages = (nr_free_buffer_pages() * 10) / 100;
3344 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3345 hotcpu_notifier(buffer_cpu_notify, 0);
3348 EXPORT_SYMBOL(__bforget);
3349 EXPORT_SYMBOL(__brelse);
3350 EXPORT_SYMBOL(__wait_on_buffer);
3351 EXPORT_SYMBOL(block_commit_write);
3352 EXPORT_SYMBOL(block_prepare_write);
3353 EXPORT_SYMBOL(block_page_mkwrite);
3354 EXPORT_SYMBOL(block_read_full_page);
3355 EXPORT_SYMBOL(block_sync_page);
3356 EXPORT_SYMBOL(block_truncate_page);
3357 EXPORT_SYMBOL(block_write_full_page);
3358 EXPORT_SYMBOL(cont_write_begin);
3359 EXPORT_SYMBOL(end_buffer_read_sync);
3360 EXPORT_SYMBOL(end_buffer_write_sync);
3361 EXPORT_SYMBOL(file_fsync);
3362 EXPORT_SYMBOL(fsync_bdev);
3363 EXPORT_SYMBOL(generic_block_bmap);
3364 EXPORT_SYMBOL(generic_cont_expand_simple);
3365 EXPORT_SYMBOL(init_buffer);
3366 EXPORT_SYMBOL(invalidate_bdev);
3367 EXPORT_SYMBOL(ll_rw_block);
3368 EXPORT_SYMBOL(mark_buffer_dirty);
3369 EXPORT_SYMBOL(submit_bh);
3370 EXPORT_SYMBOL(sync_dirty_buffer);
3371 EXPORT_SYMBOL(unlock_buffer);