Merge branch 'linux-2.6'
[linux-2.6] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/capability.h>
29 #include <linux/blkdev.h>
30 #include <linux/file.h>
31 #include <linux/quotaops.h>
32 #include <linux/highmem.h>
33 #include <linux/module.h>
34 #include <linux/writeback.h>
35 #include <linux/hash.h>
36 #include <linux/suspend.h>
37 #include <linux/buffer_head.h>
38 #include <linux/task_io_accounting_ops.h>
39 #include <linux/bio.h>
40 #include <linux/notifier.h>
41 #include <linux/cpu.h>
42 #include <linux/bitops.h>
43 #include <linux/mpage.h>
44 #include <linux/bit_spinlock.h>
45
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 static void invalidate_bh_lrus(void);
48
49 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
50
51 inline void
52 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
53 {
54         bh->b_end_io = handler;
55         bh->b_private = private;
56 }
57
58 static int sync_buffer(void *word)
59 {
60         struct block_device *bd;
61         struct buffer_head *bh
62                 = container_of(word, struct buffer_head, b_state);
63
64         smp_mb();
65         bd = bh->b_bdev;
66         if (bd)
67                 blk_run_address_space(bd->bd_inode->i_mapping);
68         io_schedule();
69         return 0;
70 }
71
72 void fastcall __lock_buffer(struct buffer_head *bh)
73 {
74         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75                                                         TASK_UNINTERRUPTIBLE);
76 }
77 EXPORT_SYMBOL(__lock_buffer);
78
79 void fastcall unlock_buffer(struct buffer_head *bh)
80 {
81         clear_buffer_locked(bh);
82         smp_mb__after_clear_bit();
83         wake_up_bit(&bh->b_state, BH_Lock);
84 }
85
86 /*
87  * Block until a buffer comes unlocked.  This doesn't stop it
88  * from becoming locked again - you have to lock it yourself
89  * if you want to preserve its state.
90  */
91 void __wait_on_buffer(struct buffer_head * bh)
92 {
93         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
94 }
95
96 static void
97 __clear_page_buffers(struct page *page)
98 {
99         ClearPagePrivate(page);
100         set_page_private(page, 0);
101         page_cache_release(page);
102 }
103
104 static void buffer_io_error(struct buffer_head *bh)
105 {
106         char b[BDEVNAME_SIZE];
107
108         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
109                         bdevname(bh->b_bdev, b),
110                         (unsigned long long)bh->b_blocknr);
111 }
112
113 /*
114  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
115  * unlock the buffer. This is what ll_rw_block uses too.
116  */
117 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
118 {
119         if (uptodate) {
120                 set_buffer_uptodate(bh);
121         } else {
122                 /* This happens, due to failed READA attempts. */
123                 clear_buffer_uptodate(bh);
124         }
125         unlock_buffer(bh);
126         put_bh(bh);
127 }
128
129 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
130 {
131         char b[BDEVNAME_SIZE];
132
133         if (uptodate) {
134                 set_buffer_uptodate(bh);
135         } else {
136                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
137                         buffer_io_error(bh);
138                         printk(KERN_WARNING "lost page write due to "
139                                         "I/O error on %s\n",
140                                        bdevname(bh->b_bdev, b));
141                 }
142                 set_buffer_write_io_error(bh);
143                 clear_buffer_uptodate(bh);
144         }
145         unlock_buffer(bh);
146         put_bh(bh);
147 }
148
149 /*
150  * Write out and wait upon all the dirty data associated with a block
151  * device via its mapping.  Does not take the superblock lock.
152  */
153 int sync_blockdev(struct block_device *bdev)
154 {
155         int ret = 0;
156
157         if (bdev)
158                 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
159         return ret;
160 }
161 EXPORT_SYMBOL(sync_blockdev);
162
163 /*
164  * Write out and wait upon all dirty data associated with this
165  * device.   Filesystem data as well as the underlying block
166  * device.  Takes the superblock lock.
167  */
168 int fsync_bdev(struct block_device *bdev)
169 {
170         struct super_block *sb = get_super(bdev);
171         if (sb) {
172                 int res = fsync_super(sb);
173                 drop_super(sb);
174                 return res;
175         }
176         return sync_blockdev(bdev);
177 }
178
179 /**
180  * freeze_bdev  --  lock a filesystem and force it into a consistent state
181  * @bdev:       blockdevice to lock
182  *
183  * This takes the block device bd_mount_sem to make sure no new mounts
184  * happen on bdev until thaw_bdev() is called.
185  * If a superblock is found on this device, we take the s_umount semaphore
186  * on it to make sure nobody unmounts until the snapshot creation is done.
187  */
188 struct super_block *freeze_bdev(struct block_device *bdev)
189 {
190         struct super_block *sb;
191
192         down(&bdev->bd_mount_sem);
193         sb = get_super(bdev);
194         if (sb && !(sb->s_flags & MS_RDONLY)) {
195                 sb->s_frozen = SB_FREEZE_WRITE;
196                 smp_wmb();
197
198                 __fsync_super(sb);
199
200                 sb->s_frozen = SB_FREEZE_TRANS;
201                 smp_wmb();
202
203                 sync_blockdev(sb->s_bdev);
204
205                 if (sb->s_op->write_super_lockfs)
206                         sb->s_op->write_super_lockfs(sb);
207         }
208
209         sync_blockdev(bdev);
210         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
211 }
212 EXPORT_SYMBOL(freeze_bdev);
213
214 /**
215  * thaw_bdev  -- unlock filesystem
216  * @bdev:       blockdevice to unlock
217  * @sb:         associated superblock
218  *
219  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
220  */
221 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
222 {
223         if (sb) {
224                 BUG_ON(sb->s_bdev != bdev);
225
226                 if (sb->s_op->unlockfs)
227                         sb->s_op->unlockfs(sb);
228                 sb->s_frozen = SB_UNFROZEN;
229                 smp_wmb();
230                 wake_up(&sb->s_wait_unfrozen);
231                 drop_super(sb);
232         }
233
234         up(&bdev->bd_mount_sem);
235 }
236 EXPORT_SYMBOL(thaw_bdev);
237
238 /*
239  * Various filesystems appear to want __find_get_block to be non-blocking.
240  * But it's the page lock which protects the buffers.  To get around this,
241  * we get exclusion from try_to_free_buffers with the blockdev mapping's
242  * private_lock.
243  *
244  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
245  * may be quite high.  This code could TryLock the page, and if that
246  * succeeds, there is no need to take private_lock. (But if
247  * private_lock is contended then so is mapping->tree_lock).
248  */
249 static struct buffer_head *
250 __find_get_block_slow(struct block_device *bdev, sector_t block)
251 {
252         struct inode *bd_inode = bdev->bd_inode;
253         struct address_space *bd_mapping = bd_inode->i_mapping;
254         struct buffer_head *ret = NULL;
255         pgoff_t index;
256         struct buffer_head *bh;
257         struct buffer_head *head;
258         struct page *page;
259         int all_mapped = 1;
260
261         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
262         page = find_get_page(bd_mapping, index);
263         if (!page)
264                 goto out;
265
266         spin_lock(&bd_mapping->private_lock);
267         if (!page_has_buffers(page))
268                 goto out_unlock;
269         head = page_buffers(page);
270         bh = head;
271         do {
272                 if (bh->b_blocknr == block) {
273                         ret = bh;
274                         get_bh(bh);
275                         goto out_unlock;
276                 }
277                 if (!buffer_mapped(bh))
278                         all_mapped = 0;
279                 bh = bh->b_this_page;
280         } while (bh != head);
281
282         /* we might be here because some of the buffers on this page are
283          * not mapped.  This is due to various races between
284          * file io on the block device and getblk.  It gets dealt with
285          * elsewhere, don't buffer_error if we had some unmapped buffers
286          */
287         if (all_mapped) {
288                 printk("__find_get_block_slow() failed. "
289                         "block=%llu, b_blocknr=%llu\n",
290                         (unsigned long long)block,
291                         (unsigned long long)bh->b_blocknr);
292                 printk("b_state=0x%08lx, b_size=%zu\n",
293                         bh->b_state, bh->b_size);
294                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
295         }
296 out_unlock:
297         spin_unlock(&bd_mapping->private_lock);
298         page_cache_release(page);
299 out:
300         return ret;
301 }
302
303 /* If invalidate_buffers() will trash dirty buffers, it means some kind
304    of fs corruption is going on. Trashing dirty data always imply losing
305    information that was supposed to be just stored on the physical layer
306    by the user.
307
308    Thus invalidate_buffers in general usage is not allwowed to trash
309    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
310    be preserved.  These buffers are simply skipped.
311   
312    We also skip buffers which are still in use.  For example this can
313    happen if a userspace program is reading the block device.
314
315    NOTE: In the case where the user removed a removable-media-disk even if
316    there's still dirty data not synced on disk (due a bug in the device driver
317    or due an error of the user), by not destroying the dirty buffers we could
318    generate corruption also on the next media inserted, thus a parameter is
319    necessary to handle this case in the most safe way possible (trying
320    to not corrupt also the new disk inserted with the data belonging to
321    the old now corrupted disk). Also for the ramdisk the natural thing
322    to do in order to release the ramdisk memory is to destroy dirty buffers.
323
324    These are two special cases. Normal usage imply the device driver
325    to issue a sync on the device (without waiting I/O completion) and
326    then an invalidate_buffers call that doesn't trash dirty buffers.
327
328    For handling cache coherency with the blkdev pagecache the 'update' case
329    is been introduced. It is needed to re-read from disk any pinned
330    buffer. NOTE: re-reading from disk is destructive so we can do it only
331    when we assume nobody is changing the buffercache under our I/O and when
332    we think the disk contains more recent information than the buffercache.
333    The update == 1 pass marks the buffers we need to update, the update == 2
334    pass does the actual I/O. */
335 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
336 {
337         struct address_space *mapping = bdev->bd_inode->i_mapping;
338
339         if (mapping->nrpages == 0)
340                 return;
341
342         invalidate_bh_lrus();
343         /*
344          * FIXME: what about destroy_dirty_buffers?
345          * We really want to use invalidate_inode_pages2() for
346          * that, but not until that's cleaned up.
347          */
348         invalidate_inode_pages(mapping);
349 }
350
351 /*
352  * Kick pdflush then try to free up some ZONE_NORMAL memory.
353  */
354 static void free_more_memory(void)
355 {
356         struct zone **zones;
357         pg_data_t *pgdat;
358
359         wakeup_pdflush(1024);
360         yield();
361
362         for_each_online_pgdat(pgdat) {
363                 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
364                 if (*zones)
365                         try_to_free_pages(zones, GFP_NOFS);
366         }
367 }
368
369 /*
370  * I/O completion handler for block_read_full_page() - pages
371  * which come unlocked at the end of I/O.
372  */
373 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
374 {
375         unsigned long flags;
376         struct buffer_head *first;
377         struct buffer_head *tmp;
378         struct page *page;
379         int page_uptodate = 1;
380
381         BUG_ON(!buffer_async_read(bh));
382
383         page = bh->b_page;
384         if (uptodate) {
385                 set_buffer_uptodate(bh);
386         } else {
387                 clear_buffer_uptodate(bh);
388                 if (printk_ratelimit())
389                         buffer_io_error(bh);
390                 SetPageError(page);
391         }
392
393         /*
394          * Be _very_ careful from here on. Bad things can happen if
395          * two buffer heads end IO at almost the same time and both
396          * decide that the page is now completely done.
397          */
398         first = page_buffers(page);
399         local_irq_save(flags);
400         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
401         clear_buffer_async_read(bh);
402         unlock_buffer(bh);
403         tmp = bh;
404         do {
405                 if (!buffer_uptodate(tmp))
406                         page_uptodate = 0;
407                 if (buffer_async_read(tmp)) {
408                         BUG_ON(!buffer_locked(tmp));
409                         goto still_busy;
410                 }
411                 tmp = tmp->b_this_page;
412         } while (tmp != bh);
413         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
414         local_irq_restore(flags);
415
416         /*
417          * If none of the buffers had errors and they are all
418          * uptodate then we can set the page uptodate.
419          */
420         if (page_uptodate && !PageError(page))
421                 SetPageUptodate(page);
422         unlock_page(page);
423         return;
424
425 still_busy:
426         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
427         local_irq_restore(flags);
428         return;
429 }
430
431 /*
432  * Completion handler for block_write_full_page() - pages which are unlocked
433  * during I/O, and which have PageWriteback cleared upon I/O completion.
434  */
435 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
436 {
437         char b[BDEVNAME_SIZE];
438         unsigned long flags;
439         struct buffer_head *first;
440         struct buffer_head *tmp;
441         struct page *page;
442
443         BUG_ON(!buffer_async_write(bh));
444
445         page = bh->b_page;
446         if (uptodate) {
447                 set_buffer_uptodate(bh);
448         } else {
449                 if (printk_ratelimit()) {
450                         buffer_io_error(bh);
451                         printk(KERN_WARNING "lost page write due to "
452                                         "I/O error on %s\n",
453                                bdevname(bh->b_bdev, b));
454                 }
455                 set_bit(AS_EIO, &page->mapping->flags);
456                 set_buffer_write_io_error(bh);
457                 clear_buffer_uptodate(bh);
458                 SetPageError(page);
459         }
460
461         first = page_buffers(page);
462         local_irq_save(flags);
463         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
464
465         clear_buffer_async_write(bh);
466         unlock_buffer(bh);
467         tmp = bh->b_this_page;
468         while (tmp != bh) {
469                 if (buffer_async_write(tmp)) {
470                         BUG_ON(!buffer_locked(tmp));
471                         goto still_busy;
472                 }
473                 tmp = tmp->b_this_page;
474         }
475         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
476         local_irq_restore(flags);
477         end_page_writeback(page);
478         return;
479
480 still_busy:
481         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
482         local_irq_restore(flags);
483         return;
484 }
485
486 /*
487  * If a page's buffers are under async readin (end_buffer_async_read
488  * completion) then there is a possibility that another thread of
489  * control could lock one of the buffers after it has completed
490  * but while some of the other buffers have not completed.  This
491  * locked buffer would confuse end_buffer_async_read() into not unlocking
492  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
493  * that this buffer is not under async I/O.
494  *
495  * The page comes unlocked when it has no locked buffer_async buffers
496  * left.
497  *
498  * PageLocked prevents anyone starting new async I/O reads any of
499  * the buffers.
500  *
501  * PageWriteback is used to prevent simultaneous writeout of the same
502  * page.
503  *
504  * PageLocked prevents anyone from starting writeback of a page which is
505  * under read I/O (PageWriteback is only ever set against a locked page).
506  */
507 static void mark_buffer_async_read(struct buffer_head *bh)
508 {
509         bh->b_end_io = end_buffer_async_read;
510         set_buffer_async_read(bh);
511 }
512
513 void mark_buffer_async_write(struct buffer_head *bh)
514 {
515         bh->b_end_io = end_buffer_async_write;
516         set_buffer_async_write(bh);
517 }
518 EXPORT_SYMBOL(mark_buffer_async_write);
519
520
521 /*
522  * fs/buffer.c contains helper functions for buffer-backed address space's
523  * fsync functions.  A common requirement for buffer-based filesystems is
524  * that certain data from the backing blockdev needs to be written out for
525  * a successful fsync().  For example, ext2 indirect blocks need to be
526  * written back and waited upon before fsync() returns.
527  *
528  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
529  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
530  * management of a list of dependent buffers at ->i_mapping->private_list.
531  *
532  * Locking is a little subtle: try_to_free_buffers() will remove buffers
533  * from their controlling inode's queue when they are being freed.  But
534  * try_to_free_buffers() will be operating against the *blockdev* mapping
535  * at the time, not against the S_ISREG file which depends on those buffers.
536  * So the locking for private_list is via the private_lock in the address_space
537  * which backs the buffers.  Which is different from the address_space 
538  * against which the buffers are listed.  So for a particular address_space,
539  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
540  * mapping->private_list will always be protected by the backing blockdev's
541  * ->private_lock.
542  *
543  * Which introduces a requirement: all buffers on an address_space's
544  * ->private_list must be from the same address_space: the blockdev's.
545  *
546  * address_spaces which do not place buffers at ->private_list via these
547  * utility functions are free to use private_lock and private_list for
548  * whatever they want.  The only requirement is that list_empty(private_list)
549  * be true at clear_inode() time.
550  *
551  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
552  * filesystems should do that.  invalidate_inode_buffers() should just go
553  * BUG_ON(!list_empty).
554  *
555  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
556  * take an address_space, not an inode.  And it should be called
557  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
558  * queued up.
559  *
560  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
561  * list if it is already on a list.  Because if the buffer is on a list,
562  * it *must* already be on the right one.  If not, the filesystem is being
563  * silly.  This will save a ton of locking.  But first we have to ensure
564  * that buffers are taken *off* the old inode's list when they are freed
565  * (presumably in truncate).  That requires careful auditing of all
566  * filesystems (do it inside bforget()).  It could also be done by bringing
567  * b_inode back.
568  */
569
570 /*
571  * The buffer's backing address_space's private_lock must be held
572  */
573 static inline void __remove_assoc_queue(struct buffer_head *bh)
574 {
575         list_del_init(&bh->b_assoc_buffers);
576         WARN_ON(!bh->b_assoc_map);
577         if (buffer_write_io_error(bh))
578                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
579         bh->b_assoc_map = NULL;
580 }
581
582 int inode_has_buffers(struct inode *inode)
583 {
584         return !list_empty(&inode->i_data.private_list);
585 }
586
587 /*
588  * osync is designed to support O_SYNC io.  It waits synchronously for
589  * all already-submitted IO to complete, but does not queue any new
590  * writes to the disk.
591  *
592  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
593  * you dirty the buffers, and then use osync_inode_buffers to wait for
594  * completion.  Any other dirty buffers which are not yet queued for
595  * write will not be flushed to disk by the osync.
596  */
597 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
598 {
599         struct buffer_head *bh;
600         struct list_head *p;
601         int err = 0;
602
603         spin_lock(lock);
604 repeat:
605         list_for_each_prev(p, list) {
606                 bh = BH_ENTRY(p);
607                 if (buffer_locked(bh)) {
608                         get_bh(bh);
609                         spin_unlock(lock);
610                         wait_on_buffer(bh);
611                         if (!buffer_uptodate(bh))
612                                 err = -EIO;
613                         brelse(bh);
614                         spin_lock(lock);
615                         goto repeat;
616                 }
617         }
618         spin_unlock(lock);
619         return err;
620 }
621
622 /**
623  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
624  *                        buffers
625  * @mapping: the mapping which wants those buffers written
626  *
627  * Starts I/O against the buffers at mapping->private_list, and waits upon
628  * that I/O.
629  *
630  * Basically, this is a convenience function for fsync().
631  * @mapping is a file or directory which needs those buffers to be written for
632  * a successful fsync().
633  */
634 int sync_mapping_buffers(struct address_space *mapping)
635 {
636         struct address_space *buffer_mapping = mapping->assoc_mapping;
637
638         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
639                 return 0;
640
641         return fsync_buffers_list(&buffer_mapping->private_lock,
642                                         &mapping->private_list);
643 }
644 EXPORT_SYMBOL(sync_mapping_buffers);
645
646 /*
647  * Called when we've recently written block `bblock', and it is known that
648  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
649  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
650  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
651  */
652 void write_boundary_block(struct block_device *bdev,
653                         sector_t bblock, unsigned blocksize)
654 {
655         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
656         if (bh) {
657                 if (buffer_dirty(bh))
658                         ll_rw_block(WRITE, 1, &bh);
659                 put_bh(bh);
660         }
661 }
662
663 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
664 {
665         struct address_space *mapping = inode->i_mapping;
666         struct address_space *buffer_mapping = bh->b_page->mapping;
667
668         mark_buffer_dirty(bh);
669         if (!mapping->assoc_mapping) {
670                 mapping->assoc_mapping = buffer_mapping;
671         } else {
672                 BUG_ON(mapping->assoc_mapping != buffer_mapping);
673         }
674         if (list_empty(&bh->b_assoc_buffers)) {
675                 spin_lock(&buffer_mapping->private_lock);
676                 list_move_tail(&bh->b_assoc_buffers,
677                                 &mapping->private_list);
678                 bh->b_assoc_map = mapping;
679                 spin_unlock(&buffer_mapping->private_lock);
680         }
681 }
682 EXPORT_SYMBOL(mark_buffer_dirty_inode);
683
684 /*
685  * Add a page to the dirty page list.
686  *
687  * It is a sad fact of life that this function is called from several places
688  * deeply under spinlocking.  It may not sleep.
689  *
690  * If the page has buffers, the uptodate buffers are set dirty, to preserve
691  * dirty-state coherency between the page and the buffers.  It the page does
692  * not have buffers then when they are later attached they will all be set
693  * dirty.
694  *
695  * The buffers are dirtied before the page is dirtied.  There's a small race
696  * window in which a writepage caller may see the page cleanness but not the
697  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
698  * before the buffers, a concurrent writepage caller could clear the page dirty
699  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
700  * page on the dirty page list.
701  *
702  * We use private_lock to lock against try_to_free_buffers while using the
703  * page's buffer list.  Also use this to protect against clean buffers being
704  * added to the page after it was set dirty.
705  *
706  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
707  * address_space though.
708  */
709 int __set_page_dirty_buffers(struct page *page)
710 {
711         struct address_space * const mapping = page_mapping(page);
712
713         if (unlikely(!mapping))
714                 return !TestSetPageDirty(page);
715
716         spin_lock(&mapping->private_lock);
717         if (page_has_buffers(page)) {
718                 struct buffer_head *head = page_buffers(page);
719                 struct buffer_head *bh = head;
720
721                 do {
722                         set_buffer_dirty(bh);
723                         bh = bh->b_this_page;
724                 } while (bh != head);
725         }
726         spin_unlock(&mapping->private_lock);
727
728         if (TestSetPageDirty(page))
729                 return 0;
730
731         write_lock_irq(&mapping->tree_lock);
732         if (page->mapping) {    /* Race with truncate? */
733                 if (mapping_cap_account_dirty(mapping)) {
734                         __inc_zone_page_state(page, NR_FILE_DIRTY);
735                         task_io_account_write(PAGE_CACHE_SIZE);
736                 }
737                 radix_tree_tag_set(&mapping->page_tree,
738                                 page_index(page), PAGECACHE_TAG_DIRTY);
739         }
740         write_unlock_irq(&mapping->tree_lock);
741         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
742         return 1;
743 }
744 EXPORT_SYMBOL(__set_page_dirty_buffers);
745
746 /*
747  * Write out and wait upon a list of buffers.
748  *
749  * We have conflicting pressures: we want to make sure that all
750  * initially dirty buffers get waited on, but that any subsequently
751  * dirtied buffers don't.  After all, we don't want fsync to last
752  * forever if somebody is actively writing to the file.
753  *
754  * Do this in two main stages: first we copy dirty buffers to a
755  * temporary inode list, queueing the writes as we go.  Then we clean
756  * up, waiting for those writes to complete.
757  * 
758  * During this second stage, any subsequent updates to the file may end
759  * up refiling the buffer on the original inode's dirty list again, so
760  * there is a chance we will end up with a buffer queued for write but
761  * not yet completed on that list.  So, as a final cleanup we go through
762  * the osync code to catch these locked, dirty buffers without requeuing
763  * any newly dirty buffers for write.
764  */
765 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
766 {
767         struct buffer_head *bh;
768         struct list_head tmp;
769         int err = 0, err2;
770
771         INIT_LIST_HEAD(&tmp);
772
773         spin_lock(lock);
774         while (!list_empty(list)) {
775                 bh = BH_ENTRY(list->next);
776                 __remove_assoc_queue(bh);
777                 if (buffer_dirty(bh) || buffer_locked(bh)) {
778                         list_add(&bh->b_assoc_buffers, &tmp);
779                         if (buffer_dirty(bh)) {
780                                 get_bh(bh);
781                                 spin_unlock(lock);
782                                 /*
783                                  * Ensure any pending I/O completes so that
784                                  * ll_rw_block() actually writes the current
785                                  * contents - it is a noop if I/O is still in
786                                  * flight on potentially older contents.
787                                  */
788                                 ll_rw_block(SWRITE, 1, &bh);
789                                 brelse(bh);
790                                 spin_lock(lock);
791                         }
792                 }
793         }
794
795         while (!list_empty(&tmp)) {
796                 bh = BH_ENTRY(tmp.prev);
797                 list_del_init(&bh->b_assoc_buffers);
798                 get_bh(bh);
799                 spin_unlock(lock);
800                 wait_on_buffer(bh);
801                 if (!buffer_uptodate(bh))
802                         err = -EIO;
803                 brelse(bh);
804                 spin_lock(lock);
805         }
806         
807         spin_unlock(lock);
808         err2 = osync_buffers_list(lock, list);
809         if (err)
810                 return err;
811         else
812                 return err2;
813 }
814
815 /*
816  * Invalidate any and all dirty buffers on a given inode.  We are
817  * probably unmounting the fs, but that doesn't mean we have already
818  * done a sync().  Just drop the buffers from the inode list.
819  *
820  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
821  * assumes that all the buffers are against the blockdev.  Not true
822  * for reiserfs.
823  */
824 void invalidate_inode_buffers(struct inode *inode)
825 {
826         if (inode_has_buffers(inode)) {
827                 struct address_space *mapping = &inode->i_data;
828                 struct list_head *list = &mapping->private_list;
829                 struct address_space *buffer_mapping = mapping->assoc_mapping;
830
831                 spin_lock(&buffer_mapping->private_lock);
832                 while (!list_empty(list))
833                         __remove_assoc_queue(BH_ENTRY(list->next));
834                 spin_unlock(&buffer_mapping->private_lock);
835         }
836 }
837
838 /*
839  * Remove any clean buffers from the inode's buffer list.  This is called
840  * when we're trying to free the inode itself.  Those buffers can pin it.
841  *
842  * Returns true if all buffers were removed.
843  */
844 int remove_inode_buffers(struct inode *inode)
845 {
846         int ret = 1;
847
848         if (inode_has_buffers(inode)) {
849                 struct address_space *mapping = &inode->i_data;
850                 struct list_head *list = &mapping->private_list;
851                 struct address_space *buffer_mapping = mapping->assoc_mapping;
852
853                 spin_lock(&buffer_mapping->private_lock);
854                 while (!list_empty(list)) {
855                         struct buffer_head *bh = BH_ENTRY(list->next);
856                         if (buffer_dirty(bh)) {
857                                 ret = 0;
858                                 break;
859                         }
860                         __remove_assoc_queue(bh);
861                 }
862                 spin_unlock(&buffer_mapping->private_lock);
863         }
864         return ret;
865 }
866
867 /*
868  * Create the appropriate buffers when given a page for data area and
869  * the size of each buffer.. Use the bh->b_this_page linked list to
870  * follow the buffers created.  Return NULL if unable to create more
871  * buffers.
872  *
873  * The retry flag is used to differentiate async IO (paging, swapping)
874  * which may not fail from ordinary buffer allocations.
875  */
876 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
877                 int retry)
878 {
879         struct buffer_head *bh, *head;
880         long offset;
881
882 try_again:
883         head = NULL;
884         offset = PAGE_SIZE;
885         while ((offset -= size) >= 0) {
886                 bh = alloc_buffer_head(GFP_NOFS);
887                 if (!bh)
888                         goto no_grow;
889
890                 bh->b_bdev = NULL;
891                 bh->b_this_page = head;
892                 bh->b_blocknr = -1;
893                 head = bh;
894
895                 bh->b_state = 0;
896                 atomic_set(&bh->b_count, 0);
897                 bh->b_private = NULL;
898                 bh->b_size = size;
899
900                 /* Link the buffer to its page */
901                 set_bh_page(bh, page, offset);
902
903                 init_buffer(bh, NULL, NULL);
904         }
905         return head;
906 /*
907  * In case anything failed, we just free everything we got.
908  */
909 no_grow:
910         if (head) {
911                 do {
912                         bh = head;
913                         head = head->b_this_page;
914                         free_buffer_head(bh);
915                 } while (head);
916         }
917
918         /*
919          * Return failure for non-async IO requests.  Async IO requests
920          * are not allowed to fail, so we have to wait until buffer heads
921          * become available.  But we don't want tasks sleeping with 
922          * partially complete buffers, so all were released above.
923          */
924         if (!retry)
925                 return NULL;
926
927         /* We're _really_ low on memory. Now we just
928          * wait for old buffer heads to become free due to
929          * finishing IO.  Since this is an async request and
930          * the reserve list is empty, we're sure there are 
931          * async buffer heads in use.
932          */
933         free_more_memory();
934         goto try_again;
935 }
936 EXPORT_SYMBOL_GPL(alloc_page_buffers);
937
938 static inline void
939 link_dev_buffers(struct page *page, struct buffer_head *head)
940 {
941         struct buffer_head *bh, *tail;
942
943         bh = head;
944         do {
945                 tail = bh;
946                 bh = bh->b_this_page;
947         } while (bh);
948         tail->b_this_page = head;
949         attach_page_buffers(page, head);
950 }
951
952 /*
953  * Initialise the state of a blockdev page's buffers.
954  */ 
955 static void
956 init_page_buffers(struct page *page, struct block_device *bdev,
957                         sector_t block, int size)
958 {
959         struct buffer_head *head = page_buffers(page);
960         struct buffer_head *bh = head;
961         int uptodate = PageUptodate(page);
962
963         do {
964                 if (!buffer_mapped(bh)) {
965                         init_buffer(bh, NULL, NULL);
966                         bh->b_bdev = bdev;
967                         bh->b_blocknr = block;
968                         if (uptodate)
969                                 set_buffer_uptodate(bh);
970                         set_buffer_mapped(bh);
971                 }
972                 block++;
973                 bh = bh->b_this_page;
974         } while (bh != head);
975 }
976
977 /*
978  * Create the page-cache page that contains the requested block.
979  *
980  * This is user purely for blockdev mappings.
981  */
982 static struct page *
983 grow_dev_page(struct block_device *bdev, sector_t block,
984                 pgoff_t index, int size)
985 {
986         struct inode *inode = bdev->bd_inode;
987         struct page *page;
988         struct buffer_head *bh;
989
990         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
991         if (!page)
992                 return NULL;
993
994         BUG_ON(!PageLocked(page));
995
996         if (page_has_buffers(page)) {
997                 bh = page_buffers(page);
998                 if (bh->b_size == size) {
999                         init_page_buffers(page, bdev, block, size);
1000                         return page;
1001                 }
1002                 if (!try_to_free_buffers(page))
1003                         goto failed;
1004         }
1005
1006         /*
1007          * Allocate some buffers for this page
1008          */
1009         bh = alloc_page_buffers(page, size, 0);
1010         if (!bh)
1011                 goto failed;
1012
1013         /*
1014          * Link the page to the buffers and initialise them.  Take the
1015          * lock to be atomic wrt __find_get_block(), which does not
1016          * run under the page lock.
1017          */
1018         spin_lock(&inode->i_mapping->private_lock);
1019         link_dev_buffers(page, bh);
1020         init_page_buffers(page, bdev, block, size);
1021         spin_unlock(&inode->i_mapping->private_lock);
1022         return page;
1023
1024 failed:
1025         BUG();
1026         unlock_page(page);
1027         page_cache_release(page);
1028         return NULL;
1029 }
1030
1031 /*
1032  * Create buffers for the specified block device block's page.  If
1033  * that page was dirty, the buffers are set dirty also.
1034  *
1035  * Except that's a bug.  Attaching dirty buffers to a dirty
1036  * blockdev's page can result in filesystem corruption, because
1037  * some of those buffers may be aliases of filesystem data.
1038  * grow_dev_page() will go BUG() if this happens.
1039  */
1040 static int
1041 grow_buffers(struct block_device *bdev, sector_t block, int size)
1042 {
1043         struct page *page;
1044         pgoff_t index;
1045         int sizebits;
1046
1047         sizebits = -1;
1048         do {
1049                 sizebits++;
1050         } while ((size << sizebits) < PAGE_SIZE);
1051
1052         index = block >> sizebits;
1053
1054         /*
1055          * Check for a block which wants to lie outside our maximum possible
1056          * pagecache index.  (this comparison is done using sector_t types).
1057          */
1058         if (unlikely(index != block >> sizebits)) {
1059                 char b[BDEVNAME_SIZE];
1060
1061                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1062                         "device %s\n",
1063                         __FUNCTION__, (unsigned long long)block,
1064                         bdevname(bdev, b));
1065                 return -EIO;
1066         }
1067         block = index << sizebits;
1068         /* Create a page with the proper size buffers.. */
1069         page = grow_dev_page(bdev, block, index, size);
1070         if (!page)
1071                 return 0;
1072         unlock_page(page);
1073         page_cache_release(page);
1074         return 1;
1075 }
1076
1077 static struct buffer_head *
1078 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1079 {
1080         /* Size must be multiple of hard sectorsize */
1081         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1082                         (size < 512 || size > PAGE_SIZE))) {
1083                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1084                                         size);
1085                 printk(KERN_ERR "hardsect size: %d\n",
1086                                         bdev_hardsect_size(bdev));
1087
1088                 dump_stack();
1089                 return NULL;
1090         }
1091
1092         for (;;) {
1093                 struct buffer_head * bh;
1094                 int ret;
1095
1096                 bh = __find_get_block(bdev, block, size);
1097                 if (bh)
1098                         return bh;
1099
1100                 ret = grow_buffers(bdev, block, size);
1101                 if (ret < 0)
1102                         return NULL;
1103                 if (ret == 0)
1104                         free_more_memory();
1105         }
1106 }
1107
1108 /*
1109  * The relationship between dirty buffers and dirty pages:
1110  *
1111  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1112  * the page is tagged dirty in its radix tree.
1113  *
1114  * At all times, the dirtiness of the buffers represents the dirtiness of
1115  * subsections of the page.  If the page has buffers, the page dirty bit is
1116  * merely a hint about the true dirty state.
1117  *
1118  * When a page is set dirty in its entirety, all its buffers are marked dirty
1119  * (if the page has buffers).
1120  *
1121  * When a buffer is marked dirty, its page is dirtied, but the page's other
1122  * buffers are not.
1123  *
1124  * Also.  When blockdev buffers are explicitly read with bread(), they
1125  * individually become uptodate.  But their backing page remains not
1126  * uptodate - even if all of its buffers are uptodate.  A subsequent
1127  * block_read_full_page() against that page will discover all the uptodate
1128  * buffers, will set the page uptodate and will perform no I/O.
1129  */
1130
1131 /**
1132  * mark_buffer_dirty - mark a buffer_head as needing writeout
1133  * @bh: the buffer_head to mark dirty
1134  *
1135  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1136  * backing page dirty, then tag the page as dirty in its address_space's radix
1137  * tree and then attach the address_space's inode to its superblock's dirty
1138  * inode list.
1139  *
1140  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1141  * mapping->tree_lock and the global inode_lock.
1142  */
1143 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1144 {
1145         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1146                 __set_page_dirty_nobuffers(bh->b_page);
1147 }
1148
1149 /*
1150  * Decrement a buffer_head's reference count.  If all buffers against a page
1151  * have zero reference count, are clean and unlocked, and if the page is clean
1152  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1153  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1154  * a page but it ends up not being freed, and buffers may later be reattached).
1155  */
1156 void __brelse(struct buffer_head * buf)
1157 {
1158         if (atomic_read(&buf->b_count)) {
1159                 put_bh(buf);
1160                 return;
1161         }
1162         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1163         WARN_ON(1);
1164 }
1165
1166 /*
1167  * bforget() is like brelse(), except it discards any
1168  * potentially dirty data.
1169  */
1170 void __bforget(struct buffer_head *bh)
1171 {
1172         clear_buffer_dirty(bh);
1173         if (!list_empty(&bh->b_assoc_buffers)) {
1174                 struct address_space *buffer_mapping = bh->b_page->mapping;
1175
1176                 spin_lock(&buffer_mapping->private_lock);
1177                 list_del_init(&bh->b_assoc_buffers);
1178                 bh->b_assoc_map = NULL;
1179                 spin_unlock(&buffer_mapping->private_lock);
1180         }
1181         __brelse(bh);
1182 }
1183
1184 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1185 {
1186         lock_buffer(bh);
1187         if (buffer_uptodate(bh)) {
1188                 unlock_buffer(bh);
1189                 return bh;
1190         } else {
1191                 get_bh(bh);
1192                 bh->b_end_io = end_buffer_read_sync;
1193                 submit_bh(READ, bh);
1194                 wait_on_buffer(bh);
1195                 if (buffer_uptodate(bh))
1196                         return bh;
1197         }
1198         brelse(bh);
1199         return NULL;
1200 }
1201
1202 /*
1203  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1204  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1205  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1206  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1207  * CPU's LRUs at the same time.
1208  *
1209  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1210  * sb_find_get_block().
1211  *
1212  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1213  * a local interrupt disable for that.
1214  */
1215
1216 #define BH_LRU_SIZE     8
1217
1218 struct bh_lru {
1219         struct buffer_head *bhs[BH_LRU_SIZE];
1220 };
1221
1222 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1223
1224 #ifdef CONFIG_SMP
1225 #define bh_lru_lock()   local_irq_disable()
1226 #define bh_lru_unlock() local_irq_enable()
1227 #else
1228 #define bh_lru_lock()   preempt_disable()
1229 #define bh_lru_unlock() preempt_enable()
1230 #endif
1231
1232 static inline void check_irqs_on(void)
1233 {
1234 #ifdef irqs_disabled
1235         BUG_ON(irqs_disabled());
1236 #endif
1237 }
1238
1239 /*
1240  * The LRU management algorithm is dopey-but-simple.  Sorry.
1241  */
1242 static void bh_lru_install(struct buffer_head *bh)
1243 {
1244         struct buffer_head *evictee = NULL;
1245         struct bh_lru *lru;
1246
1247         check_irqs_on();
1248         bh_lru_lock();
1249         lru = &__get_cpu_var(bh_lrus);
1250         if (lru->bhs[0] != bh) {
1251                 struct buffer_head *bhs[BH_LRU_SIZE];
1252                 int in;
1253                 int out = 0;
1254
1255                 get_bh(bh);
1256                 bhs[out++] = bh;
1257                 for (in = 0; in < BH_LRU_SIZE; in++) {
1258                         struct buffer_head *bh2 = lru->bhs[in];
1259
1260                         if (bh2 == bh) {
1261                                 __brelse(bh2);
1262                         } else {
1263                                 if (out >= BH_LRU_SIZE) {
1264                                         BUG_ON(evictee != NULL);
1265                                         evictee = bh2;
1266                                 } else {
1267                                         bhs[out++] = bh2;
1268                                 }
1269                         }
1270                 }
1271                 while (out < BH_LRU_SIZE)
1272                         bhs[out++] = NULL;
1273                 memcpy(lru->bhs, bhs, sizeof(bhs));
1274         }
1275         bh_lru_unlock();
1276
1277         if (evictee)
1278                 __brelse(evictee);
1279 }
1280
1281 /*
1282  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1283  */
1284 static struct buffer_head *
1285 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1286 {
1287         struct buffer_head *ret = NULL;
1288         struct bh_lru *lru;
1289         int i;
1290
1291         check_irqs_on();
1292         bh_lru_lock();
1293         lru = &__get_cpu_var(bh_lrus);
1294         for (i = 0; i < BH_LRU_SIZE; i++) {
1295                 struct buffer_head *bh = lru->bhs[i];
1296
1297                 if (bh && bh->b_bdev == bdev &&
1298                                 bh->b_blocknr == block && bh->b_size == size) {
1299                         if (i) {
1300                                 while (i) {
1301                                         lru->bhs[i] = lru->bhs[i - 1];
1302                                         i--;
1303                                 }
1304                                 lru->bhs[0] = bh;
1305                         }
1306                         get_bh(bh);
1307                         ret = bh;
1308                         break;
1309                 }
1310         }
1311         bh_lru_unlock();
1312         return ret;
1313 }
1314
1315 /*
1316  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1317  * it in the LRU and mark it as accessed.  If it is not present then return
1318  * NULL
1319  */
1320 struct buffer_head *
1321 __find_get_block(struct block_device *bdev, sector_t block, int size)
1322 {
1323         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1324
1325         if (bh == NULL) {
1326                 bh = __find_get_block_slow(bdev, block);
1327                 if (bh)
1328                         bh_lru_install(bh);
1329         }
1330         if (bh)
1331                 touch_buffer(bh);
1332         return bh;
1333 }
1334 EXPORT_SYMBOL(__find_get_block);
1335
1336 /*
1337  * __getblk will locate (and, if necessary, create) the buffer_head
1338  * which corresponds to the passed block_device, block and size. The
1339  * returned buffer has its reference count incremented.
1340  *
1341  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1342  * illegal block number, __getblk() will happily return a buffer_head
1343  * which represents the non-existent block.  Very weird.
1344  *
1345  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1346  * attempt is failing.  FIXME, perhaps?
1347  */
1348 struct buffer_head *
1349 __getblk(struct block_device *bdev, sector_t block, int size)
1350 {
1351         struct buffer_head *bh = __find_get_block(bdev, block, size);
1352
1353         might_sleep();
1354         if (bh == NULL)
1355                 bh = __getblk_slow(bdev, block, size);
1356         return bh;
1357 }
1358 EXPORT_SYMBOL(__getblk);
1359
1360 /*
1361  * Do async read-ahead on a buffer..
1362  */
1363 void __breadahead(struct block_device *bdev, sector_t block, int size)
1364 {
1365         struct buffer_head *bh = __getblk(bdev, block, size);
1366         if (likely(bh)) {
1367                 ll_rw_block(READA, 1, &bh);
1368                 brelse(bh);
1369         }
1370 }
1371 EXPORT_SYMBOL(__breadahead);
1372
1373 /**
1374  *  __bread() - reads a specified block and returns the bh
1375  *  @bdev: the block_device to read from
1376  *  @block: number of block
1377  *  @size: size (in bytes) to read
1378  * 
1379  *  Reads a specified block, and returns buffer head that contains it.
1380  *  It returns NULL if the block was unreadable.
1381  */
1382 struct buffer_head *
1383 __bread(struct block_device *bdev, sector_t block, int size)
1384 {
1385         struct buffer_head *bh = __getblk(bdev, block, size);
1386
1387         if (likely(bh) && !buffer_uptodate(bh))
1388                 bh = __bread_slow(bh);
1389         return bh;
1390 }
1391 EXPORT_SYMBOL(__bread);
1392
1393 /*
1394  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1395  * This doesn't race because it runs in each cpu either in irq
1396  * or with preempt disabled.
1397  */
1398 static void invalidate_bh_lru(void *arg)
1399 {
1400         struct bh_lru *b = &get_cpu_var(bh_lrus);
1401         int i;
1402
1403         for (i = 0; i < BH_LRU_SIZE; i++) {
1404                 brelse(b->bhs[i]);
1405                 b->bhs[i] = NULL;
1406         }
1407         put_cpu_var(bh_lrus);
1408 }
1409         
1410 static void invalidate_bh_lrus(void)
1411 {
1412         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1413 }
1414
1415 void set_bh_page(struct buffer_head *bh,
1416                 struct page *page, unsigned long offset)
1417 {
1418         bh->b_page = page;
1419         BUG_ON(offset >= PAGE_SIZE);
1420         if (PageHighMem(page))
1421                 /*
1422                  * This catches illegal uses and preserves the offset:
1423                  */
1424                 bh->b_data = (char *)(0 + offset);
1425         else
1426                 bh->b_data = page_address(page) + offset;
1427 }
1428 EXPORT_SYMBOL(set_bh_page);
1429
1430 /*
1431  * Called when truncating a buffer on a page completely.
1432  */
1433 static void discard_buffer(struct buffer_head * bh)
1434 {
1435         lock_buffer(bh);
1436         clear_buffer_dirty(bh);
1437         bh->b_bdev = NULL;
1438         clear_buffer_mapped(bh);
1439         clear_buffer_req(bh);
1440         clear_buffer_new(bh);
1441         clear_buffer_delay(bh);
1442         unlock_buffer(bh);
1443 }
1444
1445 /**
1446  * block_invalidatepage - invalidate part of all of a buffer-backed page
1447  *
1448  * @page: the page which is affected
1449  * @offset: the index of the truncation point
1450  *
1451  * block_invalidatepage() is called when all or part of the page has become
1452  * invalidatedby a truncate operation.
1453  *
1454  * block_invalidatepage() does not have to release all buffers, but it must
1455  * ensure that no dirty buffer is left outside @offset and that no I/O
1456  * is underway against any of the blocks which are outside the truncation
1457  * point.  Because the caller is about to free (and possibly reuse) those
1458  * blocks on-disk.
1459  */
1460 void block_invalidatepage(struct page *page, unsigned long offset)
1461 {
1462         struct buffer_head *head, *bh, *next;
1463         unsigned int curr_off = 0;
1464
1465         BUG_ON(!PageLocked(page));
1466         if (!page_has_buffers(page))
1467                 goto out;
1468
1469         head = page_buffers(page);
1470         bh = head;
1471         do {
1472                 unsigned int next_off = curr_off + bh->b_size;
1473                 next = bh->b_this_page;
1474
1475                 /*
1476                  * is this block fully invalidated?
1477                  */
1478                 if (offset <= curr_off)
1479                         discard_buffer(bh);
1480                 curr_off = next_off;
1481                 bh = next;
1482         } while (bh != head);
1483
1484         /*
1485          * We release buffers only if the entire page is being invalidated.
1486          * The get_block cached value has been unconditionally invalidated,
1487          * so real IO is not possible anymore.
1488          */
1489         if (offset == 0)
1490                 try_to_release_page(page, 0);
1491 out:
1492         return;
1493 }
1494 EXPORT_SYMBOL(block_invalidatepage);
1495
1496 /*
1497  * We attach and possibly dirty the buffers atomically wrt
1498  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1499  * is already excluded via the page lock.
1500  */
1501 void create_empty_buffers(struct page *page,
1502                         unsigned long blocksize, unsigned long b_state)
1503 {
1504         struct buffer_head *bh, *head, *tail;
1505
1506         head = alloc_page_buffers(page, blocksize, 1);
1507         bh = head;
1508         do {
1509                 bh->b_state |= b_state;
1510                 tail = bh;
1511                 bh = bh->b_this_page;
1512         } while (bh);
1513         tail->b_this_page = head;
1514
1515         spin_lock(&page->mapping->private_lock);
1516         if (PageUptodate(page) || PageDirty(page)) {
1517                 bh = head;
1518                 do {
1519                         if (PageDirty(page))
1520                                 set_buffer_dirty(bh);
1521                         if (PageUptodate(page))
1522                                 set_buffer_uptodate(bh);
1523                         bh = bh->b_this_page;
1524                 } while (bh != head);
1525         }
1526         attach_page_buffers(page, head);
1527         spin_unlock(&page->mapping->private_lock);
1528 }
1529 EXPORT_SYMBOL(create_empty_buffers);
1530
1531 /*
1532  * We are taking a block for data and we don't want any output from any
1533  * buffer-cache aliases starting from return from that function and
1534  * until the moment when something will explicitly mark the buffer
1535  * dirty (hopefully that will not happen until we will free that block ;-)
1536  * We don't even need to mark it not-uptodate - nobody can expect
1537  * anything from a newly allocated buffer anyway. We used to used
1538  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1539  * don't want to mark the alias unmapped, for example - it would confuse
1540  * anyone who might pick it with bread() afterwards...
1541  *
1542  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1543  * be writeout I/O going on against recently-freed buffers.  We don't
1544  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1545  * only if we really need to.  That happens here.
1546  */
1547 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1548 {
1549         struct buffer_head *old_bh;
1550
1551         might_sleep();
1552
1553         old_bh = __find_get_block_slow(bdev, block);
1554         if (old_bh) {
1555                 clear_buffer_dirty(old_bh);
1556                 wait_on_buffer(old_bh);
1557                 clear_buffer_req(old_bh);
1558                 __brelse(old_bh);
1559         }
1560 }
1561 EXPORT_SYMBOL(unmap_underlying_metadata);
1562
1563 /*
1564  * NOTE! All mapped/uptodate combinations are valid:
1565  *
1566  *      Mapped  Uptodate        Meaning
1567  *
1568  *      No      No              "unknown" - must do get_block()
1569  *      No      Yes             "hole" - zero-filled
1570  *      Yes     No              "allocated" - allocated on disk, not read in
1571  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1572  *
1573  * "Dirty" is valid only with the last case (mapped+uptodate).
1574  */
1575
1576 /*
1577  * While block_write_full_page is writing back the dirty buffers under
1578  * the page lock, whoever dirtied the buffers may decide to clean them
1579  * again at any time.  We handle that by only looking at the buffer
1580  * state inside lock_buffer().
1581  *
1582  * If block_write_full_page() is called for regular writeback
1583  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1584  * locked buffer.   This only can happen if someone has written the buffer
1585  * directly, with submit_bh().  At the address_space level PageWriteback
1586  * prevents this contention from occurring.
1587  */
1588 static int __block_write_full_page(struct inode *inode, struct page *page,
1589                         get_block_t *get_block, struct writeback_control *wbc)
1590 {
1591         int err;
1592         sector_t block;
1593         sector_t last_block;
1594         struct buffer_head *bh, *head;
1595         const unsigned blocksize = 1 << inode->i_blkbits;
1596         int nr_underway = 0;
1597
1598         BUG_ON(!PageLocked(page));
1599
1600         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1601
1602         if (!page_has_buffers(page)) {
1603                 create_empty_buffers(page, blocksize,
1604                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1605         }
1606
1607         /*
1608          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1609          * here, and the (potentially unmapped) buffers may become dirty at
1610          * any time.  If a buffer becomes dirty here after we've inspected it
1611          * then we just miss that fact, and the page stays dirty.
1612          *
1613          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1614          * handle that here by just cleaning them.
1615          */
1616
1617         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1618         head = page_buffers(page);
1619         bh = head;
1620
1621         /*
1622          * Get all the dirty buffers mapped to disk addresses and
1623          * handle any aliases from the underlying blockdev's mapping.
1624          */
1625         do {
1626                 if (block > last_block) {
1627                         /*
1628                          * mapped buffers outside i_size will occur, because
1629                          * this page can be outside i_size when there is a
1630                          * truncate in progress.
1631                          */
1632                         /*
1633                          * The buffer was zeroed by block_write_full_page()
1634                          */
1635                         clear_buffer_dirty(bh);
1636                         set_buffer_uptodate(bh);
1637                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1638                         WARN_ON(bh->b_size != blocksize);
1639                         err = get_block(inode, block, bh, 1);
1640                         if (err)
1641                                 goto recover;
1642                         if (buffer_new(bh)) {
1643                                 /* blockdev mappings never come here */
1644                                 clear_buffer_new(bh);
1645                                 unmap_underlying_metadata(bh->b_bdev,
1646                                                         bh->b_blocknr);
1647                         }
1648                 }
1649                 bh = bh->b_this_page;
1650                 block++;
1651         } while (bh != head);
1652
1653         do {
1654                 if (!buffer_mapped(bh))
1655                         continue;
1656                 /*
1657                  * If it's a fully non-blocking write attempt and we cannot
1658                  * lock the buffer then redirty the page.  Note that this can
1659                  * potentially cause a busy-wait loop from pdflush and kswapd
1660                  * activity, but those code paths have their own higher-level
1661                  * throttling.
1662                  */
1663                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1664                         lock_buffer(bh);
1665                 } else if (test_set_buffer_locked(bh)) {
1666                         redirty_page_for_writepage(wbc, page);
1667                         continue;
1668                 }
1669                 if (test_clear_buffer_dirty(bh)) {
1670                         mark_buffer_async_write(bh);
1671                 } else {
1672                         unlock_buffer(bh);
1673                 }
1674         } while ((bh = bh->b_this_page) != head);
1675
1676         /*
1677          * The page and its buffers are protected by PageWriteback(), so we can
1678          * drop the bh refcounts early.
1679          */
1680         BUG_ON(PageWriteback(page));
1681         set_page_writeback(page);
1682
1683         do {
1684                 struct buffer_head *next = bh->b_this_page;
1685                 if (buffer_async_write(bh)) {
1686                         submit_bh(WRITE, bh);
1687                         nr_underway++;
1688                 }
1689                 bh = next;
1690         } while (bh != head);
1691         unlock_page(page);
1692
1693         err = 0;
1694 done:
1695         if (nr_underway == 0) {
1696                 /*
1697                  * The page was marked dirty, but the buffers were
1698                  * clean.  Someone wrote them back by hand with
1699                  * ll_rw_block/submit_bh.  A rare case.
1700                  */
1701                 int uptodate = 1;
1702                 do {
1703                         if (!buffer_uptodate(bh)) {
1704                                 uptodate = 0;
1705                                 break;
1706                         }
1707                         bh = bh->b_this_page;
1708                 } while (bh != head);
1709                 if (uptodate)
1710                         SetPageUptodate(page);
1711                 end_page_writeback(page);
1712                 /*
1713                  * The page and buffer_heads can be released at any time from
1714                  * here on.
1715                  */
1716                 wbc->pages_skipped++;   /* We didn't write this page */
1717         }
1718         return err;
1719
1720 recover:
1721         /*
1722          * ENOSPC, or some other error.  We may already have added some
1723          * blocks to the file, so we need to write these out to avoid
1724          * exposing stale data.
1725          * The page is currently locked and not marked for writeback
1726          */
1727         bh = head;
1728         /* Recovery: lock and submit the mapped buffers */
1729         do {
1730                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1731                         lock_buffer(bh);
1732                         mark_buffer_async_write(bh);
1733                 } else {
1734                         /*
1735                          * The buffer may have been set dirty during
1736                          * attachment to a dirty page.
1737                          */
1738                         clear_buffer_dirty(bh);
1739                 }
1740         } while ((bh = bh->b_this_page) != head);
1741         SetPageError(page);
1742         BUG_ON(PageWriteback(page));
1743         set_page_writeback(page);
1744         unlock_page(page);
1745         do {
1746                 struct buffer_head *next = bh->b_this_page;
1747                 if (buffer_async_write(bh)) {
1748                         clear_buffer_dirty(bh);
1749                         submit_bh(WRITE, bh);
1750                         nr_underway++;
1751                 }
1752                 bh = next;
1753         } while (bh != head);
1754         goto done;
1755 }
1756
1757 static int __block_prepare_write(struct inode *inode, struct page *page,
1758                 unsigned from, unsigned to, get_block_t *get_block)
1759 {
1760         unsigned block_start, block_end;
1761         sector_t block;
1762         int err = 0;
1763         unsigned blocksize, bbits;
1764         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1765
1766         BUG_ON(!PageLocked(page));
1767         BUG_ON(from > PAGE_CACHE_SIZE);
1768         BUG_ON(to > PAGE_CACHE_SIZE);
1769         BUG_ON(from > to);
1770
1771         blocksize = 1 << inode->i_blkbits;
1772         if (!page_has_buffers(page))
1773                 create_empty_buffers(page, blocksize, 0);
1774         head = page_buffers(page);
1775
1776         bbits = inode->i_blkbits;
1777         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1778
1779         for(bh = head, block_start = 0; bh != head || !block_start;
1780             block++, block_start=block_end, bh = bh->b_this_page) {
1781                 block_end = block_start + blocksize;
1782                 if (block_end <= from || block_start >= to) {
1783                         if (PageUptodate(page)) {
1784                                 if (!buffer_uptodate(bh))
1785                                         set_buffer_uptodate(bh);
1786                         }
1787                         continue;
1788                 }
1789                 if (buffer_new(bh))
1790                         clear_buffer_new(bh);
1791                 if (!buffer_mapped(bh)) {
1792                         WARN_ON(bh->b_size != blocksize);
1793                         err = get_block(inode, block, bh, 1);
1794                         if (err)
1795                                 break;
1796                         if (buffer_new(bh)) {
1797                                 unmap_underlying_metadata(bh->b_bdev,
1798                                                         bh->b_blocknr);
1799                                 if (PageUptodate(page)) {
1800                                         set_buffer_uptodate(bh);
1801                                         continue;
1802                                 }
1803                                 if (block_end > to || block_start < from) {
1804                                         void *kaddr;
1805
1806                                         kaddr = kmap_atomic(page, KM_USER0);
1807                                         if (block_end > to)
1808                                                 memset(kaddr+to, 0,
1809                                                         block_end-to);
1810                                         if (block_start < from)
1811                                                 memset(kaddr+block_start,
1812                                                         0, from-block_start);
1813                                         flush_dcache_page(page);
1814                                         kunmap_atomic(kaddr, KM_USER0);
1815                                 }
1816                                 continue;
1817                         }
1818                 }
1819                 if (PageUptodate(page)) {
1820                         if (!buffer_uptodate(bh))
1821                                 set_buffer_uptodate(bh);
1822                         continue; 
1823                 }
1824                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1825                      (block_start < from || block_end > to)) {
1826                         ll_rw_block(READ, 1, &bh);
1827                         *wait_bh++=bh;
1828                 }
1829         }
1830         /*
1831          * If we issued read requests - let them complete.
1832          */
1833         while(wait_bh > wait) {
1834                 wait_on_buffer(*--wait_bh);
1835                 if (!buffer_uptodate(*wait_bh))
1836                         err = -EIO;
1837         }
1838         if (!err) {
1839                 bh = head;
1840                 do {
1841                         if (buffer_new(bh))
1842                                 clear_buffer_new(bh);
1843                 } while ((bh = bh->b_this_page) != head);
1844                 return 0;
1845         }
1846         /* Error case: */
1847         /*
1848          * Zero out any newly allocated blocks to avoid exposing stale
1849          * data.  If BH_New is set, we know that the block was newly
1850          * allocated in the above loop.
1851          */
1852         bh = head;
1853         block_start = 0;
1854         do {
1855                 block_end = block_start+blocksize;
1856                 if (block_end <= from)
1857                         goto next_bh;
1858                 if (block_start >= to)
1859                         break;
1860                 if (buffer_new(bh)) {
1861                         void *kaddr;
1862
1863                         clear_buffer_new(bh);
1864                         kaddr = kmap_atomic(page, KM_USER0);
1865                         memset(kaddr+block_start, 0, bh->b_size);
1866                         flush_dcache_page(page);
1867                         kunmap_atomic(kaddr, KM_USER0);
1868                         set_buffer_uptodate(bh);
1869                         mark_buffer_dirty(bh);
1870                 }
1871 next_bh:
1872                 block_start = block_end;
1873                 bh = bh->b_this_page;
1874         } while (bh != head);
1875         return err;
1876 }
1877
1878 static int __block_commit_write(struct inode *inode, struct page *page,
1879                 unsigned from, unsigned to)
1880 {
1881         unsigned block_start, block_end;
1882         int partial = 0;
1883         unsigned blocksize;
1884         struct buffer_head *bh, *head;
1885
1886         blocksize = 1 << inode->i_blkbits;
1887
1888         for(bh = head = page_buffers(page), block_start = 0;
1889             bh != head || !block_start;
1890             block_start=block_end, bh = bh->b_this_page) {
1891                 block_end = block_start + blocksize;
1892                 if (block_end <= from || block_start >= to) {
1893                         if (!buffer_uptodate(bh))
1894                                 partial = 1;
1895                 } else {
1896                         set_buffer_uptodate(bh);
1897                         mark_buffer_dirty(bh);
1898                 }
1899         }
1900
1901         /*
1902          * If this is a partial write which happened to make all buffers
1903          * uptodate then we can optimize away a bogus readpage() for
1904          * the next read(). Here we 'discover' whether the page went
1905          * uptodate as a result of this (potentially partial) write.
1906          */
1907         if (!partial)
1908                 SetPageUptodate(page);
1909         return 0;
1910 }
1911
1912 /*
1913  * Generic "read page" function for block devices that have the normal
1914  * get_block functionality. This is most of the block device filesystems.
1915  * Reads the page asynchronously --- the unlock_buffer() and
1916  * set/clear_buffer_uptodate() functions propagate buffer state into the
1917  * page struct once IO has completed.
1918  */
1919 int block_read_full_page(struct page *page, get_block_t *get_block)
1920 {
1921         struct inode *inode = page->mapping->host;
1922         sector_t iblock, lblock;
1923         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1924         unsigned int blocksize;
1925         int nr, i;
1926         int fully_mapped = 1;
1927
1928         BUG_ON(!PageLocked(page));
1929         blocksize = 1 << inode->i_blkbits;
1930         if (!page_has_buffers(page))
1931                 create_empty_buffers(page, blocksize, 0);
1932         head = page_buffers(page);
1933
1934         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1935         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1936         bh = head;
1937         nr = 0;
1938         i = 0;
1939
1940         do {
1941                 if (buffer_uptodate(bh))
1942                         continue;
1943
1944                 if (!buffer_mapped(bh)) {
1945                         int err = 0;
1946
1947                         fully_mapped = 0;
1948                         if (iblock < lblock) {
1949                                 WARN_ON(bh->b_size != blocksize);
1950                                 err = get_block(inode, iblock, bh, 0);
1951                                 if (err)
1952                                         SetPageError(page);
1953                         }
1954                         if (!buffer_mapped(bh)) {
1955                                 void *kaddr = kmap_atomic(page, KM_USER0);
1956                                 memset(kaddr + i * blocksize, 0, blocksize);
1957                                 flush_dcache_page(page);
1958                                 kunmap_atomic(kaddr, KM_USER0);
1959                                 if (!err)
1960                                         set_buffer_uptodate(bh);
1961                                 continue;
1962                         }
1963                         /*
1964                          * get_block() might have updated the buffer
1965                          * synchronously
1966                          */
1967                         if (buffer_uptodate(bh))
1968                                 continue;
1969                 }
1970                 arr[nr++] = bh;
1971         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1972
1973         if (fully_mapped)
1974                 SetPageMappedToDisk(page);
1975
1976         if (!nr) {
1977                 /*
1978                  * All buffers are uptodate - we can set the page uptodate
1979                  * as well. But not if get_block() returned an error.
1980                  */
1981                 if (!PageError(page))
1982                         SetPageUptodate(page);
1983                 unlock_page(page);
1984                 return 0;
1985         }
1986
1987         /* Stage two: lock the buffers */
1988         for (i = 0; i < nr; i++) {
1989                 bh = arr[i];
1990                 lock_buffer(bh);
1991                 mark_buffer_async_read(bh);
1992         }
1993
1994         /*
1995          * Stage 3: start the IO.  Check for uptodateness
1996          * inside the buffer lock in case another process reading
1997          * the underlying blockdev brought it uptodate (the sct fix).
1998          */
1999         for (i = 0; i < nr; i++) {
2000                 bh = arr[i];
2001                 if (buffer_uptodate(bh))
2002                         end_buffer_async_read(bh, 1);
2003                 else
2004                         submit_bh(READ, bh);
2005         }
2006         return 0;
2007 }
2008
2009 /* utility function for filesystems that need to do work on expanding
2010  * truncates.  Uses prepare/commit_write to allow the filesystem to
2011  * deal with the hole.  
2012  */
2013 static int __generic_cont_expand(struct inode *inode, loff_t size,
2014                                  pgoff_t index, unsigned int offset)
2015 {
2016         struct address_space *mapping = inode->i_mapping;
2017         struct page *page;
2018         unsigned long limit;
2019         int err;
2020
2021         err = -EFBIG;
2022         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2023         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2024                 send_sig(SIGXFSZ, current, 0);
2025                 goto out;
2026         }
2027         if (size > inode->i_sb->s_maxbytes)
2028                 goto out;
2029
2030         err = -ENOMEM;
2031         page = grab_cache_page(mapping, index);
2032         if (!page)
2033                 goto out;
2034         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2035         if (err) {
2036                 /*
2037                  * ->prepare_write() may have instantiated a few blocks
2038                  * outside i_size.  Trim these off again.
2039                  */
2040                 unlock_page(page);
2041                 page_cache_release(page);
2042                 vmtruncate(inode, inode->i_size);
2043                 goto out;
2044         }
2045
2046         err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2047
2048         unlock_page(page);
2049         page_cache_release(page);
2050         if (err > 0)
2051                 err = 0;
2052 out:
2053         return err;
2054 }
2055
2056 int generic_cont_expand(struct inode *inode, loff_t size)
2057 {
2058         pgoff_t index;
2059         unsigned int offset;
2060
2061         offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2062
2063         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2064         ** skip the prepare.  make sure we never send an offset for the start
2065         ** of a block
2066         */
2067         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2068                 /* caller must handle this extra byte. */
2069                 offset++;
2070         }
2071         index = size >> PAGE_CACHE_SHIFT;
2072
2073         return __generic_cont_expand(inode, size, index, offset);
2074 }
2075
2076 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2077 {
2078         loff_t pos = size - 1;
2079         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2080         unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2081
2082         /* prepare/commit_write can handle even if from==to==start of block. */
2083         return __generic_cont_expand(inode, size, index, offset);
2084 }
2085
2086 /*
2087  * For moronic filesystems that do not allow holes in file.
2088  * We may have to extend the file.
2089  */
2090
2091 int cont_prepare_write(struct page *page, unsigned offset,
2092                 unsigned to, get_block_t *get_block, loff_t *bytes)
2093 {
2094         struct address_space *mapping = page->mapping;
2095         struct inode *inode = mapping->host;
2096         struct page *new_page;
2097         pgoff_t pgpos;
2098         long status;
2099         unsigned zerofrom;
2100         unsigned blocksize = 1 << inode->i_blkbits;
2101         void *kaddr;
2102
2103         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2104                 status = -ENOMEM;
2105                 new_page = grab_cache_page(mapping, pgpos);
2106                 if (!new_page)
2107                         goto out;
2108                 /* we might sleep */
2109                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2110                         unlock_page(new_page);
2111                         page_cache_release(new_page);
2112                         continue;
2113                 }
2114                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2115                 if (zerofrom & (blocksize-1)) {
2116                         *bytes |= (blocksize-1);
2117                         (*bytes)++;
2118                 }
2119                 status = __block_prepare_write(inode, new_page, zerofrom,
2120                                                 PAGE_CACHE_SIZE, get_block);
2121                 if (status)
2122                         goto out_unmap;
2123                 kaddr = kmap_atomic(new_page, KM_USER0);
2124                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2125                 flush_dcache_page(new_page);
2126                 kunmap_atomic(kaddr, KM_USER0);
2127                 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2128                 unlock_page(new_page);
2129                 page_cache_release(new_page);
2130         }
2131
2132         if (page->index < pgpos) {
2133                 /* completely inside the area */
2134                 zerofrom = offset;
2135         } else {
2136                 /* page covers the boundary, find the boundary offset */
2137                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2138
2139                 /* if we will expand the thing last block will be filled */
2140                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2141                         *bytes |= (blocksize-1);
2142                         (*bytes)++;
2143                 }
2144
2145                 /* starting below the boundary? Nothing to zero out */
2146                 if (offset <= zerofrom)
2147                         zerofrom = offset;
2148         }
2149         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2150         if (status)
2151                 goto out1;
2152         if (zerofrom < offset) {
2153                 kaddr = kmap_atomic(page, KM_USER0);
2154                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2155                 flush_dcache_page(page);
2156                 kunmap_atomic(kaddr, KM_USER0);
2157                 __block_commit_write(inode, page, zerofrom, offset);
2158         }
2159         return 0;
2160 out1:
2161         ClearPageUptodate(page);
2162         return status;
2163
2164 out_unmap:
2165         ClearPageUptodate(new_page);
2166         unlock_page(new_page);
2167         page_cache_release(new_page);
2168 out:
2169         return status;
2170 }
2171
2172 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2173                         get_block_t *get_block)
2174 {
2175         struct inode *inode = page->mapping->host;
2176         int err = __block_prepare_write(inode, page, from, to, get_block);
2177         if (err)
2178                 ClearPageUptodate(page);
2179         return err;
2180 }
2181
2182 int block_commit_write(struct page *page, unsigned from, unsigned to)
2183 {
2184         struct inode *inode = page->mapping->host;
2185         __block_commit_write(inode,page,from,to);
2186         return 0;
2187 }
2188
2189 int generic_commit_write(struct file *file, struct page *page,
2190                 unsigned from, unsigned to)
2191 {
2192         struct inode *inode = page->mapping->host;
2193         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2194         __block_commit_write(inode,page,from,to);
2195         /*
2196          * No need to use i_size_read() here, the i_size
2197          * cannot change under us because we hold i_mutex.
2198          */
2199         if (pos > inode->i_size) {
2200                 i_size_write(inode, pos);
2201                 mark_inode_dirty(inode);
2202         }
2203         return 0;
2204 }
2205
2206
2207 /*
2208  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2209  * immediately, while under the page lock.  So it needs a special end_io
2210  * handler which does not touch the bh after unlocking it.
2211  *
2212  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2213  * a race there is benign: unlock_buffer() only use the bh's address for
2214  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2215  * itself.
2216  */
2217 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2218 {
2219         if (uptodate) {
2220                 set_buffer_uptodate(bh);
2221         } else {
2222                 /* This happens, due to failed READA attempts. */
2223                 clear_buffer_uptodate(bh);
2224         }
2225         unlock_buffer(bh);
2226 }
2227
2228 /*
2229  * On entry, the page is fully not uptodate.
2230  * On exit the page is fully uptodate in the areas outside (from,to)
2231  */
2232 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2233                         get_block_t *get_block)
2234 {
2235         struct inode *inode = page->mapping->host;
2236         const unsigned blkbits = inode->i_blkbits;
2237         const unsigned blocksize = 1 << blkbits;
2238         struct buffer_head map_bh;
2239         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2240         unsigned block_in_page;
2241         unsigned block_start;
2242         sector_t block_in_file;
2243         char *kaddr;
2244         int nr_reads = 0;
2245         int i;
2246         int ret = 0;
2247         int is_mapped_to_disk = 1;
2248         int dirtied_it = 0;
2249
2250         if (PageMappedToDisk(page))
2251                 return 0;
2252
2253         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2254         map_bh.b_page = page;
2255
2256         /*
2257          * We loop across all blocks in the page, whether or not they are
2258          * part of the affected region.  This is so we can discover if the
2259          * page is fully mapped-to-disk.
2260          */
2261         for (block_start = 0, block_in_page = 0;
2262                   block_start < PAGE_CACHE_SIZE;
2263                   block_in_page++, block_start += blocksize) {
2264                 unsigned block_end = block_start + blocksize;
2265                 int create;
2266
2267                 map_bh.b_state = 0;
2268                 create = 1;
2269                 if (block_start >= to)
2270                         create = 0;
2271                 map_bh.b_size = blocksize;
2272                 ret = get_block(inode, block_in_file + block_in_page,
2273                                         &map_bh, create);
2274                 if (ret)
2275                         goto failed;
2276                 if (!buffer_mapped(&map_bh))
2277                         is_mapped_to_disk = 0;
2278                 if (buffer_new(&map_bh))
2279                         unmap_underlying_metadata(map_bh.b_bdev,
2280                                                         map_bh.b_blocknr);
2281                 if (PageUptodate(page))
2282                         continue;
2283                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2284                         kaddr = kmap_atomic(page, KM_USER0);
2285                         if (block_start < from) {
2286                                 memset(kaddr+block_start, 0, from-block_start);
2287                                 dirtied_it = 1;
2288                         }
2289                         if (block_end > to) {
2290                                 memset(kaddr + to, 0, block_end - to);
2291                                 dirtied_it = 1;
2292                         }
2293                         flush_dcache_page(page);
2294                         kunmap_atomic(kaddr, KM_USER0);
2295                         continue;
2296                 }
2297                 if (buffer_uptodate(&map_bh))
2298                         continue;       /* reiserfs does this */
2299                 if (block_start < from || block_end > to) {
2300                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2301
2302                         if (!bh) {
2303                                 ret = -ENOMEM;
2304                                 goto failed;
2305                         }
2306                         bh->b_state = map_bh.b_state;
2307                         atomic_set(&bh->b_count, 0);
2308                         bh->b_this_page = NULL;
2309                         bh->b_page = page;
2310                         bh->b_blocknr = map_bh.b_blocknr;
2311                         bh->b_size = blocksize;
2312                         bh->b_data = (char *)(long)block_start;
2313                         bh->b_bdev = map_bh.b_bdev;
2314                         bh->b_private = NULL;
2315                         read_bh[nr_reads++] = bh;
2316                 }
2317         }
2318
2319         if (nr_reads) {
2320                 struct buffer_head *bh;
2321
2322                 /*
2323                  * The page is locked, so these buffers are protected from
2324                  * any VM or truncate activity.  Hence we don't need to care
2325                  * for the buffer_head refcounts.
2326                  */
2327                 for (i = 0; i < nr_reads; i++) {
2328                         bh = read_bh[i];
2329                         lock_buffer(bh);
2330                         bh->b_end_io = end_buffer_read_nobh;
2331                         submit_bh(READ, bh);
2332                 }
2333                 for (i = 0; i < nr_reads; i++) {
2334                         bh = read_bh[i];
2335                         wait_on_buffer(bh);
2336                         if (!buffer_uptodate(bh))
2337                                 ret = -EIO;
2338                         free_buffer_head(bh);
2339                         read_bh[i] = NULL;
2340                 }
2341                 if (ret)
2342                         goto failed;
2343         }
2344
2345         if (is_mapped_to_disk)
2346                 SetPageMappedToDisk(page);
2347         SetPageUptodate(page);
2348
2349         /*
2350          * Setting the page dirty here isn't necessary for the prepare_write
2351          * function - commit_write will do that.  But if/when this function is
2352          * used within the pagefault handler to ensure that all mmapped pages
2353          * have backing space in the filesystem, we will need to dirty the page
2354          * if its contents were altered.
2355          */
2356         if (dirtied_it)
2357                 set_page_dirty(page);
2358
2359         return 0;
2360
2361 failed:
2362         for (i = 0; i < nr_reads; i++) {
2363                 if (read_bh[i])
2364                         free_buffer_head(read_bh[i]);
2365         }
2366
2367         /*
2368          * Error recovery is pretty slack.  Clear the page and mark it dirty
2369          * so we'll later zero out any blocks which _were_ allocated.
2370          */
2371         kaddr = kmap_atomic(page, KM_USER0);
2372         memset(kaddr, 0, PAGE_CACHE_SIZE);
2373         flush_dcache_page(page);
2374         kunmap_atomic(kaddr, KM_USER0);
2375         SetPageUptodate(page);
2376         set_page_dirty(page);
2377         return ret;
2378 }
2379 EXPORT_SYMBOL(nobh_prepare_write);
2380
2381 int nobh_commit_write(struct file *file, struct page *page,
2382                 unsigned from, unsigned to)
2383 {
2384         struct inode *inode = page->mapping->host;
2385         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2386
2387         set_page_dirty(page);
2388         if (pos > inode->i_size) {
2389                 i_size_write(inode, pos);
2390                 mark_inode_dirty(inode);
2391         }
2392         return 0;
2393 }
2394 EXPORT_SYMBOL(nobh_commit_write);
2395
2396 /*
2397  * nobh_writepage() - based on block_full_write_page() except
2398  * that it tries to operate without attaching bufferheads to
2399  * the page.
2400  */
2401 int nobh_writepage(struct page *page, get_block_t *get_block,
2402                         struct writeback_control *wbc)
2403 {
2404         struct inode * const inode = page->mapping->host;
2405         loff_t i_size = i_size_read(inode);
2406         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2407         unsigned offset;
2408         void *kaddr;
2409         int ret;
2410
2411         /* Is the page fully inside i_size? */
2412         if (page->index < end_index)
2413                 goto out;
2414
2415         /* Is the page fully outside i_size? (truncate in progress) */
2416         offset = i_size & (PAGE_CACHE_SIZE-1);
2417         if (page->index >= end_index+1 || !offset) {
2418                 /*
2419                  * The page may have dirty, unmapped buffers.  For example,
2420                  * they may have been added in ext3_writepage().  Make them
2421                  * freeable here, so the page does not leak.
2422                  */
2423 #if 0
2424                 /* Not really sure about this  - do we need this ? */
2425                 if (page->mapping->a_ops->invalidatepage)
2426                         page->mapping->a_ops->invalidatepage(page, offset);
2427 #endif
2428                 unlock_page(page);
2429                 return 0; /* don't care */
2430         }
2431
2432         /*
2433          * The page straddles i_size.  It must be zeroed out on each and every
2434          * writepage invocation because it may be mmapped.  "A file is mapped
2435          * in multiples of the page size.  For a file that is not a multiple of
2436          * the  page size, the remaining memory is zeroed when mapped, and
2437          * writes to that region are not written out to the file."
2438          */
2439         kaddr = kmap_atomic(page, KM_USER0);
2440         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2441         flush_dcache_page(page);
2442         kunmap_atomic(kaddr, KM_USER0);
2443 out:
2444         ret = mpage_writepage(page, get_block, wbc);
2445         if (ret == -EAGAIN)
2446                 ret = __block_write_full_page(inode, page, get_block, wbc);
2447         return ret;
2448 }
2449 EXPORT_SYMBOL(nobh_writepage);
2450
2451 /*
2452  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2453  */
2454 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2455 {
2456         struct inode *inode = mapping->host;
2457         unsigned blocksize = 1 << inode->i_blkbits;
2458         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2459         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2460         unsigned to;
2461         struct page *page;
2462         const struct address_space_operations *a_ops = mapping->a_ops;
2463         char *kaddr;
2464         int ret = 0;
2465
2466         if ((offset & (blocksize - 1)) == 0)
2467                 goto out;
2468
2469         ret = -ENOMEM;
2470         page = grab_cache_page(mapping, index);
2471         if (!page)
2472                 goto out;
2473
2474         to = (offset + blocksize) & ~(blocksize - 1);
2475         ret = a_ops->prepare_write(NULL, page, offset, to);
2476         if (ret == 0) {
2477                 kaddr = kmap_atomic(page, KM_USER0);
2478                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2479                 flush_dcache_page(page);
2480                 kunmap_atomic(kaddr, KM_USER0);
2481                 set_page_dirty(page);
2482         }
2483         unlock_page(page);
2484         page_cache_release(page);
2485 out:
2486         return ret;
2487 }
2488 EXPORT_SYMBOL(nobh_truncate_page);
2489
2490 int block_truncate_page(struct address_space *mapping,
2491                         loff_t from, get_block_t *get_block)
2492 {
2493         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2494         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2495         unsigned blocksize;
2496         sector_t iblock;
2497         unsigned length, pos;
2498         struct inode *inode = mapping->host;
2499         struct page *page;
2500         struct buffer_head *bh;
2501         void *kaddr;
2502         int err;
2503
2504         blocksize = 1 << inode->i_blkbits;
2505         length = offset & (blocksize - 1);
2506
2507         /* Block boundary? Nothing to do */
2508         if (!length)
2509                 return 0;
2510
2511         length = blocksize - length;
2512         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2513         
2514         page = grab_cache_page(mapping, index);
2515         err = -ENOMEM;
2516         if (!page)
2517                 goto out;
2518
2519         if (!page_has_buffers(page))
2520                 create_empty_buffers(page, blocksize, 0);
2521
2522         /* Find the buffer that contains "offset" */
2523         bh = page_buffers(page);
2524         pos = blocksize;
2525         while (offset >= pos) {
2526                 bh = bh->b_this_page;
2527                 iblock++;
2528                 pos += blocksize;
2529         }
2530
2531         err = 0;
2532         if (!buffer_mapped(bh)) {
2533                 WARN_ON(bh->b_size != blocksize);
2534                 err = get_block(inode, iblock, bh, 0);
2535                 if (err)
2536                         goto unlock;
2537                 /* unmapped? It's a hole - nothing to do */
2538                 if (!buffer_mapped(bh))
2539                         goto unlock;
2540         }
2541
2542         /* Ok, it's mapped. Make sure it's up-to-date */
2543         if (PageUptodate(page))
2544                 set_buffer_uptodate(bh);
2545
2546         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2547                 err = -EIO;
2548                 ll_rw_block(READ, 1, &bh);
2549                 wait_on_buffer(bh);
2550                 /* Uhhuh. Read error. Complain and punt. */
2551                 if (!buffer_uptodate(bh))
2552                         goto unlock;
2553         }
2554
2555         kaddr = kmap_atomic(page, KM_USER0);
2556         memset(kaddr + offset, 0, length);
2557         flush_dcache_page(page);
2558         kunmap_atomic(kaddr, KM_USER0);
2559
2560         mark_buffer_dirty(bh);
2561         err = 0;
2562
2563 unlock:
2564         unlock_page(page);
2565         page_cache_release(page);
2566 out:
2567         return err;
2568 }
2569
2570 /*
2571  * The generic ->writepage function for buffer-backed address_spaces
2572  */
2573 int block_write_full_page(struct page *page, get_block_t *get_block,
2574                         struct writeback_control *wbc)
2575 {
2576         struct inode * const inode = page->mapping->host;
2577         loff_t i_size = i_size_read(inode);
2578         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2579         unsigned offset;
2580         void *kaddr;
2581
2582         /* Is the page fully inside i_size? */
2583         if (page->index < end_index)
2584                 return __block_write_full_page(inode, page, get_block, wbc);
2585
2586         /* Is the page fully outside i_size? (truncate in progress) */
2587         offset = i_size & (PAGE_CACHE_SIZE-1);
2588         if (page->index >= end_index+1 || !offset) {
2589                 /*
2590                  * The page may have dirty, unmapped buffers.  For example,
2591                  * they may have been added in ext3_writepage().  Make them
2592                  * freeable here, so the page does not leak.
2593                  */
2594                 do_invalidatepage(page, 0);
2595                 unlock_page(page);
2596                 return 0; /* don't care */
2597         }
2598
2599         /*
2600          * The page straddles i_size.  It must be zeroed out on each and every
2601          * writepage invokation because it may be mmapped.  "A file is mapped
2602          * in multiples of the page size.  For a file that is not a multiple of
2603          * the  page size, the remaining memory is zeroed when mapped, and
2604          * writes to that region are not written out to the file."
2605          */
2606         kaddr = kmap_atomic(page, KM_USER0);
2607         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2608         flush_dcache_page(page);
2609         kunmap_atomic(kaddr, KM_USER0);
2610         return __block_write_full_page(inode, page, get_block, wbc);
2611 }
2612
2613 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2614                             get_block_t *get_block)
2615 {
2616         struct buffer_head tmp;
2617         struct inode *inode = mapping->host;
2618         tmp.b_state = 0;
2619         tmp.b_blocknr = 0;
2620         tmp.b_size = 1 << inode->i_blkbits;
2621         get_block(inode, block, &tmp, 0);
2622         return tmp.b_blocknr;
2623 }
2624
2625 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2626 {
2627         struct buffer_head *bh = bio->bi_private;
2628
2629         if (bio->bi_size)
2630                 return 1;
2631
2632         if (err == -EOPNOTSUPP) {
2633                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2634                 set_bit(BH_Eopnotsupp, &bh->b_state);
2635         }
2636
2637         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2638         bio_put(bio);
2639         return 0;
2640 }
2641
2642 int submit_bh(int rw, struct buffer_head * bh)
2643 {
2644         struct bio *bio;
2645         int ret = 0;
2646
2647         BUG_ON(!buffer_locked(bh));
2648         BUG_ON(!buffer_mapped(bh));
2649         BUG_ON(!bh->b_end_io);
2650
2651         if (buffer_ordered(bh) && (rw == WRITE))
2652                 rw = WRITE_BARRIER;
2653
2654         /*
2655          * Only clear out a write error when rewriting, should this
2656          * include WRITE_SYNC as well?
2657          */
2658         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2659                 clear_buffer_write_io_error(bh);
2660
2661         /*
2662          * from here on down, it's all bio -- do the initial mapping,
2663          * submit_bio -> generic_make_request may further map this bio around
2664          */
2665         bio = bio_alloc(GFP_NOIO, 1);
2666
2667         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2668         bio->bi_bdev = bh->b_bdev;
2669         bio->bi_io_vec[0].bv_page = bh->b_page;
2670         bio->bi_io_vec[0].bv_len = bh->b_size;
2671         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2672
2673         bio->bi_vcnt = 1;
2674         bio->bi_idx = 0;
2675         bio->bi_size = bh->b_size;
2676
2677         bio->bi_end_io = end_bio_bh_io_sync;
2678         bio->bi_private = bh;
2679
2680         bio_get(bio);
2681         submit_bio(rw, bio);
2682
2683         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2684                 ret = -EOPNOTSUPP;
2685
2686         bio_put(bio);
2687         return ret;
2688 }
2689
2690 /**
2691  * ll_rw_block: low-level access to block devices (DEPRECATED)
2692  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2693  * @nr: number of &struct buffer_heads in the array
2694  * @bhs: array of pointers to &struct buffer_head
2695  *
2696  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2697  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2698  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2699  * are sent to disk. The fourth %READA option is described in the documentation
2700  * for generic_make_request() which ll_rw_block() calls.
2701  *
2702  * This function drops any buffer that it cannot get a lock on (with the
2703  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2704  * clean when doing a write request, and any buffer that appears to be
2705  * up-to-date when doing read request.  Further it marks as clean buffers that
2706  * are processed for writing (the buffer cache won't assume that they are
2707  * actually clean until the buffer gets unlocked).
2708  *
2709  * ll_rw_block sets b_end_io to simple completion handler that marks
2710  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2711  * any waiters. 
2712  *
2713  * All of the buffers must be for the same device, and must also be a
2714  * multiple of the current approved size for the device.
2715  */
2716 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2717 {
2718         int i;
2719
2720         for (i = 0; i < nr; i++) {
2721                 struct buffer_head *bh = bhs[i];
2722
2723                 if (rw == SWRITE)
2724                         lock_buffer(bh);
2725                 else if (test_set_buffer_locked(bh))
2726                         continue;
2727
2728                 if (rw == WRITE || rw == SWRITE) {
2729                         if (test_clear_buffer_dirty(bh)) {
2730                                 bh->b_end_io = end_buffer_write_sync;
2731                                 get_bh(bh);
2732                                 submit_bh(WRITE, bh);
2733                                 continue;
2734                         }
2735                 } else {
2736                         if (!buffer_uptodate(bh)) {
2737                                 bh->b_end_io = end_buffer_read_sync;
2738                                 get_bh(bh);
2739                                 submit_bh(rw, bh);
2740                                 continue;
2741                         }
2742                 }
2743                 unlock_buffer(bh);
2744         }
2745 }
2746
2747 /*
2748  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2749  * and then start new I/O and then wait upon it.  The caller must have a ref on
2750  * the buffer_head.
2751  */
2752 int sync_dirty_buffer(struct buffer_head *bh)
2753 {
2754         int ret = 0;
2755
2756         WARN_ON(atomic_read(&bh->b_count) < 1);
2757         lock_buffer(bh);
2758         if (test_clear_buffer_dirty(bh)) {
2759                 get_bh(bh);
2760                 bh->b_end_io = end_buffer_write_sync;
2761                 ret = submit_bh(WRITE, bh);
2762                 wait_on_buffer(bh);
2763                 if (buffer_eopnotsupp(bh)) {
2764                         clear_buffer_eopnotsupp(bh);
2765                         ret = -EOPNOTSUPP;
2766                 }
2767                 if (!ret && !buffer_uptodate(bh))
2768                         ret = -EIO;
2769         } else {
2770                 unlock_buffer(bh);
2771         }
2772         return ret;
2773 }
2774
2775 /*
2776  * try_to_free_buffers() checks if all the buffers on this particular page
2777  * are unused, and releases them if so.
2778  *
2779  * Exclusion against try_to_free_buffers may be obtained by either
2780  * locking the page or by holding its mapping's private_lock.
2781  *
2782  * If the page is dirty but all the buffers are clean then we need to
2783  * be sure to mark the page clean as well.  This is because the page
2784  * may be against a block device, and a later reattachment of buffers
2785  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2786  * filesystem data on the same device.
2787  *
2788  * The same applies to regular filesystem pages: if all the buffers are
2789  * clean then we set the page clean and proceed.  To do that, we require
2790  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2791  * private_lock.
2792  *
2793  * try_to_free_buffers() is non-blocking.
2794  */
2795 static inline int buffer_busy(struct buffer_head *bh)
2796 {
2797         return atomic_read(&bh->b_count) |
2798                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2799 }
2800
2801 static int
2802 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2803 {
2804         struct buffer_head *head = page_buffers(page);
2805         struct buffer_head *bh;
2806
2807         bh = head;
2808         do {
2809                 if (buffer_write_io_error(bh) && page->mapping)
2810                         set_bit(AS_EIO, &page->mapping->flags);
2811                 if (buffer_busy(bh))
2812                         goto failed;
2813                 bh = bh->b_this_page;
2814         } while (bh != head);
2815
2816         do {
2817                 struct buffer_head *next = bh->b_this_page;
2818
2819                 if (!list_empty(&bh->b_assoc_buffers))
2820                         __remove_assoc_queue(bh);
2821                 bh = next;
2822         } while (bh != head);
2823         *buffers_to_free = head;
2824         __clear_page_buffers(page);
2825         return 1;
2826 failed:
2827         return 0;
2828 }
2829
2830 int try_to_free_buffers(struct page *page)
2831 {
2832         struct address_space * const mapping = page->mapping;
2833         struct buffer_head *buffers_to_free = NULL;
2834         int ret = 0;
2835
2836         BUG_ON(!PageLocked(page));
2837         if (PageWriteback(page))
2838                 return 0;
2839
2840         if (mapping == NULL) {          /* can this still happen? */
2841                 ret = drop_buffers(page, &buffers_to_free);
2842                 goto out;
2843         }
2844
2845         spin_lock(&mapping->private_lock);
2846         ret = drop_buffers(page, &buffers_to_free);
2847
2848         /*
2849          * If the filesystem writes its buffers by hand (eg ext3)
2850          * then we can have clean buffers against a dirty page.  We
2851          * clean the page here; otherwise the VM will never notice
2852          * that the filesystem did any IO at all.
2853          *
2854          * Also, during truncate, discard_buffer will have marked all
2855          * the page's buffers clean.  We discover that here and clean
2856          * the page also.
2857          *
2858          * private_lock must be held over this entire operation in order
2859          * to synchronise against __set_page_dirty_buffers and prevent the
2860          * dirty bit from being lost.
2861          */
2862         if (ret)
2863                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2864         spin_unlock(&mapping->private_lock);
2865 out:
2866         if (buffers_to_free) {
2867                 struct buffer_head *bh = buffers_to_free;
2868
2869                 do {
2870                         struct buffer_head *next = bh->b_this_page;
2871                         free_buffer_head(bh);
2872                         bh = next;
2873                 } while (bh != buffers_to_free);
2874         }
2875         return ret;
2876 }
2877 EXPORT_SYMBOL(try_to_free_buffers);
2878
2879 void block_sync_page(struct page *page)
2880 {
2881         struct address_space *mapping;
2882
2883         smp_mb();
2884         mapping = page_mapping(page);
2885         if (mapping)
2886                 blk_run_backing_dev(mapping->backing_dev_info, page);
2887 }
2888
2889 /*
2890  * There are no bdflush tunables left.  But distributions are
2891  * still running obsolete flush daemons, so we terminate them here.
2892  *
2893  * Use of bdflush() is deprecated and will be removed in a future kernel.
2894  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2895  */
2896 asmlinkage long sys_bdflush(int func, long data)
2897 {
2898         static int msg_count;
2899
2900         if (!capable(CAP_SYS_ADMIN))
2901                 return -EPERM;
2902
2903         if (msg_count < 5) {
2904                 msg_count++;
2905                 printk(KERN_INFO
2906                         "warning: process `%s' used the obsolete bdflush"
2907                         " system call\n", current->comm);
2908                 printk(KERN_INFO "Fix your initscripts?\n");
2909         }
2910
2911         if (func == 1)
2912                 do_exit(0);
2913         return 0;
2914 }
2915
2916 /*
2917  * Buffer-head allocation
2918  */
2919 static struct kmem_cache *bh_cachep;
2920
2921 /*
2922  * Once the number of bh's in the machine exceeds this level, we start
2923  * stripping them in writeback.
2924  */
2925 static int max_buffer_heads;
2926
2927 int buffer_heads_over_limit;
2928
2929 struct bh_accounting {
2930         int nr;                 /* Number of live bh's */
2931         int ratelimit;          /* Limit cacheline bouncing */
2932 };
2933
2934 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2935
2936 static void recalc_bh_state(void)
2937 {
2938         int i;
2939         int tot = 0;
2940
2941         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2942                 return;
2943         __get_cpu_var(bh_accounting).ratelimit = 0;
2944         for_each_online_cpu(i)
2945                 tot += per_cpu(bh_accounting, i).nr;
2946         buffer_heads_over_limit = (tot > max_buffer_heads);
2947 }
2948         
2949 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2950 {
2951         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2952         if (ret) {
2953                 get_cpu_var(bh_accounting).nr++;
2954                 recalc_bh_state();
2955                 put_cpu_var(bh_accounting);
2956         }
2957         return ret;
2958 }
2959 EXPORT_SYMBOL(alloc_buffer_head);
2960
2961 void free_buffer_head(struct buffer_head *bh)
2962 {
2963         BUG_ON(!list_empty(&bh->b_assoc_buffers));
2964         kmem_cache_free(bh_cachep, bh);
2965         get_cpu_var(bh_accounting).nr--;
2966         recalc_bh_state();
2967         put_cpu_var(bh_accounting);
2968 }
2969 EXPORT_SYMBOL(free_buffer_head);
2970
2971 static void
2972 init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
2973 {
2974         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2975                             SLAB_CTOR_CONSTRUCTOR) {
2976                 struct buffer_head * bh = (struct buffer_head *)data;
2977
2978                 memset(bh, 0, sizeof(*bh));
2979                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
2980         }
2981 }
2982
2983 static void buffer_exit_cpu(int cpu)
2984 {
2985         int i;
2986         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2987
2988         for (i = 0; i < BH_LRU_SIZE; i++) {
2989                 brelse(b->bhs[i]);
2990                 b->bhs[i] = NULL;
2991         }
2992         get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
2993         per_cpu(bh_accounting, cpu).nr = 0;
2994         put_cpu_var(bh_accounting);
2995 }
2996
2997 static int buffer_cpu_notify(struct notifier_block *self,
2998                               unsigned long action, void *hcpu)
2999 {
3000         if (action == CPU_DEAD)
3001                 buffer_exit_cpu((unsigned long)hcpu);
3002         return NOTIFY_OK;
3003 }
3004
3005 void __init buffer_init(void)
3006 {
3007         int nrpages;
3008
3009         bh_cachep = kmem_cache_create("buffer_head",
3010                                         sizeof(struct buffer_head), 0,
3011                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3012                                         SLAB_MEM_SPREAD),
3013                                         init_buffer_head,
3014                                         NULL);
3015
3016         /*
3017          * Limit the bh occupancy to 10% of ZONE_NORMAL
3018          */
3019         nrpages = (nr_free_buffer_pages() * 10) / 100;
3020         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3021         hotcpu_notifier(buffer_cpu_notify, 0);
3022 }
3023
3024 EXPORT_SYMBOL(__bforget);
3025 EXPORT_SYMBOL(__brelse);
3026 EXPORT_SYMBOL(__wait_on_buffer);
3027 EXPORT_SYMBOL(block_commit_write);
3028 EXPORT_SYMBOL(block_prepare_write);
3029 EXPORT_SYMBOL(block_read_full_page);
3030 EXPORT_SYMBOL(block_sync_page);
3031 EXPORT_SYMBOL(block_truncate_page);
3032 EXPORT_SYMBOL(block_write_full_page);
3033 EXPORT_SYMBOL(cont_prepare_write);
3034 EXPORT_SYMBOL(end_buffer_read_sync);
3035 EXPORT_SYMBOL(end_buffer_write_sync);
3036 EXPORT_SYMBOL(file_fsync);
3037 EXPORT_SYMBOL(fsync_bdev);
3038 EXPORT_SYMBOL(generic_block_bmap);
3039 EXPORT_SYMBOL(generic_commit_write);
3040 EXPORT_SYMBOL(generic_cont_expand);
3041 EXPORT_SYMBOL(generic_cont_expand_simple);
3042 EXPORT_SYMBOL(init_buffer);
3043 EXPORT_SYMBOL(invalidate_bdev);
3044 EXPORT_SYMBOL(ll_rw_block);
3045 EXPORT_SYMBOL(mark_buffer_dirty);
3046 EXPORT_SYMBOL(submit_bh);
3047 EXPORT_SYMBOL(sync_dirty_buffer);
3048 EXPORT_SYMBOL(unlock_buffer);