Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52         bh->b_end_io = handler;
53         bh->b_private = private;
54 }
55
56 static int sync_buffer(void *word)
57 {
58         struct block_device *bd;
59         struct buffer_head *bh
60                 = container_of(word, struct buffer_head, b_state);
61
62         smp_mb();
63         bd = bh->b_bdev;
64         if (bd)
65                 blk_run_address_space(bd->bd_inode->i_mapping);
66         io_schedule();
67         return 0;
68 }
69
70 void fastcall __lock_buffer(struct buffer_head *bh)
71 {
72         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73                                                         TASK_UNINTERRUPTIBLE);
74 }
75 EXPORT_SYMBOL(__lock_buffer);
76
77 void fastcall unlock_buffer(struct buffer_head *bh)
78 {
79         smp_mb__before_clear_bit();
80         clear_buffer_locked(bh);
81         smp_mb__after_clear_bit();
82         wake_up_bit(&bh->b_state, BH_Lock);
83 }
84
85 /*
86  * Block until a buffer comes unlocked.  This doesn't stop it
87  * from becoming locked again - you have to lock it yourself
88  * if you want to preserve its state.
89  */
90 void __wait_on_buffer(struct buffer_head * bh)
91 {
92         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
93 }
94
95 static void
96 __clear_page_buffers(struct page *page)
97 {
98         ClearPagePrivate(page);
99         set_page_private(page, 0);
100         page_cache_release(page);
101 }
102
103 static void buffer_io_error(struct buffer_head *bh)
104 {
105         char b[BDEVNAME_SIZE];
106
107         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
108                         bdevname(bh->b_bdev, b),
109                         (unsigned long long)bh->b_blocknr);
110 }
111
112 /*
113  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
114  * unlock the buffer. This is what ll_rw_block uses too.
115  */
116 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
117 {
118         if (uptodate) {
119                 set_buffer_uptodate(bh);
120         } else {
121                 /* This happens, due to failed READA attempts. */
122                 clear_buffer_uptodate(bh);
123         }
124         unlock_buffer(bh);
125         put_bh(bh);
126 }
127
128 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
129 {
130         char b[BDEVNAME_SIZE];
131
132         if (uptodate) {
133                 set_buffer_uptodate(bh);
134         } else {
135                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
136                         buffer_io_error(bh);
137                         printk(KERN_WARNING "lost page write due to "
138                                         "I/O error on %s\n",
139                                        bdevname(bh->b_bdev, b));
140                 }
141                 set_buffer_write_io_error(bh);
142                 clear_buffer_uptodate(bh);
143         }
144         unlock_buffer(bh);
145         put_bh(bh);
146 }
147
148 /*
149  * Write out and wait upon all the dirty data associated with a block
150  * device via its mapping.  Does not take the superblock lock.
151  */
152 int sync_blockdev(struct block_device *bdev)
153 {
154         int ret = 0;
155
156         if (bdev)
157                 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
158         return ret;
159 }
160 EXPORT_SYMBOL(sync_blockdev);
161
162 /*
163  * Write out and wait upon all dirty data associated with this
164  * device.   Filesystem data as well as the underlying block
165  * device.  Takes the superblock lock.
166  */
167 int fsync_bdev(struct block_device *bdev)
168 {
169         struct super_block *sb = get_super(bdev);
170         if (sb) {
171                 int res = fsync_super(sb);
172                 drop_super(sb);
173                 return res;
174         }
175         return sync_blockdev(bdev);
176 }
177
178 /**
179  * freeze_bdev  --  lock a filesystem and force it into a consistent state
180  * @bdev:       blockdevice to lock
181  *
182  * This takes the block device bd_mount_sem to make sure no new mounts
183  * happen on bdev until thaw_bdev() is called.
184  * If a superblock is found on this device, we take the s_umount semaphore
185  * on it to make sure nobody unmounts until the snapshot creation is done.
186  */
187 struct super_block *freeze_bdev(struct block_device *bdev)
188 {
189         struct super_block *sb;
190
191         down(&bdev->bd_mount_sem);
192         sb = get_super(bdev);
193         if (sb && !(sb->s_flags & MS_RDONLY)) {
194                 sb->s_frozen = SB_FREEZE_WRITE;
195                 smp_wmb();
196
197                 __fsync_super(sb);
198
199                 sb->s_frozen = SB_FREEZE_TRANS;
200                 smp_wmb();
201
202                 sync_blockdev(sb->s_bdev);
203
204                 if (sb->s_op->write_super_lockfs)
205                         sb->s_op->write_super_lockfs(sb);
206         }
207
208         sync_blockdev(bdev);
209         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
210 }
211 EXPORT_SYMBOL(freeze_bdev);
212
213 /**
214  * thaw_bdev  -- unlock filesystem
215  * @bdev:       blockdevice to unlock
216  * @sb:         associated superblock
217  *
218  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
219  */
220 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
221 {
222         if (sb) {
223                 BUG_ON(sb->s_bdev != bdev);
224
225                 if (sb->s_op->unlockfs)
226                         sb->s_op->unlockfs(sb);
227                 sb->s_frozen = SB_UNFROZEN;
228                 smp_wmb();
229                 wake_up(&sb->s_wait_unfrozen);
230                 drop_super(sb);
231         }
232
233         up(&bdev->bd_mount_sem);
234 }
235 EXPORT_SYMBOL(thaw_bdev);
236
237 /*
238  * Various filesystems appear to want __find_get_block to be non-blocking.
239  * But it's the page lock which protects the buffers.  To get around this,
240  * we get exclusion from try_to_free_buffers with the blockdev mapping's
241  * private_lock.
242  *
243  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
244  * may be quite high.  This code could TryLock the page, and if that
245  * succeeds, there is no need to take private_lock. (But if
246  * private_lock is contended then so is mapping->tree_lock).
247  */
248 static struct buffer_head *
249 __find_get_block_slow(struct block_device *bdev, sector_t block)
250 {
251         struct inode *bd_inode = bdev->bd_inode;
252         struct address_space *bd_mapping = bd_inode->i_mapping;
253         struct buffer_head *ret = NULL;
254         pgoff_t index;
255         struct buffer_head *bh;
256         struct buffer_head *head;
257         struct page *page;
258         int all_mapped = 1;
259
260         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
261         page = find_get_page(bd_mapping, index);
262         if (!page)
263                 goto out;
264
265         spin_lock(&bd_mapping->private_lock);
266         if (!page_has_buffers(page))
267                 goto out_unlock;
268         head = page_buffers(page);
269         bh = head;
270         do {
271                 if (bh->b_blocknr == block) {
272                         ret = bh;
273                         get_bh(bh);
274                         goto out_unlock;
275                 }
276                 if (!buffer_mapped(bh))
277                         all_mapped = 0;
278                 bh = bh->b_this_page;
279         } while (bh != head);
280
281         /* we might be here because some of the buffers on this page are
282          * not mapped.  This is due to various races between
283          * file io on the block device and getblk.  It gets dealt with
284          * elsewhere, don't buffer_error if we had some unmapped buffers
285          */
286         if (all_mapped) {
287                 printk("__find_get_block_slow() failed. "
288                         "block=%llu, b_blocknr=%llu\n",
289                         (unsigned long long)block,
290                         (unsigned long long)bh->b_blocknr);
291                 printk("b_state=0x%08lx, b_size=%zu\n",
292                         bh->b_state, bh->b_size);
293                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
294         }
295 out_unlock:
296         spin_unlock(&bd_mapping->private_lock);
297         page_cache_release(page);
298 out:
299         return ret;
300 }
301
302 /* If invalidate_buffers() will trash dirty buffers, it means some kind
303    of fs corruption is going on. Trashing dirty data always imply losing
304    information that was supposed to be just stored on the physical layer
305    by the user.
306
307    Thus invalidate_buffers in general usage is not allwowed to trash
308    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
309    be preserved.  These buffers are simply skipped.
310   
311    We also skip buffers which are still in use.  For example this can
312    happen if a userspace program is reading the block device.
313
314    NOTE: In the case where the user removed a removable-media-disk even if
315    there's still dirty data not synced on disk (due a bug in the device driver
316    or due an error of the user), by not destroying the dirty buffers we could
317    generate corruption also on the next media inserted, thus a parameter is
318    necessary to handle this case in the most safe way possible (trying
319    to not corrupt also the new disk inserted with the data belonging to
320    the old now corrupted disk). Also for the ramdisk the natural thing
321    to do in order to release the ramdisk memory is to destroy dirty buffers.
322
323    These are two special cases. Normal usage imply the device driver
324    to issue a sync on the device (without waiting I/O completion) and
325    then an invalidate_buffers call that doesn't trash dirty buffers.
326
327    For handling cache coherency with the blkdev pagecache the 'update' case
328    is been introduced. It is needed to re-read from disk any pinned
329    buffer. NOTE: re-reading from disk is destructive so we can do it only
330    when we assume nobody is changing the buffercache under our I/O and when
331    we think the disk contains more recent information than the buffercache.
332    The update == 1 pass marks the buffers we need to update, the update == 2
333    pass does the actual I/O. */
334 void invalidate_bdev(struct block_device *bdev)
335 {
336         struct address_space *mapping = bdev->bd_inode->i_mapping;
337
338         if (mapping->nrpages == 0)
339                 return;
340
341         invalidate_bh_lrus();
342         invalidate_mapping_pages(mapping, 0, -1);
343 }
344
345 /*
346  * Kick pdflush then try to free up some ZONE_NORMAL memory.
347  */
348 static void free_more_memory(void)
349 {
350         struct zone **zones;
351         pg_data_t *pgdat;
352
353         wakeup_pdflush(1024);
354         yield();
355
356         for_each_online_pgdat(pgdat) {
357                 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
358                 if (*zones)
359                         try_to_free_pages(zones, GFP_NOFS);
360         }
361 }
362
363 /*
364  * I/O completion handler for block_read_full_page() - pages
365  * which come unlocked at the end of I/O.
366  */
367 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
368 {
369         unsigned long flags;
370         struct buffer_head *first;
371         struct buffer_head *tmp;
372         struct page *page;
373         int page_uptodate = 1;
374
375         BUG_ON(!buffer_async_read(bh));
376
377         page = bh->b_page;
378         if (uptodate) {
379                 set_buffer_uptodate(bh);
380         } else {
381                 clear_buffer_uptodate(bh);
382                 if (printk_ratelimit())
383                         buffer_io_error(bh);
384                 SetPageError(page);
385         }
386
387         /*
388          * Be _very_ careful from here on. Bad things can happen if
389          * two buffer heads end IO at almost the same time and both
390          * decide that the page is now completely done.
391          */
392         first = page_buffers(page);
393         local_irq_save(flags);
394         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
395         clear_buffer_async_read(bh);
396         unlock_buffer(bh);
397         tmp = bh;
398         do {
399                 if (!buffer_uptodate(tmp))
400                         page_uptodate = 0;
401                 if (buffer_async_read(tmp)) {
402                         BUG_ON(!buffer_locked(tmp));
403                         goto still_busy;
404                 }
405                 tmp = tmp->b_this_page;
406         } while (tmp != bh);
407         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
408         local_irq_restore(flags);
409
410         /*
411          * If none of the buffers had errors and they are all
412          * uptodate then we can set the page uptodate.
413          */
414         if (page_uptodate && !PageError(page))
415                 SetPageUptodate(page);
416         unlock_page(page);
417         return;
418
419 still_busy:
420         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
421         local_irq_restore(flags);
422         return;
423 }
424
425 /*
426  * Completion handler for block_write_full_page() - pages which are unlocked
427  * during I/O, and which have PageWriteback cleared upon I/O completion.
428  */
429 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
430 {
431         char b[BDEVNAME_SIZE];
432         unsigned long flags;
433         struct buffer_head *first;
434         struct buffer_head *tmp;
435         struct page *page;
436
437         BUG_ON(!buffer_async_write(bh));
438
439         page = bh->b_page;
440         if (uptodate) {
441                 set_buffer_uptodate(bh);
442         } else {
443                 if (printk_ratelimit()) {
444                         buffer_io_error(bh);
445                         printk(KERN_WARNING "lost page write due to "
446                                         "I/O error on %s\n",
447                                bdevname(bh->b_bdev, b));
448                 }
449                 set_bit(AS_EIO, &page->mapping->flags);
450                 set_buffer_write_io_error(bh);
451                 clear_buffer_uptodate(bh);
452                 SetPageError(page);
453         }
454
455         first = page_buffers(page);
456         local_irq_save(flags);
457         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
458
459         clear_buffer_async_write(bh);
460         unlock_buffer(bh);
461         tmp = bh->b_this_page;
462         while (tmp != bh) {
463                 if (buffer_async_write(tmp)) {
464                         BUG_ON(!buffer_locked(tmp));
465                         goto still_busy;
466                 }
467                 tmp = tmp->b_this_page;
468         }
469         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
470         local_irq_restore(flags);
471         end_page_writeback(page);
472         return;
473
474 still_busy:
475         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
476         local_irq_restore(flags);
477         return;
478 }
479
480 /*
481  * If a page's buffers are under async readin (end_buffer_async_read
482  * completion) then there is a possibility that another thread of
483  * control could lock one of the buffers after it has completed
484  * but while some of the other buffers have not completed.  This
485  * locked buffer would confuse end_buffer_async_read() into not unlocking
486  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
487  * that this buffer is not under async I/O.
488  *
489  * The page comes unlocked when it has no locked buffer_async buffers
490  * left.
491  *
492  * PageLocked prevents anyone starting new async I/O reads any of
493  * the buffers.
494  *
495  * PageWriteback is used to prevent simultaneous writeout of the same
496  * page.
497  *
498  * PageLocked prevents anyone from starting writeback of a page which is
499  * under read I/O (PageWriteback is only ever set against a locked page).
500  */
501 static void mark_buffer_async_read(struct buffer_head *bh)
502 {
503         bh->b_end_io = end_buffer_async_read;
504         set_buffer_async_read(bh);
505 }
506
507 void mark_buffer_async_write(struct buffer_head *bh)
508 {
509         bh->b_end_io = end_buffer_async_write;
510         set_buffer_async_write(bh);
511 }
512 EXPORT_SYMBOL(mark_buffer_async_write);
513
514
515 /*
516  * fs/buffer.c contains helper functions for buffer-backed address space's
517  * fsync functions.  A common requirement for buffer-based filesystems is
518  * that certain data from the backing blockdev needs to be written out for
519  * a successful fsync().  For example, ext2 indirect blocks need to be
520  * written back and waited upon before fsync() returns.
521  *
522  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
523  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
524  * management of a list of dependent buffers at ->i_mapping->private_list.
525  *
526  * Locking is a little subtle: try_to_free_buffers() will remove buffers
527  * from their controlling inode's queue when they are being freed.  But
528  * try_to_free_buffers() will be operating against the *blockdev* mapping
529  * at the time, not against the S_ISREG file which depends on those buffers.
530  * So the locking for private_list is via the private_lock in the address_space
531  * which backs the buffers.  Which is different from the address_space 
532  * against which the buffers are listed.  So for a particular address_space,
533  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
534  * mapping->private_list will always be protected by the backing blockdev's
535  * ->private_lock.
536  *
537  * Which introduces a requirement: all buffers on an address_space's
538  * ->private_list must be from the same address_space: the blockdev's.
539  *
540  * address_spaces which do not place buffers at ->private_list via these
541  * utility functions are free to use private_lock and private_list for
542  * whatever they want.  The only requirement is that list_empty(private_list)
543  * be true at clear_inode() time.
544  *
545  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
546  * filesystems should do that.  invalidate_inode_buffers() should just go
547  * BUG_ON(!list_empty).
548  *
549  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
550  * take an address_space, not an inode.  And it should be called
551  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
552  * queued up.
553  *
554  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
555  * list if it is already on a list.  Because if the buffer is on a list,
556  * it *must* already be on the right one.  If not, the filesystem is being
557  * silly.  This will save a ton of locking.  But first we have to ensure
558  * that buffers are taken *off* the old inode's list when they are freed
559  * (presumably in truncate).  That requires careful auditing of all
560  * filesystems (do it inside bforget()).  It could also be done by bringing
561  * b_inode back.
562  */
563
564 /*
565  * The buffer's backing address_space's private_lock must be held
566  */
567 static inline void __remove_assoc_queue(struct buffer_head *bh)
568 {
569         list_del_init(&bh->b_assoc_buffers);
570         WARN_ON(!bh->b_assoc_map);
571         if (buffer_write_io_error(bh))
572                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
573         bh->b_assoc_map = NULL;
574 }
575
576 int inode_has_buffers(struct inode *inode)
577 {
578         return !list_empty(&inode->i_data.private_list);
579 }
580
581 /*
582  * osync is designed to support O_SYNC io.  It waits synchronously for
583  * all already-submitted IO to complete, but does not queue any new
584  * writes to the disk.
585  *
586  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
587  * you dirty the buffers, and then use osync_inode_buffers to wait for
588  * completion.  Any other dirty buffers which are not yet queued for
589  * write will not be flushed to disk by the osync.
590  */
591 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
592 {
593         struct buffer_head *bh;
594         struct list_head *p;
595         int err = 0;
596
597         spin_lock(lock);
598 repeat:
599         list_for_each_prev(p, list) {
600                 bh = BH_ENTRY(p);
601                 if (buffer_locked(bh)) {
602                         get_bh(bh);
603                         spin_unlock(lock);
604                         wait_on_buffer(bh);
605                         if (!buffer_uptodate(bh))
606                                 err = -EIO;
607                         brelse(bh);
608                         spin_lock(lock);
609                         goto repeat;
610                 }
611         }
612         spin_unlock(lock);
613         return err;
614 }
615
616 /**
617  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
618  *                        buffers
619  * @mapping: the mapping which wants those buffers written
620  *
621  * Starts I/O against the buffers at mapping->private_list, and waits upon
622  * that I/O.
623  *
624  * Basically, this is a convenience function for fsync().
625  * @mapping is a file or directory which needs those buffers to be written for
626  * a successful fsync().
627  */
628 int sync_mapping_buffers(struct address_space *mapping)
629 {
630         struct address_space *buffer_mapping = mapping->assoc_mapping;
631
632         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
633                 return 0;
634
635         return fsync_buffers_list(&buffer_mapping->private_lock,
636                                         &mapping->private_list);
637 }
638 EXPORT_SYMBOL(sync_mapping_buffers);
639
640 /*
641  * Called when we've recently written block `bblock', and it is known that
642  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
643  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
644  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
645  */
646 void write_boundary_block(struct block_device *bdev,
647                         sector_t bblock, unsigned blocksize)
648 {
649         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
650         if (bh) {
651                 if (buffer_dirty(bh))
652                         ll_rw_block(WRITE, 1, &bh);
653                 put_bh(bh);
654         }
655 }
656
657 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
658 {
659         struct address_space *mapping = inode->i_mapping;
660         struct address_space *buffer_mapping = bh->b_page->mapping;
661
662         mark_buffer_dirty(bh);
663         if (!mapping->assoc_mapping) {
664                 mapping->assoc_mapping = buffer_mapping;
665         } else {
666                 BUG_ON(mapping->assoc_mapping != buffer_mapping);
667         }
668         if (list_empty(&bh->b_assoc_buffers)) {
669                 spin_lock(&buffer_mapping->private_lock);
670                 list_move_tail(&bh->b_assoc_buffers,
671                                 &mapping->private_list);
672                 bh->b_assoc_map = mapping;
673                 spin_unlock(&buffer_mapping->private_lock);
674         }
675 }
676 EXPORT_SYMBOL(mark_buffer_dirty_inode);
677
678 /*
679  * Add a page to the dirty page list.
680  *
681  * It is a sad fact of life that this function is called from several places
682  * deeply under spinlocking.  It may not sleep.
683  *
684  * If the page has buffers, the uptodate buffers are set dirty, to preserve
685  * dirty-state coherency between the page and the buffers.  It the page does
686  * not have buffers then when they are later attached they will all be set
687  * dirty.
688  *
689  * The buffers are dirtied before the page is dirtied.  There's a small race
690  * window in which a writepage caller may see the page cleanness but not the
691  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
692  * before the buffers, a concurrent writepage caller could clear the page dirty
693  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
694  * page on the dirty page list.
695  *
696  * We use private_lock to lock against try_to_free_buffers while using the
697  * page's buffer list.  Also use this to protect against clean buffers being
698  * added to the page after it was set dirty.
699  *
700  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
701  * address_space though.
702  */
703 int __set_page_dirty_buffers(struct page *page)
704 {
705         struct address_space * const mapping = page_mapping(page);
706
707         if (unlikely(!mapping))
708                 return !TestSetPageDirty(page);
709
710         spin_lock(&mapping->private_lock);
711         if (page_has_buffers(page)) {
712                 struct buffer_head *head = page_buffers(page);
713                 struct buffer_head *bh = head;
714
715                 do {
716                         set_buffer_dirty(bh);
717                         bh = bh->b_this_page;
718                 } while (bh != head);
719         }
720         spin_unlock(&mapping->private_lock);
721
722         if (TestSetPageDirty(page))
723                 return 0;
724
725         write_lock_irq(&mapping->tree_lock);
726         if (page->mapping) {    /* Race with truncate? */
727                 if (mapping_cap_account_dirty(mapping)) {
728                         __inc_zone_page_state(page, NR_FILE_DIRTY);
729                         task_io_account_write(PAGE_CACHE_SIZE);
730                 }
731                 radix_tree_tag_set(&mapping->page_tree,
732                                 page_index(page), PAGECACHE_TAG_DIRTY);
733         }
734         write_unlock_irq(&mapping->tree_lock);
735         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
736         return 1;
737 }
738 EXPORT_SYMBOL(__set_page_dirty_buffers);
739
740 /*
741  * Write out and wait upon a list of buffers.
742  *
743  * We have conflicting pressures: we want to make sure that all
744  * initially dirty buffers get waited on, but that any subsequently
745  * dirtied buffers don't.  After all, we don't want fsync to last
746  * forever if somebody is actively writing to the file.
747  *
748  * Do this in two main stages: first we copy dirty buffers to a
749  * temporary inode list, queueing the writes as we go.  Then we clean
750  * up, waiting for those writes to complete.
751  * 
752  * During this second stage, any subsequent updates to the file may end
753  * up refiling the buffer on the original inode's dirty list again, so
754  * there is a chance we will end up with a buffer queued for write but
755  * not yet completed on that list.  So, as a final cleanup we go through
756  * the osync code to catch these locked, dirty buffers without requeuing
757  * any newly dirty buffers for write.
758  */
759 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
760 {
761         struct buffer_head *bh;
762         struct list_head tmp;
763         int err = 0, err2;
764
765         INIT_LIST_HEAD(&tmp);
766
767         spin_lock(lock);
768         while (!list_empty(list)) {
769                 bh = BH_ENTRY(list->next);
770                 __remove_assoc_queue(bh);
771                 if (buffer_dirty(bh) || buffer_locked(bh)) {
772                         list_add(&bh->b_assoc_buffers, &tmp);
773                         if (buffer_dirty(bh)) {
774                                 get_bh(bh);
775                                 spin_unlock(lock);
776                                 /*
777                                  * Ensure any pending I/O completes so that
778                                  * ll_rw_block() actually writes the current
779                                  * contents - it is a noop if I/O is still in
780                                  * flight on potentially older contents.
781                                  */
782                                 ll_rw_block(SWRITE, 1, &bh);
783                                 brelse(bh);
784                                 spin_lock(lock);
785                         }
786                 }
787         }
788
789         while (!list_empty(&tmp)) {
790                 bh = BH_ENTRY(tmp.prev);
791                 list_del_init(&bh->b_assoc_buffers);
792                 get_bh(bh);
793                 spin_unlock(lock);
794                 wait_on_buffer(bh);
795                 if (!buffer_uptodate(bh))
796                         err = -EIO;
797                 brelse(bh);
798                 spin_lock(lock);
799         }
800         
801         spin_unlock(lock);
802         err2 = osync_buffers_list(lock, list);
803         if (err)
804                 return err;
805         else
806                 return err2;
807 }
808
809 /*
810  * Invalidate any and all dirty buffers on a given inode.  We are
811  * probably unmounting the fs, but that doesn't mean we have already
812  * done a sync().  Just drop the buffers from the inode list.
813  *
814  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
815  * assumes that all the buffers are against the blockdev.  Not true
816  * for reiserfs.
817  */
818 void invalidate_inode_buffers(struct inode *inode)
819 {
820         if (inode_has_buffers(inode)) {
821                 struct address_space *mapping = &inode->i_data;
822                 struct list_head *list = &mapping->private_list;
823                 struct address_space *buffer_mapping = mapping->assoc_mapping;
824
825                 spin_lock(&buffer_mapping->private_lock);
826                 while (!list_empty(list))
827                         __remove_assoc_queue(BH_ENTRY(list->next));
828                 spin_unlock(&buffer_mapping->private_lock);
829         }
830 }
831
832 /*
833  * Remove any clean buffers from the inode's buffer list.  This is called
834  * when we're trying to free the inode itself.  Those buffers can pin it.
835  *
836  * Returns true if all buffers were removed.
837  */
838 int remove_inode_buffers(struct inode *inode)
839 {
840         int ret = 1;
841
842         if (inode_has_buffers(inode)) {
843                 struct address_space *mapping = &inode->i_data;
844                 struct list_head *list = &mapping->private_list;
845                 struct address_space *buffer_mapping = mapping->assoc_mapping;
846
847                 spin_lock(&buffer_mapping->private_lock);
848                 while (!list_empty(list)) {
849                         struct buffer_head *bh = BH_ENTRY(list->next);
850                         if (buffer_dirty(bh)) {
851                                 ret = 0;
852                                 break;
853                         }
854                         __remove_assoc_queue(bh);
855                 }
856                 spin_unlock(&buffer_mapping->private_lock);
857         }
858         return ret;
859 }
860
861 /*
862  * Create the appropriate buffers when given a page for data area and
863  * the size of each buffer.. Use the bh->b_this_page linked list to
864  * follow the buffers created.  Return NULL if unable to create more
865  * buffers.
866  *
867  * The retry flag is used to differentiate async IO (paging, swapping)
868  * which may not fail from ordinary buffer allocations.
869  */
870 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
871                 int retry)
872 {
873         struct buffer_head *bh, *head;
874         long offset;
875
876 try_again:
877         head = NULL;
878         offset = PAGE_SIZE;
879         while ((offset -= size) >= 0) {
880                 bh = alloc_buffer_head(GFP_NOFS);
881                 if (!bh)
882                         goto no_grow;
883
884                 bh->b_bdev = NULL;
885                 bh->b_this_page = head;
886                 bh->b_blocknr = -1;
887                 head = bh;
888
889                 bh->b_state = 0;
890                 atomic_set(&bh->b_count, 0);
891                 bh->b_private = NULL;
892                 bh->b_size = size;
893
894                 /* Link the buffer to its page */
895                 set_bh_page(bh, page, offset);
896
897                 init_buffer(bh, NULL, NULL);
898         }
899         return head;
900 /*
901  * In case anything failed, we just free everything we got.
902  */
903 no_grow:
904         if (head) {
905                 do {
906                         bh = head;
907                         head = head->b_this_page;
908                         free_buffer_head(bh);
909                 } while (head);
910         }
911
912         /*
913          * Return failure for non-async IO requests.  Async IO requests
914          * are not allowed to fail, so we have to wait until buffer heads
915          * become available.  But we don't want tasks sleeping with 
916          * partially complete buffers, so all were released above.
917          */
918         if (!retry)
919                 return NULL;
920
921         /* We're _really_ low on memory. Now we just
922          * wait for old buffer heads to become free due to
923          * finishing IO.  Since this is an async request and
924          * the reserve list is empty, we're sure there are 
925          * async buffer heads in use.
926          */
927         free_more_memory();
928         goto try_again;
929 }
930 EXPORT_SYMBOL_GPL(alloc_page_buffers);
931
932 static inline void
933 link_dev_buffers(struct page *page, struct buffer_head *head)
934 {
935         struct buffer_head *bh, *tail;
936
937         bh = head;
938         do {
939                 tail = bh;
940                 bh = bh->b_this_page;
941         } while (bh);
942         tail->b_this_page = head;
943         attach_page_buffers(page, head);
944 }
945
946 /*
947  * Initialise the state of a blockdev page's buffers.
948  */ 
949 static void
950 init_page_buffers(struct page *page, struct block_device *bdev,
951                         sector_t block, int size)
952 {
953         struct buffer_head *head = page_buffers(page);
954         struct buffer_head *bh = head;
955         int uptodate = PageUptodate(page);
956
957         do {
958                 if (!buffer_mapped(bh)) {
959                         init_buffer(bh, NULL, NULL);
960                         bh->b_bdev = bdev;
961                         bh->b_blocknr = block;
962                         if (uptodate)
963                                 set_buffer_uptodate(bh);
964                         set_buffer_mapped(bh);
965                 }
966                 block++;
967                 bh = bh->b_this_page;
968         } while (bh != head);
969 }
970
971 /*
972  * Create the page-cache page that contains the requested block.
973  *
974  * This is user purely for blockdev mappings.
975  */
976 static struct page *
977 grow_dev_page(struct block_device *bdev, sector_t block,
978                 pgoff_t index, int size)
979 {
980         struct inode *inode = bdev->bd_inode;
981         struct page *page;
982         struct buffer_head *bh;
983
984         page = find_or_create_page(inode->i_mapping, index,
985                 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
986         if (!page)
987                 return NULL;
988
989         BUG_ON(!PageLocked(page));
990
991         if (page_has_buffers(page)) {
992                 bh = page_buffers(page);
993                 if (bh->b_size == size) {
994                         init_page_buffers(page, bdev, block, size);
995                         return page;
996                 }
997                 if (!try_to_free_buffers(page))
998                         goto failed;
999         }
1000
1001         /*
1002          * Allocate some buffers for this page
1003          */
1004         bh = alloc_page_buffers(page, size, 0);
1005         if (!bh)
1006                 goto failed;
1007
1008         /*
1009          * Link the page to the buffers and initialise them.  Take the
1010          * lock to be atomic wrt __find_get_block(), which does not
1011          * run under the page lock.
1012          */
1013         spin_lock(&inode->i_mapping->private_lock);
1014         link_dev_buffers(page, bh);
1015         init_page_buffers(page, bdev, block, size);
1016         spin_unlock(&inode->i_mapping->private_lock);
1017         return page;
1018
1019 failed:
1020         BUG();
1021         unlock_page(page);
1022         page_cache_release(page);
1023         return NULL;
1024 }
1025
1026 /*
1027  * Create buffers for the specified block device block's page.  If
1028  * that page was dirty, the buffers are set dirty also.
1029  *
1030  * Except that's a bug.  Attaching dirty buffers to a dirty
1031  * blockdev's page can result in filesystem corruption, because
1032  * some of those buffers may be aliases of filesystem data.
1033  * grow_dev_page() will go BUG() if this happens.
1034  */
1035 static int
1036 grow_buffers(struct block_device *bdev, sector_t block, int size)
1037 {
1038         struct page *page;
1039         pgoff_t index;
1040         int sizebits;
1041
1042         sizebits = -1;
1043         do {
1044                 sizebits++;
1045         } while ((size << sizebits) < PAGE_SIZE);
1046
1047         index = block >> sizebits;
1048
1049         /*
1050          * Check for a block which wants to lie outside our maximum possible
1051          * pagecache index.  (this comparison is done using sector_t types).
1052          */
1053         if (unlikely(index != block >> sizebits)) {
1054                 char b[BDEVNAME_SIZE];
1055
1056                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1057                         "device %s\n",
1058                         __FUNCTION__, (unsigned long long)block,
1059                         bdevname(bdev, b));
1060                 return -EIO;
1061         }
1062         block = index << sizebits;
1063         /* Create a page with the proper size buffers.. */
1064         page = grow_dev_page(bdev, block, index, size);
1065         if (!page)
1066                 return 0;
1067         unlock_page(page);
1068         page_cache_release(page);
1069         return 1;
1070 }
1071
1072 static struct buffer_head *
1073 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1074 {
1075         /* Size must be multiple of hard sectorsize */
1076         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1077                         (size < 512 || size > PAGE_SIZE))) {
1078                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1079                                         size);
1080                 printk(KERN_ERR "hardsect size: %d\n",
1081                                         bdev_hardsect_size(bdev));
1082
1083                 dump_stack();
1084                 return NULL;
1085         }
1086
1087         for (;;) {
1088                 struct buffer_head * bh;
1089                 int ret;
1090
1091                 bh = __find_get_block(bdev, block, size);
1092                 if (bh)
1093                         return bh;
1094
1095                 ret = grow_buffers(bdev, block, size);
1096                 if (ret < 0)
1097                         return NULL;
1098                 if (ret == 0)
1099                         free_more_memory();
1100         }
1101 }
1102
1103 /*
1104  * The relationship between dirty buffers and dirty pages:
1105  *
1106  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1107  * the page is tagged dirty in its radix tree.
1108  *
1109  * At all times, the dirtiness of the buffers represents the dirtiness of
1110  * subsections of the page.  If the page has buffers, the page dirty bit is
1111  * merely a hint about the true dirty state.
1112  *
1113  * When a page is set dirty in its entirety, all its buffers are marked dirty
1114  * (if the page has buffers).
1115  *
1116  * When a buffer is marked dirty, its page is dirtied, but the page's other
1117  * buffers are not.
1118  *
1119  * Also.  When blockdev buffers are explicitly read with bread(), they
1120  * individually become uptodate.  But their backing page remains not
1121  * uptodate - even if all of its buffers are uptodate.  A subsequent
1122  * block_read_full_page() against that page will discover all the uptodate
1123  * buffers, will set the page uptodate and will perform no I/O.
1124  */
1125
1126 /**
1127  * mark_buffer_dirty - mark a buffer_head as needing writeout
1128  * @bh: the buffer_head to mark dirty
1129  *
1130  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1131  * backing page dirty, then tag the page as dirty in its address_space's radix
1132  * tree and then attach the address_space's inode to its superblock's dirty
1133  * inode list.
1134  *
1135  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1136  * mapping->tree_lock and the global inode_lock.
1137  */
1138 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1139 {
1140         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1141                 __set_page_dirty_nobuffers(bh->b_page);
1142 }
1143
1144 /*
1145  * Decrement a buffer_head's reference count.  If all buffers against a page
1146  * have zero reference count, are clean and unlocked, and if the page is clean
1147  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1148  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1149  * a page but it ends up not being freed, and buffers may later be reattached).
1150  */
1151 void __brelse(struct buffer_head * buf)
1152 {
1153         if (atomic_read(&buf->b_count)) {
1154                 put_bh(buf);
1155                 return;
1156         }
1157         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1158         WARN_ON(1);
1159 }
1160
1161 /*
1162  * bforget() is like brelse(), except it discards any
1163  * potentially dirty data.
1164  */
1165 void __bforget(struct buffer_head *bh)
1166 {
1167         clear_buffer_dirty(bh);
1168         if (!list_empty(&bh->b_assoc_buffers)) {
1169                 struct address_space *buffer_mapping = bh->b_page->mapping;
1170
1171                 spin_lock(&buffer_mapping->private_lock);
1172                 list_del_init(&bh->b_assoc_buffers);
1173                 bh->b_assoc_map = NULL;
1174                 spin_unlock(&buffer_mapping->private_lock);
1175         }
1176         __brelse(bh);
1177 }
1178
1179 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1180 {
1181         lock_buffer(bh);
1182         if (buffer_uptodate(bh)) {
1183                 unlock_buffer(bh);
1184                 return bh;
1185         } else {
1186                 get_bh(bh);
1187                 bh->b_end_io = end_buffer_read_sync;
1188                 submit_bh(READ, bh);
1189                 wait_on_buffer(bh);
1190                 if (buffer_uptodate(bh))
1191                         return bh;
1192         }
1193         brelse(bh);
1194         return NULL;
1195 }
1196
1197 /*
1198  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1199  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1200  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1201  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1202  * CPU's LRUs at the same time.
1203  *
1204  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1205  * sb_find_get_block().
1206  *
1207  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1208  * a local interrupt disable for that.
1209  */
1210
1211 #define BH_LRU_SIZE     8
1212
1213 struct bh_lru {
1214         struct buffer_head *bhs[BH_LRU_SIZE];
1215 };
1216
1217 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1218
1219 #ifdef CONFIG_SMP
1220 #define bh_lru_lock()   local_irq_disable()
1221 #define bh_lru_unlock() local_irq_enable()
1222 #else
1223 #define bh_lru_lock()   preempt_disable()
1224 #define bh_lru_unlock() preempt_enable()
1225 #endif
1226
1227 static inline void check_irqs_on(void)
1228 {
1229 #ifdef irqs_disabled
1230         BUG_ON(irqs_disabled());
1231 #endif
1232 }
1233
1234 /*
1235  * The LRU management algorithm is dopey-but-simple.  Sorry.
1236  */
1237 static void bh_lru_install(struct buffer_head *bh)
1238 {
1239         struct buffer_head *evictee = NULL;
1240         struct bh_lru *lru;
1241
1242         check_irqs_on();
1243         bh_lru_lock();
1244         lru = &__get_cpu_var(bh_lrus);
1245         if (lru->bhs[0] != bh) {
1246                 struct buffer_head *bhs[BH_LRU_SIZE];
1247                 int in;
1248                 int out = 0;
1249
1250                 get_bh(bh);
1251                 bhs[out++] = bh;
1252                 for (in = 0; in < BH_LRU_SIZE; in++) {
1253                         struct buffer_head *bh2 = lru->bhs[in];
1254
1255                         if (bh2 == bh) {
1256                                 __brelse(bh2);
1257                         } else {
1258                                 if (out >= BH_LRU_SIZE) {
1259                                         BUG_ON(evictee != NULL);
1260                                         evictee = bh2;
1261                                 } else {
1262                                         bhs[out++] = bh2;
1263                                 }
1264                         }
1265                 }
1266                 while (out < BH_LRU_SIZE)
1267                         bhs[out++] = NULL;
1268                 memcpy(lru->bhs, bhs, sizeof(bhs));
1269         }
1270         bh_lru_unlock();
1271
1272         if (evictee)
1273                 __brelse(evictee);
1274 }
1275
1276 /*
1277  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1278  */
1279 static struct buffer_head *
1280 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1281 {
1282         struct buffer_head *ret = NULL;
1283         struct bh_lru *lru;
1284         unsigned int i;
1285
1286         check_irqs_on();
1287         bh_lru_lock();
1288         lru = &__get_cpu_var(bh_lrus);
1289         for (i = 0; i < BH_LRU_SIZE; i++) {
1290                 struct buffer_head *bh = lru->bhs[i];
1291
1292                 if (bh && bh->b_bdev == bdev &&
1293                                 bh->b_blocknr == block && bh->b_size == size) {
1294                         if (i) {
1295                                 while (i) {
1296                                         lru->bhs[i] = lru->bhs[i - 1];
1297                                         i--;
1298                                 }
1299                                 lru->bhs[0] = bh;
1300                         }
1301                         get_bh(bh);
1302                         ret = bh;
1303                         break;
1304                 }
1305         }
1306         bh_lru_unlock();
1307         return ret;
1308 }
1309
1310 /*
1311  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1312  * it in the LRU and mark it as accessed.  If it is not present then return
1313  * NULL
1314  */
1315 struct buffer_head *
1316 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1317 {
1318         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1319
1320         if (bh == NULL) {
1321                 bh = __find_get_block_slow(bdev, block);
1322                 if (bh)
1323                         bh_lru_install(bh);
1324         }
1325         if (bh)
1326                 touch_buffer(bh);
1327         return bh;
1328 }
1329 EXPORT_SYMBOL(__find_get_block);
1330
1331 /*
1332  * __getblk will locate (and, if necessary, create) the buffer_head
1333  * which corresponds to the passed block_device, block and size. The
1334  * returned buffer has its reference count incremented.
1335  *
1336  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1337  * illegal block number, __getblk() will happily return a buffer_head
1338  * which represents the non-existent block.  Very weird.
1339  *
1340  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1341  * attempt is failing.  FIXME, perhaps?
1342  */
1343 struct buffer_head *
1344 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1345 {
1346         struct buffer_head *bh = __find_get_block(bdev, block, size);
1347
1348         might_sleep();
1349         if (bh == NULL)
1350                 bh = __getblk_slow(bdev, block, size);
1351         return bh;
1352 }
1353 EXPORT_SYMBOL(__getblk);
1354
1355 /*
1356  * Do async read-ahead on a buffer..
1357  */
1358 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1359 {
1360         struct buffer_head *bh = __getblk(bdev, block, size);
1361         if (likely(bh)) {
1362                 ll_rw_block(READA, 1, &bh);
1363                 brelse(bh);
1364         }
1365 }
1366 EXPORT_SYMBOL(__breadahead);
1367
1368 /**
1369  *  __bread() - reads a specified block and returns the bh
1370  *  @bdev: the block_device to read from
1371  *  @block: number of block
1372  *  @size: size (in bytes) to read
1373  * 
1374  *  Reads a specified block, and returns buffer head that contains it.
1375  *  It returns NULL if the block was unreadable.
1376  */
1377 struct buffer_head *
1378 __bread(struct block_device *bdev, sector_t block, unsigned size)
1379 {
1380         struct buffer_head *bh = __getblk(bdev, block, size);
1381
1382         if (likely(bh) && !buffer_uptodate(bh))
1383                 bh = __bread_slow(bh);
1384         return bh;
1385 }
1386 EXPORT_SYMBOL(__bread);
1387
1388 /*
1389  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1390  * This doesn't race because it runs in each cpu either in irq
1391  * or with preempt disabled.
1392  */
1393 static void invalidate_bh_lru(void *arg)
1394 {
1395         struct bh_lru *b = &get_cpu_var(bh_lrus);
1396         int i;
1397
1398         for (i = 0; i < BH_LRU_SIZE; i++) {
1399                 brelse(b->bhs[i]);
1400                 b->bhs[i] = NULL;
1401         }
1402         put_cpu_var(bh_lrus);
1403 }
1404         
1405 void invalidate_bh_lrus(void)
1406 {
1407         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1408 }
1409
1410 void set_bh_page(struct buffer_head *bh,
1411                 struct page *page, unsigned long offset)
1412 {
1413         bh->b_page = page;
1414         BUG_ON(offset >= PAGE_SIZE);
1415         if (PageHighMem(page))
1416                 /*
1417                  * This catches illegal uses and preserves the offset:
1418                  */
1419                 bh->b_data = (char *)(0 + offset);
1420         else
1421                 bh->b_data = page_address(page) + offset;
1422 }
1423 EXPORT_SYMBOL(set_bh_page);
1424
1425 /*
1426  * Called when truncating a buffer on a page completely.
1427  */
1428 static void discard_buffer(struct buffer_head * bh)
1429 {
1430         lock_buffer(bh);
1431         clear_buffer_dirty(bh);
1432         bh->b_bdev = NULL;
1433         clear_buffer_mapped(bh);
1434         clear_buffer_req(bh);
1435         clear_buffer_new(bh);
1436         clear_buffer_delay(bh);
1437         clear_buffer_unwritten(bh);
1438         unlock_buffer(bh);
1439 }
1440
1441 /**
1442  * block_invalidatepage - invalidate part of all of a buffer-backed page
1443  *
1444  * @page: the page which is affected
1445  * @offset: the index of the truncation point
1446  *
1447  * block_invalidatepage() is called when all or part of the page has become
1448  * invalidatedby a truncate operation.
1449  *
1450  * block_invalidatepage() does not have to release all buffers, but it must
1451  * ensure that no dirty buffer is left outside @offset and that no I/O
1452  * is underway against any of the blocks which are outside the truncation
1453  * point.  Because the caller is about to free (and possibly reuse) those
1454  * blocks on-disk.
1455  */
1456 void block_invalidatepage(struct page *page, unsigned long offset)
1457 {
1458         struct buffer_head *head, *bh, *next;
1459         unsigned int curr_off = 0;
1460
1461         BUG_ON(!PageLocked(page));
1462         if (!page_has_buffers(page))
1463                 goto out;
1464
1465         head = page_buffers(page);
1466         bh = head;
1467         do {
1468                 unsigned int next_off = curr_off + bh->b_size;
1469                 next = bh->b_this_page;
1470
1471                 /*
1472                  * is this block fully invalidated?
1473                  */
1474                 if (offset <= curr_off)
1475                         discard_buffer(bh);
1476                 curr_off = next_off;
1477                 bh = next;
1478         } while (bh != head);
1479
1480         /*
1481          * We release buffers only if the entire page is being invalidated.
1482          * The get_block cached value has been unconditionally invalidated,
1483          * so real IO is not possible anymore.
1484          */
1485         if (offset == 0)
1486                 try_to_release_page(page, 0);
1487 out:
1488         return;
1489 }
1490 EXPORT_SYMBOL(block_invalidatepage);
1491
1492 /*
1493  * We attach and possibly dirty the buffers atomically wrt
1494  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1495  * is already excluded via the page lock.
1496  */
1497 void create_empty_buffers(struct page *page,
1498                         unsigned long blocksize, unsigned long b_state)
1499 {
1500         struct buffer_head *bh, *head, *tail;
1501
1502         head = alloc_page_buffers(page, blocksize, 1);
1503         bh = head;
1504         do {
1505                 bh->b_state |= b_state;
1506                 tail = bh;
1507                 bh = bh->b_this_page;
1508         } while (bh);
1509         tail->b_this_page = head;
1510
1511         spin_lock(&page->mapping->private_lock);
1512         if (PageUptodate(page) || PageDirty(page)) {
1513                 bh = head;
1514                 do {
1515                         if (PageDirty(page))
1516                                 set_buffer_dirty(bh);
1517                         if (PageUptodate(page))
1518                                 set_buffer_uptodate(bh);
1519                         bh = bh->b_this_page;
1520                 } while (bh != head);
1521         }
1522         attach_page_buffers(page, head);
1523         spin_unlock(&page->mapping->private_lock);
1524 }
1525 EXPORT_SYMBOL(create_empty_buffers);
1526
1527 /*
1528  * We are taking a block for data and we don't want any output from any
1529  * buffer-cache aliases starting from return from that function and
1530  * until the moment when something will explicitly mark the buffer
1531  * dirty (hopefully that will not happen until we will free that block ;-)
1532  * We don't even need to mark it not-uptodate - nobody can expect
1533  * anything from a newly allocated buffer anyway. We used to used
1534  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1535  * don't want to mark the alias unmapped, for example - it would confuse
1536  * anyone who might pick it with bread() afterwards...
1537  *
1538  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1539  * be writeout I/O going on against recently-freed buffers.  We don't
1540  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1541  * only if we really need to.  That happens here.
1542  */
1543 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1544 {
1545         struct buffer_head *old_bh;
1546
1547         might_sleep();
1548
1549         old_bh = __find_get_block_slow(bdev, block);
1550         if (old_bh) {
1551                 clear_buffer_dirty(old_bh);
1552                 wait_on_buffer(old_bh);
1553                 clear_buffer_req(old_bh);
1554                 __brelse(old_bh);
1555         }
1556 }
1557 EXPORT_SYMBOL(unmap_underlying_metadata);
1558
1559 /*
1560  * NOTE! All mapped/uptodate combinations are valid:
1561  *
1562  *      Mapped  Uptodate        Meaning
1563  *
1564  *      No      No              "unknown" - must do get_block()
1565  *      No      Yes             "hole" - zero-filled
1566  *      Yes     No              "allocated" - allocated on disk, not read in
1567  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1568  *
1569  * "Dirty" is valid only with the last case (mapped+uptodate).
1570  */
1571
1572 /*
1573  * While block_write_full_page is writing back the dirty buffers under
1574  * the page lock, whoever dirtied the buffers may decide to clean them
1575  * again at any time.  We handle that by only looking at the buffer
1576  * state inside lock_buffer().
1577  *
1578  * If block_write_full_page() is called for regular writeback
1579  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1580  * locked buffer.   This only can happen if someone has written the buffer
1581  * directly, with submit_bh().  At the address_space level PageWriteback
1582  * prevents this contention from occurring.
1583  */
1584 static int __block_write_full_page(struct inode *inode, struct page *page,
1585                         get_block_t *get_block, struct writeback_control *wbc)
1586 {
1587         int err;
1588         sector_t block;
1589         sector_t last_block;
1590         struct buffer_head *bh, *head;
1591         const unsigned blocksize = 1 << inode->i_blkbits;
1592         int nr_underway = 0;
1593
1594         BUG_ON(!PageLocked(page));
1595
1596         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1597
1598         if (!page_has_buffers(page)) {
1599                 create_empty_buffers(page, blocksize,
1600                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1601         }
1602
1603         /*
1604          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1605          * here, and the (potentially unmapped) buffers may become dirty at
1606          * any time.  If a buffer becomes dirty here after we've inspected it
1607          * then we just miss that fact, and the page stays dirty.
1608          *
1609          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1610          * handle that here by just cleaning them.
1611          */
1612
1613         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1614         head = page_buffers(page);
1615         bh = head;
1616
1617         /*
1618          * Get all the dirty buffers mapped to disk addresses and
1619          * handle any aliases from the underlying blockdev's mapping.
1620          */
1621         do {
1622                 if (block > last_block) {
1623                         /*
1624                          * mapped buffers outside i_size will occur, because
1625                          * this page can be outside i_size when there is a
1626                          * truncate in progress.
1627                          */
1628                         /*
1629                          * The buffer was zeroed by block_write_full_page()
1630                          */
1631                         clear_buffer_dirty(bh);
1632                         set_buffer_uptodate(bh);
1633                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1634                         WARN_ON(bh->b_size != blocksize);
1635                         err = get_block(inode, block, bh, 1);
1636                         if (err)
1637                                 goto recover;
1638                         if (buffer_new(bh)) {
1639                                 /* blockdev mappings never come here */
1640                                 clear_buffer_new(bh);
1641                                 unmap_underlying_metadata(bh->b_bdev,
1642                                                         bh->b_blocknr);
1643                         }
1644                 }
1645                 bh = bh->b_this_page;
1646                 block++;
1647         } while (bh != head);
1648
1649         do {
1650                 if (!buffer_mapped(bh))
1651                         continue;
1652                 /*
1653                  * If it's a fully non-blocking write attempt and we cannot
1654                  * lock the buffer then redirty the page.  Note that this can
1655                  * potentially cause a busy-wait loop from pdflush and kswapd
1656                  * activity, but those code paths have their own higher-level
1657                  * throttling.
1658                  */
1659                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1660                         lock_buffer(bh);
1661                 } else if (test_set_buffer_locked(bh)) {
1662                         redirty_page_for_writepage(wbc, page);
1663                         continue;
1664                 }
1665                 if (test_clear_buffer_dirty(bh)) {
1666                         mark_buffer_async_write(bh);
1667                 } else {
1668                         unlock_buffer(bh);
1669                 }
1670         } while ((bh = bh->b_this_page) != head);
1671
1672         /*
1673          * The page and its buffers are protected by PageWriteback(), so we can
1674          * drop the bh refcounts early.
1675          */
1676         BUG_ON(PageWriteback(page));
1677         set_page_writeback(page);
1678
1679         do {
1680                 struct buffer_head *next = bh->b_this_page;
1681                 if (buffer_async_write(bh)) {
1682                         submit_bh(WRITE, bh);
1683                         nr_underway++;
1684                 }
1685                 bh = next;
1686         } while (bh != head);
1687         unlock_page(page);
1688
1689         err = 0;
1690 done:
1691         if (nr_underway == 0) {
1692                 /*
1693                  * The page was marked dirty, but the buffers were
1694                  * clean.  Someone wrote them back by hand with
1695                  * ll_rw_block/submit_bh.  A rare case.
1696                  */
1697                 end_page_writeback(page);
1698
1699                 /*
1700                  * The page and buffer_heads can be released at any time from
1701                  * here on.
1702                  */
1703                 wbc->pages_skipped++;   /* We didn't write this page */
1704         }
1705         return err;
1706
1707 recover:
1708         /*
1709          * ENOSPC, or some other error.  We may already have added some
1710          * blocks to the file, so we need to write these out to avoid
1711          * exposing stale data.
1712          * The page is currently locked and not marked for writeback
1713          */
1714         bh = head;
1715         /* Recovery: lock and submit the mapped buffers */
1716         do {
1717                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1718                         lock_buffer(bh);
1719                         mark_buffer_async_write(bh);
1720                 } else {
1721                         /*
1722                          * The buffer may have been set dirty during
1723                          * attachment to a dirty page.
1724                          */
1725                         clear_buffer_dirty(bh);
1726                 }
1727         } while ((bh = bh->b_this_page) != head);
1728         SetPageError(page);
1729         BUG_ON(PageWriteback(page));
1730         mapping_set_error(page->mapping, err);
1731         set_page_writeback(page);
1732         do {
1733                 struct buffer_head *next = bh->b_this_page;
1734                 if (buffer_async_write(bh)) {
1735                         clear_buffer_dirty(bh);
1736                         submit_bh(WRITE, bh);
1737                         nr_underway++;
1738                 }
1739                 bh = next;
1740         } while (bh != head);
1741         unlock_page(page);
1742         goto done;
1743 }
1744
1745 static int __block_prepare_write(struct inode *inode, struct page *page,
1746                 unsigned from, unsigned to, get_block_t *get_block)
1747 {
1748         unsigned block_start, block_end;
1749         sector_t block;
1750         int err = 0;
1751         unsigned blocksize, bbits;
1752         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1753
1754         BUG_ON(!PageLocked(page));
1755         BUG_ON(from > PAGE_CACHE_SIZE);
1756         BUG_ON(to > PAGE_CACHE_SIZE);
1757         BUG_ON(from > to);
1758
1759         blocksize = 1 << inode->i_blkbits;
1760         if (!page_has_buffers(page))
1761                 create_empty_buffers(page, blocksize, 0);
1762         head = page_buffers(page);
1763
1764         bbits = inode->i_blkbits;
1765         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1766
1767         for(bh = head, block_start = 0; bh != head || !block_start;
1768             block++, block_start=block_end, bh = bh->b_this_page) {
1769                 block_end = block_start + blocksize;
1770                 if (block_end <= from || block_start >= to) {
1771                         if (PageUptodate(page)) {
1772                                 if (!buffer_uptodate(bh))
1773                                         set_buffer_uptodate(bh);
1774                         }
1775                         continue;
1776                 }
1777                 if (buffer_new(bh))
1778                         clear_buffer_new(bh);
1779                 if (!buffer_mapped(bh)) {
1780                         WARN_ON(bh->b_size != blocksize);
1781                         err = get_block(inode, block, bh, 1);
1782                         if (err)
1783                                 break;
1784                         if (buffer_new(bh)) {
1785                                 unmap_underlying_metadata(bh->b_bdev,
1786                                                         bh->b_blocknr);
1787                                 if (PageUptodate(page)) {
1788                                         set_buffer_uptodate(bh);
1789                                         continue;
1790                                 }
1791                                 if (block_end > to || block_start < from) {
1792                                         void *kaddr;
1793
1794                                         kaddr = kmap_atomic(page, KM_USER0);
1795                                         if (block_end > to)
1796                                                 memset(kaddr+to, 0,
1797                                                         block_end-to);
1798                                         if (block_start < from)
1799                                                 memset(kaddr+block_start,
1800                                                         0, from-block_start);
1801                                         flush_dcache_page(page);
1802                                         kunmap_atomic(kaddr, KM_USER0);
1803                                 }
1804                                 continue;
1805                         }
1806                 }
1807                 if (PageUptodate(page)) {
1808                         if (!buffer_uptodate(bh))
1809                                 set_buffer_uptodate(bh);
1810                         continue; 
1811                 }
1812                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1813                     !buffer_unwritten(bh) &&
1814                      (block_start < from || block_end > to)) {
1815                         ll_rw_block(READ, 1, &bh);
1816                         *wait_bh++=bh;
1817                 }
1818         }
1819         /*
1820          * If we issued read requests - let them complete.
1821          */
1822         while(wait_bh > wait) {
1823                 wait_on_buffer(*--wait_bh);
1824                 if (!buffer_uptodate(*wait_bh))
1825                         err = -EIO;
1826         }
1827         if (!err) {
1828                 bh = head;
1829                 do {
1830                         if (buffer_new(bh))
1831                                 clear_buffer_new(bh);
1832                 } while ((bh = bh->b_this_page) != head);
1833                 return 0;
1834         }
1835         /* Error case: */
1836         /*
1837          * Zero out any newly allocated blocks to avoid exposing stale
1838          * data.  If BH_New is set, we know that the block was newly
1839          * allocated in the above loop.
1840          */
1841         bh = head;
1842         block_start = 0;
1843         do {
1844                 block_end = block_start+blocksize;
1845                 if (block_end <= from)
1846                         goto next_bh;
1847                 if (block_start >= to)
1848                         break;
1849                 if (buffer_new(bh)) {
1850                         clear_buffer_new(bh);
1851                         zero_user_page(page, block_start, bh->b_size, KM_USER0);
1852                         set_buffer_uptodate(bh);
1853                         mark_buffer_dirty(bh);
1854                 }
1855 next_bh:
1856                 block_start = block_end;
1857                 bh = bh->b_this_page;
1858         } while (bh != head);
1859         return err;
1860 }
1861
1862 static int __block_commit_write(struct inode *inode, struct page *page,
1863                 unsigned from, unsigned to)
1864 {
1865         unsigned block_start, block_end;
1866         int partial = 0;
1867         unsigned blocksize;
1868         struct buffer_head *bh, *head;
1869
1870         blocksize = 1 << inode->i_blkbits;
1871
1872         for(bh = head = page_buffers(page), block_start = 0;
1873             bh != head || !block_start;
1874             block_start=block_end, bh = bh->b_this_page) {
1875                 block_end = block_start + blocksize;
1876                 if (block_end <= from || block_start >= to) {
1877                         if (!buffer_uptodate(bh))
1878                                 partial = 1;
1879                 } else {
1880                         set_buffer_uptodate(bh);
1881                         mark_buffer_dirty(bh);
1882                 }
1883         }
1884
1885         /*
1886          * If this is a partial write which happened to make all buffers
1887          * uptodate then we can optimize away a bogus readpage() for
1888          * the next read(). Here we 'discover' whether the page went
1889          * uptodate as a result of this (potentially partial) write.
1890          */
1891         if (!partial)
1892                 SetPageUptodate(page);
1893         return 0;
1894 }
1895
1896 /*
1897  * Generic "read page" function for block devices that have the normal
1898  * get_block functionality. This is most of the block device filesystems.
1899  * Reads the page asynchronously --- the unlock_buffer() and
1900  * set/clear_buffer_uptodate() functions propagate buffer state into the
1901  * page struct once IO has completed.
1902  */
1903 int block_read_full_page(struct page *page, get_block_t *get_block)
1904 {
1905         struct inode *inode = page->mapping->host;
1906         sector_t iblock, lblock;
1907         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1908         unsigned int blocksize;
1909         int nr, i;
1910         int fully_mapped = 1;
1911
1912         BUG_ON(!PageLocked(page));
1913         blocksize = 1 << inode->i_blkbits;
1914         if (!page_has_buffers(page))
1915                 create_empty_buffers(page, blocksize, 0);
1916         head = page_buffers(page);
1917
1918         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1919         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1920         bh = head;
1921         nr = 0;
1922         i = 0;
1923
1924         do {
1925                 if (buffer_uptodate(bh))
1926                         continue;
1927
1928                 if (!buffer_mapped(bh)) {
1929                         int err = 0;
1930
1931                         fully_mapped = 0;
1932                         if (iblock < lblock) {
1933                                 WARN_ON(bh->b_size != blocksize);
1934                                 err = get_block(inode, iblock, bh, 0);
1935                                 if (err)
1936                                         SetPageError(page);
1937                         }
1938                         if (!buffer_mapped(bh)) {
1939                                 zero_user_page(page, i * blocksize, blocksize,
1940                                                 KM_USER0);
1941                                 if (!err)
1942                                         set_buffer_uptodate(bh);
1943                                 continue;
1944                         }
1945                         /*
1946                          * get_block() might have updated the buffer
1947                          * synchronously
1948                          */
1949                         if (buffer_uptodate(bh))
1950                                 continue;
1951                 }
1952                 arr[nr++] = bh;
1953         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1954
1955         if (fully_mapped)
1956                 SetPageMappedToDisk(page);
1957
1958         if (!nr) {
1959                 /*
1960                  * All buffers are uptodate - we can set the page uptodate
1961                  * as well. But not if get_block() returned an error.
1962                  */
1963                 if (!PageError(page))
1964                         SetPageUptodate(page);
1965                 unlock_page(page);
1966                 return 0;
1967         }
1968
1969         /* Stage two: lock the buffers */
1970         for (i = 0; i < nr; i++) {
1971                 bh = arr[i];
1972                 lock_buffer(bh);
1973                 mark_buffer_async_read(bh);
1974         }
1975
1976         /*
1977          * Stage 3: start the IO.  Check for uptodateness
1978          * inside the buffer lock in case another process reading
1979          * the underlying blockdev brought it uptodate (the sct fix).
1980          */
1981         for (i = 0; i < nr; i++) {
1982                 bh = arr[i];
1983                 if (buffer_uptodate(bh))
1984                         end_buffer_async_read(bh, 1);
1985                 else
1986                         submit_bh(READ, bh);
1987         }
1988         return 0;
1989 }
1990
1991 /* utility function for filesystems that need to do work on expanding
1992  * truncates.  Uses prepare/commit_write to allow the filesystem to
1993  * deal with the hole.  
1994  */
1995 static int __generic_cont_expand(struct inode *inode, loff_t size,
1996                                  pgoff_t index, unsigned int offset)
1997 {
1998         struct address_space *mapping = inode->i_mapping;
1999         struct page *page;
2000         unsigned long limit;
2001         int err;
2002
2003         err = -EFBIG;
2004         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2005         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2006                 send_sig(SIGXFSZ, current, 0);
2007                 goto out;
2008         }
2009         if (size > inode->i_sb->s_maxbytes)
2010                 goto out;
2011
2012         err = -ENOMEM;
2013         page = grab_cache_page(mapping, index);
2014         if (!page)
2015                 goto out;
2016         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2017         if (err) {
2018                 /*
2019                  * ->prepare_write() may have instantiated a few blocks
2020                  * outside i_size.  Trim these off again.
2021                  */
2022                 unlock_page(page);
2023                 page_cache_release(page);
2024                 vmtruncate(inode, inode->i_size);
2025                 goto out;
2026         }
2027
2028         err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2029
2030         unlock_page(page);
2031         page_cache_release(page);
2032         if (err > 0)
2033                 err = 0;
2034 out:
2035         return err;
2036 }
2037
2038 int generic_cont_expand(struct inode *inode, loff_t size)
2039 {
2040         pgoff_t index;
2041         unsigned int offset;
2042
2043         offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2044
2045         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2046         ** skip the prepare.  make sure we never send an offset for the start
2047         ** of a block
2048         */
2049         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2050                 /* caller must handle this extra byte. */
2051                 offset++;
2052         }
2053         index = size >> PAGE_CACHE_SHIFT;
2054
2055         return __generic_cont_expand(inode, size, index, offset);
2056 }
2057
2058 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2059 {
2060         loff_t pos = size - 1;
2061         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2062         unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2063
2064         /* prepare/commit_write can handle even if from==to==start of block. */
2065         return __generic_cont_expand(inode, size, index, offset);
2066 }
2067
2068 /*
2069  * For moronic filesystems that do not allow holes in file.
2070  * We may have to extend the file.
2071  */
2072
2073 int cont_prepare_write(struct page *page, unsigned offset,
2074                 unsigned to, get_block_t *get_block, loff_t *bytes)
2075 {
2076         struct address_space *mapping = page->mapping;
2077         struct inode *inode = mapping->host;
2078         struct page *new_page;
2079         pgoff_t pgpos;
2080         long status;
2081         unsigned zerofrom;
2082         unsigned blocksize = 1 << inode->i_blkbits;
2083
2084         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2085                 status = -ENOMEM;
2086                 new_page = grab_cache_page(mapping, pgpos);
2087                 if (!new_page)
2088                         goto out;
2089                 /* we might sleep */
2090                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2091                         unlock_page(new_page);
2092                         page_cache_release(new_page);
2093                         continue;
2094                 }
2095                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2096                 if (zerofrom & (blocksize-1)) {
2097                         *bytes |= (blocksize-1);
2098                         (*bytes)++;
2099                 }
2100                 status = __block_prepare_write(inode, new_page, zerofrom,
2101                                                 PAGE_CACHE_SIZE, get_block);
2102                 if (status)
2103                         goto out_unmap;
2104                 zero_user_page(new_page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
2105                                 KM_USER0);
2106                 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2107                 unlock_page(new_page);
2108                 page_cache_release(new_page);
2109         }
2110
2111         if (page->index < pgpos) {
2112                 /* completely inside the area */
2113                 zerofrom = offset;
2114         } else {
2115                 /* page covers the boundary, find the boundary offset */
2116                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2117
2118                 /* if we will expand the thing last block will be filled */
2119                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2120                         *bytes |= (blocksize-1);
2121                         (*bytes)++;
2122                 }
2123
2124                 /* starting below the boundary? Nothing to zero out */
2125                 if (offset <= zerofrom)
2126                         zerofrom = offset;
2127         }
2128         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2129         if (status)
2130                 goto out1;
2131         if (zerofrom < offset) {
2132                 zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
2133                 __block_commit_write(inode, page, zerofrom, offset);
2134         }
2135         return 0;
2136 out1:
2137         ClearPageUptodate(page);
2138         return status;
2139
2140 out_unmap:
2141         ClearPageUptodate(new_page);
2142         unlock_page(new_page);
2143         page_cache_release(new_page);
2144 out:
2145         return status;
2146 }
2147
2148 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2149                         get_block_t *get_block)
2150 {
2151         struct inode *inode = page->mapping->host;
2152         int err = __block_prepare_write(inode, page, from, to, get_block);
2153         if (err)
2154                 ClearPageUptodate(page);
2155         return err;
2156 }
2157
2158 int block_commit_write(struct page *page, unsigned from, unsigned to)
2159 {
2160         struct inode *inode = page->mapping->host;
2161         __block_commit_write(inode,page,from,to);
2162         return 0;
2163 }
2164
2165 int generic_commit_write(struct file *file, struct page *page,
2166                 unsigned from, unsigned to)
2167 {
2168         struct inode *inode = page->mapping->host;
2169         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2170         __block_commit_write(inode,page,from,to);
2171         /*
2172          * No need to use i_size_read() here, the i_size
2173          * cannot change under us because we hold i_mutex.
2174          */
2175         if (pos > inode->i_size) {
2176                 i_size_write(inode, pos);
2177                 mark_inode_dirty(inode);
2178         }
2179         return 0;
2180 }
2181
2182
2183 /*
2184  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2185  * immediately, while under the page lock.  So it needs a special end_io
2186  * handler which does not touch the bh after unlocking it.
2187  *
2188  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2189  * a race there is benign: unlock_buffer() only use the bh's address for
2190  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2191  * itself.
2192  */
2193 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2194 {
2195         if (uptodate) {
2196                 set_buffer_uptodate(bh);
2197         } else {
2198                 /* This happens, due to failed READA attempts. */
2199                 clear_buffer_uptodate(bh);
2200         }
2201         unlock_buffer(bh);
2202 }
2203
2204 /*
2205  * On entry, the page is fully not uptodate.
2206  * On exit the page is fully uptodate in the areas outside (from,to)
2207  */
2208 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2209                         get_block_t *get_block)
2210 {
2211         struct inode *inode = page->mapping->host;
2212         const unsigned blkbits = inode->i_blkbits;
2213         const unsigned blocksize = 1 << blkbits;
2214         struct buffer_head map_bh;
2215         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2216         unsigned block_in_page;
2217         unsigned block_start;
2218         sector_t block_in_file;
2219         char *kaddr;
2220         int nr_reads = 0;
2221         int i;
2222         int ret = 0;
2223         int is_mapped_to_disk = 1;
2224
2225         if (PageMappedToDisk(page))
2226                 return 0;
2227
2228         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2229         map_bh.b_page = page;
2230
2231         /*
2232          * We loop across all blocks in the page, whether or not they are
2233          * part of the affected region.  This is so we can discover if the
2234          * page is fully mapped-to-disk.
2235          */
2236         for (block_start = 0, block_in_page = 0;
2237                   block_start < PAGE_CACHE_SIZE;
2238                   block_in_page++, block_start += blocksize) {
2239                 unsigned block_end = block_start + blocksize;
2240                 int create;
2241
2242                 map_bh.b_state = 0;
2243                 create = 1;
2244                 if (block_start >= to)
2245                         create = 0;
2246                 map_bh.b_size = blocksize;
2247                 ret = get_block(inode, block_in_file + block_in_page,
2248                                         &map_bh, create);
2249                 if (ret)
2250                         goto failed;
2251                 if (!buffer_mapped(&map_bh))
2252                         is_mapped_to_disk = 0;
2253                 if (buffer_new(&map_bh))
2254                         unmap_underlying_metadata(map_bh.b_bdev,
2255                                                         map_bh.b_blocknr);
2256                 if (PageUptodate(page))
2257                         continue;
2258                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2259                         kaddr = kmap_atomic(page, KM_USER0);
2260                         if (block_start < from)
2261                                 memset(kaddr+block_start, 0, from-block_start);
2262                         if (block_end > to)
2263                                 memset(kaddr + to, 0, block_end - to);
2264                         flush_dcache_page(page);
2265                         kunmap_atomic(kaddr, KM_USER0);
2266                         continue;
2267                 }
2268                 if (buffer_uptodate(&map_bh))
2269                         continue;       /* reiserfs does this */
2270                 if (block_start < from || block_end > to) {
2271                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2272
2273                         if (!bh) {
2274                                 ret = -ENOMEM;
2275                                 goto failed;
2276                         }
2277                         bh->b_state = map_bh.b_state;
2278                         atomic_set(&bh->b_count, 0);
2279                         bh->b_this_page = NULL;
2280                         bh->b_page = page;
2281                         bh->b_blocknr = map_bh.b_blocknr;
2282                         bh->b_size = blocksize;
2283                         bh->b_data = (char *)(long)block_start;
2284                         bh->b_bdev = map_bh.b_bdev;
2285                         bh->b_private = NULL;
2286                         read_bh[nr_reads++] = bh;
2287                 }
2288         }
2289
2290         if (nr_reads) {
2291                 struct buffer_head *bh;
2292
2293                 /*
2294                  * The page is locked, so these buffers are protected from
2295                  * any VM or truncate activity.  Hence we don't need to care
2296                  * for the buffer_head refcounts.
2297                  */
2298                 for (i = 0; i < nr_reads; i++) {
2299                         bh = read_bh[i];
2300                         lock_buffer(bh);
2301                         bh->b_end_io = end_buffer_read_nobh;
2302                         submit_bh(READ, bh);
2303                 }
2304                 for (i = 0; i < nr_reads; i++) {
2305                         bh = read_bh[i];
2306                         wait_on_buffer(bh);
2307                         if (!buffer_uptodate(bh))
2308                                 ret = -EIO;
2309                         free_buffer_head(bh);
2310                         read_bh[i] = NULL;
2311                 }
2312                 if (ret)
2313                         goto failed;
2314         }
2315
2316         if (is_mapped_to_disk)
2317                 SetPageMappedToDisk(page);
2318
2319         return 0;
2320
2321 failed:
2322         for (i = 0; i < nr_reads; i++) {
2323                 if (read_bh[i])
2324                         free_buffer_head(read_bh[i]);
2325         }
2326
2327         /*
2328          * Error recovery is pretty slack.  Clear the page and mark it dirty
2329          * so we'll later zero out any blocks which _were_ allocated.
2330          */
2331         zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
2332         SetPageUptodate(page);
2333         set_page_dirty(page);
2334         return ret;
2335 }
2336 EXPORT_SYMBOL(nobh_prepare_write);
2337
2338 /*
2339  * Make sure any changes to nobh_commit_write() are reflected in
2340  * nobh_truncate_page(), since it doesn't call commit_write().
2341  */
2342 int nobh_commit_write(struct file *file, struct page *page,
2343                 unsigned from, unsigned to)
2344 {
2345         struct inode *inode = page->mapping->host;
2346         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2347
2348         SetPageUptodate(page);
2349         set_page_dirty(page);
2350         if (pos > inode->i_size) {
2351                 i_size_write(inode, pos);
2352                 mark_inode_dirty(inode);
2353         }
2354         return 0;
2355 }
2356 EXPORT_SYMBOL(nobh_commit_write);
2357
2358 /*
2359  * nobh_writepage() - based on block_full_write_page() except
2360  * that it tries to operate without attaching bufferheads to
2361  * the page.
2362  */
2363 int nobh_writepage(struct page *page, get_block_t *get_block,
2364                         struct writeback_control *wbc)
2365 {
2366         struct inode * const inode = page->mapping->host;
2367         loff_t i_size = i_size_read(inode);
2368         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2369         unsigned offset;
2370         int ret;
2371
2372         /* Is the page fully inside i_size? */
2373         if (page->index < end_index)
2374                 goto out;
2375
2376         /* Is the page fully outside i_size? (truncate in progress) */
2377         offset = i_size & (PAGE_CACHE_SIZE-1);
2378         if (page->index >= end_index+1 || !offset) {
2379                 /*
2380                  * The page may have dirty, unmapped buffers.  For example,
2381                  * they may have been added in ext3_writepage().  Make them
2382                  * freeable here, so the page does not leak.
2383                  */
2384 #if 0
2385                 /* Not really sure about this  - do we need this ? */
2386                 if (page->mapping->a_ops->invalidatepage)
2387                         page->mapping->a_ops->invalidatepage(page, offset);
2388 #endif
2389                 unlock_page(page);
2390                 return 0; /* don't care */
2391         }
2392
2393         /*
2394          * The page straddles i_size.  It must be zeroed out on each and every
2395          * writepage invocation because it may be mmapped.  "A file is mapped
2396          * in multiples of the page size.  For a file that is not a multiple of
2397          * the  page size, the remaining memory is zeroed when mapped, and
2398          * writes to that region are not written out to the file."
2399          */
2400         zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2401 out:
2402         ret = mpage_writepage(page, get_block, wbc);
2403         if (ret == -EAGAIN)
2404                 ret = __block_write_full_page(inode, page, get_block, wbc);
2405         return ret;
2406 }
2407 EXPORT_SYMBOL(nobh_writepage);
2408
2409 /*
2410  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2411  */
2412 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2413 {
2414         struct inode *inode = mapping->host;
2415         unsigned blocksize = 1 << inode->i_blkbits;
2416         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2417         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2418         unsigned to;
2419         struct page *page;
2420         const struct address_space_operations *a_ops = mapping->a_ops;
2421         int ret = 0;
2422
2423         if ((offset & (blocksize - 1)) == 0)
2424                 goto out;
2425
2426         ret = -ENOMEM;
2427         page = grab_cache_page(mapping, index);
2428         if (!page)
2429                 goto out;
2430
2431         to = (offset + blocksize) & ~(blocksize - 1);
2432         ret = a_ops->prepare_write(NULL, page, offset, to);
2433         if (ret == 0) {
2434                 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
2435                                 KM_USER0);
2436                 /*
2437                  * It would be more correct to call aops->commit_write()
2438                  * here, but this is more efficient.
2439                  */
2440                 SetPageUptodate(page);
2441                 set_page_dirty(page);
2442         }
2443         unlock_page(page);
2444         page_cache_release(page);
2445 out:
2446         return ret;
2447 }
2448 EXPORT_SYMBOL(nobh_truncate_page);
2449
2450 int block_truncate_page(struct address_space *mapping,
2451                         loff_t from, get_block_t *get_block)
2452 {
2453         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2454         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2455         unsigned blocksize;
2456         sector_t iblock;
2457         unsigned length, pos;
2458         struct inode *inode = mapping->host;
2459         struct page *page;
2460         struct buffer_head *bh;
2461         int err;
2462
2463         blocksize = 1 << inode->i_blkbits;
2464         length = offset & (blocksize - 1);
2465
2466         /* Block boundary? Nothing to do */
2467         if (!length)
2468                 return 0;
2469
2470         length = blocksize - length;
2471         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2472         
2473         page = grab_cache_page(mapping, index);
2474         err = -ENOMEM;
2475         if (!page)
2476                 goto out;
2477
2478         if (!page_has_buffers(page))
2479                 create_empty_buffers(page, blocksize, 0);
2480
2481         /* Find the buffer that contains "offset" */
2482         bh = page_buffers(page);
2483         pos = blocksize;
2484         while (offset >= pos) {
2485                 bh = bh->b_this_page;
2486                 iblock++;
2487                 pos += blocksize;
2488         }
2489
2490         err = 0;
2491         if (!buffer_mapped(bh)) {
2492                 WARN_ON(bh->b_size != blocksize);
2493                 err = get_block(inode, iblock, bh, 0);
2494                 if (err)
2495                         goto unlock;
2496                 /* unmapped? It's a hole - nothing to do */
2497                 if (!buffer_mapped(bh))
2498                         goto unlock;
2499         }
2500
2501         /* Ok, it's mapped. Make sure it's up-to-date */
2502         if (PageUptodate(page))
2503                 set_buffer_uptodate(bh);
2504
2505         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2506                 err = -EIO;
2507                 ll_rw_block(READ, 1, &bh);
2508                 wait_on_buffer(bh);
2509                 /* Uhhuh. Read error. Complain and punt. */
2510                 if (!buffer_uptodate(bh))
2511                         goto unlock;
2512         }
2513
2514         zero_user_page(page, offset, length, KM_USER0);
2515         mark_buffer_dirty(bh);
2516         err = 0;
2517
2518 unlock:
2519         unlock_page(page);
2520         page_cache_release(page);
2521 out:
2522         return err;
2523 }
2524
2525 /*
2526  * The generic ->writepage function for buffer-backed address_spaces
2527  */
2528 int block_write_full_page(struct page *page, get_block_t *get_block,
2529                         struct writeback_control *wbc)
2530 {
2531         struct inode * const inode = page->mapping->host;
2532         loff_t i_size = i_size_read(inode);
2533         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2534         unsigned offset;
2535
2536         /* Is the page fully inside i_size? */
2537         if (page->index < end_index)
2538                 return __block_write_full_page(inode, page, get_block, wbc);
2539
2540         /* Is the page fully outside i_size? (truncate in progress) */
2541         offset = i_size & (PAGE_CACHE_SIZE-1);
2542         if (page->index >= end_index+1 || !offset) {
2543                 /*
2544                  * The page may have dirty, unmapped buffers.  For example,
2545                  * they may have been added in ext3_writepage().  Make them
2546                  * freeable here, so the page does not leak.
2547                  */
2548                 do_invalidatepage(page, 0);
2549                 unlock_page(page);
2550                 return 0; /* don't care */
2551         }
2552
2553         /*
2554          * The page straddles i_size.  It must be zeroed out on each and every
2555          * writepage invokation because it may be mmapped.  "A file is mapped
2556          * in multiples of the page size.  For a file that is not a multiple of
2557          * the  page size, the remaining memory is zeroed when mapped, and
2558          * writes to that region are not written out to the file."
2559          */
2560         zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2561         return __block_write_full_page(inode, page, get_block, wbc);
2562 }
2563
2564 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2565                             get_block_t *get_block)
2566 {
2567         struct buffer_head tmp;
2568         struct inode *inode = mapping->host;
2569         tmp.b_state = 0;
2570         tmp.b_blocknr = 0;
2571         tmp.b_size = 1 << inode->i_blkbits;
2572         get_block(inode, block, &tmp, 0);
2573         return tmp.b_blocknr;
2574 }
2575
2576 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2577 {
2578         struct buffer_head *bh = bio->bi_private;
2579
2580         if (bio->bi_size)
2581                 return 1;
2582
2583         if (err == -EOPNOTSUPP) {
2584                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2585                 set_bit(BH_Eopnotsupp, &bh->b_state);
2586         }
2587
2588         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2589         bio_put(bio);
2590         return 0;
2591 }
2592
2593 int submit_bh(int rw, struct buffer_head * bh)
2594 {
2595         struct bio *bio;
2596         int ret = 0;
2597
2598         BUG_ON(!buffer_locked(bh));
2599         BUG_ON(!buffer_mapped(bh));
2600         BUG_ON(!bh->b_end_io);
2601
2602         if (buffer_ordered(bh) && (rw == WRITE))
2603                 rw = WRITE_BARRIER;
2604
2605         /*
2606          * Only clear out a write error when rewriting, should this
2607          * include WRITE_SYNC as well?
2608          */
2609         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2610                 clear_buffer_write_io_error(bh);
2611
2612         /*
2613          * from here on down, it's all bio -- do the initial mapping,
2614          * submit_bio -> generic_make_request may further map this bio around
2615          */
2616         bio = bio_alloc(GFP_NOIO, 1);
2617
2618         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2619         bio->bi_bdev = bh->b_bdev;
2620         bio->bi_io_vec[0].bv_page = bh->b_page;
2621         bio->bi_io_vec[0].bv_len = bh->b_size;
2622         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2623
2624         bio->bi_vcnt = 1;
2625         bio->bi_idx = 0;
2626         bio->bi_size = bh->b_size;
2627
2628         bio->bi_end_io = end_bio_bh_io_sync;
2629         bio->bi_private = bh;
2630
2631         bio_get(bio);
2632         submit_bio(rw, bio);
2633
2634         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2635                 ret = -EOPNOTSUPP;
2636
2637         bio_put(bio);
2638         return ret;
2639 }
2640
2641 /**
2642  * ll_rw_block: low-level access to block devices (DEPRECATED)
2643  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2644  * @nr: number of &struct buffer_heads in the array
2645  * @bhs: array of pointers to &struct buffer_head
2646  *
2647  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2648  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2649  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2650  * are sent to disk. The fourth %READA option is described in the documentation
2651  * for generic_make_request() which ll_rw_block() calls.
2652  *
2653  * This function drops any buffer that it cannot get a lock on (with the
2654  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2655  * clean when doing a write request, and any buffer that appears to be
2656  * up-to-date when doing read request.  Further it marks as clean buffers that
2657  * are processed for writing (the buffer cache won't assume that they are
2658  * actually clean until the buffer gets unlocked).
2659  *
2660  * ll_rw_block sets b_end_io to simple completion handler that marks
2661  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2662  * any waiters. 
2663  *
2664  * All of the buffers must be for the same device, and must also be a
2665  * multiple of the current approved size for the device.
2666  */
2667 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2668 {
2669         int i;
2670
2671         for (i = 0; i < nr; i++) {
2672                 struct buffer_head *bh = bhs[i];
2673
2674                 if (rw == SWRITE)
2675                         lock_buffer(bh);
2676                 else if (test_set_buffer_locked(bh))
2677                         continue;
2678
2679                 if (rw == WRITE || rw == SWRITE) {
2680                         if (test_clear_buffer_dirty(bh)) {
2681                                 bh->b_end_io = end_buffer_write_sync;
2682                                 get_bh(bh);
2683                                 submit_bh(WRITE, bh);
2684                                 continue;
2685                         }
2686                 } else {
2687                         if (!buffer_uptodate(bh)) {
2688                                 bh->b_end_io = end_buffer_read_sync;
2689                                 get_bh(bh);
2690                                 submit_bh(rw, bh);
2691                                 continue;
2692                         }
2693                 }
2694                 unlock_buffer(bh);
2695         }
2696 }
2697
2698 /*
2699  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2700  * and then start new I/O and then wait upon it.  The caller must have a ref on
2701  * the buffer_head.
2702  */
2703 int sync_dirty_buffer(struct buffer_head *bh)
2704 {
2705         int ret = 0;
2706
2707         WARN_ON(atomic_read(&bh->b_count) < 1);
2708         lock_buffer(bh);
2709         if (test_clear_buffer_dirty(bh)) {
2710                 get_bh(bh);
2711                 bh->b_end_io = end_buffer_write_sync;
2712                 ret = submit_bh(WRITE, bh);
2713                 wait_on_buffer(bh);
2714                 if (buffer_eopnotsupp(bh)) {
2715                         clear_buffer_eopnotsupp(bh);
2716                         ret = -EOPNOTSUPP;
2717                 }
2718                 if (!ret && !buffer_uptodate(bh))
2719                         ret = -EIO;
2720         } else {
2721                 unlock_buffer(bh);
2722         }
2723         return ret;
2724 }
2725
2726 /*
2727  * try_to_free_buffers() checks if all the buffers on this particular page
2728  * are unused, and releases them if so.
2729  *
2730  * Exclusion against try_to_free_buffers may be obtained by either
2731  * locking the page or by holding its mapping's private_lock.
2732  *
2733  * If the page is dirty but all the buffers are clean then we need to
2734  * be sure to mark the page clean as well.  This is because the page
2735  * may be against a block device, and a later reattachment of buffers
2736  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2737  * filesystem data on the same device.
2738  *
2739  * The same applies to regular filesystem pages: if all the buffers are
2740  * clean then we set the page clean and proceed.  To do that, we require
2741  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2742  * private_lock.
2743  *
2744  * try_to_free_buffers() is non-blocking.
2745  */
2746 static inline int buffer_busy(struct buffer_head *bh)
2747 {
2748         return atomic_read(&bh->b_count) |
2749                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2750 }
2751
2752 static int
2753 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2754 {
2755         struct buffer_head *head = page_buffers(page);
2756         struct buffer_head *bh;
2757
2758         bh = head;
2759         do {
2760                 if (buffer_write_io_error(bh) && page->mapping)
2761                         set_bit(AS_EIO, &page->mapping->flags);
2762                 if (buffer_busy(bh))
2763                         goto failed;
2764                 bh = bh->b_this_page;
2765         } while (bh != head);
2766
2767         do {
2768                 struct buffer_head *next = bh->b_this_page;
2769
2770                 if (!list_empty(&bh->b_assoc_buffers))
2771                         __remove_assoc_queue(bh);
2772                 bh = next;
2773         } while (bh != head);
2774         *buffers_to_free = head;
2775         __clear_page_buffers(page);
2776         return 1;
2777 failed:
2778         return 0;
2779 }
2780
2781 int try_to_free_buffers(struct page *page)
2782 {
2783         struct address_space * const mapping = page->mapping;
2784         struct buffer_head *buffers_to_free = NULL;
2785         int ret = 0;
2786
2787         BUG_ON(!PageLocked(page));
2788         if (PageWriteback(page))
2789                 return 0;
2790
2791         if (mapping == NULL) {          /* can this still happen? */
2792                 ret = drop_buffers(page, &buffers_to_free);
2793                 goto out;
2794         }
2795
2796         spin_lock(&mapping->private_lock);
2797         ret = drop_buffers(page, &buffers_to_free);
2798
2799         /*
2800          * If the filesystem writes its buffers by hand (eg ext3)
2801          * then we can have clean buffers against a dirty page.  We
2802          * clean the page here; otherwise the VM will never notice
2803          * that the filesystem did any IO at all.
2804          *
2805          * Also, during truncate, discard_buffer will have marked all
2806          * the page's buffers clean.  We discover that here and clean
2807          * the page also.
2808          *
2809          * private_lock must be held over this entire operation in order
2810          * to synchronise against __set_page_dirty_buffers and prevent the
2811          * dirty bit from being lost.
2812          */
2813         if (ret)
2814                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2815         spin_unlock(&mapping->private_lock);
2816 out:
2817         if (buffers_to_free) {
2818                 struct buffer_head *bh = buffers_to_free;
2819
2820                 do {
2821                         struct buffer_head *next = bh->b_this_page;
2822                         free_buffer_head(bh);
2823                         bh = next;
2824                 } while (bh != buffers_to_free);
2825         }
2826         return ret;
2827 }
2828 EXPORT_SYMBOL(try_to_free_buffers);
2829
2830 void block_sync_page(struct page *page)
2831 {
2832         struct address_space *mapping;
2833
2834         smp_mb();
2835         mapping = page_mapping(page);
2836         if (mapping)
2837                 blk_run_backing_dev(mapping->backing_dev_info, page);
2838 }
2839
2840 /*
2841  * There are no bdflush tunables left.  But distributions are
2842  * still running obsolete flush daemons, so we terminate them here.
2843  *
2844  * Use of bdflush() is deprecated and will be removed in a future kernel.
2845  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2846  */
2847 asmlinkage long sys_bdflush(int func, long data)
2848 {
2849         static int msg_count;
2850
2851         if (!capable(CAP_SYS_ADMIN))
2852                 return -EPERM;
2853
2854         if (msg_count < 5) {
2855                 msg_count++;
2856                 printk(KERN_INFO
2857                         "warning: process `%s' used the obsolete bdflush"
2858                         " system call\n", current->comm);
2859                 printk(KERN_INFO "Fix your initscripts?\n");
2860         }
2861
2862         if (func == 1)
2863                 do_exit(0);
2864         return 0;
2865 }
2866
2867 /*
2868  * Buffer-head allocation
2869  */
2870 static struct kmem_cache *bh_cachep;
2871
2872 /*
2873  * Once the number of bh's in the machine exceeds this level, we start
2874  * stripping them in writeback.
2875  */
2876 static int max_buffer_heads;
2877
2878 int buffer_heads_over_limit;
2879
2880 struct bh_accounting {
2881         int nr;                 /* Number of live bh's */
2882         int ratelimit;          /* Limit cacheline bouncing */
2883 };
2884
2885 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2886
2887 static void recalc_bh_state(void)
2888 {
2889         int i;
2890         int tot = 0;
2891
2892         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2893                 return;
2894         __get_cpu_var(bh_accounting).ratelimit = 0;
2895         for_each_online_cpu(i)
2896                 tot += per_cpu(bh_accounting, i).nr;
2897         buffer_heads_over_limit = (tot > max_buffer_heads);
2898 }
2899         
2900 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2901 {
2902         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2903         if (ret) {
2904                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
2905                 get_cpu_var(bh_accounting).nr++;
2906                 recalc_bh_state();
2907                 put_cpu_var(bh_accounting);
2908         }
2909         return ret;
2910 }
2911 EXPORT_SYMBOL(alloc_buffer_head);
2912
2913 void free_buffer_head(struct buffer_head *bh)
2914 {
2915         BUG_ON(!list_empty(&bh->b_assoc_buffers));
2916         kmem_cache_free(bh_cachep, bh);
2917         get_cpu_var(bh_accounting).nr--;
2918         recalc_bh_state();
2919         put_cpu_var(bh_accounting);
2920 }
2921 EXPORT_SYMBOL(free_buffer_head);
2922
2923 static void buffer_exit_cpu(int cpu)
2924 {
2925         int i;
2926         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2927
2928         for (i = 0; i < BH_LRU_SIZE; i++) {
2929                 brelse(b->bhs[i]);
2930                 b->bhs[i] = NULL;
2931         }
2932         get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
2933         per_cpu(bh_accounting, cpu).nr = 0;
2934         put_cpu_var(bh_accounting);
2935 }
2936
2937 static int buffer_cpu_notify(struct notifier_block *self,
2938                               unsigned long action, void *hcpu)
2939 {
2940         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
2941                 buffer_exit_cpu((unsigned long)hcpu);
2942         return NOTIFY_OK;
2943 }
2944
2945 void __init buffer_init(void)
2946 {
2947         int nrpages;
2948
2949         bh_cachep = KMEM_CACHE(buffer_head,
2950                         SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
2951
2952         /*
2953          * Limit the bh occupancy to 10% of ZONE_NORMAL
2954          */
2955         nrpages = (nr_free_buffer_pages() * 10) / 100;
2956         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
2957         hotcpu_notifier(buffer_cpu_notify, 0);
2958 }
2959
2960 EXPORT_SYMBOL(__bforget);
2961 EXPORT_SYMBOL(__brelse);
2962 EXPORT_SYMBOL(__wait_on_buffer);
2963 EXPORT_SYMBOL(block_commit_write);
2964 EXPORT_SYMBOL(block_prepare_write);
2965 EXPORT_SYMBOL(block_read_full_page);
2966 EXPORT_SYMBOL(block_sync_page);
2967 EXPORT_SYMBOL(block_truncate_page);
2968 EXPORT_SYMBOL(block_write_full_page);
2969 EXPORT_SYMBOL(cont_prepare_write);
2970 EXPORT_SYMBOL(end_buffer_read_sync);
2971 EXPORT_SYMBOL(end_buffer_write_sync);
2972 EXPORT_SYMBOL(file_fsync);
2973 EXPORT_SYMBOL(fsync_bdev);
2974 EXPORT_SYMBOL(generic_block_bmap);
2975 EXPORT_SYMBOL(generic_commit_write);
2976 EXPORT_SYMBOL(generic_cont_expand);
2977 EXPORT_SYMBOL(generic_cont_expand_simple);
2978 EXPORT_SYMBOL(init_buffer);
2979 EXPORT_SYMBOL(invalidate_bdev);
2980 EXPORT_SYMBOL(ll_rw_block);
2981 EXPORT_SYMBOL(mark_buffer_dirty);
2982 EXPORT_SYMBOL(submit_bh);
2983 EXPORT_SYMBOL(sync_dirty_buffer);
2984 EXPORT_SYMBOL(unlock_buffer);