/home/lenb/src/to-linus-stable branch 'acpi-2.6.12'
[linux-2.6] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include "xattr.h"
40 #include "acl.h"
41
42 static int ext3_writepage_trans_blocks(struct inode *inode);
43
44 /*
45  * Test whether an inode is a fast symlink.
46  */
47 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
48 {
49         int ea_blocks = EXT3_I(inode)->i_file_acl ?
50                 (inode->i_sb->s_blocksize >> 9) : 0;
51
52         return (S_ISLNK(inode->i_mode) &&
53                 inode->i_blocks - ea_blocks == 0);
54 }
55
56 /* The ext3 forget function must perform a revoke if we are freeing data
57  * which has been journaled.  Metadata (eg. indirect blocks) must be
58  * revoked in all cases. 
59  *
60  * "bh" may be NULL: a metadata block may have been freed from memory
61  * but there may still be a record of it in the journal, and that record
62  * still needs to be revoked.
63  */
64
65 int ext3_forget(handle_t *handle, int is_metadata,
66                        struct inode *inode, struct buffer_head *bh,
67                        int blocknr)
68 {
69         int err;
70
71         might_sleep();
72
73         BUFFER_TRACE(bh, "enter");
74
75         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76                   "data mode %lx\n",
77                   bh, is_metadata, inode->i_mode,
78                   test_opt(inode->i_sb, DATA_FLAGS));
79
80         /* Never use the revoke function if we are doing full data
81          * journaling: there is no need to, and a V1 superblock won't
82          * support it.  Otherwise, only skip the revoke on un-journaled
83          * data blocks. */
84
85         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86             (!is_metadata && !ext3_should_journal_data(inode))) {
87                 if (bh) {
88                         BUFFER_TRACE(bh, "call journal_forget");
89                         return ext3_journal_forget(handle, bh);
90                 }
91                 return 0;
92         }
93
94         /*
95          * data!=journal && (is_metadata || should_journal_data(inode))
96          */
97         BUFFER_TRACE(bh, "call ext3_journal_revoke");
98         err = ext3_journal_revoke(handle, blocknr, bh);
99         if (err)
100                 ext3_abort(inode->i_sb, __FUNCTION__,
101                            "error %d when attempting revoke", err);
102         BUFFER_TRACE(bh, "exit");
103         return err;
104 }
105
106 /*
107  * Work out how many blocks we need to progress with the next chunk of a
108  * truncate transaction.
109  */
110
111 static unsigned long blocks_for_truncate(struct inode *inode) 
112 {
113         unsigned long needed;
114
115         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
116
117         /* Give ourselves just enough room to cope with inodes in which
118          * i_blocks is corrupt: we've seen disk corruptions in the past
119          * which resulted in random data in an inode which looked enough
120          * like a regular file for ext3 to try to delete it.  Things
121          * will go a bit crazy if that happens, but at least we should
122          * try not to panic the whole kernel. */
123         if (needed < 2)
124                 needed = 2;
125
126         /* But we need to bound the transaction so we don't overflow the
127          * journal. */
128         if (needed > EXT3_MAX_TRANS_DATA) 
129                 needed = EXT3_MAX_TRANS_DATA;
130
131         return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
132 }
133
134 /* 
135  * Truncate transactions can be complex and absolutely huge.  So we need to
136  * be able to restart the transaction at a conventient checkpoint to make
137  * sure we don't overflow the journal.
138  *
139  * start_transaction gets us a new handle for a truncate transaction,
140  * and extend_transaction tries to extend the existing one a bit.  If
141  * extend fails, we need to propagate the failure up and restart the
142  * transaction in the top-level truncate loop. --sct 
143  */
144
145 static handle_t *start_transaction(struct inode *inode) 
146 {
147         handle_t *result;
148
149         result = ext3_journal_start(inode, blocks_for_truncate(inode));
150         if (!IS_ERR(result))
151                 return result;
152
153         ext3_std_error(inode->i_sb, PTR_ERR(result));
154         return result;
155 }
156
157 /*
158  * Try to extend this transaction for the purposes of truncation.
159  *
160  * Returns 0 if we managed to create more room.  If we can't create more
161  * room, and the transaction must be restarted we return 1.
162  */
163 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
164 {
165         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
166                 return 0;
167         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
168                 return 0;
169         return 1;
170 }
171
172 /*
173  * Restart the transaction associated with *handle.  This does a commit,
174  * so before we call here everything must be consistently dirtied against
175  * this transaction.
176  */
177 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
178 {
179         jbd_debug(2, "restarting handle %p\n", handle);
180         return ext3_journal_restart(handle, blocks_for_truncate(inode));
181 }
182
183 /*
184  * Called at the last iput() if i_nlink is zero.
185  */
186 void ext3_delete_inode (struct inode * inode)
187 {
188         handle_t *handle;
189
190         if (is_bad_inode(inode))
191                 goto no_delete;
192
193         handle = start_transaction(inode);
194         if (IS_ERR(handle)) {
195                 /* If we're going to skip the normal cleanup, we still
196                  * need to make sure that the in-core orphan linked list
197                  * is properly cleaned up. */
198                 ext3_orphan_del(NULL, inode);
199                 goto no_delete;
200         }
201
202         if (IS_SYNC(inode))
203                 handle->h_sync = 1;
204         inode->i_size = 0;
205         if (inode->i_blocks)
206                 ext3_truncate(inode);
207         /*
208          * Kill off the orphan record which ext3_truncate created.
209          * AKPM: I think this can be inside the above `if'.
210          * Note that ext3_orphan_del() has to be able to cope with the
211          * deletion of a non-existent orphan - this is because we don't
212          * know if ext3_truncate() actually created an orphan record.
213          * (Well, we could do this if we need to, but heck - it works)
214          */
215         ext3_orphan_del(handle, inode);
216         EXT3_I(inode)->i_dtime  = get_seconds();
217
218         /* 
219          * One subtle ordering requirement: if anything has gone wrong
220          * (transaction abort, IO errors, whatever), then we can still
221          * do these next steps (the fs will already have been marked as
222          * having errors), but we can't free the inode if the mark_dirty
223          * fails.  
224          */
225         if (ext3_mark_inode_dirty(handle, inode))
226                 /* If that failed, just do the required in-core inode clear. */
227                 clear_inode(inode);
228         else
229                 ext3_free_inode(handle, inode);
230         ext3_journal_stop(handle);
231         return;
232 no_delete:
233         clear_inode(inode);     /* We must guarantee clearing of inode... */
234 }
235
236 static int ext3_alloc_block (handle_t *handle,
237                         struct inode * inode, unsigned long goal, int *err)
238 {
239         unsigned long result;
240
241         result = ext3_new_block(handle, inode, goal, err);
242         return result;
243 }
244
245
246 typedef struct {
247         __le32  *p;
248         __le32  key;
249         struct buffer_head *bh;
250 } Indirect;
251
252 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
253 {
254         p->key = *(p->p = v);
255         p->bh = bh;
256 }
257
258 static inline int verify_chain(Indirect *from, Indirect *to)
259 {
260         while (from <= to && from->key == *from->p)
261                 from++;
262         return (from > to);
263 }
264
265 /**
266  *      ext3_block_to_path - parse the block number into array of offsets
267  *      @inode: inode in question (we are only interested in its superblock)
268  *      @i_block: block number to be parsed
269  *      @offsets: array to store the offsets in
270  *      @boundary: set this non-zero if the referred-to block is likely to be
271  *             followed (on disk) by an indirect block.
272  *
273  *      To store the locations of file's data ext3 uses a data structure common
274  *      for UNIX filesystems - tree of pointers anchored in the inode, with
275  *      data blocks at leaves and indirect blocks in intermediate nodes.
276  *      This function translates the block number into path in that tree -
277  *      return value is the path length and @offsets[n] is the offset of
278  *      pointer to (n+1)th node in the nth one. If @block is out of range
279  *      (negative or too large) warning is printed and zero returned.
280  *
281  *      Note: function doesn't find node addresses, so no IO is needed. All
282  *      we need to know is the capacity of indirect blocks (taken from the
283  *      inode->i_sb).
284  */
285
286 /*
287  * Portability note: the last comparison (check that we fit into triple
288  * indirect block) is spelled differently, because otherwise on an
289  * architecture with 32-bit longs and 8Kb pages we might get into trouble
290  * if our filesystem had 8Kb blocks. We might use long long, but that would
291  * kill us on x86. Oh, well, at least the sign propagation does not matter -
292  * i_block would have to be negative in the very beginning, so we would not
293  * get there at all.
294  */
295
296 static int ext3_block_to_path(struct inode *inode,
297                         long i_block, int offsets[4], int *boundary)
298 {
299         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
300         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
301         const long direct_blocks = EXT3_NDIR_BLOCKS,
302                 indirect_blocks = ptrs,
303                 double_blocks = (1 << (ptrs_bits * 2));
304         int n = 0;
305         int final = 0;
306
307         if (i_block < 0) {
308                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
309         } else if (i_block < direct_blocks) {
310                 offsets[n++] = i_block;
311                 final = direct_blocks;
312         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
313                 offsets[n++] = EXT3_IND_BLOCK;
314                 offsets[n++] = i_block;
315                 final = ptrs;
316         } else if ((i_block -= indirect_blocks) < double_blocks) {
317                 offsets[n++] = EXT3_DIND_BLOCK;
318                 offsets[n++] = i_block >> ptrs_bits;
319                 offsets[n++] = i_block & (ptrs - 1);
320                 final = ptrs;
321         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
322                 offsets[n++] = EXT3_TIND_BLOCK;
323                 offsets[n++] = i_block >> (ptrs_bits * 2);
324                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
325                 offsets[n++] = i_block & (ptrs - 1);
326                 final = ptrs;
327         } else {
328                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
329         }
330         if (boundary)
331                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
332         return n;
333 }
334
335 /**
336  *      ext3_get_branch - read the chain of indirect blocks leading to data
337  *      @inode: inode in question
338  *      @depth: depth of the chain (1 - direct pointer, etc.)
339  *      @offsets: offsets of pointers in inode/indirect blocks
340  *      @chain: place to store the result
341  *      @err: here we store the error value
342  *
343  *      Function fills the array of triples <key, p, bh> and returns %NULL
344  *      if everything went OK or the pointer to the last filled triple
345  *      (incomplete one) otherwise. Upon the return chain[i].key contains
346  *      the number of (i+1)-th block in the chain (as it is stored in memory,
347  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
348  *      number (it points into struct inode for i==0 and into the bh->b_data
349  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
350  *      block for i>0 and NULL for i==0. In other words, it holds the block
351  *      numbers of the chain, addresses they were taken from (and where we can
352  *      verify that chain did not change) and buffer_heads hosting these
353  *      numbers.
354  *
355  *      Function stops when it stumbles upon zero pointer (absent block)
356  *              (pointer to last triple returned, *@err == 0)
357  *      or when it gets an IO error reading an indirect block
358  *              (ditto, *@err == -EIO)
359  *      or when it notices that chain had been changed while it was reading
360  *              (ditto, *@err == -EAGAIN)
361  *      or when it reads all @depth-1 indirect blocks successfully and finds
362  *      the whole chain, all way to the data (returns %NULL, *err == 0).
363  */
364 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
365                                  Indirect chain[4], int *err)
366 {
367         struct super_block *sb = inode->i_sb;
368         Indirect *p = chain;
369         struct buffer_head *bh;
370
371         *err = 0;
372         /* i_data is not going away, no lock needed */
373         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
374         if (!p->key)
375                 goto no_block;
376         while (--depth) {
377                 bh = sb_bread(sb, le32_to_cpu(p->key));
378                 if (!bh)
379                         goto failure;
380                 /* Reader: pointers */
381                 if (!verify_chain(chain, p))
382                         goto changed;
383                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
384                 /* Reader: end */
385                 if (!p->key)
386                         goto no_block;
387         }
388         return NULL;
389
390 changed:
391         brelse(bh);
392         *err = -EAGAIN;
393         goto no_block;
394 failure:
395         *err = -EIO;
396 no_block:
397         return p;
398 }
399
400 /**
401  *      ext3_find_near - find a place for allocation with sufficient locality
402  *      @inode: owner
403  *      @ind: descriptor of indirect block.
404  *
405  *      This function returns the prefered place for block allocation.
406  *      It is used when heuristic for sequential allocation fails.
407  *      Rules are:
408  *        + if there is a block to the left of our position - allocate near it.
409  *        + if pointer will live in indirect block - allocate near that block.
410  *        + if pointer will live in inode - allocate in the same
411  *          cylinder group. 
412  *
413  * In the latter case we colour the starting block by the callers PID to
414  * prevent it from clashing with concurrent allocations for a different inode
415  * in the same block group.   The PID is used here so that functionally related
416  * files will be close-by on-disk.
417  *
418  *      Caller must make sure that @ind is valid and will stay that way.
419  */
420
421 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
422 {
423         struct ext3_inode_info *ei = EXT3_I(inode);
424         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
425         __le32 *p;
426         unsigned long bg_start;
427         unsigned long colour;
428
429         /* Try to find previous block */
430         for (p = ind->p - 1; p >= start; p--)
431                 if (*p)
432                         return le32_to_cpu(*p);
433
434         /* No such thing, so let's try location of indirect block */
435         if (ind->bh)
436                 return ind->bh->b_blocknr;
437
438         /*
439          * It is going to be refered from inode itself? OK, just put it into
440          * the same cylinder group then.
441          */
442         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
443                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
444         colour = (current->pid % 16) *
445                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
446         return bg_start + colour;
447 }
448
449 /**
450  *      ext3_find_goal - find a prefered place for allocation.
451  *      @inode: owner
452  *      @block:  block we want
453  *      @chain:  chain of indirect blocks
454  *      @partial: pointer to the last triple within a chain
455  *      @goal:  place to store the result.
456  *
457  *      Normally this function find the prefered place for block allocation,
458  *      stores it in *@goal and returns zero.
459  */
460
461 static unsigned long ext3_find_goal(struct inode *inode, long block,
462                 Indirect chain[4], Indirect *partial)
463 {
464         struct ext3_block_alloc_info *block_i =  EXT3_I(inode)->i_block_alloc_info;
465
466         /*
467          * try the heuristic for sequential allocation,
468          * failing that at least try to get decent locality.
469          */
470         if (block_i && (block == block_i->last_alloc_logical_block + 1)
471                 && (block_i->last_alloc_physical_block != 0)) {
472                 return block_i->last_alloc_physical_block + 1;
473         }
474
475         return ext3_find_near(inode, partial);
476 }
477
478 /**
479  *      ext3_alloc_branch - allocate and set up a chain of blocks.
480  *      @inode: owner
481  *      @num: depth of the chain (number of blocks to allocate)
482  *      @offsets: offsets (in the blocks) to store the pointers to next.
483  *      @branch: place to store the chain in.
484  *
485  *      This function allocates @num blocks, zeroes out all but the last one,
486  *      links them into chain and (if we are synchronous) writes them to disk.
487  *      In other words, it prepares a branch that can be spliced onto the
488  *      inode. It stores the information about that chain in the branch[], in
489  *      the same format as ext3_get_branch() would do. We are calling it after
490  *      we had read the existing part of chain and partial points to the last
491  *      triple of that (one with zero ->key). Upon the exit we have the same
492  *      picture as after the successful ext3_get_block(), excpet that in one
493  *      place chain is disconnected - *branch->p is still zero (we did not
494  *      set the last link), but branch->key contains the number that should
495  *      be placed into *branch->p to fill that gap.
496  *
497  *      If allocation fails we free all blocks we've allocated (and forget
498  *      their buffer_heads) and return the error value the from failed
499  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
500  *      as described above and return 0.
501  */
502
503 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
504                              int num,
505                              unsigned long goal,
506                              int *offsets,
507                              Indirect *branch)
508 {
509         int blocksize = inode->i_sb->s_blocksize;
510         int n = 0, keys = 0;
511         int err = 0;
512         int i;
513         int parent = ext3_alloc_block(handle, inode, goal, &err);
514
515         branch[0].key = cpu_to_le32(parent);
516         if (parent) {
517                 for (n = 1; n < num; n++) {
518                         struct buffer_head *bh;
519                         /* Allocate the next block */
520                         int nr = ext3_alloc_block(handle, inode, parent, &err);
521                         if (!nr)
522                                 break;
523                         branch[n].key = cpu_to_le32(nr);
524                         keys = n+1;
525
526                         /*
527                          * Get buffer_head for parent block, zero it out
528                          * and set the pointer to new one, then send
529                          * parent to disk.  
530                          */
531                         bh = sb_getblk(inode->i_sb, parent);
532                         branch[n].bh = bh;
533                         lock_buffer(bh);
534                         BUFFER_TRACE(bh, "call get_create_access");
535                         err = ext3_journal_get_create_access(handle, bh);
536                         if (err) {
537                                 unlock_buffer(bh);
538                                 brelse(bh);
539                                 break;
540                         }
541
542                         memset(bh->b_data, 0, blocksize);
543                         branch[n].p = (__le32*) bh->b_data + offsets[n];
544                         *branch[n].p = branch[n].key;
545                         BUFFER_TRACE(bh, "marking uptodate");
546                         set_buffer_uptodate(bh);
547                         unlock_buffer(bh);
548
549                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
550                         err = ext3_journal_dirty_metadata(handle, bh);
551                         if (err)
552                                 break;
553
554                         parent = nr;
555                 }
556         }
557         if (n == num)
558                 return 0;
559
560         /* Allocation failed, free what we already allocated */
561         for (i = 1; i < keys; i++) {
562                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
563                 ext3_journal_forget(handle, branch[i].bh);
564         }
565         for (i = 0; i < keys; i++)
566                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
567         return err;
568 }
569
570 /**
571  *      ext3_splice_branch - splice the allocated branch onto inode.
572  *      @inode: owner
573  *      @block: (logical) number of block we are adding
574  *      @chain: chain of indirect blocks (with a missing link - see
575  *              ext3_alloc_branch)
576  *      @where: location of missing link
577  *      @num:   number of blocks we are adding
578  *
579  *      This function fills the missing link and does all housekeeping needed in
580  *      inode (->i_blocks, etc.). In case of success we end up with the full
581  *      chain to new block and return 0.
582  */
583
584 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
585                               Indirect chain[4], Indirect *where, int num)
586 {
587         int i;
588         int err = 0;
589         struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
590
591         /*
592          * If we're splicing into a [td]indirect block (as opposed to the
593          * inode) then we need to get write access to the [td]indirect block
594          * before the splice.
595          */
596         if (where->bh) {
597                 BUFFER_TRACE(where->bh, "get_write_access");
598                 err = ext3_journal_get_write_access(handle, where->bh);
599                 if (err)
600                         goto err_out;
601         }
602         /* That's it */
603
604         *where->p = where->key;
605
606         /*
607          * update the most recently allocated logical & physical block
608          * in i_block_alloc_info, to assist find the proper goal block for next
609          * allocation
610          */
611         if (block_i) {
612                 block_i->last_alloc_logical_block = block;
613                 block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
614         }
615
616         /* We are done with atomic stuff, now do the rest of housekeeping */
617
618         inode->i_ctime = CURRENT_TIME_SEC;
619         ext3_mark_inode_dirty(handle, inode);
620
621         /* had we spliced it onto indirect block? */
622         if (where->bh) {
623                 /*
624                  * akpm: If we spliced it onto an indirect block, we haven't
625                  * altered the inode.  Note however that if it is being spliced
626                  * onto an indirect block at the very end of the file (the
627                  * file is growing) then we *will* alter the inode to reflect
628                  * the new i_size.  But that is not done here - it is done in
629                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
630                  */
631                 jbd_debug(5, "splicing indirect only\n");
632                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
633                 err = ext3_journal_dirty_metadata(handle, where->bh);
634                 if (err) 
635                         goto err_out;
636         } else {
637                 /*
638                  * OK, we spliced it into the inode itself on a direct block.
639                  * Inode was dirtied above.
640                  */
641                 jbd_debug(5, "splicing direct\n");
642         }
643         return err;
644
645 err_out:
646         for (i = 1; i < num; i++) {
647                 BUFFER_TRACE(where[i].bh, "call journal_forget");
648                 ext3_journal_forget(handle, where[i].bh);
649         }
650         return err;
651 }
652
653 /*
654  * Allocation strategy is simple: if we have to allocate something, we will
655  * have to go the whole way to leaf. So let's do it before attaching anything
656  * to tree, set linkage between the newborn blocks, write them if sync is
657  * required, recheck the path, free and repeat if check fails, otherwise
658  * set the last missing link (that will protect us from any truncate-generated
659  * removals - all blocks on the path are immune now) and possibly force the
660  * write on the parent block.
661  * That has a nice additional property: no special recovery from the failed
662  * allocations is needed - we simply release blocks and do not touch anything
663  * reachable from inode.
664  *
665  * akpm: `handle' can be NULL if create == 0.
666  *
667  * The BKL may not be held on entry here.  Be sure to take it early.
668  */
669
670 static int
671 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
672                 struct buffer_head *bh_result, int create, int extend_disksize)
673 {
674         int err = -EIO;
675         int offsets[4];
676         Indirect chain[4];
677         Indirect *partial;
678         unsigned long goal;
679         int left;
680         int boundary = 0;
681         const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
682         struct ext3_inode_info *ei = EXT3_I(inode);
683
684         J_ASSERT(handle != NULL || create == 0);
685
686         if (depth == 0)
687                 goto out;
688
689         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
690
691         /* Simplest case - block found, no allocation needed */
692         if (!partial) {
693                 clear_buffer_new(bh_result);
694                 goto got_it;
695         }
696
697         /* Next simple case - plain lookup or failed read of indirect block */
698         if (!create || err == -EIO)
699                 goto cleanup;
700
701         down(&ei->truncate_sem);
702
703         /*
704          * If the indirect block is missing while we are reading
705          * the chain(ext3_get_branch() returns -EAGAIN err), or
706          * if the chain has been changed after we grab the semaphore,
707          * (either because another process truncated this branch, or
708          * another get_block allocated this branch) re-grab the chain to see if
709          * the request block has been allocated or not.
710          *
711          * Since we already block the truncate/other get_block
712          * at this point, we will have the current copy of the chain when we
713          * splice the branch into the tree.
714          */
715         if (err == -EAGAIN || !verify_chain(chain, partial)) {
716                 while (partial > chain) {
717                         brelse(partial->bh);
718                         partial--;
719                 }
720                 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
721                 if (!partial) {
722                         up(&ei->truncate_sem);
723                         if (err)
724                                 goto cleanup;
725                         clear_buffer_new(bh_result);
726                         goto got_it;
727                 }
728         }
729
730         /*
731          * Okay, we need to do block allocation.  Lazily initialize the block
732          * allocation info here if necessary
733         */
734         if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
735                 ext3_init_block_alloc_info(inode);
736
737         goal = ext3_find_goal(inode, iblock, chain, partial);
738
739         left = (chain + depth) - partial;
740
741         /*
742          * Block out ext3_truncate while we alter the tree
743          */
744         err = ext3_alloc_branch(handle, inode, left, goal,
745                                 offsets + (partial - chain), partial);
746
747         /*
748          * The ext3_splice_branch call will free and forget any buffers
749          * on the new chain if there is a failure, but that risks using
750          * up transaction credits, especially for bitmaps where the
751          * credits cannot be returned.  Can we handle this somehow?  We
752          * may need to return -EAGAIN upwards in the worst case.  --sct
753          */
754         if (!err)
755                 err = ext3_splice_branch(handle, inode, iblock, chain,
756                                          partial, left);
757         /*
758          * i_disksize growing is protected by truncate_sem.  Don't forget to
759          * protect it if you're about to implement concurrent
760          * ext3_get_block() -bzzz
761         */
762         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
763                 ei->i_disksize = inode->i_size;
764         up(&ei->truncate_sem);
765         if (err)
766                 goto cleanup;
767
768         set_buffer_new(bh_result);
769 got_it:
770         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
771         if (boundary)
772                 set_buffer_boundary(bh_result);
773         /* Clean up and exit */
774         partial = chain + depth - 1;    /* the whole chain */
775 cleanup:
776         while (partial > chain) {
777                 BUFFER_TRACE(partial->bh, "call brelse");
778                 brelse(partial->bh);
779                 partial--;
780         }
781         BUFFER_TRACE(bh_result, "returned");
782 out:
783         return err;
784 }
785
786 static int ext3_get_block(struct inode *inode, sector_t iblock,
787                         struct buffer_head *bh_result, int create)
788 {
789         handle_t *handle = NULL;
790         int ret;
791
792         if (create) {
793                 handle = ext3_journal_current_handle();
794                 J_ASSERT(handle != 0);
795         }
796         ret = ext3_get_block_handle(handle, inode, iblock,
797                                 bh_result, create, 1);
798         return ret;
799 }
800
801 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
802
803 static int
804 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
805                 unsigned long max_blocks, struct buffer_head *bh_result,
806                 int create)
807 {
808         handle_t *handle = journal_current_handle();
809         int ret = 0;
810
811         if (!handle)
812                 goto get_block;         /* A read */
813
814         if (handle->h_transaction->t_state == T_LOCKED) {
815                 /*
816                  * Huge direct-io writes can hold off commits for long
817                  * periods of time.  Let this commit run.
818                  */
819                 ext3_journal_stop(handle);
820                 handle = ext3_journal_start(inode, DIO_CREDITS);
821                 if (IS_ERR(handle))
822                         ret = PTR_ERR(handle);
823                 goto get_block;
824         }
825
826         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
827                 /*
828                  * Getting low on buffer credits...
829                  */
830                 ret = ext3_journal_extend(handle, DIO_CREDITS);
831                 if (ret > 0) {
832                         /*
833                          * Couldn't extend the transaction.  Start a new one.
834                          */
835                         ret = ext3_journal_restart(handle, DIO_CREDITS);
836                 }
837         }
838
839 get_block:
840         if (ret == 0)
841                 ret = ext3_get_block_handle(handle, inode, iblock,
842                                         bh_result, create, 0);
843         bh_result->b_size = (1 << inode->i_blkbits);
844         return ret;
845 }
846
847 /*
848  * `handle' can be NULL if create is zero
849  */
850 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
851                                 long block, int create, int * errp)
852 {
853         struct buffer_head dummy;
854         int fatal = 0, err;
855
856         J_ASSERT(handle != NULL || create == 0);
857
858         dummy.b_state = 0;
859         dummy.b_blocknr = -1000;
860         buffer_trace_init(&dummy.b_history);
861         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
862         if (!*errp && buffer_mapped(&dummy)) {
863                 struct buffer_head *bh;
864                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
865                 if (buffer_new(&dummy)) {
866                         J_ASSERT(create != 0);
867                         J_ASSERT(handle != 0);
868
869                         /* Now that we do not always journal data, we
870                            should keep in mind whether this should
871                            always journal the new buffer as metadata.
872                            For now, regular file writes use
873                            ext3_get_block instead, so it's not a
874                            problem. */
875                         lock_buffer(bh);
876                         BUFFER_TRACE(bh, "call get_create_access");
877                         fatal = ext3_journal_get_create_access(handle, bh);
878                         if (!fatal && !buffer_uptodate(bh)) {
879                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
880                                 set_buffer_uptodate(bh);
881                         }
882                         unlock_buffer(bh);
883                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
884                         err = ext3_journal_dirty_metadata(handle, bh);
885                         if (!fatal)
886                                 fatal = err;
887                 } else {
888                         BUFFER_TRACE(bh, "not a new buffer");
889                 }
890                 if (fatal) {
891                         *errp = fatal;
892                         brelse(bh);
893                         bh = NULL;
894                 }
895                 return bh;
896         }
897         return NULL;
898 }
899
900 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
901                                int block, int create, int *err)
902 {
903         struct buffer_head * bh;
904
905         bh = ext3_getblk(handle, inode, block, create, err);
906         if (!bh)
907                 return bh;
908         if (buffer_uptodate(bh))
909                 return bh;
910         ll_rw_block(READ, 1, &bh);
911         wait_on_buffer(bh);
912         if (buffer_uptodate(bh))
913                 return bh;
914         put_bh(bh);
915         *err = -EIO;
916         return NULL;
917 }
918
919 static int walk_page_buffers(   handle_t *handle,
920                                 struct buffer_head *head,
921                                 unsigned from,
922                                 unsigned to,
923                                 int *partial,
924                                 int (*fn)(      handle_t *handle,
925                                                 struct buffer_head *bh))
926 {
927         struct buffer_head *bh;
928         unsigned block_start, block_end;
929         unsigned blocksize = head->b_size;
930         int err, ret = 0;
931         struct buffer_head *next;
932
933         for (   bh = head, block_start = 0;
934                 ret == 0 && (bh != head || !block_start);
935                 block_start = block_end, bh = next)
936         {
937                 next = bh->b_this_page;
938                 block_end = block_start + blocksize;
939                 if (block_end <= from || block_start >= to) {
940                         if (partial && !buffer_uptodate(bh))
941                                 *partial = 1;
942                         continue;
943                 }
944                 err = (*fn)(handle, bh);
945                 if (!ret)
946                         ret = err;
947         }
948         return ret;
949 }
950
951 /*
952  * To preserve ordering, it is essential that the hole instantiation and
953  * the data write be encapsulated in a single transaction.  We cannot
954  * close off a transaction and start a new one between the ext3_get_block()
955  * and the commit_write().  So doing the journal_start at the start of
956  * prepare_write() is the right place.
957  *
958  * Also, this function can nest inside ext3_writepage() ->
959  * block_write_full_page(). In that case, we *know* that ext3_writepage()
960  * has generated enough buffer credits to do the whole page.  So we won't
961  * block on the journal in that case, which is good, because the caller may
962  * be PF_MEMALLOC.
963  *
964  * By accident, ext3 can be reentered when a transaction is open via
965  * quota file writes.  If we were to commit the transaction while thus
966  * reentered, there can be a deadlock - we would be holding a quota
967  * lock, and the commit would never complete if another thread had a
968  * transaction open and was blocking on the quota lock - a ranking
969  * violation.
970  *
971  * So what we do is to rely on the fact that journal_stop/journal_start
972  * will _not_ run commit under these circumstances because handle->h_ref
973  * is elevated.  We'll still have enough credits for the tiny quotafile
974  * write.  
975  */
976
977 static int do_journal_get_write_access(handle_t *handle, 
978                                        struct buffer_head *bh)
979 {
980         if (!buffer_mapped(bh) || buffer_freed(bh))
981                 return 0;
982         return ext3_journal_get_write_access(handle, bh);
983 }
984
985 static int ext3_prepare_write(struct file *file, struct page *page,
986                               unsigned from, unsigned to)
987 {
988         struct inode *inode = page->mapping->host;
989         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
990         handle_t *handle;
991         int retries = 0;
992
993 retry:
994         handle = ext3_journal_start(inode, needed_blocks);
995         if (IS_ERR(handle)) {
996                 ret = PTR_ERR(handle);
997                 goto out;
998         }
999         if (test_opt(inode->i_sb, NOBH))
1000                 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1001         else
1002                 ret = block_prepare_write(page, from, to, ext3_get_block);
1003         if (ret)
1004                 goto prepare_write_failed;
1005
1006         if (ext3_should_journal_data(inode)) {
1007                 ret = walk_page_buffers(handle, page_buffers(page),
1008                                 from, to, NULL, do_journal_get_write_access);
1009         }
1010 prepare_write_failed:
1011         if (ret)
1012                 ext3_journal_stop(handle);
1013         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1014                 goto retry;
1015 out:
1016         return ret;
1017 }
1018
1019 int
1020 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1021 {
1022         int err = journal_dirty_data(handle, bh);
1023         if (err)
1024                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1025                                                 bh, handle,err);
1026         return err;
1027 }
1028
1029 /* For commit_write() in data=journal mode */
1030 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1031 {
1032         if (!buffer_mapped(bh) || buffer_freed(bh))
1033                 return 0;
1034         set_buffer_uptodate(bh);
1035         return ext3_journal_dirty_metadata(handle, bh);
1036 }
1037
1038 /*
1039  * We need to pick up the new inode size which generic_commit_write gave us
1040  * `file' can be NULL - eg, when called from page_symlink().
1041  *
1042  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1043  * buffers are managed internally.
1044  */
1045
1046 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1047                              unsigned from, unsigned to)
1048 {
1049         handle_t *handle = ext3_journal_current_handle();
1050         struct inode *inode = page->mapping->host;
1051         int ret = 0, ret2;
1052
1053         ret = walk_page_buffers(handle, page_buffers(page),
1054                 from, to, NULL, ext3_journal_dirty_data);
1055
1056         if (ret == 0) {
1057                 /*
1058                  * generic_commit_write() will run mark_inode_dirty() if i_size
1059                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1060                  * into that.
1061                  */
1062                 loff_t new_i_size;
1063
1064                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1065                 if (new_i_size > EXT3_I(inode)->i_disksize)
1066                         EXT3_I(inode)->i_disksize = new_i_size;
1067                 ret = generic_commit_write(file, page, from, to);
1068         }
1069         ret2 = ext3_journal_stop(handle);
1070         if (!ret)
1071                 ret = ret2;
1072         return ret;
1073 }
1074
1075 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1076                              unsigned from, unsigned to)
1077 {
1078         handle_t *handle = ext3_journal_current_handle();
1079         struct inode *inode = page->mapping->host;
1080         int ret = 0, ret2;
1081         loff_t new_i_size;
1082
1083         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1084         if (new_i_size > EXT3_I(inode)->i_disksize)
1085                 EXT3_I(inode)->i_disksize = new_i_size;
1086
1087         if (test_opt(inode->i_sb, NOBH))
1088                 ret = nobh_commit_write(file, page, from, to);
1089         else
1090                 ret = generic_commit_write(file, page, from, to);
1091
1092         ret2 = ext3_journal_stop(handle);
1093         if (!ret)
1094                 ret = ret2;
1095         return ret;
1096 }
1097
1098 static int ext3_journalled_commit_write(struct file *file,
1099                         struct page *page, unsigned from, unsigned to)
1100 {
1101         handle_t *handle = ext3_journal_current_handle();
1102         struct inode *inode = page->mapping->host;
1103         int ret = 0, ret2;
1104         int partial = 0;
1105         loff_t pos;
1106
1107         /*
1108          * Here we duplicate the generic_commit_write() functionality
1109          */
1110         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1111
1112         ret = walk_page_buffers(handle, page_buffers(page), from,
1113                                 to, &partial, commit_write_fn);
1114         if (!partial)
1115                 SetPageUptodate(page);
1116         if (pos > inode->i_size)
1117                 i_size_write(inode, pos);
1118         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1119         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1120                 EXT3_I(inode)->i_disksize = inode->i_size;
1121                 ret2 = ext3_mark_inode_dirty(handle, inode);
1122                 if (!ret) 
1123                         ret = ret2;
1124         }
1125         ret2 = ext3_journal_stop(handle);
1126         if (!ret)
1127                 ret = ret2;
1128         return ret;
1129 }
1130
1131 /* 
1132  * bmap() is special.  It gets used by applications such as lilo and by
1133  * the swapper to find the on-disk block of a specific piece of data.
1134  *
1135  * Naturally, this is dangerous if the block concerned is still in the
1136  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1137  * filesystem and enables swap, then they may get a nasty shock when the
1138  * data getting swapped to that swapfile suddenly gets overwritten by
1139  * the original zero's written out previously to the journal and
1140  * awaiting writeback in the kernel's buffer cache. 
1141  *
1142  * So, if we see any bmap calls here on a modified, data-journaled file,
1143  * take extra steps to flush any blocks which might be in the cache. 
1144  */
1145 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1146 {
1147         struct inode *inode = mapping->host;
1148         journal_t *journal;
1149         int err;
1150
1151         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1152                 /* 
1153                  * This is a REALLY heavyweight approach, but the use of
1154                  * bmap on dirty files is expected to be extremely rare:
1155                  * only if we run lilo or swapon on a freshly made file
1156                  * do we expect this to happen. 
1157                  *
1158                  * (bmap requires CAP_SYS_RAWIO so this does not
1159                  * represent an unprivileged user DOS attack --- we'd be
1160                  * in trouble if mortal users could trigger this path at
1161                  * will.) 
1162                  *
1163                  * NB. EXT3_STATE_JDATA is not set on files other than
1164                  * regular files.  If somebody wants to bmap a directory
1165                  * or symlink and gets confused because the buffer
1166                  * hasn't yet been flushed to disk, they deserve
1167                  * everything they get.
1168                  */
1169
1170                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1171                 journal = EXT3_JOURNAL(inode);
1172                 journal_lock_updates(journal);
1173                 err = journal_flush(journal);
1174                 journal_unlock_updates(journal);
1175
1176                 if (err)
1177                         return 0;
1178         }
1179
1180         return generic_block_bmap(mapping,block,ext3_get_block);
1181 }
1182
1183 static int bget_one(handle_t *handle, struct buffer_head *bh)
1184 {
1185         get_bh(bh);
1186         return 0;
1187 }
1188
1189 static int bput_one(handle_t *handle, struct buffer_head *bh)
1190 {
1191         put_bh(bh);
1192         return 0;
1193 }
1194
1195 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1196 {
1197         if (buffer_mapped(bh))
1198                 return ext3_journal_dirty_data(handle, bh);
1199         return 0;
1200 }
1201
1202 /*
1203  * Note that we always start a transaction even if we're not journalling
1204  * data.  This is to preserve ordering: any hole instantiation within
1205  * __block_write_full_page -> ext3_get_block() should be journalled
1206  * along with the data so we don't crash and then get metadata which
1207  * refers to old data.
1208  *
1209  * In all journalling modes block_write_full_page() will start the I/O.
1210  *
1211  * Problem:
1212  *
1213  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1214  *              ext3_writepage()
1215  *
1216  * Similar for:
1217  *
1218  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1219  *
1220  * Same applies to ext3_get_block().  We will deadlock on various things like
1221  * lock_journal and i_truncate_sem.
1222  *
1223  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1224  * allocations fail.
1225  *
1226  * 16May01: If we're reentered then journal_current_handle() will be
1227  *          non-zero. We simply *return*.
1228  *
1229  * 1 July 2001: @@@ FIXME:
1230  *   In journalled data mode, a data buffer may be metadata against the
1231  *   current transaction.  But the same file is part of a shared mapping
1232  *   and someone does a writepage() on it.
1233  *
1234  *   We will move the buffer onto the async_data list, but *after* it has
1235  *   been dirtied. So there's a small window where we have dirty data on
1236  *   BJ_Metadata.
1237  *
1238  *   Note that this only applies to the last partial page in the file.  The
1239  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1240  *   broken code anyway: it's wrong for msync()).
1241  *
1242  *   It's a rare case: affects the final partial page, for journalled data
1243  *   where the file is subject to bith write() and writepage() in the same
1244  *   transction.  To fix it we'll need a custom block_write_full_page().
1245  *   We'll probably need that anyway for journalling writepage() output.
1246  *
1247  * We don't honour synchronous mounts for writepage().  That would be
1248  * disastrous.  Any write() or metadata operation will sync the fs for
1249  * us.
1250  *
1251  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1252  * we don't need to open a transaction here.
1253  */
1254 static int ext3_ordered_writepage(struct page *page,
1255                         struct writeback_control *wbc)
1256 {
1257         struct inode *inode = page->mapping->host;
1258         struct buffer_head *page_bufs;
1259         handle_t *handle = NULL;
1260         int ret = 0;
1261         int err;
1262
1263         J_ASSERT(PageLocked(page));
1264
1265         /*
1266          * We give up here if we're reentered, because it might be for a
1267          * different filesystem.
1268          */
1269         if (ext3_journal_current_handle())
1270                 goto out_fail;
1271
1272         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1273
1274         if (IS_ERR(handle)) {
1275                 ret = PTR_ERR(handle);
1276                 goto out_fail;
1277         }
1278
1279         if (!page_has_buffers(page)) {
1280                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1281                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1282         }
1283         page_bufs = page_buffers(page);
1284         walk_page_buffers(handle, page_bufs, 0,
1285                         PAGE_CACHE_SIZE, NULL, bget_one);
1286
1287         ret = block_write_full_page(page, ext3_get_block, wbc);
1288
1289         /*
1290          * The page can become unlocked at any point now, and
1291          * truncate can then come in and change things.  So we
1292          * can't touch *page from now on.  But *page_bufs is
1293          * safe due to elevated refcount.
1294          */
1295
1296         /*
1297          * And attach them to the current transaction.  But only if 
1298          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1299          * and generally junk.
1300          */
1301         if (ret == 0) {
1302                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1303                                         NULL, journal_dirty_data_fn);
1304                 if (!ret)
1305                         ret = err;
1306         }
1307         walk_page_buffers(handle, page_bufs, 0,
1308                         PAGE_CACHE_SIZE, NULL, bput_one);
1309         err = ext3_journal_stop(handle);
1310         if (!ret)
1311                 ret = err;
1312         return ret;
1313
1314 out_fail:
1315         redirty_page_for_writepage(wbc, page);
1316         unlock_page(page);
1317         return ret;
1318 }
1319
1320 static int ext3_writeback_writepage(struct page *page,
1321                                 struct writeback_control *wbc)
1322 {
1323         struct inode *inode = page->mapping->host;
1324         handle_t *handle = NULL;
1325         int ret = 0;
1326         int err;
1327
1328         if (ext3_journal_current_handle())
1329                 goto out_fail;
1330
1331         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1332         if (IS_ERR(handle)) {
1333                 ret = PTR_ERR(handle);
1334                 goto out_fail;
1335         }
1336
1337         if (test_opt(inode->i_sb, NOBH))
1338                 ret = nobh_writepage(page, ext3_get_block, wbc);
1339         else
1340                 ret = block_write_full_page(page, ext3_get_block, wbc);
1341
1342         err = ext3_journal_stop(handle);
1343         if (!ret)
1344                 ret = err;
1345         return ret;
1346
1347 out_fail:
1348         redirty_page_for_writepage(wbc, page);
1349         unlock_page(page);
1350         return ret;
1351 }
1352
1353 static int ext3_journalled_writepage(struct page *page,
1354                                 struct writeback_control *wbc)
1355 {
1356         struct inode *inode = page->mapping->host;
1357         handle_t *handle = NULL;
1358         int ret = 0;
1359         int err;
1360
1361         if (ext3_journal_current_handle())
1362                 goto no_write;
1363
1364         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1365         if (IS_ERR(handle)) {
1366                 ret = PTR_ERR(handle);
1367                 goto no_write;
1368         }
1369
1370         if (!page_has_buffers(page) || PageChecked(page)) {
1371                 /*
1372                  * It's mmapped pagecache.  Add buffers and journal it.  There
1373                  * doesn't seem much point in redirtying the page here.
1374                  */
1375                 ClearPageChecked(page);
1376                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1377                                         ext3_get_block);
1378                 if (ret != 0)
1379                         goto out_unlock;
1380                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1381                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1382
1383                 err = walk_page_buffers(handle, page_buffers(page), 0,
1384                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1385                 if (ret == 0)
1386                         ret = err;
1387                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1388                 unlock_page(page);
1389         } else {
1390                 /*
1391                  * It may be a page full of checkpoint-mode buffers.  We don't
1392                  * really know unless we go poke around in the buffer_heads.
1393                  * But block_write_full_page will do the right thing.
1394                  */
1395                 ret = block_write_full_page(page, ext3_get_block, wbc);
1396         }
1397         err = ext3_journal_stop(handle);
1398         if (!ret)
1399                 ret = err;
1400 out:
1401         return ret;
1402
1403 no_write:
1404         redirty_page_for_writepage(wbc, page);
1405 out_unlock:
1406         unlock_page(page);
1407         goto out;
1408 }
1409
1410 static int ext3_readpage(struct file *file, struct page *page)
1411 {
1412         return mpage_readpage(page, ext3_get_block);
1413 }
1414
1415 static int
1416 ext3_readpages(struct file *file, struct address_space *mapping,
1417                 struct list_head *pages, unsigned nr_pages)
1418 {
1419         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1420 }
1421
1422 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1423 {
1424         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1425
1426         /*
1427          * If it's a full truncate we just forget about the pending dirtying
1428          */
1429         if (offset == 0)
1430                 ClearPageChecked(page);
1431
1432         return journal_invalidatepage(journal, page, offset);
1433 }
1434
1435 static int ext3_releasepage(struct page *page, int wait)
1436 {
1437         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1438
1439         WARN_ON(PageChecked(page));
1440         if (!page_has_buffers(page))
1441                 return 0;
1442         return journal_try_to_free_buffers(journal, page, wait);
1443 }
1444
1445 /*
1446  * If the O_DIRECT write will extend the file then add this inode to the
1447  * orphan list.  So recovery will truncate it back to the original size
1448  * if the machine crashes during the write.
1449  *
1450  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1451  * crashes then stale disk data _may_ be exposed inside the file.
1452  */
1453 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1454                         const struct iovec *iov, loff_t offset,
1455                         unsigned long nr_segs)
1456 {
1457         struct file *file = iocb->ki_filp;
1458         struct inode *inode = file->f_mapping->host;
1459         struct ext3_inode_info *ei = EXT3_I(inode);
1460         handle_t *handle = NULL;
1461         ssize_t ret;
1462         int orphan = 0;
1463         size_t count = iov_length(iov, nr_segs);
1464
1465         if (rw == WRITE) {
1466                 loff_t final_size = offset + count;
1467
1468                 handle = ext3_journal_start(inode, DIO_CREDITS);
1469                 if (IS_ERR(handle)) {
1470                         ret = PTR_ERR(handle);
1471                         goto out;
1472                 }
1473                 if (final_size > inode->i_size) {
1474                         ret = ext3_orphan_add(handle, inode);
1475                         if (ret)
1476                                 goto out_stop;
1477                         orphan = 1;
1478                         ei->i_disksize = inode->i_size;
1479                 }
1480         }
1481
1482         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1483                                  offset, nr_segs,
1484                                  ext3_direct_io_get_blocks, NULL);
1485
1486         /*
1487          * Reacquire the handle: ext3_direct_io_get_block() can restart the
1488          * transaction
1489          */
1490         handle = journal_current_handle();
1491
1492 out_stop:
1493         if (handle) {
1494                 int err;
1495
1496                 if (orphan && inode->i_nlink)
1497                         ext3_orphan_del(handle, inode);
1498                 if (orphan && ret > 0) {
1499                         loff_t end = offset + ret;
1500                         if (end > inode->i_size) {
1501                                 ei->i_disksize = end;
1502                                 i_size_write(inode, end);
1503                                 /*
1504                                  * We're going to return a positive `ret'
1505                                  * here due to non-zero-length I/O, so there's
1506                                  * no way of reporting error returns from
1507                                  * ext3_mark_inode_dirty() to userspace.  So
1508                                  * ignore it.
1509                                  */
1510                                 ext3_mark_inode_dirty(handle, inode);
1511                         }
1512                 }
1513                 err = ext3_journal_stop(handle);
1514                 if (ret == 0)
1515                         ret = err;
1516         }
1517 out:
1518         return ret;
1519 }
1520
1521 /*
1522  * Pages can be marked dirty completely asynchronously from ext3's journalling
1523  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1524  * much here because ->set_page_dirty is called under VFS locks.  The page is
1525  * not necessarily locked.
1526  *
1527  * We cannot just dirty the page and leave attached buffers clean, because the
1528  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1529  * or jbddirty because all the journalling code will explode.
1530  *
1531  * So what we do is to mark the page "pending dirty" and next time writepage
1532  * is called, propagate that into the buffers appropriately.
1533  */
1534 static int ext3_journalled_set_page_dirty(struct page *page)
1535 {
1536         SetPageChecked(page);
1537         return __set_page_dirty_nobuffers(page);
1538 }
1539
1540 static struct address_space_operations ext3_ordered_aops = {
1541         .readpage       = ext3_readpage,
1542         .readpages      = ext3_readpages,
1543         .writepage      = ext3_ordered_writepage,
1544         .sync_page      = block_sync_page,
1545         .prepare_write  = ext3_prepare_write,
1546         .commit_write   = ext3_ordered_commit_write,
1547         .bmap           = ext3_bmap,
1548         .invalidatepage = ext3_invalidatepage,
1549         .releasepage    = ext3_releasepage,
1550         .direct_IO      = ext3_direct_IO,
1551 };
1552
1553 static struct address_space_operations ext3_writeback_aops = {
1554         .readpage       = ext3_readpage,
1555         .readpages      = ext3_readpages,
1556         .writepage      = ext3_writeback_writepage,
1557         .sync_page      = block_sync_page,
1558         .prepare_write  = ext3_prepare_write,
1559         .commit_write   = ext3_writeback_commit_write,
1560         .bmap           = ext3_bmap,
1561         .invalidatepage = ext3_invalidatepage,
1562         .releasepage    = ext3_releasepage,
1563         .direct_IO      = ext3_direct_IO,
1564 };
1565
1566 static struct address_space_operations ext3_journalled_aops = {
1567         .readpage       = ext3_readpage,
1568         .readpages      = ext3_readpages,
1569         .writepage      = ext3_journalled_writepage,
1570         .sync_page      = block_sync_page,
1571         .prepare_write  = ext3_prepare_write,
1572         .commit_write   = ext3_journalled_commit_write,
1573         .set_page_dirty = ext3_journalled_set_page_dirty,
1574         .bmap           = ext3_bmap,
1575         .invalidatepage = ext3_invalidatepage,
1576         .releasepage    = ext3_releasepage,
1577 };
1578
1579 void ext3_set_aops(struct inode *inode)
1580 {
1581         if (ext3_should_order_data(inode))
1582                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1583         else if (ext3_should_writeback_data(inode))
1584                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1585         else
1586                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1587 }
1588
1589 /*
1590  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1591  * up to the end of the block which corresponds to `from'.
1592  * This required during truncate. We need to physically zero the tail end
1593  * of that block so it doesn't yield old data if the file is later grown.
1594  */
1595 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1596                 struct address_space *mapping, loff_t from)
1597 {
1598         unsigned long index = from >> PAGE_CACHE_SHIFT;
1599         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1600         unsigned blocksize, iblock, length, pos;
1601         struct inode *inode = mapping->host;
1602         struct buffer_head *bh;
1603         int err = 0;
1604         void *kaddr;
1605
1606         blocksize = inode->i_sb->s_blocksize;
1607         length = blocksize - (offset & (blocksize - 1));
1608         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1609
1610         /*
1611          * For "nobh" option,  we can only work if we don't need to
1612          * read-in the page - otherwise we create buffers to do the IO.
1613          */
1614         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
1615                 if (PageUptodate(page)) {
1616                         kaddr = kmap_atomic(page, KM_USER0);
1617                         memset(kaddr + offset, 0, length);
1618                         flush_dcache_page(page);
1619                         kunmap_atomic(kaddr, KM_USER0);
1620                         set_page_dirty(page);
1621                         goto unlock;
1622                 }
1623         }
1624
1625         if (!page_has_buffers(page))
1626                 create_empty_buffers(page, blocksize, 0);
1627
1628         /* Find the buffer that contains "offset" */
1629         bh = page_buffers(page);
1630         pos = blocksize;
1631         while (offset >= pos) {
1632                 bh = bh->b_this_page;
1633                 iblock++;
1634                 pos += blocksize;
1635         }
1636
1637         err = 0;
1638         if (buffer_freed(bh)) {
1639                 BUFFER_TRACE(bh, "freed: skip");
1640                 goto unlock;
1641         }
1642
1643         if (!buffer_mapped(bh)) {
1644                 BUFFER_TRACE(bh, "unmapped");
1645                 ext3_get_block(inode, iblock, bh, 0);
1646                 /* unmapped? It's a hole - nothing to do */
1647                 if (!buffer_mapped(bh)) {
1648                         BUFFER_TRACE(bh, "still unmapped");
1649                         goto unlock;
1650                 }
1651         }
1652
1653         /* Ok, it's mapped. Make sure it's up-to-date */
1654         if (PageUptodate(page))
1655                 set_buffer_uptodate(bh);
1656
1657         if (!buffer_uptodate(bh)) {
1658                 err = -EIO;
1659                 ll_rw_block(READ, 1, &bh);
1660                 wait_on_buffer(bh);
1661                 /* Uhhuh. Read error. Complain and punt. */
1662                 if (!buffer_uptodate(bh))
1663                         goto unlock;
1664         }
1665
1666         if (ext3_should_journal_data(inode)) {
1667                 BUFFER_TRACE(bh, "get write access");
1668                 err = ext3_journal_get_write_access(handle, bh);
1669                 if (err)
1670                         goto unlock;
1671         }
1672
1673         kaddr = kmap_atomic(page, KM_USER0);
1674         memset(kaddr + offset, 0, length);
1675         flush_dcache_page(page);
1676         kunmap_atomic(kaddr, KM_USER0);
1677
1678         BUFFER_TRACE(bh, "zeroed end of block");
1679
1680         err = 0;
1681         if (ext3_should_journal_data(inode)) {
1682                 err = ext3_journal_dirty_metadata(handle, bh);
1683         } else {
1684                 if (ext3_should_order_data(inode))
1685                         err = ext3_journal_dirty_data(handle, bh);
1686                 mark_buffer_dirty(bh);
1687         }
1688
1689 unlock:
1690         unlock_page(page);
1691         page_cache_release(page);
1692         return err;
1693 }
1694
1695 /*
1696  * Probably it should be a library function... search for first non-zero word
1697  * or memcmp with zero_page, whatever is better for particular architecture.
1698  * Linus?
1699  */
1700 static inline int all_zeroes(__le32 *p, __le32 *q)
1701 {
1702         while (p < q)
1703                 if (*p++)
1704                         return 0;
1705         return 1;
1706 }
1707
1708 /**
1709  *      ext3_find_shared - find the indirect blocks for partial truncation.
1710  *      @inode:   inode in question
1711  *      @depth:   depth of the affected branch
1712  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1713  *      @chain:   place to store the pointers to partial indirect blocks
1714  *      @top:     place to the (detached) top of branch
1715  *
1716  *      This is a helper function used by ext3_truncate().
1717  *
1718  *      When we do truncate() we may have to clean the ends of several
1719  *      indirect blocks but leave the blocks themselves alive. Block is
1720  *      partially truncated if some data below the new i_size is refered
1721  *      from it (and it is on the path to the first completely truncated
1722  *      data block, indeed).  We have to free the top of that path along
1723  *      with everything to the right of the path. Since no allocation
1724  *      past the truncation point is possible until ext3_truncate()
1725  *      finishes, we may safely do the latter, but top of branch may
1726  *      require special attention - pageout below the truncation point
1727  *      might try to populate it.
1728  *
1729  *      We atomically detach the top of branch from the tree, store the
1730  *      block number of its root in *@top, pointers to buffer_heads of
1731  *      partially truncated blocks - in @chain[].bh and pointers to
1732  *      their last elements that should not be removed - in
1733  *      @chain[].p. Return value is the pointer to last filled element
1734  *      of @chain.
1735  *
1736  *      The work left to caller to do the actual freeing of subtrees:
1737  *              a) free the subtree starting from *@top
1738  *              b) free the subtrees whose roots are stored in
1739  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1740  *              c) free the subtrees growing from the inode past the @chain[0].
1741  *                      (no partially truncated stuff there).  */
1742
1743 static Indirect *ext3_find_shared(struct inode *inode,
1744                                 int depth,
1745                                 int offsets[4],
1746                                 Indirect chain[4],
1747                                 __le32 *top)
1748 {
1749         Indirect *partial, *p;
1750         int k, err;
1751
1752         *top = 0;
1753         /* Make k index the deepest non-null offest + 1 */
1754         for (k = depth; k > 1 && !offsets[k-1]; k--)
1755                 ;
1756         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1757         /* Writer: pointers */
1758         if (!partial)
1759                 partial = chain + k-1;
1760         /*
1761          * If the branch acquired continuation since we've looked at it -
1762          * fine, it should all survive and (new) top doesn't belong to us.
1763          */
1764         if (!partial->key && *partial->p)
1765                 /* Writer: end */
1766                 goto no_top;
1767         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1768                 ;
1769         /*
1770          * OK, we've found the last block that must survive. The rest of our
1771          * branch should be detached before unlocking. However, if that rest
1772          * of branch is all ours and does not grow immediately from the inode
1773          * it's easier to cheat and just decrement partial->p.
1774          */
1775         if (p == chain + k - 1 && p > chain) {
1776                 p->p--;
1777         } else {
1778                 *top = *p->p;
1779                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1780 #if 0
1781                 *p->p = 0;
1782 #endif
1783         }
1784         /* Writer: end */
1785
1786         while(partial > p)
1787         {
1788                 brelse(partial->bh);
1789                 partial--;
1790         }
1791 no_top:
1792         return partial;
1793 }
1794
1795 /*
1796  * Zero a number of block pointers in either an inode or an indirect block.
1797  * If we restart the transaction we must again get write access to the
1798  * indirect block for further modification.
1799  *
1800  * We release `count' blocks on disk, but (last - first) may be greater
1801  * than `count' because there can be holes in there.
1802  */
1803 static void
1804 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1805                 unsigned long block_to_free, unsigned long count,
1806                 __le32 *first, __le32 *last)
1807 {
1808         __le32 *p;
1809         if (try_to_extend_transaction(handle, inode)) {
1810                 if (bh) {
1811                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1812                         ext3_journal_dirty_metadata(handle, bh);
1813                 }
1814                 ext3_mark_inode_dirty(handle, inode);
1815                 ext3_journal_test_restart(handle, inode);
1816                 if (bh) {
1817                         BUFFER_TRACE(bh, "retaking write access");
1818                         ext3_journal_get_write_access(handle, bh);
1819                 }
1820         }
1821
1822         /*
1823          * Any buffers which are on the journal will be in memory. We find
1824          * them on the hash table so journal_revoke() will run journal_forget()
1825          * on them.  We've already detached each block from the file, so
1826          * bforget() in journal_forget() should be safe.
1827          *
1828          * AKPM: turn on bforget in journal_forget()!!!
1829          */
1830         for (p = first; p < last; p++) {
1831                 u32 nr = le32_to_cpu(*p);
1832                 if (nr) {
1833                         struct buffer_head *bh;
1834
1835                         *p = 0;
1836                         bh = sb_find_get_block(inode->i_sb, nr);
1837                         ext3_forget(handle, 0, inode, bh, nr);
1838                 }
1839         }
1840
1841         ext3_free_blocks(handle, inode, block_to_free, count);
1842 }
1843
1844 /**
1845  * ext3_free_data - free a list of data blocks
1846  * @handle:     handle for this transaction
1847  * @inode:      inode we are dealing with
1848  * @this_bh:    indirect buffer_head which contains *@first and *@last
1849  * @first:      array of block numbers
1850  * @last:       points immediately past the end of array
1851  *
1852  * We are freeing all blocks refered from that array (numbers are stored as
1853  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1854  *
1855  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1856  * blocks are contiguous then releasing them at one time will only affect one
1857  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1858  * actually use a lot of journal space.
1859  *
1860  * @this_bh will be %NULL if @first and @last point into the inode's direct
1861  * block pointers.
1862  */
1863 static void ext3_free_data(handle_t *handle, struct inode *inode,
1864                            struct buffer_head *this_bh,
1865                            __le32 *first, __le32 *last)
1866 {
1867         unsigned long block_to_free = 0;    /* Starting block # of a run */
1868         unsigned long count = 0;            /* Number of blocks in the run */ 
1869         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
1870                                                corresponding to
1871                                                block_to_free */
1872         unsigned long nr;                   /* Current block # */
1873         __le32 *p;                          /* Pointer into inode/ind
1874                                                for current block */
1875         int err;
1876
1877         if (this_bh) {                          /* For indirect block */
1878                 BUFFER_TRACE(this_bh, "get_write_access");
1879                 err = ext3_journal_get_write_access(handle, this_bh);
1880                 /* Important: if we can't update the indirect pointers
1881                  * to the blocks, we can't free them. */
1882                 if (err)
1883                         return;
1884         }
1885
1886         for (p = first; p < last; p++) {
1887                 nr = le32_to_cpu(*p);
1888                 if (nr) {
1889                         /* accumulate blocks to free if they're contiguous */
1890                         if (count == 0) {
1891                                 block_to_free = nr;
1892                                 block_to_free_p = p;
1893                                 count = 1;
1894                         } else if (nr == block_to_free + count) {
1895                                 count++;
1896                         } else {
1897                                 ext3_clear_blocks(handle, inode, this_bh, 
1898                                                   block_to_free,
1899                                                   count, block_to_free_p, p);
1900                                 block_to_free = nr;
1901                                 block_to_free_p = p;
1902                                 count = 1;
1903                         }
1904                 }
1905         }
1906
1907         if (count > 0)
1908                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1909                                   count, block_to_free_p, p);
1910
1911         if (this_bh) {
1912                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1913                 ext3_journal_dirty_metadata(handle, this_bh);
1914         }
1915 }
1916
1917 /**
1918  *      ext3_free_branches - free an array of branches
1919  *      @handle: JBD handle for this transaction
1920  *      @inode: inode we are dealing with
1921  *      @parent_bh: the buffer_head which contains *@first and *@last
1922  *      @first: array of block numbers
1923  *      @last:  pointer immediately past the end of array
1924  *      @depth: depth of the branches to free
1925  *
1926  *      We are freeing all blocks refered from these branches (numbers are
1927  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1928  *      appropriately.
1929  */
1930 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1931                                struct buffer_head *parent_bh,
1932                                __le32 *first, __le32 *last, int depth)
1933 {
1934         unsigned long nr;
1935         __le32 *p;
1936
1937         if (is_handle_aborted(handle))
1938                 return;
1939
1940         if (depth--) {
1941                 struct buffer_head *bh;
1942                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1943                 p = last;
1944                 while (--p >= first) {
1945                         nr = le32_to_cpu(*p);
1946                         if (!nr)
1947                                 continue;               /* A hole */
1948
1949                         /* Go read the buffer for the next level down */
1950                         bh = sb_bread(inode->i_sb, nr);
1951
1952                         /*
1953                          * A read failure? Report error and clear slot
1954                          * (should be rare).
1955                          */
1956                         if (!bh) {
1957                                 ext3_error(inode->i_sb, "ext3_free_branches",
1958                                            "Read failure, inode=%ld, block=%ld",
1959                                            inode->i_ino, nr);
1960                                 continue;
1961                         }
1962
1963                         /* This zaps the entire block.  Bottom up. */
1964                         BUFFER_TRACE(bh, "free child branches");
1965                         ext3_free_branches(handle, inode, bh,
1966                                            (__le32*)bh->b_data,
1967                                            (__le32*)bh->b_data + addr_per_block,
1968                                            depth);
1969
1970                         /*
1971                          * We've probably journalled the indirect block several
1972                          * times during the truncate.  But it's no longer
1973                          * needed and we now drop it from the transaction via
1974                          * journal_revoke().
1975                          *
1976                          * That's easy if it's exclusively part of this
1977                          * transaction.  But if it's part of the committing
1978                          * transaction then journal_forget() will simply
1979                          * brelse() it.  That means that if the underlying
1980                          * block is reallocated in ext3_get_block(),
1981                          * unmap_underlying_metadata() will find this block
1982                          * and will try to get rid of it.  damn, damn.
1983                          *
1984                          * If this block has already been committed to the
1985                          * journal, a revoke record will be written.  And
1986                          * revoke records must be emitted *before* clearing
1987                          * this block's bit in the bitmaps.
1988                          */
1989                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
1990
1991                         /*
1992                          * Everything below this this pointer has been
1993                          * released.  Now let this top-of-subtree go.
1994                          *
1995                          * We want the freeing of this indirect block to be
1996                          * atomic in the journal with the updating of the
1997                          * bitmap block which owns it.  So make some room in
1998                          * the journal.
1999                          *
2000                          * We zero the parent pointer *after* freeing its
2001                          * pointee in the bitmaps, so if extend_transaction()
2002                          * for some reason fails to put the bitmap changes and
2003                          * the release into the same transaction, recovery
2004                          * will merely complain about releasing a free block,
2005                          * rather than leaking blocks.
2006                          */
2007                         if (is_handle_aborted(handle))
2008                                 return;
2009                         if (try_to_extend_transaction(handle, inode)) {
2010                                 ext3_mark_inode_dirty(handle, inode);
2011                                 ext3_journal_test_restart(handle, inode);
2012                         }
2013
2014                         ext3_free_blocks(handle, inode, nr, 1);
2015
2016                         if (parent_bh) {
2017                                 /*
2018                                  * The block which we have just freed is
2019                                  * pointed to by an indirect block: journal it
2020                                  */
2021                                 BUFFER_TRACE(parent_bh, "get_write_access");
2022                                 if (!ext3_journal_get_write_access(handle,
2023                                                                    parent_bh)){
2024                                         *p = 0;
2025                                         BUFFER_TRACE(parent_bh,
2026                                         "call ext3_journal_dirty_metadata");
2027                                         ext3_journal_dirty_metadata(handle, 
2028                                                                     parent_bh);
2029                                 }
2030                         }
2031                 }
2032         } else {
2033                 /* We have reached the bottom of the tree. */
2034                 BUFFER_TRACE(parent_bh, "free data blocks");
2035                 ext3_free_data(handle, inode, parent_bh, first, last);
2036         }
2037 }
2038
2039 /*
2040  * ext3_truncate()
2041  *
2042  * We block out ext3_get_block() block instantiations across the entire
2043  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2044  * simultaneously on behalf of the same inode.
2045  *
2046  * As we work through the truncate and commmit bits of it to the journal there
2047  * is one core, guiding principle: the file's tree must always be consistent on
2048  * disk.  We must be able to restart the truncate after a crash.
2049  *
2050  * The file's tree may be transiently inconsistent in memory (although it
2051  * probably isn't), but whenever we close off and commit a journal transaction,
2052  * the contents of (the filesystem + the journal) must be consistent and
2053  * restartable.  It's pretty simple, really: bottom up, right to left (although
2054  * left-to-right works OK too).
2055  *
2056  * Note that at recovery time, journal replay occurs *before* the restart of
2057  * truncate against the orphan inode list.
2058  *
2059  * The committed inode has the new, desired i_size (which is the same as
2060  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2061  * that this inode's truncate did not complete and it will again call
2062  * ext3_truncate() to have another go.  So there will be instantiated blocks
2063  * to the right of the truncation point in a crashed ext3 filesystem.  But
2064  * that's fine - as long as they are linked from the inode, the post-crash
2065  * ext3_truncate() run will find them and release them.
2066  */
2067
2068 void ext3_truncate(struct inode * inode)
2069 {
2070         handle_t *handle;
2071         struct ext3_inode_info *ei = EXT3_I(inode);
2072         __le32 *i_data = ei->i_data;
2073         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2074         struct address_space *mapping = inode->i_mapping;
2075         int offsets[4];
2076         Indirect chain[4];
2077         Indirect *partial;
2078         __le32 nr = 0;
2079         int n;
2080         long last_block;
2081         unsigned blocksize = inode->i_sb->s_blocksize;
2082         struct page *page;
2083
2084         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2085             S_ISLNK(inode->i_mode)))
2086                 return;
2087         if (ext3_inode_is_fast_symlink(inode))
2088                 return;
2089         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2090                 return;
2091
2092         /*
2093          * We have to lock the EOF page here, because lock_page() nests
2094          * outside journal_start().
2095          */
2096         if ((inode->i_size & (blocksize - 1)) == 0) {
2097                 /* Block boundary? Nothing to do */
2098                 page = NULL;
2099         } else {
2100                 page = grab_cache_page(mapping,
2101                                 inode->i_size >> PAGE_CACHE_SHIFT);
2102                 if (!page)
2103                         return;
2104         }
2105
2106         handle = start_transaction(inode);
2107         if (IS_ERR(handle)) {
2108                 if (page) {
2109                         clear_highpage(page);
2110                         flush_dcache_page(page);
2111                         unlock_page(page);
2112                         page_cache_release(page);
2113                 }
2114                 return;         /* AKPM: return what? */
2115         }
2116
2117         last_block = (inode->i_size + blocksize-1)
2118                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2119
2120         if (page)
2121                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2122
2123         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2124         if (n == 0)
2125                 goto out_stop;  /* error */
2126
2127         /*
2128          * OK.  This truncate is going to happen.  We add the inode to the
2129          * orphan list, so that if this truncate spans multiple transactions,
2130          * and we crash, we will resume the truncate when the filesystem
2131          * recovers.  It also marks the inode dirty, to catch the new size.
2132          *
2133          * Implication: the file must always be in a sane, consistent
2134          * truncatable state while each transaction commits.
2135          */
2136         if (ext3_orphan_add(handle, inode))
2137                 goto out_stop;
2138
2139         /*
2140          * The orphan list entry will now protect us from any crash which
2141          * occurs before the truncate completes, so it is now safe to propagate
2142          * the new, shorter inode size (held for now in i_size) into the
2143          * on-disk inode. We do this via i_disksize, which is the value which
2144          * ext3 *really* writes onto the disk inode.
2145          */
2146         ei->i_disksize = inode->i_size;
2147
2148         /*
2149          * From here we block out all ext3_get_block() callers who want to
2150          * modify the block allocation tree.
2151          */
2152         down(&ei->truncate_sem);
2153
2154         if (n == 1) {           /* direct blocks */
2155                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2156                                i_data + EXT3_NDIR_BLOCKS);
2157                 goto do_indirects;
2158         }
2159
2160         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2161         /* Kill the top of shared branch (not detached) */
2162         if (nr) {
2163                 if (partial == chain) {
2164                         /* Shared branch grows from the inode */
2165                         ext3_free_branches(handle, inode, NULL,
2166                                            &nr, &nr+1, (chain+n-1) - partial);
2167                         *partial->p = 0;
2168                         /*
2169                          * We mark the inode dirty prior to restart,
2170                          * and prior to stop.  No need for it here.
2171                          */
2172                 } else {
2173                         /* Shared branch grows from an indirect block */
2174                         BUFFER_TRACE(partial->bh, "get_write_access");
2175                         ext3_free_branches(handle, inode, partial->bh,
2176                                         partial->p,
2177                                         partial->p+1, (chain+n-1) - partial);
2178                 }
2179         }
2180         /* Clear the ends of indirect blocks on the shared branch */
2181         while (partial > chain) {
2182                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2183                                    (__le32*)partial->bh->b_data+addr_per_block,
2184                                    (chain+n-1) - partial);
2185                 BUFFER_TRACE(partial->bh, "call brelse");
2186                 brelse (partial->bh);
2187                 partial--;
2188         }
2189 do_indirects:
2190         /* Kill the remaining (whole) subtrees */
2191         switch (offsets[0]) {
2192                 default:
2193                         nr = i_data[EXT3_IND_BLOCK];
2194                         if (nr) {
2195                                 ext3_free_branches(handle, inode, NULL,
2196                                                    &nr, &nr+1, 1);
2197                                 i_data[EXT3_IND_BLOCK] = 0;
2198                         }
2199                 case EXT3_IND_BLOCK:
2200                         nr = i_data[EXT3_DIND_BLOCK];
2201                         if (nr) {
2202                                 ext3_free_branches(handle, inode, NULL,
2203                                                    &nr, &nr+1, 2);
2204                                 i_data[EXT3_DIND_BLOCK] = 0;
2205                         }
2206                 case EXT3_DIND_BLOCK:
2207                         nr = i_data[EXT3_TIND_BLOCK];
2208                         if (nr) {
2209                                 ext3_free_branches(handle, inode, NULL,
2210                                                    &nr, &nr+1, 3);
2211                                 i_data[EXT3_TIND_BLOCK] = 0;
2212                         }
2213                 case EXT3_TIND_BLOCK:
2214                         ;
2215         }
2216
2217         ext3_discard_reservation(inode);
2218
2219         up(&ei->truncate_sem);
2220         inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2221         ext3_mark_inode_dirty(handle, inode);
2222
2223         /* In a multi-transaction truncate, we only make the final
2224          * transaction synchronous */
2225         if (IS_SYNC(inode))
2226                 handle->h_sync = 1;
2227 out_stop:
2228         /*
2229          * If this was a simple ftruncate(), and the file will remain alive
2230          * then we need to clear up the orphan record which we created above.
2231          * However, if this was a real unlink then we were called by
2232          * ext3_delete_inode(), and we allow that function to clean up the
2233          * orphan info for us.
2234          */
2235         if (inode->i_nlink)
2236                 ext3_orphan_del(handle, inode);
2237
2238         ext3_journal_stop(handle);
2239 }
2240
2241 static unsigned long ext3_get_inode_block(struct super_block *sb,
2242                 unsigned long ino, struct ext3_iloc *iloc)
2243 {
2244         unsigned long desc, group_desc, block_group;
2245         unsigned long offset, block;
2246         struct buffer_head *bh;
2247         struct ext3_group_desc * gdp;
2248
2249
2250         if ((ino != EXT3_ROOT_INO &&
2251                 ino != EXT3_JOURNAL_INO &&
2252                 ino != EXT3_RESIZE_INO &&
2253                 ino < EXT3_FIRST_INO(sb)) ||
2254                 ino > le32_to_cpu(
2255                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2256                 ext3_error (sb, "ext3_get_inode_block",
2257                             "bad inode number: %lu", ino);
2258                 return 0;
2259         }
2260         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2261         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2262                 ext3_error (sb, "ext3_get_inode_block",
2263                             "group >= groups count");
2264                 return 0;
2265         }
2266         smp_rmb();
2267         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2268         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2269         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2270         if (!bh) {
2271                 ext3_error (sb, "ext3_get_inode_block",
2272                             "Descriptor not loaded");
2273                 return 0;
2274         }
2275
2276         gdp = (struct ext3_group_desc *) bh->b_data;
2277         /*
2278          * Figure out the offset within the block group inode table
2279          */
2280         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2281                 EXT3_INODE_SIZE(sb);
2282         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2283                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2284
2285         iloc->block_group = block_group;
2286         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2287         return block;
2288 }
2289
2290 /*
2291  * ext3_get_inode_loc returns with an extra refcount against the inode's
2292  * underlying buffer_head on success. If 'in_mem' is true, we have all
2293  * data in memory that is needed to recreate the on-disk version of this
2294  * inode.
2295  */
2296 static int __ext3_get_inode_loc(struct inode *inode,
2297                                 struct ext3_iloc *iloc, int in_mem)
2298 {
2299         unsigned long block;
2300         struct buffer_head *bh;
2301
2302         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2303         if (!block)
2304                 return -EIO;
2305
2306         bh = sb_getblk(inode->i_sb, block);
2307         if (!bh) {
2308                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2309                                 "unable to read inode block - "
2310                                 "inode=%lu, block=%lu", inode->i_ino, block);
2311                 return -EIO;
2312         }
2313         if (!buffer_uptodate(bh)) {
2314                 lock_buffer(bh);
2315                 if (buffer_uptodate(bh)) {
2316                         /* someone brought it uptodate while we waited */
2317                         unlock_buffer(bh);
2318                         goto has_buffer;
2319                 }
2320
2321                 /*
2322                  * If we have all information of the inode in memory and this
2323                  * is the only valid inode in the block, we need not read the
2324                  * block.
2325                  */
2326                 if (in_mem) {
2327                         struct buffer_head *bitmap_bh;
2328                         struct ext3_group_desc *desc;
2329                         int inodes_per_buffer;
2330                         int inode_offset, i;
2331                         int block_group;
2332                         int start;
2333
2334                         block_group = (inode->i_ino - 1) /
2335                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2336                         inodes_per_buffer = bh->b_size /
2337                                 EXT3_INODE_SIZE(inode->i_sb);
2338                         inode_offset = ((inode->i_ino - 1) %
2339                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2340                         start = inode_offset & ~(inodes_per_buffer - 1);
2341
2342                         /* Is the inode bitmap in cache? */
2343                         desc = ext3_get_group_desc(inode->i_sb,
2344                                                 block_group, NULL);
2345                         if (!desc)
2346                                 goto make_io;
2347
2348                         bitmap_bh = sb_getblk(inode->i_sb,
2349                                         le32_to_cpu(desc->bg_inode_bitmap));
2350                         if (!bitmap_bh)
2351                                 goto make_io;
2352
2353                         /*
2354                          * If the inode bitmap isn't in cache then the
2355                          * optimisation may end up performing two reads instead
2356                          * of one, so skip it.
2357                          */
2358                         if (!buffer_uptodate(bitmap_bh)) {
2359                                 brelse(bitmap_bh);
2360                                 goto make_io;
2361                         }
2362                         for (i = start; i < start + inodes_per_buffer; i++) {
2363                                 if (i == inode_offset)
2364                                         continue;
2365                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2366                                         break;
2367                         }
2368                         brelse(bitmap_bh);
2369                         if (i == start + inodes_per_buffer) {
2370                                 /* all other inodes are free, so skip I/O */
2371                                 memset(bh->b_data, 0, bh->b_size);
2372                                 set_buffer_uptodate(bh);
2373                                 unlock_buffer(bh);
2374                                 goto has_buffer;
2375                         }
2376                 }
2377
2378 make_io:
2379                 /*
2380                  * There are other valid inodes in the buffer, this inode
2381                  * has in-inode xattrs, or we don't have this inode in memory.
2382                  * Read the block from disk.
2383                  */
2384                 get_bh(bh);
2385                 bh->b_end_io = end_buffer_read_sync;
2386                 submit_bh(READ, bh);
2387                 wait_on_buffer(bh);
2388                 if (!buffer_uptodate(bh)) {
2389                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2390                                         "unable to read inode block - "
2391                                         "inode=%lu, block=%lu",
2392                                         inode->i_ino, block);
2393                         brelse(bh);
2394                         return -EIO;
2395                 }
2396         }
2397 has_buffer:
2398         iloc->bh = bh;
2399         return 0;
2400 }
2401
2402 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2403 {
2404         /* We have all inode data except xattrs in memory here. */
2405         return __ext3_get_inode_loc(inode, iloc,
2406                 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2407 }
2408
2409 void ext3_set_inode_flags(struct inode *inode)
2410 {
2411         unsigned int flags = EXT3_I(inode)->i_flags;
2412
2413         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2414         if (flags & EXT3_SYNC_FL)
2415                 inode->i_flags |= S_SYNC;
2416         if (flags & EXT3_APPEND_FL)
2417                 inode->i_flags |= S_APPEND;
2418         if (flags & EXT3_IMMUTABLE_FL)
2419                 inode->i_flags |= S_IMMUTABLE;
2420         if (flags & EXT3_NOATIME_FL)
2421                 inode->i_flags |= S_NOATIME;
2422         if (flags & EXT3_DIRSYNC_FL)
2423                 inode->i_flags |= S_DIRSYNC;
2424 }
2425
2426 void ext3_read_inode(struct inode * inode)
2427 {
2428         struct ext3_iloc iloc;
2429         struct ext3_inode *raw_inode;
2430         struct ext3_inode_info *ei = EXT3_I(inode);
2431         struct buffer_head *bh;
2432         int block;
2433
2434 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2435         ei->i_acl = EXT3_ACL_NOT_CACHED;
2436         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2437 #endif
2438         ei->i_block_alloc_info = NULL;
2439
2440         if (__ext3_get_inode_loc(inode, &iloc, 0))
2441                 goto bad_inode;
2442         bh = iloc.bh;
2443         raw_inode = ext3_raw_inode(&iloc);
2444         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2445         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2446         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2447         if(!(test_opt (inode->i_sb, NO_UID32))) {
2448                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2449                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2450         }
2451         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2452         inode->i_size = le32_to_cpu(raw_inode->i_size);
2453         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2454         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2455         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2456         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2457
2458         ei->i_state = 0;
2459         ei->i_dir_start_lookup = 0;
2460         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2461         /* We now have enough fields to check if the inode was active or not.
2462          * This is needed because nfsd might try to access dead inodes
2463          * the test is that same one that e2fsck uses
2464          * NeilBrown 1999oct15
2465          */
2466         if (inode->i_nlink == 0) {
2467                 if (inode->i_mode == 0 ||
2468                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2469                         /* this inode is deleted */
2470                         brelse (bh);
2471                         goto bad_inode;
2472                 }
2473                 /* The only unlinked inodes we let through here have
2474                  * valid i_mode and are being read by the orphan
2475                  * recovery code: that's fine, we're about to complete
2476                  * the process of deleting those. */
2477         }
2478         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2479                                          * (for stat), not the fs block
2480                                          * size */  
2481         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2482         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2483 #ifdef EXT3_FRAGMENTS
2484         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2485         ei->i_frag_no = raw_inode->i_frag;
2486         ei->i_frag_size = raw_inode->i_fsize;
2487 #endif
2488         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2489         if (!S_ISREG(inode->i_mode)) {
2490                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2491         } else {
2492                 inode->i_size |=
2493                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2494         }
2495         ei->i_disksize = inode->i_size;
2496         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2497         ei->i_block_group = iloc.block_group;
2498         /*
2499          * NOTE! The in-memory inode i_data array is in little-endian order
2500          * even on big-endian machines: we do NOT byteswap the block numbers!
2501          */
2502         for (block = 0; block < EXT3_N_BLOCKS; block++)
2503                 ei->i_data[block] = raw_inode->i_block[block];
2504         INIT_LIST_HEAD(&ei->i_orphan);
2505
2506         if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2507             EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2508                 /*
2509                  * When mke2fs creates big inodes it does not zero out
2510                  * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2511                  * so ignore those first few inodes.
2512                  */
2513                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2514                 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2515                     EXT3_INODE_SIZE(inode->i_sb))
2516                         goto bad_inode;
2517                 if (ei->i_extra_isize == 0) {
2518                         /* The extra space is currently unused. Use it. */
2519                         ei->i_extra_isize = sizeof(struct ext3_inode) -
2520                                             EXT3_GOOD_OLD_INODE_SIZE;
2521                 } else {
2522                         __le32 *magic = (void *)raw_inode +
2523                                         EXT3_GOOD_OLD_INODE_SIZE +
2524                                         ei->i_extra_isize;
2525                         if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2526                                  ei->i_state |= EXT3_STATE_XATTR;
2527                 }
2528         } else
2529                 ei->i_extra_isize = 0;
2530
2531         if (S_ISREG(inode->i_mode)) {
2532                 inode->i_op = &ext3_file_inode_operations;
2533                 inode->i_fop = &ext3_file_operations;
2534                 ext3_set_aops(inode);
2535         } else if (S_ISDIR(inode->i_mode)) {
2536                 inode->i_op = &ext3_dir_inode_operations;
2537                 inode->i_fop = &ext3_dir_operations;
2538         } else if (S_ISLNK(inode->i_mode)) {
2539                 if (ext3_inode_is_fast_symlink(inode))
2540                         inode->i_op = &ext3_fast_symlink_inode_operations;
2541                 else {
2542                         inode->i_op = &ext3_symlink_inode_operations;
2543                         ext3_set_aops(inode);
2544                 }
2545         } else {
2546                 inode->i_op = &ext3_special_inode_operations;
2547                 if (raw_inode->i_block[0])
2548                         init_special_inode(inode, inode->i_mode,
2549                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2550                 else 
2551                         init_special_inode(inode, inode->i_mode,
2552                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2553         }
2554         brelse (iloc.bh);
2555         ext3_set_inode_flags(inode);
2556         return;
2557
2558 bad_inode:
2559         make_bad_inode(inode);
2560         return;
2561 }
2562
2563 /*
2564  * Post the struct inode info into an on-disk inode location in the
2565  * buffer-cache.  This gobbles the caller's reference to the
2566  * buffer_head in the inode location struct.
2567  *
2568  * The caller must have write access to iloc->bh.
2569  */
2570 static int ext3_do_update_inode(handle_t *handle, 
2571                                 struct inode *inode, 
2572                                 struct ext3_iloc *iloc)
2573 {
2574         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2575         struct ext3_inode_info *ei = EXT3_I(inode);
2576         struct buffer_head *bh = iloc->bh;
2577         int err = 0, rc, block;
2578
2579         /* For fields not not tracking in the in-memory inode,
2580          * initialise them to zero for new inodes. */
2581         if (ei->i_state & EXT3_STATE_NEW)
2582                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2583
2584         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2585         if(!(test_opt(inode->i_sb, NO_UID32))) {
2586                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2587                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2588 /*
2589  * Fix up interoperability with old kernels. Otherwise, old inodes get
2590  * re-used with the upper 16 bits of the uid/gid intact
2591  */
2592                 if(!ei->i_dtime) {
2593                         raw_inode->i_uid_high =
2594                                 cpu_to_le16(high_16_bits(inode->i_uid));
2595                         raw_inode->i_gid_high =
2596                                 cpu_to_le16(high_16_bits(inode->i_gid));
2597                 } else {
2598                         raw_inode->i_uid_high = 0;
2599                         raw_inode->i_gid_high = 0;
2600                 }
2601         } else {
2602                 raw_inode->i_uid_low =
2603                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
2604                 raw_inode->i_gid_low =
2605                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
2606                 raw_inode->i_uid_high = 0;
2607                 raw_inode->i_gid_high = 0;
2608         }
2609         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2610         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2611         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2612         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2613         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2614         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2615         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2616         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2617 #ifdef EXT3_FRAGMENTS
2618         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2619         raw_inode->i_frag = ei->i_frag_no;
2620         raw_inode->i_fsize = ei->i_frag_size;
2621 #endif
2622         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2623         if (!S_ISREG(inode->i_mode)) {
2624                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2625         } else {
2626                 raw_inode->i_size_high =
2627                         cpu_to_le32(ei->i_disksize >> 32);
2628                 if (ei->i_disksize > 0x7fffffffULL) {
2629                         struct super_block *sb = inode->i_sb;
2630                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2631                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2632                             EXT3_SB(sb)->s_es->s_rev_level ==
2633                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2634                                /* If this is the first large file
2635                                 * created, add a flag to the superblock.
2636                                 */
2637                                 err = ext3_journal_get_write_access(handle,
2638                                                 EXT3_SB(sb)->s_sbh);
2639                                 if (err)
2640                                         goto out_brelse;
2641                                 ext3_update_dynamic_rev(sb);
2642                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2643                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2644                                 sb->s_dirt = 1;
2645                                 handle->h_sync = 1;
2646                                 err = ext3_journal_dirty_metadata(handle,
2647                                                 EXT3_SB(sb)->s_sbh);
2648                         }
2649                 }
2650         }
2651         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2652         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2653                 if (old_valid_dev(inode->i_rdev)) {
2654                         raw_inode->i_block[0] =
2655                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2656                         raw_inode->i_block[1] = 0;
2657                 } else {
2658                         raw_inode->i_block[0] = 0;
2659                         raw_inode->i_block[1] =
2660                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2661                         raw_inode->i_block[2] = 0;
2662                 }
2663         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2664                 raw_inode->i_block[block] = ei->i_data[block];
2665
2666         if (ei->i_extra_isize)
2667                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2668
2669         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2670         rc = ext3_journal_dirty_metadata(handle, bh);
2671         if (!err)
2672                 err = rc;
2673         ei->i_state &= ~EXT3_STATE_NEW;
2674
2675 out_brelse:
2676         brelse (bh);
2677         ext3_std_error(inode->i_sb, err);
2678         return err;
2679 }
2680
2681 /*
2682  * ext3_write_inode()
2683  *
2684  * We are called from a few places:
2685  *
2686  * - Within generic_file_write() for O_SYNC files.
2687  *   Here, there will be no transaction running. We wait for any running
2688  *   trasnaction to commit.
2689  *
2690  * - Within sys_sync(), kupdate and such.
2691  *   We wait on commit, if tol to.
2692  *
2693  * - Within prune_icache() (PF_MEMALLOC == true)
2694  *   Here we simply return.  We can't afford to block kswapd on the
2695  *   journal commit.
2696  *
2697  * In all cases it is actually safe for us to return without doing anything,
2698  * because the inode has been copied into a raw inode buffer in
2699  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2700  * knfsd.
2701  *
2702  * Note that we are absolutely dependent upon all inode dirtiers doing the
2703  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2704  * which we are interested.
2705  *
2706  * It would be a bug for them to not do this.  The code:
2707  *
2708  *      mark_inode_dirty(inode)
2709  *      stuff();
2710  *      inode->i_size = expr;
2711  *
2712  * is in error because a kswapd-driven write_inode() could occur while
2713  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2714  * will no longer be on the superblock's dirty inode list.
2715  */
2716 int ext3_write_inode(struct inode *inode, int wait)
2717 {
2718         if (current->flags & PF_MEMALLOC)
2719                 return 0;
2720
2721         if (ext3_journal_current_handle()) {
2722                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2723                 dump_stack();
2724                 return -EIO;
2725         }
2726
2727         if (!wait)
2728                 return 0;
2729
2730         return ext3_force_commit(inode->i_sb);
2731 }
2732
2733 /*
2734  * ext3_setattr()
2735  *
2736  * Called from notify_change.
2737  *
2738  * We want to trap VFS attempts to truncate the file as soon as
2739  * possible.  In particular, we want to make sure that when the VFS
2740  * shrinks i_size, we put the inode on the orphan list and modify
2741  * i_disksize immediately, so that during the subsequent flushing of
2742  * dirty pages and freeing of disk blocks, we can guarantee that any
2743  * commit will leave the blocks being flushed in an unused state on
2744  * disk.  (On recovery, the inode will get truncated and the blocks will
2745  * be freed, so we have a strong guarantee that no future commit will
2746  * leave these blocks visible to the user.)  
2747  *
2748  * Called with inode->sem down.
2749  */
2750 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2751 {
2752         struct inode *inode = dentry->d_inode;
2753         int error, rc = 0;
2754         const unsigned int ia_valid = attr->ia_valid;
2755
2756         error = inode_change_ok(inode, attr);
2757         if (error)
2758                 return error;
2759
2760         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2761                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2762                 handle_t *handle;
2763
2764                 /* (user+group)*(old+new) structure, inode write (sb,
2765                  * inode block, ? - but truncate inode update has it) */
2766                 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
2767                                         EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2768                 if (IS_ERR(handle)) {
2769                         error = PTR_ERR(handle);
2770                         goto err_out;
2771                 }
2772                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2773                 if (error) {
2774                         ext3_journal_stop(handle);
2775                         return error;
2776                 }
2777                 /* Update corresponding info in inode so that everything is in
2778                  * one transaction */
2779                 if (attr->ia_valid & ATTR_UID)
2780                         inode->i_uid = attr->ia_uid;
2781                 if (attr->ia_valid & ATTR_GID)
2782                         inode->i_gid = attr->ia_gid;
2783                 error = ext3_mark_inode_dirty(handle, inode);
2784                 ext3_journal_stop(handle);
2785         }
2786
2787         if (S_ISREG(inode->i_mode) &&
2788             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2789                 handle_t *handle;
2790
2791                 handle = ext3_journal_start(inode, 3);
2792                 if (IS_ERR(handle)) {
2793                         error = PTR_ERR(handle);
2794                         goto err_out;
2795                 }
2796
2797                 error = ext3_orphan_add(handle, inode);
2798                 EXT3_I(inode)->i_disksize = attr->ia_size;
2799                 rc = ext3_mark_inode_dirty(handle, inode);
2800                 if (!error)
2801                         error = rc;
2802                 ext3_journal_stop(handle);
2803         }
2804
2805         rc = inode_setattr(inode, attr);
2806
2807         /* If inode_setattr's call to ext3_truncate failed to get a
2808          * transaction handle at all, we need to clean up the in-core
2809          * orphan list manually. */
2810         if (inode->i_nlink)
2811                 ext3_orphan_del(NULL, inode);
2812
2813         if (!rc && (ia_valid & ATTR_MODE))
2814                 rc = ext3_acl_chmod(inode);
2815
2816 err_out:
2817         ext3_std_error(inode->i_sb, error);
2818         if (!error)
2819                 error = rc;
2820         return error;
2821 }
2822
2823
2824 /*
2825  * akpm: how many blocks doth make a writepage()?
2826  *
2827  * With N blocks per page, it may be:
2828  * N data blocks
2829  * 2 indirect block
2830  * 2 dindirect
2831  * 1 tindirect
2832  * N+5 bitmap blocks (from the above)
2833  * N+5 group descriptor summary blocks
2834  * 1 inode block
2835  * 1 superblock.
2836  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2837  *
2838  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2839  *
2840  * With ordered or writeback data it's the same, less the N data blocks.
2841  *
2842  * If the inode's direct blocks can hold an integral number of pages then a
2843  * page cannot straddle two indirect blocks, and we can only touch one indirect
2844  * and dindirect block, and the "5" above becomes "3".
2845  *
2846  * This still overestimates under most circumstances.  If we were to pass the
2847  * start and end offsets in here as well we could do block_to_path() on each
2848  * block and work out the exact number of indirects which are touched.  Pah.
2849  */
2850
2851 static int ext3_writepage_trans_blocks(struct inode *inode)
2852 {
2853         int bpp = ext3_journal_blocks_per_page(inode);
2854         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2855         int ret;
2856
2857         if (ext3_should_journal_data(inode))
2858                 ret = 3 * (bpp + indirects) + 2;
2859         else
2860                 ret = 2 * (bpp + indirects) + 2;
2861
2862 #ifdef CONFIG_QUOTA
2863         /* We know that structure was already allocated during DQUOT_INIT so
2864          * we will be updating only the data blocks + inodes */
2865         ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
2866 #endif
2867
2868         return ret;
2869 }
2870
2871 /*
2872  * The caller must have previously called ext3_reserve_inode_write().
2873  * Give this, we know that the caller already has write access to iloc->bh.
2874  */
2875 int ext3_mark_iloc_dirty(handle_t *handle,
2876                 struct inode *inode, struct ext3_iloc *iloc)
2877 {
2878         int err = 0;
2879
2880         /* the do_update_inode consumes one bh->b_count */
2881         get_bh(iloc->bh);
2882
2883         /* ext3_do_update_inode() does journal_dirty_metadata */
2884         err = ext3_do_update_inode(handle, inode, iloc);
2885         put_bh(iloc->bh);
2886         return err;
2887 }
2888
2889 /* 
2890  * On success, We end up with an outstanding reference count against
2891  * iloc->bh.  This _must_ be cleaned up later. 
2892  */
2893
2894 int
2895 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2896                          struct ext3_iloc *iloc)
2897 {
2898         int err = 0;
2899         if (handle) {
2900                 err = ext3_get_inode_loc(inode, iloc);
2901                 if (!err) {
2902                         BUFFER_TRACE(iloc->bh, "get_write_access");
2903                         err = ext3_journal_get_write_access(handle, iloc->bh);
2904                         if (err) {
2905                                 brelse(iloc->bh);
2906                                 iloc->bh = NULL;
2907                         }
2908                 }
2909         }
2910         ext3_std_error(inode->i_sb, err);
2911         return err;
2912 }
2913
2914 /*
2915  * akpm: What we do here is to mark the in-core inode as clean
2916  * with respect to inode dirtiness (it may still be data-dirty).
2917  * This means that the in-core inode may be reaped by prune_icache
2918  * without having to perform any I/O.  This is a very good thing,
2919  * because *any* task may call prune_icache - even ones which
2920  * have a transaction open against a different journal.
2921  *
2922  * Is this cheating?  Not really.  Sure, we haven't written the
2923  * inode out, but prune_icache isn't a user-visible syncing function.
2924  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2925  * we start and wait on commits.
2926  *
2927  * Is this efficient/effective?  Well, we're being nice to the system
2928  * by cleaning up our inodes proactively so they can be reaped
2929  * without I/O.  But we are potentially leaving up to five seconds'
2930  * worth of inodes floating about which prune_icache wants us to
2931  * write out.  One way to fix that would be to get prune_icache()
2932  * to do a write_super() to free up some memory.  It has the desired
2933  * effect.
2934  */
2935 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2936 {
2937         struct ext3_iloc iloc;
2938         int err;
2939
2940         might_sleep();
2941         err = ext3_reserve_inode_write(handle, inode, &iloc);
2942         if (!err)
2943                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2944         return err;
2945 }
2946
2947 /*
2948  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2949  *
2950  * We're really interested in the case where a file is being extended.
2951  * i_size has been changed by generic_commit_write() and we thus need
2952  * to include the updated inode in the current transaction.
2953  *
2954  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2955  * are allocated to the file.
2956  *
2957  * If the inode is marked synchronous, we don't honour that here - doing
2958  * so would cause a commit on atime updates, which we don't bother doing.
2959  * We handle synchronous inodes at the highest possible level.
2960  */
2961 void ext3_dirty_inode(struct inode *inode)
2962 {
2963         handle_t *current_handle = ext3_journal_current_handle();
2964         handle_t *handle;
2965
2966         handle = ext3_journal_start(inode, 2);
2967         if (IS_ERR(handle))
2968                 goto out;
2969         if (current_handle &&
2970                 current_handle->h_transaction != handle->h_transaction) {
2971                 /* This task has a transaction open against a different fs */
2972                 printk(KERN_EMERG "%s: transactions do not match!\n",
2973                        __FUNCTION__);
2974         } else {
2975                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
2976                                 current_handle);
2977                 ext3_mark_inode_dirty(handle, inode);
2978         }
2979         ext3_journal_stop(handle);
2980 out:
2981         return;
2982 }
2983
2984 #ifdef AKPM
2985 /* 
2986  * Bind an inode's backing buffer_head into this transaction, to prevent
2987  * it from being flushed to disk early.  Unlike
2988  * ext3_reserve_inode_write, this leaves behind no bh reference and
2989  * returns no iloc structure, so the caller needs to repeat the iloc
2990  * lookup to mark the inode dirty later.
2991  */
2992 static inline int
2993 ext3_pin_inode(handle_t *handle, struct inode *inode)
2994 {
2995         struct ext3_iloc iloc;
2996
2997         int err = 0;
2998         if (handle) {
2999                 err = ext3_get_inode_loc(inode, &iloc);
3000                 if (!err) {
3001                         BUFFER_TRACE(iloc.bh, "get_write_access");
3002                         err = journal_get_write_access(handle, iloc.bh);
3003                         if (!err)
3004                                 err = ext3_journal_dirty_metadata(handle, 
3005                                                                   iloc.bh);
3006                         brelse(iloc.bh);
3007                 }
3008         }
3009         ext3_std_error(inode->i_sb, err);
3010         return err;
3011 }
3012 #endif
3013
3014 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3015 {
3016         journal_t *journal;
3017         handle_t *handle;
3018         int err;
3019
3020         /*
3021          * We have to be very careful here: changing a data block's
3022          * journaling status dynamically is dangerous.  If we write a
3023          * data block to the journal, change the status and then delete
3024          * that block, we risk forgetting to revoke the old log record
3025          * from the journal and so a subsequent replay can corrupt data.
3026          * So, first we make sure that the journal is empty and that
3027          * nobody is changing anything.
3028          */
3029
3030         journal = EXT3_JOURNAL(inode);
3031         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3032                 return -EROFS;
3033
3034         journal_lock_updates(journal);
3035         journal_flush(journal);
3036
3037         /*
3038          * OK, there are no updates running now, and all cached data is
3039          * synced to disk.  We are now in a completely consistent state
3040          * which doesn't have anything in the journal, and we know that
3041          * no filesystem updates are running, so it is safe to modify
3042          * the inode's in-core data-journaling state flag now.
3043          */
3044
3045         if (val)
3046                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3047         else
3048                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3049         ext3_set_aops(inode);
3050
3051         journal_unlock_updates(journal);
3052
3053         /* Finally we can mark the inode as dirty. */
3054
3055         handle = ext3_journal_start(inode, 1);
3056         if (IS_ERR(handle))
3057                 return PTR_ERR(handle);
3058
3059         err = ext3_mark_inode_dirty(handle, inode);
3060         handle->h_sync = 1;
3061         ext3_journal_stop(handle);
3062         ext3_std_error(inode->i_sb, err);
3063
3064         return err;
3065 }