ext4: Fix lock inversion in ext4_ext_truncate()
[linux-2.6] / fs / ext4 / extents.c
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31
32 #include <linux/module.h>
33 #include <linux/fs.h>
34 #include <linux/time.h>
35 #include <linux/jbd2.h>
36 #include <linux/highuid.h>
37 #include <linux/pagemap.h>
38 #include <linux/quotaops.h>
39 #include <linux/string.h>
40 #include <linux/slab.h>
41 #include <linux/falloc.h>
42 #include <asm/uaccess.h>
43 #include "ext4_jbd2.h"
44 #include "ext4_extents.h"
45
46
47 /*
48  * ext_pblock:
49  * combine low and high parts of physical block number into ext4_fsblk_t
50  */
51 static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52 {
53         ext4_fsblk_t block;
54
55         block = le32_to_cpu(ex->ee_start_lo);
56         block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57         return block;
58 }
59
60 /*
61  * idx_pblock:
62  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63  */
64 ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65 {
66         ext4_fsblk_t block;
67
68         block = le32_to_cpu(ix->ei_leaf_lo);
69         block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70         return block;
71 }
72
73 /*
74  * ext4_ext_store_pblock:
75  * stores a large physical block number into an extent struct,
76  * breaking it into parts
77  */
78 void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79 {
80         ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81         ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82 }
83
84 /*
85  * ext4_idx_store_pblock:
86  * stores a large physical block number into an index struct,
87  * breaking it into parts
88  */
89 static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90 {
91         ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92         ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93 }
94
95 static int ext4_ext_journal_restart(handle_t *handle, int needed)
96 {
97         int err;
98
99         if (handle->h_buffer_credits > needed)
100                 return 0;
101         err = ext4_journal_extend(handle, needed);
102         if (err)
103                 return err;
104         return ext4_journal_restart(handle, needed);
105 }
106
107 /*
108  * could return:
109  *  - EROFS
110  *  - ENOMEM
111  */
112 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
113                                 struct ext4_ext_path *path)
114 {
115         if (path->p_bh) {
116                 /* path points to block */
117                 return ext4_journal_get_write_access(handle, path->p_bh);
118         }
119         /* path points to leaf/index in inode body */
120         /* we use in-core data, no need to protect them */
121         return 0;
122 }
123
124 /*
125  * could return:
126  *  - EROFS
127  *  - ENOMEM
128  *  - EIO
129  */
130 static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
131                                 struct ext4_ext_path *path)
132 {
133         int err;
134         if (path->p_bh) {
135                 /* path points to block */
136                 err = ext4_journal_dirty_metadata(handle, path->p_bh);
137         } else {
138                 /* path points to leaf/index in inode body */
139                 err = ext4_mark_inode_dirty(handle, inode);
140         }
141         return err;
142 }
143
144 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
145                               struct ext4_ext_path *path,
146                               ext4_lblk_t block)
147 {
148         struct ext4_inode_info *ei = EXT4_I(inode);
149         ext4_fsblk_t bg_start;
150         ext4_fsblk_t last_block;
151         ext4_grpblk_t colour;
152         int depth;
153
154         if (path) {
155                 struct ext4_extent *ex;
156                 depth = path->p_depth;
157
158                 /* try to predict block placement */
159                 ex = path[depth].p_ext;
160                 if (ex)
161                         return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
162
163                 /* it looks like index is empty;
164                  * try to find starting block from index itself */
165                 if (path[depth].p_bh)
166                         return path[depth].p_bh->b_blocknr;
167         }
168
169         /* OK. use inode's group */
170         bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
171                 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
172         last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
173
174         if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
175                 colour = (current->pid % 16) *
176                         (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
177         else
178                 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
179         return bg_start + colour + block;
180 }
181
182 /*
183  * Allocation for a meta data block
184  */
185 static ext4_fsblk_t
186 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
187                         struct ext4_ext_path *path,
188                         struct ext4_extent *ex, int *err)
189 {
190         ext4_fsblk_t goal, newblock;
191
192         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
193         newblock = ext4_new_meta_block(handle, inode, goal, err);
194         return newblock;
195 }
196
197 static int ext4_ext_space_block(struct inode *inode)
198 {
199         int size;
200
201         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
202                         / sizeof(struct ext4_extent);
203 #ifdef AGGRESSIVE_TEST
204         if (size > 6)
205                 size = 6;
206 #endif
207         return size;
208 }
209
210 static int ext4_ext_space_block_idx(struct inode *inode)
211 {
212         int size;
213
214         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
215                         / sizeof(struct ext4_extent_idx);
216 #ifdef AGGRESSIVE_TEST
217         if (size > 5)
218                 size = 5;
219 #endif
220         return size;
221 }
222
223 static int ext4_ext_space_root(struct inode *inode)
224 {
225         int size;
226
227         size = sizeof(EXT4_I(inode)->i_data);
228         size -= sizeof(struct ext4_extent_header);
229         size /= sizeof(struct ext4_extent);
230 #ifdef AGGRESSIVE_TEST
231         if (size > 3)
232                 size = 3;
233 #endif
234         return size;
235 }
236
237 static int ext4_ext_space_root_idx(struct inode *inode)
238 {
239         int size;
240
241         size = sizeof(EXT4_I(inode)->i_data);
242         size -= sizeof(struct ext4_extent_header);
243         size /= sizeof(struct ext4_extent_idx);
244 #ifdef AGGRESSIVE_TEST
245         if (size > 4)
246                 size = 4;
247 #endif
248         return size;
249 }
250
251 static int
252 ext4_ext_max_entries(struct inode *inode, int depth)
253 {
254         int max;
255
256         if (depth == ext_depth(inode)) {
257                 if (depth == 0)
258                         max = ext4_ext_space_root(inode);
259                 else
260                         max = ext4_ext_space_root_idx(inode);
261         } else {
262                 if (depth == 0)
263                         max = ext4_ext_space_block(inode);
264                 else
265                         max = ext4_ext_space_block_idx(inode);
266         }
267
268         return max;
269 }
270
271 static int __ext4_ext_check_header(const char *function, struct inode *inode,
272                                         struct ext4_extent_header *eh,
273                                         int depth)
274 {
275         const char *error_msg;
276         int max = 0;
277
278         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
279                 error_msg = "invalid magic";
280                 goto corrupted;
281         }
282         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
283                 error_msg = "unexpected eh_depth";
284                 goto corrupted;
285         }
286         if (unlikely(eh->eh_max == 0)) {
287                 error_msg = "invalid eh_max";
288                 goto corrupted;
289         }
290         max = ext4_ext_max_entries(inode, depth);
291         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
292                 error_msg = "too large eh_max";
293                 goto corrupted;
294         }
295         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
296                 error_msg = "invalid eh_entries";
297                 goto corrupted;
298         }
299         return 0;
300
301 corrupted:
302         ext4_error(inode->i_sb, function,
303                         "bad header in inode #%lu: %s - magic %x, "
304                         "entries %u, max %u(%u), depth %u(%u)",
305                         inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
306                         le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
307                         max, le16_to_cpu(eh->eh_depth), depth);
308
309         return -EIO;
310 }
311
312 #define ext4_ext_check_header(inode, eh, depth) \
313         __ext4_ext_check_header(__func__, inode, eh, depth)
314
315 #ifdef EXT_DEBUG
316 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
317 {
318         int k, l = path->p_depth;
319
320         ext_debug("path:");
321         for (k = 0; k <= l; k++, path++) {
322                 if (path->p_idx) {
323                   ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
324                             idx_pblock(path->p_idx));
325                 } else if (path->p_ext) {
326                         ext_debug("  %d:%d:%llu ",
327                                   le32_to_cpu(path->p_ext->ee_block),
328                                   ext4_ext_get_actual_len(path->p_ext),
329                                   ext_pblock(path->p_ext));
330                 } else
331                         ext_debug("  []");
332         }
333         ext_debug("\n");
334 }
335
336 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
337 {
338         int depth = ext_depth(inode);
339         struct ext4_extent_header *eh;
340         struct ext4_extent *ex;
341         int i;
342
343         if (!path)
344                 return;
345
346         eh = path[depth].p_hdr;
347         ex = EXT_FIRST_EXTENT(eh);
348
349         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
350                 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
351                           ext4_ext_get_actual_len(ex), ext_pblock(ex));
352         }
353         ext_debug("\n");
354 }
355 #else
356 #define ext4_ext_show_path(inode,path)
357 #define ext4_ext_show_leaf(inode,path)
358 #endif
359
360 void ext4_ext_drop_refs(struct ext4_ext_path *path)
361 {
362         int depth = path->p_depth;
363         int i;
364
365         for (i = 0; i <= depth; i++, path++)
366                 if (path->p_bh) {
367                         brelse(path->p_bh);
368                         path->p_bh = NULL;
369                 }
370 }
371
372 /*
373  * ext4_ext_binsearch_idx:
374  * binary search for the closest index of the given block
375  * the header must be checked before calling this
376  */
377 static void
378 ext4_ext_binsearch_idx(struct inode *inode,
379                         struct ext4_ext_path *path, ext4_lblk_t block)
380 {
381         struct ext4_extent_header *eh = path->p_hdr;
382         struct ext4_extent_idx *r, *l, *m;
383
384
385         ext_debug("binsearch for %u(idx):  ", block);
386
387         l = EXT_FIRST_INDEX(eh) + 1;
388         r = EXT_LAST_INDEX(eh);
389         while (l <= r) {
390                 m = l + (r - l) / 2;
391                 if (block < le32_to_cpu(m->ei_block))
392                         r = m - 1;
393                 else
394                         l = m + 1;
395                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
396                                 m, le32_to_cpu(m->ei_block),
397                                 r, le32_to_cpu(r->ei_block));
398         }
399
400         path->p_idx = l - 1;
401         ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
402                   idx_pblock(path->p_idx));
403
404 #ifdef CHECK_BINSEARCH
405         {
406                 struct ext4_extent_idx *chix, *ix;
407                 int k;
408
409                 chix = ix = EXT_FIRST_INDEX(eh);
410                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
411                   if (k != 0 &&
412                       le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
413                                 printk("k=%d, ix=0x%p, first=0x%p\n", k,
414                                         ix, EXT_FIRST_INDEX(eh));
415                                 printk("%u <= %u\n",
416                                        le32_to_cpu(ix->ei_block),
417                                        le32_to_cpu(ix[-1].ei_block));
418                         }
419                         BUG_ON(k && le32_to_cpu(ix->ei_block)
420                                            <= le32_to_cpu(ix[-1].ei_block));
421                         if (block < le32_to_cpu(ix->ei_block))
422                                 break;
423                         chix = ix;
424                 }
425                 BUG_ON(chix != path->p_idx);
426         }
427 #endif
428
429 }
430
431 /*
432  * ext4_ext_binsearch:
433  * binary search for closest extent of the given block
434  * the header must be checked before calling this
435  */
436 static void
437 ext4_ext_binsearch(struct inode *inode,
438                 struct ext4_ext_path *path, ext4_lblk_t block)
439 {
440         struct ext4_extent_header *eh = path->p_hdr;
441         struct ext4_extent *r, *l, *m;
442
443         if (eh->eh_entries == 0) {
444                 /*
445                  * this leaf is empty:
446                  * we get such a leaf in split/add case
447                  */
448                 return;
449         }
450
451         ext_debug("binsearch for %u:  ", block);
452
453         l = EXT_FIRST_EXTENT(eh) + 1;
454         r = EXT_LAST_EXTENT(eh);
455
456         while (l <= r) {
457                 m = l + (r - l) / 2;
458                 if (block < le32_to_cpu(m->ee_block))
459                         r = m - 1;
460                 else
461                         l = m + 1;
462                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
463                                 m, le32_to_cpu(m->ee_block),
464                                 r, le32_to_cpu(r->ee_block));
465         }
466
467         path->p_ext = l - 1;
468         ext_debug("  -> %d:%llu:%d ",
469                         le32_to_cpu(path->p_ext->ee_block),
470                         ext_pblock(path->p_ext),
471                         ext4_ext_get_actual_len(path->p_ext));
472
473 #ifdef CHECK_BINSEARCH
474         {
475                 struct ext4_extent *chex, *ex;
476                 int k;
477
478                 chex = ex = EXT_FIRST_EXTENT(eh);
479                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
480                         BUG_ON(k && le32_to_cpu(ex->ee_block)
481                                           <= le32_to_cpu(ex[-1].ee_block));
482                         if (block < le32_to_cpu(ex->ee_block))
483                                 break;
484                         chex = ex;
485                 }
486                 BUG_ON(chex != path->p_ext);
487         }
488 #endif
489
490 }
491
492 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
493 {
494         struct ext4_extent_header *eh;
495
496         eh = ext_inode_hdr(inode);
497         eh->eh_depth = 0;
498         eh->eh_entries = 0;
499         eh->eh_magic = EXT4_EXT_MAGIC;
500         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
501         ext4_mark_inode_dirty(handle, inode);
502         ext4_ext_invalidate_cache(inode);
503         return 0;
504 }
505
506 struct ext4_ext_path *
507 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
508                                         struct ext4_ext_path *path)
509 {
510         struct ext4_extent_header *eh;
511         struct buffer_head *bh;
512         short int depth, i, ppos = 0, alloc = 0;
513
514         eh = ext_inode_hdr(inode);
515         depth = ext_depth(inode);
516         if (ext4_ext_check_header(inode, eh, depth))
517                 return ERR_PTR(-EIO);
518
519
520         /* account possible depth increase */
521         if (!path) {
522                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
523                                 GFP_NOFS);
524                 if (!path)
525                         return ERR_PTR(-ENOMEM);
526                 alloc = 1;
527         }
528         path[0].p_hdr = eh;
529         path[0].p_bh = NULL;
530
531         i = depth;
532         /* walk through the tree */
533         while (i) {
534                 ext_debug("depth %d: num %d, max %d\n",
535                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
536
537                 ext4_ext_binsearch_idx(inode, path + ppos, block);
538                 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
539                 path[ppos].p_depth = i;
540                 path[ppos].p_ext = NULL;
541
542                 bh = sb_bread(inode->i_sb, path[ppos].p_block);
543                 if (!bh)
544                         goto err;
545
546                 eh = ext_block_hdr(bh);
547                 ppos++;
548                 BUG_ON(ppos > depth);
549                 path[ppos].p_bh = bh;
550                 path[ppos].p_hdr = eh;
551                 i--;
552
553                 if (ext4_ext_check_header(inode, eh, i))
554                         goto err;
555         }
556
557         path[ppos].p_depth = i;
558         path[ppos].p_ext = NULL;
559         path[ppos].p_idx = NULL;
560
561         /* find extent */
562         ext4_ext_binsearch(inode, path + ppos, block);
563         /* if not an empty leaf */
564         if (path[ppos].p_ext)
565                 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
566
567         ext4_ext_show_path(inode, path);
568
569         return path;
570
571 err:
572         ext4_ext_drop_refs(path);
573         if (alloc)
574                 kfree(path);
575         return ERR_PTR(-EIO);
576 }
577
578 /*
579  * ext4_ext_insert_index:
580  * insert new index [@logical;@ptr] into the block at @curp;
581  * check where to insert: before @curp or after @curp
582  */
583 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
584                                 struct ext4_ext_path *curp,
585                                 int logical, ext4_fsblk_t ptr)
586 {
587         struct ext4_extent_idx *ix;
588         int len, err;
589
590         err = ext4_ext_get_access(handle, inode, curp);
591         if (err)
592                 return err;
593
594         BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
595         len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
596         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
597                 /* insert after */
598                 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
599                         len = (len - 1) * sizeof(struct ext4_extent_idx);
600                         len = len < 0 ? 0 : len;
601                         ext_debug("insert new index %d after: %llu. "
602                                         "move %d from 0x%p to 0x%p\n",
603                                         logical, ptr, len,
604                                         (curp->p_idx + 1), (curp->p_idx + 2));
605                         memmove(curp->p_idx + 2, curp->p_idx + 1, len);
606                 }
607                 ix = curp->p_idx + 1;
608         } else {
609                 /* insert before */
610                 len = len * sizeof(struct ext4_extent_idx);
611                 len = len < 0 ? 0 : len;
612                 ext_debug("insert new index %d before: %llu. "
613                                 "move %d from 0x%p to 0x%p\n",
614                                 logical, ptr, len,
615                                 curp->p_idx, (curp->p_idx + 1));
616                 memmove(curp->p_idx + 1, curp->p_idx, len);
617                 ix = curp->p_idx;
618         }
619
620         ix->ei_block = cpu_to_le32(logical);
621         ext4_idx_store_pblock(ix, ptr);
622         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
623
624         BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
625                              > le16_to_cpu(curp->p_hdr->eh_max));
626         BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
627
628         err = ext4_ext_dirty(handle, inode, curp);
629         ext4_std_error(inode->i_sb, err);
630
631         return err;
632 }
633
634 /*
635  * ext4_ext_split:
636  * inserts new subtree into the path, using free index entry
637  * at depth @at:
638  * - allocates all needed blocks (new leaf and all intermediate index blocks)
639  * - makes decision where to split
640  * - moves remaining extents and index entries (right to the split point)
641  *   into the newly allocated blocks
642  * - initializes subtree
643  */
644 static int ext4_ext_split(handle_t *handle, struct inode *inode,
645                                 struct ext4_ext_path *path,
646                                 struct ext4_extent *newext, int at)
647 {
648         struct buffer_head *bh = NULL;
649         int depth = ext_depth(inode);
650         struct ext4_extent_header *neh;
651         struct ext4_extent_idx *fidx;
652         struct ext4_extent *ex;
653         int i = at, k, m, a;
654         ext4_fsblk_t newblock, oldblock;
655         __le32 border;
656         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
657         int err = 0;
658
659         /* make decision: where to split? */
660         /* FIXME: now decision is simplest: at current extent */
661
662         /* if current leaf will be split, then we should use
663          * border from split point */
664         BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
665         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
666                 border = path[depth].p_ext[1].ee_block;
667                 ext_debug("leaf will be split."
668                                 " next leaf starts at %d\n",
669                                   le32_to_cpu(border));
670         } else {
671                 border = newext->ee_block;
672                 ext_debug("leaf will be added."
673                                 " next leaf starts at %d\n",
674                                 le32_to_cpu(border));
675         }
676
677         /*
678          * If error occurs, then we break processing
679          * and mark filesystem read-only. index won't
680          * be inserted and tree will be in consistent
681          * state. Next mount will repair buffers too.
682          */
683
684         /*
685          * Get array to track all allocated blocks.
686          * We need this to handle errors and free blocks
687          * upon them.
688          */
689         ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
690         if (!ablocks)
691                 return -ENOMEM;
692
693         /* allocate all needed blocks */
694         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
695         for (a = 0; a < depth - at; a++) {
696                 newblock = ext4_ext_new_meta_block(handle, inode, path,
697                                                    newext, &err);
698                 if (newblock == 0)
699                         goto cleanup;
700                 ablocks[a] = newblock;
701         }
702
703         /* initialize new leaf */
704         newblock = ablocks[--a];
705         BUG_ON(newblock == 0);
706         bh = sb_getblk(inode->i_sb, newblock);
707         if (!bh) {
708                 err = -EIO;
709                 goto cleanup;
710         }
711         lock_buffer(bh);
712
713         err = ext4_journal_get_create_access(handle, bh);
714         if (err)
715                 goto cleanup;
716
717         neh = ext_block_hdr(bh);
718         neh->eh_entries = 0;
719         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
720         neh->eh_magic = EXT4_EXT_MAGIC;
721         neh->eh_depth = 0;
722         ex = EXT_FIRST_EXTENT(neh);
723
724         /* move remainder of path[depth] to the new leaf */
725         BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
726         /* start copy from next extent */
727         /* TODO: we could do it by single memmove */
728         m = 0;
729         path[depth].p_ext++;
730         while (path[depth].p_ext <=
731                         EXT_MAX_EXTENT(path[depth].p_hdr)) {
732                 ext_debug("move %d:%llu:%d in new leaf %llu\n",
733                                 le32_to_cpu(path[depth].p_ext->ee_block),
734                                 ext_pblock(path[depth].p_ext),
735                                 ext4_ext_get_actual_len(path[depth].p_ext),
736                                 newblock);
737                 /*memmove(ex++, path[depth].p_ext++,
738                                 sizeof(struct ext4_extent));
739                 neh->eh_entries++;*/
740                 path[depth].p_ext++;
741                 m++;
742         }
743         if (m) {
744                 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
745                 le16_add_cpu(&neh->eh_entries, m);
746         }
747
748         set_buffer_uptodate(bh);
749         unlock_buffer(bh);
750
751         err = ext4_journal_dirty_metadata(handle, bh);
752         if (err)
753                 goto cleanup;
754         brelse(bh);
755         bh = NULL;
756
757         /* correct old leaf */
758         if (m) {
759                 err = ext4_ext_get_access(handle, inode, path + depth);
760                 if (err)
761                         goto cleanup;
762                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
763                 err = ext4_ext_dirty(handle, inode, path + depth);
764                 if (err)
765                         goto cleanup;
766
767         }
768
769         /* create intermediate indexes */
770         k = depth - at - 1;
771         BUG_ON(k < 0);
772         if (k)
773                 ext_debug("create %d intermediate indices\n", k);
774         /* insert new index into current index block */
775         /* current depth stored in i var */
776         i = depth - 1;
777         while (k--) {
778                 oldblock = newblock;
779                 newblock = ablocks[--a];
780                 bh = sb_getblk(inode->i_sb, newblock);
781                 if (!bh) {
782                         err = -EIO;
783                         goto cleanup;
784                 }
785                 lock_buffer(bh);
786
787                 err = ext4_journal_get_create_access(handle, bh);
788                 if (err)
789                         goto cleanup;
790
791                 neh = ext_block_hdr(bh);
792                 neh->eh_entries = cpu_to_le16(1);
793                 neh->eh_magic = EXT4_EXT_MAGIC;
794                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
795                 neh->eh_depth = cpu_to_le16(depth - i);
796                 fidx = EXT_FIRST_INDEX(neh);
797                 fidx->ei_block = border;
798                 ext4_idx_store_pblock(fidx, oldblock);
799
800                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
801                                 i, newblock, le32_to_cpu(border), oldblock);
802                 /* copy indexes */
803                 m = 0;
804                 path[i].p_idx++;
805
806                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
807                                 EXT_MAX_INDEX(path[i].p_hdr));
808                 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
809                                 EXT_LAST_INDEX(path[i].p_hdr));
810                 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
811                         ext_debug("%d: move %d:%llu in new index %llu\n", i,
812                                         le32_to_cpu(path[i].p_idx->ei_block),
813                                         idx_pblock(path[i].p_idx),
814                                         newblock);
815                         /*memmove(++fidx, path[i].p_idx++,
816                                         sizeof(struct ext4_extent_idx));
817                         neh->eh_entries++;
818                         BUG_ON(neh->eh_entries > neh->eh_max);*/
819                         path[i].p_idx++;
820                         m++;
821                 }
822                 if (m) {
823                         memmove(++fidx, path[i].p_idx - m,
824                                 sizeof(struct ext4_extent_idx) * m);
825                         le16_add_cpu(&neh->eh_entries, m);
826                 }
827                 set_buffer_uptodate(bh);
828                 unlock_buffer(bh);
829
830                 err = ext4_journal_dirty_metadata(handle, bh);
831                 if (err)
832                         goto cleanup;
833                 brelse(bh);
834                 bh = NULL;
835
836                 /* correct old index */
837                 if (m) {
838                         err = ext4_ext_get_access(handle, inode, path + i);
839                         if (err)
840                                 goto cleanup;
841                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
842                         err = ext4_ext_dirty(handle, inode, path + i);
843                         if (err)
844                                 goto cleanup;
845                 }
846
847                 i--;
848         }
849
850         /* insert new index */
851         err = ext4_ext_insert_index(handle, inode, path + at,
852                                     le32_to_cpu(border), newblock);
853
854 cleanup:
855         if (bh) {
856                 if (buffer_locked(bh))
857                         unlock_buffer(bh);
858                 brelse(bh);
859         }
860
861         if (err) {
862                 /* free all allocated blocks in error case */
863                 for (i = 0; i < depth; i++) {
864                         if (!ablocks[i])
865                                 continue;
866                         ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
867                 }
868         }
869         kfree(ablocks);
870
871         return err;
872 }
873
874 /*
875  * ext4_ext_grow_indepth:
876  * implements tree growing procedure:
877  * - allocates new block
878  * - moves top-level data (index block or leaf) into the new block
879  * - initializes new top-level, creating index that points to the
880  *   just created block
881  */
882 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
883                                         struct ext4_ext_path *path,
884                                         struct ext4_extent *newext)
885 {
886         struct ext4_ext_path *curp = path;
887         struct ext4_extent_header *neh;
888         struct ext4_extent_idx *fidx;
889         struct buffer_head *bh;
890         ext4_fsblk_t newblock;
891         int err = 0;
892
893         newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
894         if (newblock == 0)
895                 return err;
896
897         bh = sb_getblk(inode->i_sb, newblock);
898         if (!bh) {
899                 err = -EIO;
900                 ext4_std_error(inode->i_sb, err);
901                 return err;
902         }
903         lock_buffer(bh);
904
905         err = ext4_journal_get_create_access(handle, bh);
906         if (err) {
907                 unlock_buffer(bh);
908                 goto out;
909         }
910
911         /* move top-level index/leaf into new block */
912         memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
913
914         /* set size of new block */
915         neh = ext_block_hdr(bh);
916         /* old root could have indexes or leaves
917          * so calculate e_max right way */
918         if (ext_depth(inode))
919           neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
920         else
921           neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
922         neh->eh_magic = EXT4_EXT_MAGIC;
923         set_buffer_uptodate(bh);
924         unlock_buffer(bh);
925
926         err = ext4_journal_dirty_metadata(handle, bh);
927         if (err)
928                 goto out;
929
930         /* create index in new top-level index: num,max,pointer */
931         err = ext4_ext_get_access(handle, inode, curp);
932         if (err)
933                 goto out;
934
935         curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
936         curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
937         curp->p_hdr->eh_entries = cpu_to_le16(1);
938         curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
939
940         if (path[0].p_hdr->eh_depth)
941                 curp->p_idx->ei_block =
942                         EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
943         else
944                 curp->p_idx->ei_block =
945                         EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
946         ext4_idx_store_pblock(curp->p_idx, newblock);
947
948         neh = ext_inode_hdr(inode);
949         fidx = EXT_FIRST_INDEX(neh);
950         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
951                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
952                   le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
953
954         neh->eh_depth = cpu_to_le16(path->p_depth + 1);
955         err = ext4_ext_dirty(handle, inode, curp);
956 out:
957         brelse(bh);
958
959         return err;
960 }
961
962 /*
963  * ext4_ext_create_new_leaf:
964  * finds empty index and adds new leaf.
965  * if no free index is found, then it requests in-depth growing.
966  */
967 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
968                                         struct ext4_ext_path *path,
969                                         struct ext4_extent *newext)
970 {
971         struct ext4_ext_path *curp;
972         int depth, i, err = 0;
973
974 repeat:
975         i = depth = ext_depth(inode);
976
977         /* walk up to the tree and look for free index entry */
978         curp = path + depth;
979         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
980                 i--;
981                 curp--;
982         }
983
984         /* we use already allocated block for index block,
985          * so subsequent data blocks should be contiguous */
986         if (EXT_HAS_FREE_INDEX(curp)) {
987                 /* if we found index with free entry, then use that
988                  * entry: create all needed subtree and add new leaf */
989                 err = ext4_ext_split(handle, inode, path, newext, i);
990                 if (err)
991                         goto out;
992
993                 /* refill path */
994                 ext4_ext_drop_refs(path);
995                 path = ext4_ext_find_extent(inode,
996                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
997                                     path);
998                 if (IS_ERR(path))
999                         err = PTR_ERR(path);
1000         } else {
1001                 /* tree is full, time to grow in depth */
1002                 err = ext4_ext_grow_indepth(handle, inode, path, newext);
1003                 if (err)
1004                         goto out;
1005
1006                 /* refill path */
1007                 ext4_ext_drop_refs(path);
1008                 path = ext4_ext_find_extent(inode,
1009                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1010                                     path);
1011                 if (IS_ERR(path)) {
1012                         err = PTR_ERR(path);
1013                         goto out;
1014                 }
1015
1016                 /*
1017                  * only first (depth 0 -> 1) produces free space;
1018                  * in all other cases we have to split the grown tree
1019                  */
1020                 depth = ext_depth(inode);
1021                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1022                         /* now we need to split */
1023                         goto repeat;
1024                 }
1025         }
1026
1027 out:
1028         return err;
1029 }
1030
1031 /*
1032  * search the closest allocated block to the left for *logical
1033  * and returns it at @logical + it's physical address at @phys
1034  * if *logical is the smallest allocated block, the function
1035  * returns 0 at @phys
1036  * return value contains 0 (success) or error code
1037  */
1038 int
1039 ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1040                         ext4_lblk_t *logical, ext4_fsblk_t *phys)
1041 {
1042         struct ext4_extent_idx *ix;
1043         struct ext4_extent *ex;
1044         int depth, ee_len;
1045
1046         BUG_ON(path == NULL);
1047         depth = path->p_depth;
1048         *phys = 0;
1049
1050         if (depth == 0 && path->p_ext == NULL)
1051                 return 0;
1052
1053         /* usually extent in the path covers blocks smaller
1054          * then *logical, but it can be that extent is the
1055          * first one in the file */
1056
1057         ex = path[depth].p_ext;
1058         ee_len = ext4_ext_get_actual_len(ex);
1059         if (*logical < le32_to_cpu(ex->ee_block)) {
1060                 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1061                 while (--depth >= 0) {
1062                         ix = path[depth].p_idx;
1063                         BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1064                 }
1065                 return 0;
1066         }
1067
1068         BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1069
1070         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1071         *phys = ext_pblock(ex) + ee_len - 1;
1072         return 0;
1073 }
1074
1075 /*
1076  * search the closest allocated block to the right for *logical
1077  * and returns it at @logical + it's physical address at @phys
1078  * if *logical is the smallest allocated block, the function
1079  * returns 0 at @phys
1080  * return value contains 0 (success) or error code
1081  */
1082 int
1083 ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1084                         ext4_lblk_t *logical, ext4_fsblk_t *phys)
1085 {
1086         struct buffer_head *bh = NULL;
1087         struct ext4_extent_header *eh;
1088         struct ext4_extent_idx *ix;
1089         struct ext4_extent *ex;
1090         ext4_fsblk_t block;
1091         int depth, ee_len;
1092
1093         BUG_ON(path == NULL);
1094         depth = path->p_depth;
1095         *phys = 0;
1096
1097         if (depth == 0 && path->p_ext == NULL)
1098                 return 0;
1099
1100         /* usually extent in the path covers blocks smaller
1101          * then *logical, but it can be that extent is the
1102          * first one in the file */
1103
1104         ex = path[depth].p_ext;
1105         ee_len = ext4_ext_get_actual_len(ex);
1106         if (*logical < le32_to_cpu(ex->ee_block)) {
1107                 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
1108                 while (--depth >= 0) {
1109                         ix = path[depth].p_idx;
1110                         BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
1111                 }
1112                 *logical = le32_to_cpu(ex->ee_block);
1113                 *phys = ext_pblock(ex);
1114                 return 0;
1115         }
1116
1117         BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
1118
1119         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1120                 /* next allocated block in this leaf */
1121                 ex++;
1122                 *logical = le32_to_cpu(ex->ee_block);
1123                 *phys = ext_pblock(ex);
1124                 return 0;
1125         }
1126
1127         /* go up and search for index to the right */
1128         while (--depth >= 0) {
1129                 ix = path[depth].p_idx;
1130                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1131                         break;
1132         }
1133
1134         if (depth < 0) {
1135                 /* we've gone up to the root and
1136                  * found no index to the right */
1137                 return 0;
1138         }
1139
1140         /* we've found index to the right, let's
1141          * follow it and find the closest allocated
1142          * block to the right */
1143         ix++;
1144         block = idx_pblock(ix);
1145         while (++depth < path->p_depth) {
1146                 bh = sb_bread(inode->i_sb, block);
1147                 if (bh == NULL)
1148                         return -EIO;
1149                 eh = ext_block_hdr(bh);
1150                 if (ext4_ext_check_header(inode, eh, depth)) {
1151                         put_bh(bh);
1152                         return -EIO;
1153                 }
1154                 ix = EXT_FIRST_INDEX(eh);
1155                 block = idx_pblock(ix);
1156                 put_bh(bh);
1157         }
1158
1159         bh = sb_bread(inode->i_sb, block);
1160         if (bh == NULL)
1161                 return -EIO;
1162         eh = ext_block_hdr(bh);
1163         if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
1164                 put_bh(bh);
1165                 return -EIO;
1166         }
1167         ex = EXT_FIRST_EXTENT(eh);
1168         *logical = le32_to_cpu(ex->ee_block);
1169         *phys = ext_pblock(ex);
1170         put_bh(bh);
1171         return 0;
1172
1173 }
1174
1175 /*
1176  * ext4_ext_next_allocated_block:
1177  * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1178  * NOTE: it considers block number from index entry as
1179  * allocated block. Thus, index entries have to be consistent
1180  * with leaves.
1181  */
1182 static ext4_lblk_t
1183 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1184 {
1185         int depth;
1186
1187         BUG_ON(path == NULL);
1188         depth = path->p_depth;
1189
1190         if (depth == 0 && path->p_ext == NULL)
1191                 return EXT_MAX_BLOCK;
1192
1193         while (depth >= 0) {
1194                 if (depth == path->p_depth) {
1195                         /* leaf */
1196                         if (path[depth].p_ext !=
1197                                         EXT_LAST_EXTENT(path[depth].p_hdr))
1198                           return le32_to_cpu(path[depth].p_ext[1].ee_block);
1199                 } else {
1200                         /* index */
1201                         if (path[depth].p_idx !=
1202                                         EXT_LAST_INDEX(path[depth].p_hdr))
1203                           return le32_to_cpu(path[depth].p_idx[1].ei_block);
1204                 }
1205                 depth--;
1206         }
1207
1208         return EXT_MAX_BLOCK;
1209 }
1210
1211 /*
1212  * ext4_ext_next_leaf_block:
1213  * returns first allocated block from next leaf or EXT_MAX_BLOCK
1214  */
1215 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1216                                         struct ext4_ext_path *path)
1217 {
1218         int depth;
1219
1220         BUG_ON(path == NULL);
1221         depth = path->p_depth;
1222
1223         /* zero-tree has no leaf blocks at all */
1224         if (depth == 0)
1225                 return EXT_MAX_BLOCK;
1226
1227         /* go to index block */
1228         depth--;
1229
1230         while (depth >= 0) {
1231                 if (path[depth].p_idx !=
1232                                 EXT_LAST_INDEX(path[depth].p_hdr))
1233                         return (ext4_lblk_t)
1234                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1235                 depth--;
1236         }
1237
1238         return EXT_MAX_BLOCK;
1239 }
1240
1241 /*
1242  * ext4_ext_correct_indexes:
1243  * if leaf gets modified and modified extent is first in the leaf,
1244  * then we have to correct all indexes above.
1245  * TODO: do we need to correct tree in all cases?
1246  */
1247 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1248                                 struct ext4_ext_path *path)
1249 {
1250         struct ext4_extent_header *eh;
1251         int depth = ext_depth(inode);
1252         struct ext4_extent *ex;
1253         __le32 border;
1254         int k, err = 0;
1255
1256         eh = path[depth].p_hdr;
1257         ex = path[depth].p_ext;
1258         BUG_ON(ex == NULL);
1259         BUG_ON(eh == NULL);
1260
1261         if (depth == 0) {
1262                 /* there is no tree at all */
1263                 return 0;
1264         }
1265
1266         if (ex != EXT_FIRST_EXTENT(eh)) {
1267                 /* we correct tree if first leaf got modified only */
1268                 return 0;
1269         }
1270
1271         /*
1272          * TODO: we need correction if border is smaller than current one
1273          */
1274         k = depth - 1;
1275         border = path[depth].p_ext->ee_block;
1276         err = ext4_ext_get_access(handle, inode, path + k);
1277         if (err)
1278                 return err;
1279         path[k].p_idx->ei_block = border;
1280         err = ext4_ext_dirty(handle, inode, path + k);
1281         if (err)
1282                 return err;
1283
1284         while (k--) {
1285                 /* change all left-side indexes */
1286                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1287                         break;
1288                 err = ext4_ext_get_access(handle, inode, path + k);
1289                 if (err)
1290                         break;
1291                 path[k].p_idx->ei_block = border;
1292                 err = ext4_ext_dirty(handle, inode, path + k);
1293                 if (err)
1294                         break;
1295         }
1296
1297         return err;
1298 }
1299
1300 static int
1301 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1302                                 struct ext4_extent *ex2)
1303 {
1304         unsigned short ext1_ee_len, ext2_ee_len, max_len;
1305
1306         /*
1307          * Make sure that either both extents are uninitialized, or
1308          * both are _not_.
1309          */
1310         if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1311                 return 0;
1312
1313         if (ext4_ext_is_uninitialized(ex1))
1314                 max_len = EXT_UNINIT_MAX_LEN;
1315         else
1316                 max_len = EXT_INIT_MAX_LEN;
1317
1318         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1319         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1320
1321         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1322                         le32_to_cpu(ex2->ee_block))
1323                 return 0;
1324
1325         /*
1326          * To allow future support for preallocated extents to be added
1327          * as an RO_COMPAT feature, refuse to merge to extents if
1328          * this can result in the top bit of ee_len being set.
1329          */
1330         if (ext1_ee_len + ext2_ee_len > max_len)
1331                 return 0;
1332 #ifdef AGGRESSIVE_TEST
1333         if (ext1_ee_len >= 4)
1334                 return 0;
1335 #endif
1336
1337         if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1338                 return 1;
1339         return 0;
1340 }
1341
1342 /*
1343  * This function tries to merge the "ex" extent to the next extent in the tree.
1344  * It always tries to merge towards right. If you want to merge towards
1345  * left, pass "ex - 1" as argument instead of "ex".
1346  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1347  * 1 if they got merged.
1348  */
1349 int ext4_ext_try_to_merge(struct inode *inode,
1350                           struct ext4_ext_path *path,
1351                           struct ext4_extent *ex)
1352 {
1353         struct ext4_extent_header *eh;
1354         unsigned int depth, len;
1355         int merge_done = 0;
1356         int uninitialized = 0;
1357
1358         depth = ext_depth(inode);
1359         BUG_ON(path[depth].p_hdr == NULL);
1360         eh = path[depth].p_hdr;
1361
1362         while (ex < EXT_LAST_EXTENT(eh)) {
1363                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1364                         break;
1365                 /* merge with next extent! */
1366                 if (ext4_ext_is_uninitialized(ex))
1367                         uninitialized = 1;
1368                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1369                                 + ext4_ext_get_actual_len(ex + 1));
1370                 if (uninitialized)
1371                         ext4_ext_mark_uninitialized(ex);
1372
1373                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1374                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1375                                 * sizeof(struct ext4_extent);
1376                         memmove(ex + 1, ex + 2, len);
1377                 }
1378                 le16_add_cpu(&eh->eh_entries, -1);
1379                 merge_done = 1;
1380                 WARN_ON(eh->eh_entries == 0);
1381                 if (!eh->eh_entries)
1382                         ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
1383                            "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
1384         }
1385
1386         return merge_done;
1387 }
1388
1389 /*
1390  * check if a portion of the "newext" extent overlaps with an
1391  * existing extent.
1392  *
1393  * If there is an overlap discovered, it updates the length of the newext
1394  * such that there will be no overlap, and then returns 1.
1395  * If there is no overlap found, it returns 0.
1396  */
1397 unsigned int ext4_ext_check_overlap(struct inode *inode,
1398                                     struct ext4_extent *newext,
1399                                     struct ext4_ext_path *path)
1400 {
1401         ext4_lblk_t b1, b2;
1402         unsigned int depth, len1;
1403         unsigned int ret = 0;
1404
1405         b1 = le32_to_cpu(newext->ee_block);
1406         len1 = ext4_ext_get_actual_len(newext);
1407         depth = ext_depth(inode);
1408         if (!path[depth].p_ext)
1409                 goto out;
1410         b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1411
1412         /*
1413          * get the next allocated block if the extent in the path
1414          * is before the requested block(s) 
1415          */
1416         if (b2 < b1) {
1417                 b2 = ext4_ext_next_allocated_block(path);
1418                 if (b2 == EXT_MAX_BLOCK)
1419                         goto out;
1420         }
1421
1422         /* check for wrap through zero on extent logical start block*/
1423         if (b1 + len1 < b1) {
1424                 len1 = EXT_MAX_BLOCK - b1;
1425                 newext->ee_len = cpu_to_le16(len1);
1426                 ret = 1;
1427         }
1428
1429         /* check for overlap */
1430         if (b1 + len1 > b2) {
1431                 newext->ee_len = cpu_to_le16(b2 - b1);
1432                 ret = 1;
1433         }
1434 out:
1435         return ret;
1436 }
1437
1438 /*
1439  * ext4_ext_insert_extent:
1440  * tries to merge requsted extent into the existing extent or
1441  * inserts requested extent as new one into the tree,
1442  * creating new leaf in the no-space case.
1443  */
1444 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1445                                 struct ext4_ext_path *path,
1446                                 struct ext4_extent *newext)
1447 {
1448         struct ext4_extent_header * eh;
1449         struct ext4_extent *ex, *fex;
1450         struct ext4_extent *nearex; /* nearest extent */
1451         struct ext4_ext_path *npath = NULL;
1452         int depth, len, err;
1453         ext4_lblk_t next;
1454         unsigned uninitialized = 0;
1455
1456         BUG_ON(ext4_ext_get_actual_len(newext) == 0);
1457         depth = ext_depth(inode);
1458         ex = path[depth].p_ext;
1459         BUG_ON(path[depth].p_hdr == NULL);
1460
1461         /* try to insert block into found extent and return */
1462         if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1463                 ext_debug("append %d block to %d:%d (from %llu)\n",
1464                                 ext4_ext_get_actual_len(newext),
1465                                 le32_to_cpu(ex->ee_block),
1466                                 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1467                 err = ext4_ext_get_access(handle, inode, path + depth);
1468                 if (err)
1469                         return err;
1470
1471                 /*
1472                  * ext4_can_extents_be_merged should have checked that either
1473                  * both extents are uninitialized, or both aren't. Thus we
1474                  * need to check only one of them here.
1475                  */
1476                 if (ext4_ext_is_uninitialized(ex))
1477                         uninitialized = 1;
1478                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1479                                         + ext4_ext_get_actual_len(newext));
1480                 if (uninitialized)
1481                         ext4_ext_mark_uninitialized(ex);
1482                 eh = path[depth].p_hdr;
1483                 nearex = ex;
1484                 goto merge;
1485         }
1486
1487 repeat:
1488         depth = ext_depth(inode);
1489         eh = path[depth].p_hdr;
1490         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1491                 goto has_space;
1492
1493         /* probably next leaf has space for us? */
1494         fex = EXT_LAST_EXTENT(eh);
1495         next = ext4_ext_next_leaf_block(inode, path);
1496         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1497             && next != EXT_MAX_BLOCK) {
1498                 ext_debug("next leaf block - %d\n", next);
1499                 BUG_ON(npath != NULL);
1500                 npath = ext4_ext_find_extent(inode, next, NULL);
1501                 if (IS_ERR(npath))
1502                         return PTR_ERR(npath);
1503                 BUG_ON(npath->p_depth != path->p_depth);
1504                 eh = npath[depth].p_hdr;
1505                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1506                         ext_debug("next leaf isnt full(%d)\n",
1507                                   le16_to_cpu(eh->eh_entries));
1508                         path = npath;
1509                         goto repeat;
1510                 }
1511                 ext_debug("next leaf has no free space(%d,%d)\n",
1512                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1513         }
1514
1515         /*
1516          * There is no free space in the found leaf.
1517          * We're gonna add a new leaf in the tree.
1518          */
1519         err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1520         if (err)
1521                 goto cleanup;
1522         depth = ext_depth(inode);
1523         eh = path[depth].p_hdr;
1524
1525 has_space:
1526         nearex = path[depth].p_ext;
1527
1528         err = ext4_ext_get_access(handle, inode, path + depth);
1529         if (err)
1530                 goto cleanup;
1531
1532         if (!nearex) {
1533                 /* there is no extent in this leaf, create first one */
1534                 ext_debug("first extent in the leaf: %d:%llu:%d\n",
1535                                 le32_to_cpu(newext->ee_block),
1536                                 ext_pblock(newext),
1537                                 ext4_ext_get_actual_len(newext));
1538                 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1539         } else if (le32_to_cpu(newext->ee_block)
1540                            > le32_to_cpu(nearex->ee_block)) {
1541 /*              BUG_ON(newext->ee_block == nearex->ee_block); */
1542                 if (nearex != EXT_LAST_EXTENT(eh)) {
1543                         len = EXT_MAX_EXTENT(eh) - nearex;
1544                         len = (len - 1) * sizeof(struct ext4_extent);
1545                         len = len < 0 ? 0 : len;
1546                         ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1547                                         "move %d from 0x%p to 0x%p\n",
1548                                         le32_to_cpu(newext->ee_block),
1549                                         ext_pblock(newext),
1550                                         ext4_ext_get_actual_len(newext),
1551                                         nearex, len, nearex + 1, nearex + 2);
1552                         memmove(nearex + 2, nearex + 1, len);
1553                 }
1554                 path[depth].p_ext = nearex + 1;
1555         } else {
1556                 BUG_ON(newext->ee_block == nearex->ee_block);
1557                 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1558                 len = len < 0 ? 0 : len;
1559                 ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1560                                 "move %d from 0x%p to 0x%p\n",
1561                                 le32_to_cpu(newext->ee_block),
1562                                 ext_pblock(newext),
1563                                 ext4_ext_get_actual_len(newext),
1564                                 nearex, len, nearex + 1, nearex + 2);
1565                 memmove(nearex + 1, nearex, len);
1566                 path[depth].p_ext = nearex;
1567         }
1568
1569         le16_add_cpu(&eh->eh_entries, 1);
1570         nearex = path[depth].p_ext;
1571         nearex->ee_block = newext->ee_block;
1572         ext4_ext_store_pblock(nearex, ext_pblock(newext));
1573         nearex->ee_len = newext->ee_len;
1574
1575 merge:
1576         /* try to merge extents to the right */
1577         ext4_ext_try_to_merge(inode, path, nearex);
1578
1579         /* try to merge extents to the left */
1580
1581         /* time to correct all indexes above */
1582         err = ext4_ext_correct_indexes(handle, inode, path);
1583         if (err)
1584                 goto cleanup;
1585
1586         err = ext4_ext_dirty(handle, inode, path + depth);
1587
1588 cleanup:
1589         if (npath) {
1590                 ext4_ext_drop_refs(npath);
1591                 kfree(npath);
1592         }
1593         ext4_ext_tree_changed(inode);
1594         ext4_ext_invalidate_cache(inode);
1595         return err;
1596 }
1597
1598 static void
1599 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1600                         __u32 len, ext4_fsblk_t start, int type)
1601 {
1602         struct ext4_ext_cache *cex;
1603         BUG_ON(len == 0);
1604         cex = &EXT4_I(inode)->i_cached_extent;
1605         cex->ec_type = type;
1606         cex->ec_block = block;
1607         cex->ec_len = len;
1608         cex->ec_start = start;
1609 }
1610
1611 /*
1612  * ext4_ext_put_gap_in_cache:
1613  * calculate boundaries of the gap that the requested block fits into
1614  * and cache this gap
1615  */
1616 static void
1617 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1618                                 ext4_lblk_t block)
1619 {
1620         int depth = ext_depth(inode);
1621         unsigned long len;
1622         ext4_lblk_t lblock;
1623         struct ext4_extent *ex;
1624
1625         ex = path[depth].p_ext;
1626         if (ex == NULL) {
1627                 /* there is no extent yet, so gap is [0;-] */
1628                 lblock = 0;
1629                 len = EXT_MAX_BLOCK;
1630                 ext_debug("cache gap(whole file):");
1631         } else if (block < le32_to_cpu(ex->ee_block)) {
1632                 lblock = block;
1633                 len = le32_to_cpu(ex->ee_block) - block;
1634                 ext_debug("cache gap(before): %u [%u:%u]",
1635                                 block,
1636                                 le32_to_cpu(ex->ee_block),
1637                                  ext4_ext_get_actual_len(ex));
1638         } else if (block >= le32_to_cpu(ex->ee_block)
1639                         + ext4_ext_get_actual_len(ex)) {
1640                 ext4_lblk_t next;
1641                 lblock = le32_to_cpu(ex->ee_block)
1642                         + ext4_ext_get_actual_len(ex);
1643
1644                 next = ext4_ext_next_allocated_block(path);
1645                 ext_debug("cache gap(after): [%u:%u] %u",
1646                                 le32_to_cpu(ex->ee_block),
1647                                 ext4_ext_get_actual_len(ex),
1648                                 block);
1649                 BUG_ON(next == lblock);
1650                 len = next - lblock;
1651         } else {
1652                 lblock = len = 0;
1653                 BUG();
1654         }
1655
1656         ext_debug(" -> %u:%lu\n", lblock, len);
1657         ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1658 }
1659
1660 static int
1661 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1662                         struct ext4_extent *ex)
1663 {
1664         struct ext4_ext_cache *cex;
1665
1666         cex = &EXT4_I(inode)->i_cached_extent;
1667
1668         /* has cache valid data? */
1669         if (cex->ec_type == EXT4_EXT_CACHE_NO)
1670                 return EXT4_EXT_CACHE_NO;
1671
1672         BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1673                         cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1674         if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1675                 ex->ee_block = cpu_to_le32(cex->ec_block);
1676                 ext4_ext_store_pblock(ex, cex->ec_start);
1677                 ex->ee_len = cpu_to_le16(cex->ec_len);
1678                 ext_debug("%u cached by %u:%u:%llu\n",
1679                                 block,
1680                                 cex->ec_block, cex->ec_len, cex->ec_start);
1681                 return cex->ec_type;
1682         }
1683
1684         /* not in cache */
1685         return EXT4_EXT_CACHE_NO;
1686 }
1687
1688 /*
1689  * ext4_ext_rm_idx:
1690  * removes index from the index block.
1691  * It's used in truncate case only, thus all requests are for
1692  * last index in the block only.
1693  */
1694 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1695                         struct ext4_ext_path *path)
1696 {
1697         struct buffer_head *bh;
1698         int err;
1699         ext4_fsblk_t leaf;
1700
1701         /* free index block */
1702         path--;
1703         leaf = idx_pblock(path->p_idx);
1704         BUG_ON(path->p_hdr->eh_entries == 0);
1705         err = ext4_ext_get_access(handle, inode, path);
1706         if (err)
1707                 return err;
1708         le16_add_cpu(&path->p_hdr->eh_entries, -1);
1709         err = ext4_ext_dirty(handle, inode, path);
1710         if (err)
1711                 return err;
1712         ext_debug("index is empty, remove it, free block %llu\n", leaf);
1713         bh = sb_find_get_block(inode->i_sb, leaf);
1714         ext4_forget(handle, 1, inode, bh, leaf);
1715         ext4_free_blocks(handle, inode, leaf, 1, 1);
1716         return err;
1717 }
1718
1719 /*
1720  * ext4_ext_calc_credits_for_insert:
1721  * This routine returns max. credits that the extent tree can consume.
1722  * It should be OK for low-performance paths like ->writepage()
1723  * To allow many writing processes to fit into a single transaction,
1724  * the caller should calculate credits under i_data_sem and
1725  * pass the actual path.
1726  */
1727 int ext4_ext_calc_credits_for_insert(struct inode *inode,
1728                                                 struct ext4_ext_path *path)
1729 {
1730         int depth, needed;
1731
1732         if (path) {
1733                 /* probably there is space in leaf? */
1734                 depth = ext_depth(inode);
1735                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1736                                 < le16_to_cpu(path[depth].p_hdr->eh_max))
1737                         return 1;
1738         }
1739
1740         /*
1741          * given 32-bit logical block (4294967296 blocks), max. tree
1742          * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1743          * Let's also add one more level for imbalance.
1744          */
1745         depth = 5;
1746
1747         /* allocation of new data block(s) */
1748         needed = 2;
1749
1750         /*
1751          * tree can be full, so it would need to grow in depth:
1752          * we need one credit to modify old root, credits for
1753          * new root will be added in split accounting
1754          */
1755         needed += 1;
1756
1757         /*
1758          * Index split can happen, we would need:
1759          *    allocate intermediate indexes (bitmap + group)
1760          *  + change two blocks at each level, but root (already included)
1761          */
1762         needed += (depth * 2) + (depth * 2);
1763
1764         /* any allocation modifies superblock */
1765         needed += 1;
1766
1767         return needed;
1768 }
1769
1770 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1771                                 struct ext4_extent *ex,
1772                                 ext4_lblk_t from, ext4_lblk_t to)
1773 {
1774         struct buffer_head *bh;
1775         unsigned short ee_len =  ext4_ext_get_actual_len(ex);
1776         int i, metadata = 0;
1777
1778         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1779                 metadata = 1;
1780 #ifdef EXTENTS_STATS
1781         {
1782                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1783                 spin_lock(&sbi->s_ext_stats_lock);
1784                 sbi->s_ext_blocks += ee_len;
1785                 sbi->s_ext_extents++;
1786                 if (ee_len < sbi->s_ext_min)
1787                         sbi->s_ext_min = ee_len;
1788                 if (ee_len > sbi->s_ext_max)
1789                         sbi->s_ext_max = ee_len;
1790                 if (ext_depth(inode) > sbi->s_depth_max)
1791                         sbi->s_depth_max = ext_depth(inode);
1792                 spin_unlock(&sbi->s_ext_stats_lock);
1793         }
1794 #endif
1795         if (from >= le32_to_cpu(ex->ee_block)
1796             && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
1797                 /* tail removal */
1798                 ext4_lblk_t num;
1799                 ext4_fsblk_t start;
1800
1801                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
1802                 start = ext_pblock(ex) + ee_len - num;
1803                 ext_debug("free last %u blocks starting %llu\n", num, start);
1804                 for (i = 0; i < num; i++) {
1805                         bh = sb_find_get_block(inode->i_sb, start + i);
1806                         ext4_forget(handle, 0, inode, bh, start + i);
1807                 }
1808                 ext4_free_blocks(handle, inode, start, num, metadata);
1809         } else if (from == le32_to_cpu(ex->ee_block)
1810                    && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1811                 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
1812                         from, to, le32_to_cpu(ex->ee_block), ee_len);
1813         } else {
1814                 printk(KERN_INFO "strange request: removal(2) "
1815                                 "%u-%u from %u:%u\n",
1816                                 from, to, le32_to_cpu(ex->ee_block), ee_len);
1817         }
1818         return 0;
1819 }
1820
1821 static int
1822 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1823                 struct ext4_ext_path *path, ext4_lblk_t start)
1824 {
1825         int err = 0, correct_index = 0;
1826         int depth = ext_depth(inode), credits;
1827         struct ext4_extent_header *eh;
1828         ext4_lblk_t a, b, block;
1829         unsigned num;
1830         ext4_lblk_t ex_ee_block;
1831         unsigned short ex_ee_len;
1832         unsigned uninitialized = 0;
1833         struct ext4_extent *ex;
1834
1835         /* the header must be checked already in ext4_ext_remove_space() */
1836         ext_debug("truncate since %u in leaf\n", start);
1837         if (!path[depth].p_hdr)
1838                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1839         eh = path[depth].p_hdr;
1840         BUG_ON(eh == NULL);
1841
1842         /* find where to start removing */
1843         ex = EXT_LAST_EXTENT(eh);
1844
1845         ex_ee_block = le32_to_cpu(ex->ee_block);
1846         if (ext4_ext_is_uninitialized(ex))
1847                 uninitialized = 1;
1848         ex_ee_len = ext4_ext_get_actual_len(ex);
1849
1850         while (ex >= EXT_FIRST_EXTENT(eh) &&
1851                         ex_ee_block + ex_ee_len > start) {
1852                 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1853                 path[depth].p_ext = ex;
1854
1855                 a = ex_ee_block > start ? ex_ee_block : start;
1856                 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1857                         ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1858
1859                 ext_debug("  border %u:%u\n", a, b);
1860
1861                 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1862                         block = 0;
1863                         num = 0;
1864                         BUG();
1865                 } else if (a != ex_ee_block) {
1866                         /* remove tail of the extent */
1867                         block = ex_ee_block;
1868                         num = a - block;
1869                 } else if (b != ex_ee_block + ex_ee_len - 1) {
1870                         /* remove head of the extent */
1871                         block = a;
1872                         num = b - a;
1873                         /* there is no "make a hole" API yet */
1874                         BUG();
1875                 } else {
1876                         /* remove whole extent: excellent! */
1877                         block = ex_ee_block;
1878                         num = 0;
1879                         BUG_ON(a != ex_ee_block);
1880                         BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1881                 }
1882
1883                 /* at present, extent can't cross block group: */
1884                 /* leaf + bitmap + group desc + sb + inode */
1885                 credits = 5;
1886                 if (ex == EXT_FIRST_EXTENT(eh)) {
1887                         correct_index = 1;
1888                         credits += (ext_depth(inode)) + 1;
1889                 }
1890 #ifdef CONFIG_QUOTA
1891                 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1892 #endif
1893
1894                 err = ext4_ext_journal_restart(handle, credits);
1895                 if (err)
1896                         goto out;
1897
1898                 err = ext4_ext_get_access(handle, inode, path + depth);
1899                 if (err)
1900                         goto out;
1901
1902                 err = ext4_remove_blocks(handle, inode, ex, a, b);
1903                 if (err)
1904                         goto out;
1905
1906                 if (num == 0) {
1907                         /* this extent is removed; mark slot entirely unused */
1908                         ext4_ext_store_pblock(ex, 0);
1909                         le16_add_cpu(&eh->eh_entries, -1);
1910                 }
1911
1912                 ex->ee_block = cpu_to_le32(block);
1913                 ex->ee_len = cpu_to_le16(num);
1914                 /*
1915                  * Do not mark uninitialized if all the blocks in the
1916                  * extent have been removed.
1917                  */
1918                 if (uninitialized && num)
1919                         ext4_ext_mark_uninitialized(ex);
1920
1921                 err = ext4_ext_dirty(handle, inode, path + depth);
1922                 if (err)
1923                         goto out;
1924
1925                 ext_debug("new extent: %u:%u:%llu\n", block, num,
1926                                 ext_pblock(ex));
1927                 ex--;
1928                 ex_ee_block = le32_to_cpu(ex->ee_block);
1929                 ex_ee_len = ext4_ext_get_actual_len(ex);
1930         }
1931
1932         if (correct_index && eh->eh_entries)
1933                 err = ext4_ext_correct_indexes(handle, inode, path);
1934
1935         /* if this leaf is free, then we should
1936          * remove it from index block above */
1937         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1938                 err = ext4_ext_rm_idx(handle, inode, path + depth);
1939
1940 out:
1941         return err;
1942 }
1943
1944 /*
1945  * ext4_ext_more_to_rm:
1946  * returns 1 if current index has to be freed (even partial)
1947  */
1948 static int
1949 ext4_ext_more_to_rm(struct ext4_ext_path *path)
1950 {
1951         BUG_ON(path->p_idx == NULL);
1952
1953         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1954                 return 0;
1955
1956         /*
1957          * if truncate on deeper level happened, it wasn't partial,
1958          * so we have to consider current index for truncation
1959          */
1960         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
1961                 return 0;
1962         return 1;
1963 }
1964
1965 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
1966 {
1967         struct super_block *sb = inode->i_sb;
1968         int depth = ext_depth(inode);
1969         struct ext4_ext_path *path;
1970         handle_t *handle;
1971         int i = 0, err = 0;
1972
1973         ext_debug("truncate since %u\n", start);
1974
1975         /* probably first extent we're gonna free will be last in block */
1976         handle = ext4_journal_start(inode, depth + 1);
1977         if (IS_ERR(handle))
1978                 return PTR_ERR(handle);
1979
1980         ext4_ext_invalidate_cache(inode);
1981
1982         /*
1983          * We start scanning from right side, freeing all the blocks
1984          * after i_size and walking into the tree depth-wise.
1985          */
1986         path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
1987         if (path == NULL) {
1988                 ext4_journal_stop(handle);
1989                 return -ENOMEM;
1990         }
1991         path[0].p_hdr = ext_inode_hdr(inode);
1992         if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
1993                 err = -EIO;
1994                 goto out;
1995         }
1996         path[0].p_depth = depth;
1997
1998         while (i >= 0 && err == 0) {
1999                 if (i == depth) {
2000                         /* this is leaf block */
2001                         err = ext4_ext_rm_leaf(handle, inode, path, start);
2002                         /* root level has p_bh == NULL, brelse() eats this */
2003                         brelse(path[i].p_bh);
2004                         path[i].p_bh = NULL;
2005                         i--;
2006                         continue;
2007                 }
2008
2009                 /* this is index block */
2010                 if (!path[i].p_hdr) {
2011                         ext_debug("initialize header\n");
2012                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2013                 }
2014
2015                 if (!path[i].p_idx) {
2016                         /* this level hasn't been touched yet */
2017                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2018                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2019                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
2020                                   path[i].p_hdr,
2021                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2022                 } else {
2023                         /* we were already here, see at next index */
2024                         path[i].p_idx--;
2025                 }
2026
2027                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2028                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2029                                 path[i].p_idx);
2030                 if (ext4_ext_more_to_rm(path + i)) {
2031                         struct buffer_head *bh;
2032                         /* go to the next level */
2033                         ext_debug("move to level %d (block %llu)\n",
2034                                   i + 1, idx_pblock(path[i].p_idx));
2035                         memset(path + i + 1, 0, sizeof(*path));
2036                         bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2037                         if (!bh) {
2038                                 /* should we reset i_size? */
2039                                 err = -EIO;
2040                                 break;
2041                         }
2042                         if (WARN_ON(i + 1 > depth)) {
2043                                 err = -EIO;
2044                                 break;
2045                         }
2046                         if (ext4_ext_check_header(inode, ext_block_hdr(bh),
2047                                                         depth - i - 1)) {
2048                                 err = -EIO;
2049                                 break;
2050                         }
2051                         path[i + 1].p_bh = bh;
2052
2053                         /* save actual number of indexes since this
2054                          * number is changed at the next iteration */
2055                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2056                         i++;
2057                 } else {
2058                         /* we finished processing this index, go up */
2059                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2060                                 /* index is empty, remove it;
2061                                  * handle must be already prepared by the
2062                                  * truncatei_leaf() */
2063                                 err = ext4_ext_rm_idx(handle, inode, path + i);
2064                         }
2065                         /* root level has p_bh == NULL, brelse() eats this */
2066                         brelse(path[i].p_bh);
2067                         path[i].p_bh = NULL;
2068                         i--;
2069                         ext_debug("return to level %d\n", i);
2070                 }
2071         }
2072
2073         /* TODO: flexible tree reduction should be here */
2074         if (path->p_hdr->eh_entries == 0) {
2075                 /*
2076                  * truncate to zero freed all the tree,
2077                  * so we need to correct eh_depth
2078                  */
2079                 err = ext4_ext_get_access(handle, inode, path);
2080                 if (err == 0) {
2081                         ext_inode_hdr(inode)->eh_depth = 0;
2082                         ext_inode_hdr(inode)->eh_max =
2083                                 cpu_to_le16(ext4_ext_space_root(inode));
2084                         err = ext4_ext_dirty(handle, inode, path);
2085                 }
2086         }
2087 out:
2088         ext4_ext_tree_changed(inode);
2089         ext4_ext_drop_refs(path);
2090         kfree(path);
2091         ext4_journal_stop(handle);
2092
2093         return err;
2094 }
2095
2096 /*
2097  * called at mount time
2098  */
2099 void ext4_ext_init(struct super_block *sb)
2100 {
2101         /*
2102          * possible initialization would be here
2103          */
2104
2105         if (test_opt(sb, EXTENTS)) {
2106                 printk("EXT4-fs: file extents enabled");
2107 #ifdef AGGRESSIVE_TEST
2108                 printk(", aggressive tests");
2109 #endif
2110 #ifdef CHECK_BINSEARCH
2111                 printk(", check binsearch");
2112 #endif
2113 #ifdef EXTENTS_STATS
2114                 printk(", stats");
2115 #endif
2116                 printk("\n");
2117 #ifdef EXTENTS_STATS
2118                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2119                 EXT4_SB(sb)->s_ext_min = 1 << 30;
2120                 EXT4_SB(sb)->s_ext_max = 0;
2121 #endif
2122         }
2123 }
2124
2125 /*
2126  * called at umount time
2127  */
2128 void ext4_ext_release(struct super_block *sb)
2129 {
2130         if (!test_opt(sb, EXTENTS))
2131                 return;
2132
2133 #ifdef EXTENTS_STATS
2134         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2135                 struct ext4_sb_info *sbi = EXT4_SB(sb);
2136                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2137                         sbi->s_ext_blocks, sbi->s_ext_extents,
2138                         sbi->s_ext_blocks / sbi->s_ext_extents);
2139                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2140                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2141         }
2142 #endif
2143 }
2144
2145 static void bi_complete(struct bio *bio, int error)
2146 {
2147         complete((struct completion *)bio->bi_private);
2148 }
2149
2150 /* FIXME!! we need to try to merge to left or right after zero-out  */
2151 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2152 {
2153         int ret = -EIO;
2154         struct bio *bio;
2155         int blkbits, blocksize;
2156         sector_t ee_pblock;
2157         struct completion event;
2158         unsigned int ee_len, len, done, offset;
2159
2160
2161         blkbits   = inode->i_blkbits;
2162         blocksize = inode->i_sb->s_blocksize;
2163         ee_len    = ext4_ext_get_actual_len(ex);
2164         ee_pblock = ext_pblock(ex);
2165
2166         /* convert ee_pblock to 512 byte sectors */
2167         ee_pblock = ee_pblock << (blkbits - 9);
2168
2169         while (ee_len > 0) {
2170
2171                 if (ee_len > BIO_MAX_PAGES)
2172                         len = BIO_MAX_PAGES;
2173                 else
2174                         len = ee_len;
2175
2176                 bio = bio_alloc(GFP_NOIO, len);
2177                 if (!bio)
2178                         return -ENOMEM;
2179                 bio->bi_sector = ee_pblock;
2180                 bio->bi_bdev   = inode->i_sb->s_bdev;
2181
2182                 done = 0;
2183                 offset = 0;
2184                 while (done < len) {
2185                         ret = bio_add_page(bio, ZERO_PAGE(0),
2186                                                         blocksize, offset);
2187                         if (ret != blocksize) {
2188                                 /*
2189                                  * We can't add any more pages because of
2190                                  * hardware limitations.  Start a new bio.
2191                                  */
2192                                 break;
2193                         }
2194                         done++;
2195                         offset += blocksize;
2196                         if (offset >= PAGE_CACHE_SIZE)
2197                                 offset = 0;
2198                 }
2199
2200                 init_completion(&event);
2201                 bio->bi_private = &event;
2202                 bio->bi_end_io = bi_complete;
2203                 submit_bio(WRITE, bio);
2204                 wait_for_completion(&event);
2205
2206                 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
2207                         ret = 0;
2208                 else {
2209                         ret = -EIO;
2210                         break;
2211                 }
2212                 bio_put(bio);
2213                 ee_len    -= done;
2214                 ee_pblock += done  << (blkbits - 9);
2215         }
2216         return ret;
2217 }
2218
2219 #define EXT4_EXT_ZERO_LEN 7
2220
2221 /*
2222  * This function is called by ext4_ext_get_blocks() if someone tries to write
2223  * to an uninitialized extent. It may result in splitting the uninitialized
2224  * extent into multiple extents (upto three - one initialized and two
2225  * uninitialized).
2226  * There are three possibilities:
2227  *   a> There is no split required: Entire extent should be initialized
2228  *   b> Splits in two extents: Write is happening at either end of the extent
2229  *   c> Splits in three extents: Somone is writing in middle of the extent
2230  */
2231 static int ext4_ext_convert_to_initialized(handle_t *handle,
2232                                                 struct inode *inode,
2233                                                 struct ext4_ext_path *path,
2234                                                 ext4_lblk_t iblock,
2235                                                 unsigned long max_blocks)
2236 {
2237         struct ext4_extent *ex, newex, orig_ex;
2238         struct ext4_extent *ex1 = NULL;
2239         struct ext4_extent *ex2 = NULL;
2240         struct ext4_extent *ex3 = NULL;
2241         struct ext4_extent_header *eh;
2242         ext4_lblk_t ee_block;
2243         unsigned int allocated, ee_len, depth;
2244         ext4_fsblk_t newblock;
2245         int err = 0;
2246         int ret = 0;
2247
2248         depth = ext_depth(inode);
2249         eh = path[depth].p_hdr;
2250         ex = path[depth].p_ext;
2251         ee_block = le32_to_cpu(ex->ee_block);
2252         ee_len = ext4_ext_get_actual_len(ex);
2253         allocated = ee_len - (iblock - ee_block);
2254         newblock = iblock - ee_block + ext_pblock(ex);
2255         ex2 = ex;
2256         orig_ex.ee_block = ex->ee_block;
2257         orig_ex.ee_len   = cpu_to_le16(ee_len);
2258         ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2259
2260         err = ext4_ext_get_access(handle, inode, path + depth);
2261         if (err)
2262                 goto out;
2263         /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2264         if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
2265                 err =  ext4_ext_zeroout(inode, &orig_ex);
2266                 if (err)
2267                         goto fix_extent_len;
2268                 /* update the extent length and mark as initialized */
2269                 ex->ee_block = orig_ex.ee_block;
2270                 ex->ee_len   = orig_ex.ee_len;
2271                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2272                 ext4_ext_dirty(handle, inode, path + depth);
2273                 /* zeroed the full extent */
2274                 return allocated;
2275         }
2276
2277         /* ex1: ee_block to iblock - 1 : uninitialized */
2278         if (iblock > ee_block) {
2279                 ex1 = ex;
2280                 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2281                 ext4_ext_mark_uninitialized(ex1);
2282                 ex2 = &newex;
2283         }
2284         /*
2285          * for sanity, update the length of the ex2 extent before
2286          * we insert ex3, if ex1 is NULL. This is to avoid temporary
2287          * overlap of blocks.
2288          */
2289         if (!ex1 && allocated > max_blocks)
2290                 ex2->ee_len = cpu_to_le16(max_blocks);
2291         /* ex3: to ee_block + ee_len : uninitialised */
2292         if (allocated > max_blocks) {
2293                 unsigned int newdepth;
2294                 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2295                 if (allocated <= EXT4_EXT_ZERO_LEN) {
2296                         /* Mark first half uninitialized.
2297                          * Mark second half initialized and zero out the
2298                          * initialized extent
2299                          */
2300                         ex->ee_block = orig_ex.ee_block;
2301                         ex->ee_len   = cpu_to_le16(ee_len - allocated);
2302                         ext4_ext_mark_uninitialized(ex);
2303                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2304                         ext4_ext_dirty(handle, inode, path + depth);
2305
2306                         ex3 = &newex;
2307                         ex3->ee_block = cpu_to_le32(iblock);
2308                         ext4_ext_store_pblock(ex3, newblock);
2309                         ex3->ee_len = cpu_to_le16(allocated);
2310                         err = ext4_ext_insert_extent(handle, inode, path, ex3);
2311                         if (err == -ENOSPC) {
2312                                 err =  ext4_ext_zeroout(inode, &orig_ex);
2313                                 if (err)
2314                                         goto fix_extent_len;
2315                                 ex->ee_block = orig_ex.ee_block;
2316                                 ex->ee_len   = orig_ex.ee_len;
2317                                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2318                                 ext4_ext_dirty(handle, inode, path + depth);
2319                                 /* zeroed the full extent */
2320                                 return allocated;
2321
2322                         } else if (err)
2323                                 goto fix_extent_len;
2324
2325                         /*
2326                          * We need to zero out the second half because
2327                          * an fallocate request can update file size and
2328                          * converting the second half to initialized extent
2329                          * implies that we can leak some junk data to user
2330                          * space.
2331                          */
2332                         err =  ext4_ext_zeroout(inode, ex3);
2333                         if (err) {
2334                                 /*
2335                                  * We should actually mark the
2336                                  * second half as uninit and return error
2337                                  * Insert would have changed the extent
2338                                  */
2339                                 depth = ext_depth(inode);
2340                                 ext4_ext_drop_refs(path);
2341                                 path = ext4_ext_find_extent(inode,
2342                                                                 iblock, path);
2343                                 if (IS_ERR(path)) {
2344                                         err = PTR_ERR(path);
2345                                         return err;
2346                                 }
2347                                 ex = path[depth].p_ext;
2348                                 err = ext4_ext_get_access(handle, inode,
2349                                                                 path + depth);
2350                                 if (err)
2351                                         return err;
2352                                 ext4_ext_mark_uninitialized(ex);
2353                                 ext4_ext_dirty(handle, inode, path + depth);
2354                                 return err;
2355                         }
2356
2357                         /* zeroed the second half */
2358                         return allocated;
2359                 }
2360                 ex3 = &newex;
2361                 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2362                 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2363                 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2364                 ext4_ext_mark_uninitialized(ex3);
2365                 err = ext4_ext_insert_extent(handle, inode, path, ex3);
2366                 if (err == -ENOSPC) {
2367                         err =  ext4_ext_zeroout(inode, &orig_ex);
2368                         if (err)
2369                                 goto fix_extent_len;
2370                         /* update the extent length and mark as initialized */
2371                         ex->ee_block = orig_ex.ee_block;
2372                         ex->ee_len   = orig_ex.ee_len;
2373                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2374                         ext4_ext_dirty(handle, inode, path + depth);
2375                         /* zeroed the full extent */
2376                         return allocated;
2377
2378                 } else if (err)
2379                         goto fix_extent_len;
2380                 /*
2381                  * The depth, and hence eh & ex might change
2382                  * as part of the insert above.
2383                  */
2384                 newdepth = ext_depth(inode);
2385                 /*
2386                  * update the extent length after successfull insert of the
2387                  * split extent
2388                  */
2389                 orig_ex.ee_len = cpu_to_le16(ee_len -
2390                                                 ext4_ext_get_actual_len(ex3));
2391                 if (newdepth != depth) {
2392                         depth = newdepth;
2393                         ext4_ext_drop_refs(path);
2394                         path = ext4_ext_find_extent(inode, iblock, path);
2395                         if (IS_ERR(path)) {
2396                                 err = PTR_ERR(path);
2397                                 goto out;
2398                         }
2399                         eh = path[depth].p_hdr;
2400                         ex = path[depth].p_ext;
2401                         if (ex2 != &newex)
2402                                 ex2 = ex;
2403
2404                         err = ext4_ext_get_access(handle, inode, path + depth);
2405                         if (err)
2406                                 goto out;
2407                 }
2408                 allocated = max_blocks;
2409
2410                 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2411                  * to insert a extent in the middle zerout directly
2412                  * otherwise give the extent a chance to merge to left
2413                  */
2414                 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2415                                                         iblock != ee_block) {
2416                         err =  ext4_ext_zeroout(inode, &orig_ex);
2417                         if (err)
2418                                 goto fix_extent_len;
2419                         /* update the extent length and mark as initialized */
2420                         ex->ee_block = orig_ex.ee_block;
2421                         ex->ee_len   = orig_ex.ee_len;
2422                         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2423                         ext4_ext_dirty(handle, inode, path + depth);
2424                         /* zero out the first half */
2425                         return allocated;
2426                 }
2427         }
2428         /*
2429          * If there was a change of depth as part of the
2430          * insertion of ex3 above, we need to update the length
2431          * of the ex1 extent again here
2432          */
2433         if (ex1 && ex1 != ex) {
2434                 ex1 = ex;
2435                 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2436                 ext4_ext_mark_uninitialized(ex1);
2437                 ex2 = &newex;
2438         }
2439         /* ex2: iblock to iblock + maxblocks-1 : initialised */
2440         ex2->ee_block = cpu_to_le32(iblock);
2441         ext4_ext_store_pblock(ex2, newblock);
2442         ex2->ee_len = cpu_to_le16(allocated);
2443         if (ex2 != ex)
2444                 goto insert;
2445         /*
2446          * New (initialized) extent starts from the first block
2447          * in the current extent. i.e., ex2 == ex
2448          * We have to see if it can be merged with the extent
2449          * on the left.
2450          */
2451         if (ex2 > EXT_FIRST_EXTENT(eh)) {
2452                 /*
2453                  * To merge left, pass "ex2 - 1" to try_to_merge(),
2454                  * since it merges towards right _only_.
2455                  */
2456                 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2457                 if (ret) {
2458                         err = ext4_ext_correct_indexes(handle, inode, path);
2459                         if (err)
2460                                 goto out;
2461                         depth = ext_depth(inode);
2462                         ex2--;
2463                 }
2464         }
2465         /*
2466          * Try to Merge towards right. This might be required
2467          * only when the whole extent is being written to.
2468          * i.e. ex2 == ex and ex3 == NULL.
2469          */
2470         if (!ex3) {
2471                 ret = ext4_ext_try_to_merge(inode, path, ex2);
2472                 if (ret) {
2473                         err = ext4_ext_correct_indexes(handle, inode, path);
2474                         if (err)
2475                                 goto out;
2476                 }
2477         }
2478         /* Mark modified extent as dirty */
2479         err = ext4_ext_dirty(handle, inode, path + depth);
2480         goto out;
2481 insert:
2482         err = ext4_ext_insert_extent(handle, inode, path, &newex);
2483         if (err == -ENOSPC) {
2484                 err =  ext4_ext_zeroout(inode, &orig_ex);
2485                 if (err)
2486                         goto fix_extent_len;
2487                 /* update the extent length and mark as initialized */
2488                 ex->ee_block = orig_ex.ee_block;
2489                 ex->ee_len   = orig_ex.ee_len;
2490                 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2491                 ext4_ext_dirty(handle, inode, path + depth);
2492                 /* zero out the first half */
2493                 return allocated;
2494         } else if (err)
2495                 goto fix_extent_len;
2496 out:
2497         return err ? err : allocated;
2498
2499 fix_extent_len:
2500         ex->ee_block = orig_ex.ee_block;
2501         ex->ee_len   = orig_ex.ee_len;
2502         ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2503         ext4_ext_mark_uninitialized(ex);
2504         ext4_ext_dirty(handle, inode, path + depth);
2505         return err;
2506 }
2507
2508 /*
2509  * Block allocation/map/preallocation routine for extents based files
2510  *
2511  *
2512  * Need to be called with
2513  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
2514  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
2515  *
2516  * return > 0, number of of blocks already mapped/allocated
2517  *          if create == 0 and these are pre-allocated blocks
2518  *              buffer head is unmapped
2519  *          otherwise blocks are mapped
2520  *
2521  * return = 0, if plain look up failed (blocks have not been allocated)
2522  *          buffer head is unmapped
2523  *
2524  * return < 0, error case.
2525  */
2526 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2527                         ext4_lblk_t iblock,
2528                         unsigned long max_blocks, struct buffer_head *bh_result,
2529                         int create, int extend_disksize)
2530 {
2531         struct ext4_ext_path *path = NULL;
2532         struct ext4_extent_header *eh;
2533         struct ext4_extent newex, *ex;
2534         ext4_fsblk_t goal, newblock;
2535         int err = 0, depth, ret;
2536         unsigned long allocated = 0;
2537         struct ext4_allocation_request ar;
2538
2539         __clear_bit(BH_New, &bh_result->b_state);
2540         ext_debug("blocks %u/%lu requested for inode %u\n",
2541                         iblock, max_blocks, inode->i_ino);
2542
2543         /* check in cache */
2544         goal = ext4_ext_in_cache(inode, iblock, &newex);
2545         if (goal) {
2546                 if (goal == EXT4_EXT_CACHE_GAP) {
2547                         if (!create) {
2548                                 /*
2549                                  * block isn't allocated yet and
2550                                  * user doesn't want to allocate it
2551                                  */
2552                                 goto out2;
2553                         }
2554                         /* we should allocate requested block */
2555                 } else if (goal == EXT4_EXT_CACHE_EXTENT) {
2556                         /* block is already allocated */
2557                         newblock = iblock
2558                                    - le32_to_cpu(newex.ee_block)
2559                                    + ext_pblock(&newex);
2560                         /* number of remaining blocks in the extent */
2561                         allocated = ext4_ext_get_actual_len(&newex) -
2562                                         (iblock - le32_to_cpu(newex.ee_block));
2563                         goto out;
2564                 } else {
2565                         BUG();
2566                 }
2567         }
2568
2569         /* find extent for this block */
2570         path = ext4_ext_find_extent(inode, iblock, NULL);
2571         if (IS_ERR(path)) {
2572                 err = PTR_ERR(path);
2573                 path = NULL;
2574                 goto out2;
2575         }
2576
2577         depth = ext_depth(inode);
2578
2579         /*
2580          * consistent leaf must not be empty;
2581          * this situation is possible, though, _during_ tree modification;
2582          * this is why assert can't be put in ext4_ext_find_extent()
2583          */
2584         BUG_ON(path[depth].p_ext == NULL && depth != 0);
2585         eh = path[depth].p_hdr;
2586
2587         ex = path[depth].p_ext;
2588         if (ex) {
2589                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
2590                 ext4_fsblk_t ee_start = ext_pblock(ex);
2591                 unsigned short ee_len;
2592
2593                 /*
2594                  * Uninitialized extents are treated as holes, except that
2595                  * we split out initialized portions during a write.
2596                  */
2597                 ee_len = ext4_ext_get_actual_len(ex);
2598                 /* if found extent covers block, simply return it */
2599                 if (iblock >= ee_block && iblock < ee_block + ee_len) {
2600                         newblock = iblock - ee_block + ee_start;
2601                         /* number of remaining blocks in the extent */
2602                         allocated = ee_len - (iblock - ee_block);
2603                         ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
2604                                         ee_block, ee_len, newblock);
2605
2606                         /* Do not put uninitialized extent in the cache */
2607                         if (!ext4_ext_is_uninitialized(ex)) {
2608                                 ext4_ext_put_in_cache(inode, ee_block,
2609                                                         ee_len, ee_start,
2610                                                         EXT4_EXT_CACHE_EXTENT);
2611                                 goto out;
2612                         }
2613                         if (create == EXT4_CREATE_UNINITIALIZED_EXT)
2614                                 goto out;
2615                         if (!create) {
2616                                 /*
2617                                  * We have blocks reserved already.  We
2618                                  * return allocated blocks so that delalloc
2619                                  * won't do block reservation for us.  But
2620                                  * the buffer head will be unmapped so that
2621                                  * a read from the block returns 0s.
2622                                  */
2623                                 if (allocated > max_blocks)
2624                                         allocated = max_blocks;
2625                                 set_buffer_unwritten(bh_result);
2626                                 goto out2;
2627                         }
2628
2629                         ret = ext4_ext_convert_to_initialized(handle, inode,
2630                                                                 path, iblock,
2631                                                                 max_blocks);
2632                         if (ret <= 0) {
2633                                 err = ret;
2634                                 goto out2;
2635                         } else
2636                                 allocated = ret;
2637                         goto outnew;
2638                 }
2639         }
2640
2641         /*
2642          * requested block isn't allocated yet;
2643          * we couldn't try to create block if create flag is zero
2644          */
2645         if (!create) {
2646                 /*
2647                  * put just found gap into cache to speed up
2648                  * subsequent requests
2649                  */
2650                 ext4_ext_put_gap_in_cache(inode, path, iblock);
2651                 goto out2;
2652         }
2653         /*
2654          * Okay, we need to do block allocation.  Lazily initialize the block
2655          * allocation info here if necessary.
2656          */
2657         if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2658                 ext4_init_block_alloc_info(inode);
2659
2660         /* find neighbour allocated blocks */
2661         ar.lleft = iblock;
2662         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2663         if (err)
2664                 goto out2;
2665         ar.lright = iblock;
2666         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2667         if (err)
2668                 goto out2;
2669
2670         /*
2671          * See if request is beyond maximum number of blocks we can have in
2672          * a single extent. For an initialized extent this limit is
2673          * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
2674          * EXT_UNINIT_MAX_LEN.
2675          */
2676         if (max_blocks > EXT_INIT_MAX_LEN &&
2677             create != EXT4_CREATE_UNINITIALIZED_EXT)
2678                 max_blocks = EXT_INIT_MAX_LEN;
2679         else if (max_blocks > EXT_UNINIT_MAX_LEN &&
2680                  create == EXT4_CREATE_UNINITIALIZED_EXT)
2681                 max_blocks = EXT_UNINIT_MAX_LEN;
2682
2683         /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
2684         newex.ee_block = cpu_to_le32(iblock);
2685         newex.ee_len = cpu_to_le16(max_blocks);
2686         err = ext4_ext_check_overlap(inode, &newex, path);
2687         if (err)
2688                 allocated = ext4_ext_get_actual_len(&newex);
2689         else
2690                 allocated = max_blocks;
2691
2692         /* allocate new block */
2693         ar.inode = inode;
2694         ar.goal = ext4_ext_find_goal(inode, path, iblock);
2695         ar.logical = iblock;
2696         ar.len = allocated;
2697         if (S_ISREG(inode->i_mode))
2698                 ar.flags = EXT4_MB_HINT_DATA;
2699         else
2700                 /* disable in-core preallocation for non-regular files */
2701                 ar.flags = 0;
2702         newblock = ext4_mb_new_blocks(handle, &ar, &err);
2703         if (!newblock)
2704                 goto out2;
2705         ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2706                         goal, newblock, allocated);
2707
2708         /* try to insert new extent into found leaf and return */
2709         ext4_ext_store_pblock(&newex, newblock);
2710         newex.ee_len = cpu_to_le16(ar.len);
2711         if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
2712                 ext4_ext_mark_uninitialized(&newex);
2713         err = ext4_ext_insert_extent(handle, inode, path, &newex);
2714         if (err) {
2715                 /* free data blocks we just allocated */
2716                 /* not a good idea to call discard here directly,
2717                  * but otherwise we'd need to call it every free() */
2718                 ext4_mb_discard_inode_preallocations(inode);
2719                 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2720                                         ext4_ext_get_actual_len(&newex), 0);
2721                 goto out2;
2722         }
2723
2724         /* previous routine could use block we allocated */
2725         newblock = ext_pblock(&newex);
2726         allocated = ext4_ext_get_actual_len(&newex);
2727 outnew:
2728         if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2729                 EXT4_I(inode)->i_disksize = inode->i_size;
2730
2731         set_buffer_new(bh_result);
2732
2733         /* Cache only when it is _not_ an uninitialized extent */
2734         if (create != EXT4_CREATE_UNINITIALIZED_EXT)
2735                 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2736                                                 EXT4_EXT_CACHE_EXTENT);
2737 out:
2738         if (allocated > max_blocks)
2739                 allocated = max_blocks;
2740         ext4_ext_show_leaf(inode, path);
2741         set_buffer_mapped(bh_result);
2742         bh_result->b_bdev = inode->i_sb->s_bdev;
2743         bh_result->b_blocknr = newblock;
2744 out2:
2745         if (path) {
2746                 ext4_ext_drop_refs(path);
2747                 kfree(path);
2748         }
2749         return err ? err : allocated;
2750 }
2751
2752 void ext4_ext_truncate(struct inode *inode)
2753 {
2754         struct address_space *mapping = inode->i_mapping;
2755         struct super_block *sb = inode->i_sb;
2756         ext4_lblk_t last_block;
2757         handle_t *handle;
2758         int err = 0;
2759
2760         /*
2761          * probably first extent we're gonna free will be last in block
2762          */
2763         err = ext4_writepage_trans_blocks(inode) + 3;
2764         handle = ext4_journal_start(inode, err);
2765         if (IS_ERR(handle))
2766                 return;
2767
2768         if (inode->i_size & (sb->s_blocksize - 1))
2769                 ext4_block_truncate_page(handle, mapping, inode->i_size);
2770
2771         if (ext4_orphan_add(handle, inode))
2772                 goto out_stop;
2773
2774         down_write(&EXT4_I(inode)->i_data_sem);
2775         ext4_ext_invalidate_cache(inode);
2776
2777         ext4_mb_discard_inode_preallocations(inode);
2778
2779         /*
2780          * TODO: optimization is possible here.
2781          * Probably we need not scan at all,
2782          * because page truncation is enough.
2783          */
2784
2785         /* we have to know where to truncate from in crash case */
2786         EXT4_I(inode)->i_disksize = inode->i_size;
2787         ext4_mark_inode_dirty(handle, inode);
2788
2789         last_block = (inode->i_size + sb->s_blocksize - 1)
2790                         >> EXT4_BLOCK_SIZE_BITS(sb);
2791         err = ext4_ext_remove_space(inode, last_block);
2792
2793         /* In a multi-transaction truncate, we only make the final
2794          * transaction synchronous.
2795          */
2796         if (IS_SYNC(inode))
2797                 handle->h_sync = 1;
2798
2799 out_stop:
2800         up_write(&EXT4_I(inode)->i_data_sem);
2801         /*
2802          * If this was a simple ftruncate() and the file will remain alive,
2803          * then we need to clear up the orphan record which we created above.
2804          * However, if this was a real unlink then we were called by
2805          * ext4_delete_inode(), and we allow that function to clean up the
2806          * orphan info for us.
2807          */
2808         if (inode->i_nlink)
2809                 ext4_orphan_del(handle, inode);
2810
2811         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2812         ext4_mark_inode_dirty(handle, inode);
2813         ext4_journal_stop(handle);
2814 }
2815
2816 /*
2817  * ext4_ext_writepage_trans_blocks:
2818  * calculate max number of blocks we could modify
2819  * in order to allocate new block for an inode
2820  */
2821 int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2822 {
2823         int needed;
2824
2825         needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2826
2827         /* caller wants to allocate num blocks, but note it includes sb */
2828         needed = needed * num - (num - 1);
2829
2830 #ifdef CONFIG_QUOTA
2831         needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2832 #endif
2833
2834         return needed;
2835 }
2836
2837 static void ext4_falloc_update_inode(struct inode *inode,
2838                                 int mode, loff_t new_size, int update_ctime)
2839 {
2840         struct timespec now;
2841
2842         if (update_ctime) {
2843                 now = current_fs_time(inode->i_sb);
2844                 if (!timespec_equal(&inode->i_ctime, &now))
2845                         inode->i_ctime = now;
2846         }
2847         /*
2848          * Update only when preallocation was requested beyond
2849          * the file size.
2850          */
2851         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
2852                                 new_size > i_size_read(inode)) {
2853                 i_size_write(inode, new_size);
2854                 EXT4_I(inode)->i_disksize = new_size;
2855         }
2856
2857 }
2858
2859 /*
2860  * preallocate space for a file. This implements ext4's fallocate inode
2861  * operation, which gets called from sys_fallocate system call.
2862  * For block-mapped files, posix_fallocate should fall back to the method
2863  * of writing zeroes to the required new blocks (the same behavior which is
2864  * expected for file systems which do not support fallocate() system call).
2865  */
2866 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2867 {
2868         handle_t *handle;
2869         ext4_lblk_t block;
2870         loff_t new_size;
2871         unsigned long max_blocks;
2872         int ret = 0;
2873         int ret2 = 0;
2874         int retries = 0;
2875         struct buffer_head map_bh;
2876         unsigned int credits, blkbits = inode->i_blkbits;
2877
2878         /*
2879          * currently supporting (pre)allocate mode for extent-based
2880          * files _only_
2881          */
2882         if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
2883                 return -EOPNOTSUPP;
2884
2885         /* preallocation to directories is currently not supported */
2886         if (S_ISDIR(inode->i_mode))
2887                 return -ENODEV;
2888
2889         block = offset >> blkbits;
2890         /*
2891          * We can't just convert len to max_blocks because
2892          * If blocksize = 4096 offset = 3072 and len = 2048
2893          */
2894         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2895                                                         - block;
2896         /*
2897          * credits to insert 1 extent into extent tree + buffers to be able to
2898          * modify 1 super block, 1 block bitmap and 1 group descriptor.
2899          */
2900         credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
2901         mutex_lock(&inode->i_mutex);
2902 retry:
2903         while (ret >= 0 && ret < max_blocks) {
2904                 block = block + ret;
2905                 max_blocks = max_blocks - ret;
2906                 handle = ext4_journal_start(inode, credits);
2907                 if (IS_ERR(handle)) {
2908                         ret = PTR_ERR(handle);
2909                         break;
2910                 }
2911                 ret = ext4_get_blocks_wrap(handle, inode, block,
2912                                           max_blocks, &map_bh,
2913                                           EXT4_CREATE_UNINITIALIZED_EXT, 0);
2914                 if (ret <= 0) {
2915 #ifdef EXT4FS_DEBUG
2916                         WARN_ON(ret <= 0);
2917                         printk(KERN_ERR "%s: ext4_ext_get_blocks "
2918                                     "returned error inode#%lu, block=%u, "
2919                                     "max_blocks=%lu", __func__,
2920                                     inode->i_ino, block, max_blocks);
2921 #endif
2922                         ext4_mark_inode_dirty(handle, inode);
2923                         ret2 = ext4_journal_stop(handle);
2924                         break;
2925                 }
2926                 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
2927                                                 blkbits) >> blkbits))
2928                         new_size = offset + len;
2929                 else
2930                         new_size = (block + ret) << blkbits;
2931
2932                 ext4_falloc_update_inode(inode, mode, new_size,
2933                                                 buffer_new(&map_bh));
2934                 ext4_mark_inode_dirty(handle, inode);
2935                 ret2 = ext4_journal_stop(handle);
2936                 if (ret2)
2937                         break;
2938         }
2939         if (ret == -ENOSPC &&
2940                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
2941                 ret = 0;
2942                 goto retry;
2943         }
2944         mutex_unlock(&inode->i_mutex);
2945         return ret > 0 ? ret2 : ret;
2946 }