Merge branch 'linus' into locking-for-linus
[linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35
36 #include "ocfs2.h"
37
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 #include "xattr.h"
52
53 #include "buffer_head_io.h"
54
55
56 /*
57  * Operations for a specific extent tree type.
58  *
59  * To implement an on-disk btree (extent tree) type in ocfs2, add
60  * an ocfs2_extent_tree_operations structure and the matching
61  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
62  * for the allocation portion of the extent tree.
63  */
64 struct ocfs2_extent_tree_operations {
65         /*
66          * last_eb_blk is the block number of the right most leaf extent
67          * block.  Most on-disk structures containing an extent tree store
68          * this value for fast access.  The ->eo_set_last_eb_blk() and
69          * ->eo_get_last_eb_blk() operations access this value.  They are
70          *  both required.
71          */
72         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
73                                    u64 blkno);
74         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
75
76         /*
77          * The on-disk structure usually keeps track of how many total
78          * clusters are stored in this extent tree.  This function updates
79          * that value.  new_clusters is the delta, and must be
80          * added to the total.  Required.
81          */
82         void (*eo_update_clusters)(struct inode *inode,
83                                    struct ocfs2_extent_tree *et,
84                                    u32 new_clusters);
85
86         /*
87          * If ->eo_insert_check() exists, it is called before rec is
88          * inserted into the extent tree.  It is optional.
89          */
90         int (*eo_insert_check)(struct inode *inode,
91                                struct ocfs2_extent_tree *et,
92                                struct ocfs2_extent_rec *rec);
93         int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
94
95         /*
96          * --------------------------------------------------------------
97          * The remaining are internal to ocfs2_extent_tree and don't have
98          * accessor functions
99          */
100
101         /*
102          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
103          * It is required.
104          */
105         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
106
107         /*
108          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
109          * it exists.  If it does not, et->et_max_leaf_clusters is set
110          * to 0 (unlimited).  Optional.
111          */
112         void (*eo_fill_max_leaf_clusters)(struct inode *inode,
113                                           struct ocfs2_extent_tree *et);
114 };
115
116
117 /*
118  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
119  * in the methods.
120  */
121 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
122 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
123                                          u64 blkno);
124 static void ocfs2_dinode_update_clusters(struct inode *inode,
125                                          struct ocfs2_extent_tree *et,
126                                          u32 clusters);
127 static int ocfs2_dinode_insert_check(struct inode *inode,
128                                      struct ocfs2_extent_tree *et,
129                                      struct ocfs2_extent_rec *rec);
130 static int ocfs2_dinode_sanity_check(struct inode *inode,
131                                      struct ocfs2_extent_tree *et);
132 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
135         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
136         .eo_update_clusters     = ocfs2_dinode_update_clusters,
137         .eo_insert_check        = ocfs2_dinode_insert_check,
138         .eo_sanity_check        = ocfs2_dinode_sanity_check,
139         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
140 };
141
142 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
143                                          u64 blkno)
144 {
145         struct ocfs2_dinode *di = et->et_object;
146
147         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
148         di->i_last_eb_blk = cpu_to_le64(blkno);
149 }
150
151 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
152 {
153         struct ocfs2_dinode *di = et->et_object;
154
155         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
156         return le64_to_cpu(di->i_last_eb_blk);
157 }
158
159 static void ocfs2_dinode_update_clusters(struct inode *inode,
160                                          struct ocfs2_extent_tree *et,
161                                          u32 clusters)
162 {
163         struct ocfs2_dinode *di = et->et_object;
164
165         le32_add_cpu(&di->i_clusters, clusters);
166         spin_lock(&OCFS2_I(inode)->ip_lock);
167         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
168         spin_unlock(&OCFS2_I(inode)->ip_lock);
169 }
170
171 static int ocfs2_dinode_insert_check(struct inode *inode,
172                                      struct ocfs2_extent_tree *et,
173                                      struct ocfs2_extent_rec *rec)
174 {
175         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
176
177         BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179                         (OCFS2_I(inode)->ip_clusters !=
180                          le32_to_cpu(rec->e_cpos)),
181                         "Device %s, asking for sparse allocation: inode %llu, "
182                         "cpos %u, clusters %u\n",
183                         osb->dev_str,
184                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
185                         rec->e_cpos,
186                         OCFS2_I(inode)->ip_clusters);
187
188         return 0;
189 }
190
191 static int ocfs2_dinode_sanity_check(struct inode *inode,
192                                      struct ocfs2_extent_tree *et)
193 {
194         struct ocfs2_dinode *di = et->et_object;
195
196         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
197         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
198
199         return 0;
200 }
201
202 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
203 {
204         struct ocfs2_dinode *di = et->et_object;
205
206         et->et_root_el = &di->id2.i_list;
207 }
208
209
210 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
211 {
212         struct ocfs2_xattr_value_buf *vb = et->et_object;
213
214         et->et_root_el = &vb->vb_xv->xr_list;
215 }
216
217 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
218                                               u64 blkno)
219 {
220         struct ocfs2_xattr_value_buf *vb = et->et_object;
221
222         vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
223 }
224
225 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
226 {
227         struct ocfs2_xattr_value_buf *vb = et->et_object;
228
229         return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
230 }
231
232 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
233                                               struct ocfs2_extent_tree *et,
234                                               u32 clusters)
235 {
236         struct ocfs2_xattr_value_buf *vb = et->et_object;
237
238         le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
239 }
240
241 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
242         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
243         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
244         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
245         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
246 };
247
248 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
249 {
250         struct ocfs2_xattr_block *xb = et->et_object;
251
252         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253 }
254
255 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
256                                                     struct ocfs2_extent_tree *et)
257 {
258         et->et_max_leaf_clusters =
259                 ocfs2_clusters_for_bytes(inode->i_sb,
260                                          OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261 }
262
263 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
264                                              u64 blkno)
265 {
266         struct ocfs2_xattr_block *xb = et->et_object;
267         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
268
269         xt->xt_last_eb_blk = cpu_to_le64(blkno);
270 }
271
272 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
273 {
274         struct ocfs2_xattr_block *xb = et->et_object;
275         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
276
277         return le64_to_cpu(xt->xt_last_eb_blk);
278 }
279
280 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
281                                              struct ocfs2_extent_tree *et,
282                                              u32 clusters)
283 {
284         struct ocfs2_xattr_block *xb = et->et_object;
285
286         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
287 }
288
289 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
290         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
291         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
292         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
293         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
294         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295 };
296
297 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
298                                      struct inode *inode,
299                                      struct buffer_head *bh,
300                                      ocfs2_journal_access_func access,
301                                      void *obj,
302                                      struct ocfs2_extent_tree_operations *ops)
303 {
304         et->et_ops = ops;
305         et->et_root_bh = bh;
306         et->et_root_journal_access = access;
307         if (!obj)
308                 obj = (void *)bh->b_data;
309         et->et_object = obj;
310
311         et->et_ops->eo_fill_root_el(et);
312         if (!et->et_ops->eo_fill_max_leaf_clusters)
313                 et->et_max_leaf_clusters = 0;
314         else
315                 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
316 }
317
318 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
319                                    struct inode *inode,
320                                    struct buffer_head *bh)
321 {
322         __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
323                                  NULL, &ocfs2_dinode_et_ops);
324 }
325
326 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
327                                        struct inode *inode,
328                                        struct buffer_head *bh)
329 {
330         __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
331                                  NULL, &ocfs2_xattr_tree_et_ops);
332 }
333
334 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
335                                         struct inode *inode,
336                                         struct ocfs2_xattr_value_buf *vb)
337 {
338         __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
339                                  &ocfs2_xattr_value_et_ops);
340 }
341
342 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
343                                             u64 new_last_eb_blk)
344 {
345         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
346 }
347
348 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
349 {
350         return et->et_ops->eo_get_last_eb_blk(et);
351 }
352
353 static inline void ocfs2_et_update_clusters(struct inode *inode,
354                                             struct ocfs2_extent_tree *et,
355                                             u32 clusters)
356 {
357         et->et_ops->eo_update_clusters(inode, et, clusters);
358 }
359
360 static inline int ocfs2_et_root_journal_access(handle_t *handle,
361                                                struct inode *inode,
362                                                struct ocfs2_extent_tree *et,
363                                                int type)
364 {
365         return et->et_root_journal_access(handle, inode, et->et_root_bh,
366                                           type);
367 }
368
369 static inline int ocfs2_et_insert_check(struct inode *inode,
370                                         struct ocfs2_extent_tree *et,
371                                         struct ocfs2_extent_rec *rec)
372 {
373         int ret = 0;
374
375         if (et->et_ops->eo_insert_check)
376                 ret = et->et_ops->eo_insert_check(inode, et, rec);
377         return ret;
378 }
379
380 static inline int ocfs2_et_sanity_check(struct inode *inode,
381                                         struct ocfs2_extent_tree *et)
382 {
383         int ret = 0;
384
385         if (et->et_ops->eo_sanity_check)
386                 ret = et->et_ops->eo_sanity_check(inode, et);
387         return ret;
388 }
389
390 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
391 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
392                                          struct ocfs2_extent_block *eb);
393
394 /*
395  * Structures which describe a path through a btree, and functions to
396  * manipulate them.
397  *
398  * The idea here is to be as generic as possible with the tree
399  * manipulation code.
400  */
401 struct ocfs2_path_item {
402         struct buffer_head              *bh;
403         struct ocfs2_extent_list        *el;
404 };
405
406 #define OCFS2_MAX_PATH_DEPTH    5
407
408 struct ocfs2_path {
409         int                             p_tree_depth;
410         ocfs2_journal_access_func       p_root_access;
411         struct ocfs2_path_item          p_node[OCFS2_MAX_PATH_DEPTH];
412 };
413
414 #define path_root_bh(_path) ((_path)->p_node[0].bh)
415 #define path_root_el(_path) ((_path)->p_node[0].el)
416 #define path_root_access(_path)((_path)->p_root_access)
417 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
418 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
419 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
420
421 /*
422  * Reset the actual path elements so that we can re-use the structure
423  * to build another path. Generally, this involves freeing the buffer
424  * heads.
425  */
426 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
427 {
428         int i, start = 0, depth = 0;
429         struct ocfs2_path_item *node;
430
431         if (keep_root)
432                 start = 1;
433
434         for(i = start; i < path_num_items(path); i++) {
435                 node = &path->p_node[i];
436
437                 brelse(node->bh);
438                 node->bh = NULL;
439                 node->el = NULL;
440         }
441
442         /*
443          * Tree depth may change during truncate, or insert. If we're
444          * keeping the root extent list, then make sure that our path
445          * structure reflects the proper depth.
446          */
447         if (keep_root)
448                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
449         else
450                 path_root_access(path) = NULL;
451
452         path->p_tree_depth = depth;
453 }
454
455 static void ocfs2_free_path(struct ocfs2_path *path)
456 {
457         if (path) {
458                 ocfs2_reinit_path(path, 0);
459                 kfree(path);
460         }
461 }
462
463 /*
464  * All the elements of src into dest. After this call, src could be freed
465  * without affecting dest.
466  *
467  * Both paths should have the same root. Any non-root elements of dest
468  * will be freed.
469  */
470 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
471 {
472         int i;
473
474         BUG_ON(path_root_bh(dest) != path_root_bh(src));
475         BUG_ON(path_root_el(dest) != path_root_el(src));
476         BUG_ON(path_root_access(dest) != path_root_access(src));
477
478         ocfs2_reinit_path(dest, 1);
479
480         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
481                 dest->p_node[i].bh = src->p_node[i].bh;
482                 dest->p_node[i].el = src->p_node[i].el;
483
484                 if (dest->p_node[i].bh)
485                         get_bh(dest->p_node[i].bh);
486         }
487 }
488
489 /*
490  * Make the *dest path the same as src and re-initialize src path to
491  * have a root only.
492  */
493 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
494 {
495         int i;
496
497         BUG_ON(path_root_bh(dest) != path_root_bh(src));
498         BUG_ON(path_root_access(dest) != path_root_access(src));
499
500         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
501                 brelse(dest->p_node[i].bh);
502
503                 dest->p_node[i].bh = src->p_node[i].bh;
504                 dest->p_node[i].el = src->p_node[i].el;
505
506                 src->p_node[i].bh = NULL;
507                 src->p_node[i].el = NULL;
508         }
509 }
510
511 /*
512  * Insert an extent block at given index.
513  *
514  * This will not take an additional reference on eb_bh.
515  */
516 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
517                                         struct buffer_head *eb_bh)
518 {
519         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
520
521         /*
522          * Right now, no root bh is an extent block, so this helps
523          * catch code errors with dinode trees. The assertion can be
524          * safely removed if we ever need to insert extent block
525          * structures at the root.
526          */
527         BUG_ON(index == 0);
528
529         path->p_node[index].bh = eb_bh;
530         path->p_node[index].el = &eb->h_list;
531 }
532
533 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
534                                          struct ocfs2_extent_list *root_el,
535                                          ocfs2_journal_access_func access)
536 {
537         struct ocfs2_path *path;
538
539         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
540
541         path = kzalloc(sizeof(*path), GFP_NOFS);
542         if (path) {
543                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
544                 get_bh(root_bh);
545                 path_root_bh(path) = root_bh;
546                 path_root_el(path) = root_el;
547                 path_root_access(path) = access;
548         }
549
550         return path;
551 }
552
553 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
554 {
555         return ocfs2_new_path(path_root_bh(path), path_root_el(path),
556                               path_root_access(path));
557 }
558
559 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
560 {
561         return ocfs2_new_path(et->et_root_bh, et->et_root_el,
562                               et->et_root_journal_access);
563 }
564
565 /*
566  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
567  * otherwise it's the root_access function.
568  *
569  * I don't like the way this function's name looks next to
570  * ocfs2_journal_access_path(), but I don't have a better one.
571  */
572 static int ocfs2_path_bh_journal_access(handle_t *handle,
573                                         struct inode *inode,
574                                         struct ocfs2_path *path,
575                                         int idx)
576 {
577         ocfs2_journal_access_func access = path_root_access(path);
578
579         if (!access)
580                 access = ocfs2_journal_access;
581
582         if (idx)
583                 access = ocfs2_journal_access_eb;
584
585         return access(handle, inode, path->p_node[idx].bh,
586                       OCFS2_JOURNAL_ACCESS_WRITE);
587 }
588
589 /*
590  * Convenience function to journal all components in a path.
591  */
592 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
593                                      struct ocfs2_path *path)
594 {
595         int i, ret = 0;
596
597         if (!path)
598                 goto out;
599
600         for(i = 0; i < path_num_items(path); i++) {
601                 ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
602                 if (ret < 0) {
603                         mlog_errno(ret);
604                         goto out;
605                 }
606         }
607
608 out:
609         return ret;
610 }
611
612 /*
613  * Return the index of the extent record which contains cluster #v_cluster.
614  * -1 is returned if it was not found.
615  *
616  * Should work fine on interior and exterior nodes.
617  */
618 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
619 {
620         int ret = -1;
621         int i;
622         struct ocfs2_extent_rec *rec;
623         u32 rec_end, rec_start, clusters;
624
625         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
626                 rec = &el->l_recs[i];
627
628                 rec_start = le32_to_cpu(rec->e_cpos);
629                 clusters = ocfs2_rec_clusters(el, rec);
630
631                 rec_end = rec_start + clusters;
632
633                 if (v_cluster >= rec_start && v_cluster < rec_end) {
634                         ret = i;
635                         break;
636                 }
637         }
638
639         return ret;
640 }
641
642 enum ocfs2_contig_type {
643         CONTIG_NONE = 0,
644         CONTIG_LEFT,
645         CONTIG_RIGHT,
646         CONTIG_LEFTRIGHT,
647 };
648
649
650 /*
651  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
652  * ocfs2_extent_contig only work properly against leaf nodes!
653  */
654 static int ocfs2_block_extent_contig(struct super_block *sb,
655                                      struct ocfs2_extent_rec *ext,
656                                      u64 blkno)
657 {
658         u64 blk_end = le64_to_cpu(ext->e_blkno);
659
660         blk_end += ocfs2_clusters_to_blocks(sb,
661                                     le16_to_cpu(ext->e_leaf_clusters));
662
663         return blkno == blk_end;
664 }
665
666 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
667                                   struct ocfs2_extent_rec *right)
668 {
669         u32 left_range;
670
671         left_range = le32_to_cpu(left->e_cpos) +
672                 le16_to_cpu(left->e_leaf_clusters);
673
674         return (left_range == le32_to_cpu(right->e_cpos));
675 }
676
677 static enum ocfs2_contig_type
678         ocfs2_extent_contig(struct inode *inode,
679                             struct ocfs2_extent_rec *ext,
680                             struct ocfs2_extent_rec *insert_rec)
681 {
682         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
683
684         /*
685          * Refuse to coalesce extent records with different flag
686          * fields - we don't want to mix unwritten extents with user
687          * data.
688          */
689         if (ext->e_flags != insert_rec->e_flags)
690                 return CONTIG_NONE;
691
692         if (ocfs2_extents_adjacent(ext, insert_rec) &&
693             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
694                         return CONTIG_RIGHT;
695
696         blkno = le64_to_cpu(ext->e_blkno);
697         if (ocfs2_extents_adjacent(insert_rec, ext) &&
698             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
699                 return CONTIG_LEFT;
700
701         return CONTIG_NONE;
702 }
703
704 /*
705  * NOTE: We can have pretty much any combination of contiguousness and
706  * appending.
707  *
708  * The usefulness of APPEND_TAIL is more in that it lets us know that
709  * we'll have to update the path to that leaf.
710  */
711 enum ocfs2_append_type {
712         APPEND_NONE = 0,
713         APPEND_TAIL,
714 };
715
716 enum ocfs2_split_type {
717         SPLIT_NONE = 0,
718         SPLIT_LEFT,
719         SPLIT_RIGHT,
720 };
721
722 struct ocfs2_insert_type {
723         enum ocfs2_split_type   ins_split;
724         enum ocfs2_append_type  ins_appending;
725         enum ocfs2_contig_type  ins_contig;
726         int                     ins_contig_index;
727         int                     ins_tree_depth;
728 };
729
730 struct ocfs2_merge_ctxt {
731         enum ocfs2_contig_type  c_contig_type;
732         int                     c_has_empty_extent;
733         int                     c_split_covers_rec;
734 };
735
736 static int ocfs2_validate_extent_block(struct super_block *sb,
737                                        struct buffer_head *bh)
738 {
739         int rc;
740         struct ocfs2_extent_block *eb =
741                 (struct ocfs2_extent_block *)bh->b_data;
742
743         mlog(0, "Validating extent block %llu\n",
744              (unsigned long long)bh->b_blocknr);
745
746         BUG_ON(!buffer_uptodate(bh));
747
748         /*
749          * If the ecc fails, we return the error but otherwise
750          * leave the filesystem running.  We know any error is
751          * local to this block.
752          */
753         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
754         if (rc) {
755                 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
756                      (unsigned long long)bh->b_blocknr);
757                 return rc;
758         }
759
760         /*
761          * Errors after here are fatal.
762          */
763
764         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
765                 ocfs2_error(sb,
766                             "Extent block #%llu has bad signature %.*s",
767                             (unsigned long long)bh->b_blocknr, 7,
768                             eb->h_signature);
769                 return -EINVAL;
770         }
771
772         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
773                 ocfs2_error(sb,
774                             "Extent block #%llu has an invalid h_blkno "
775                             "of %llu",
776                             (unsigned long long)bh->b_blocknr,
777                             (unsigned long long)le64_to_cpu(eb->h_blkno));
778                 return -EINVAL;
779         }
780
781         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
782                 ocfs2_error(sb,
783                             "Extent block #%llu has an invalid "
784                             "h_fs_generation of #%u",
785                             (unsigned long long)bh->b_blocknr,
786                             le32_to_cpu(eb->h_fs_generation));
787                 return -EINVAL;
788         }
789
790         return 0;
791 }
792
793 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
794                             struct buffer_head **bh)
795 {
796         int rc;
797         struct buffer_head *tmp = *bh;
798
799         rc = ocfs2_read_block(inode, eb_blkno, &tmp,
800                               ocfs2_validate_extent_block);
801
802         /* If ocfs2_read_block() got us a new bh, pass it up. */
803         if (!rc && !*bh)
804                 *bh = tmp;
805
806         return rc;
807 }
808
809
810 /*
811  * How many free extents have we got before we need more meta data?
812  */
813 int ocfs2_num_free_extents(struct ocfs2_super *osb,
814                            struct inode *inode,
815                            struct ocfs2_extent_tree *et)
816 {
817         int retval;
818         struct ocfs2_extent_list *el = NULL;
819         struct ocfs2_extent_block *eb;
820         struct buffer_head *eb_bh = NULL;
821         u64 last_eb_blk = 0;
822
823         mlog_entry_void();
824
825         el = et->et_root_el;
826         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
827
828         if (last_eb_blk) {
829                 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
830                 if (retval < 0) {
831                         mlog_errno(retval);
832                         goto bail;
833                 }
834                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
835                 el = &eb->h_list;
836         }
837
838         BUG_ON(el->l_tree_depth != 0);
839
840         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
841 bail:
842         brelse(eb_bh);
843
844         mlog_exit(retval);
845         return retval;
846 }
847
848 /* expects array to already be allocated
849  *
850  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
851  * l_count for you
852  */
853 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
854                                      handle_t *handle,
855                                      struct inode *inode,
856                                      int wanted,
857                                      struct ocfs2_alloc_context *meta_ac,
858                                      struct buffer_head *bhs[])
859 {
860         int count, status, i;
861         u16 suballoc_bit_start;
862         u32 num_got;
863         u64 first_blkno;
864         struct ocfs2_extent_block *eb;
865
866         mlog_entry_void();
867
868         count = 0;
869         while (count < wanted) {
870                 status = ocfs2_claim_metadata(osb,
871                                               handle,
872                                               meta_ac,
873                                               wanted - count,
874                                               &suballoc_bit_start,
875                                               &num_got,
876                                               &first_blkno);
877                 if (status < 0) {
878                         mlog_errno(status);
879                         goto bail;
880                 }
881
882                 for(i = count;  i < (num_got + count); i++) {
883                         bhs[i] = sb_getblk(osb->sb, first_blkno);
884                         if (bhs[i] == NULL) {
885                                 status = -EIO;
886                                 mlog_errno(status);
887                                 goto bail;
888                         }
889                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
890
891                         status = ocfs2_journal_access_eb(handle, inode, bhs[i],
892                                                          OCFS2_JOURNAL_ACCESS_CREATE);
893                         if (status < 0) {
894                                 mlog_errno(status);
895                                 goto bail;
896                         }
897
898                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
899                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
900                         /* Ok, setup the minimal stuff here. */
901                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
902                         eb->h_blkno = cpu_to_le64(first_blkno);
903                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
904                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
905                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
906                         eb->h_list.l_count =
907                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
908
909                         suballoc_bit_start++;
910                         first_blkno++;
911
912                         /* We'll also be dirtied by the caller, so
913                          * this isn't absolutely necessary. */
914                         status = ocfs2_journal_dirty(handle, bhs[i]);
915                         if (status < 0) {
916                                 mlog_errno(status);
917                                 goto bail;
918                         }
919                 }
920
921                 count += num_got;
922         }
923
924         status = 0;
925 bail:
926         if (status < 0) {
927                 for(i = 0; i < wanted; i++) {
928                         brelse(bhs[i]);
929                         bhs[i] = NULL;
930                 }
931         }
932         mlog_exit(status);
933         return status;
934 }
935
936 /*
937  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
938  *
939  * Returns the sum of the rightmost extent rec logical offset and
940  * cluster count.
941  *
942  * ocfs2_add_branch() uses this to determine what logical cluster
943  * value should be populated into the leftmost new branch records.
944  *
945  * ocfs2_shift_tree_depth() uses this to determine the # clusters
946  * value for the new topmost tree record.
947  */
948 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
949 {
950         int i;
951
952         i = le16_to_cpu(el->l_next_free_rec) - 1;
953
954         return le32_to_cpu(el->l_recs[i].e_cpos) +
955                 ocfs2_rec_clusters(el, &el->l_recs[i]);
956 }
957
958 /*
959  * Add an entire tree branch to our inode. eb_bh is the extent block
960  * to start at, if we don't want to start the branch at the dinode
961  * structure.
962  *
963  * last_eb_bh is required as we have to update it's next_leaf pointer
964  * for the new last extent block.
965  *
966  * the new branch will be 'empty' in the sense that every block will
967  * contain a single record with cluster count == 0.
968  */
969 static int ocfs2_add_branch(struct ocfs2_super *osb,
970                             handle_t *handle,
971                             struct inode *inode,
972                             struct ocfs2_extent_tree *et,
973                             struct buffer_head *eb_bh,
974                             struct buffer_head **last_eb_bh,
975                             struct ocfs2_alloc_context *meta_ac)
976 {
977         int status, new_blocks, i;
978         u64 next_blkno, new_last_eb_blk;
979         struct buffer_head *bh;
980         struct buffer_head **new_eb_bhs = NULL;
981         struct ocfs2_extent_block *eb;
982         struct ocfs2_extent_list  *eb_el;
983         struct ocfs2_extent_list  *el;
984         u32 new_cpos;
985
986         mlog_entry_void();
987
988         BUG_ON(!last_eb_bh || !*last_eb_bh);
989
990         if (eb_bh) {
991                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
992                 el = &eb->h_list;
993         } else
994                 el = et->et_root_el;
995
996         /* we never add a branch to a leaf. */
997         BUG_ON(!el->l_tree_depth);
998
999         new_blocks = le16_to_cpu(el->l_tree_depth);
1000
1001         /* allocate the number of new eb blocks we need */
1002         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1003                              GFP_KERNEL);
1004         if (!new_eb_bhs) {
1005                 status = -ENOMEM;
1006                 mlog_errno(status);
1007                 goto bail;
1008         }
1009
1010         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
1011                                            meta_ac, new_eb_bhs);
1012         if (status < 0) {
1013                 mlog_errno(status);
1014                 goto bail;
1015         }
1016
1017         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1018         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1019
1020         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1021          * linked with the rest of the tree.
1022          * conversly, new_eb_bhs[0] is the new bottommost leaf.
1023          *
1024          * when we leave the loop, new_last_eb_blk will point to the
1025          * newest leaf, and next_blkno will point to the topmost extent
1026          * block. */
1027         next_blkno = new_last_eb_blk = 0;
1028         for(i = 0; i < new_blocks; i++) {
1029                 bh = new_eb_bhs[i];
1030                 eb = (struct ocfs2_extent_block *) bh->b_data;
1031                 /* ocfs2_create_new_meta_bhs() should create it right! */
1032                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1033                 eb_el = &eb->h_list;
1034
1035                 status = ocfs2_journal_access_eb(handle, inode, bh,
1036                                                  OCFS2_JOURNAL_ACCESS_CREATE);
1037                 if (status < 0) {
1038                         mlog_errno(status);
1039                         goto bail;
1040                 }
1041
1042                 eb->h_next_leaf_blk = 0;
1043                 eb_el->l_tree_depth = cpu_to_le16(i);
1044                 eb_el->l_next_free_rec = cpu_to_le16(1);
1045                 /*
1046                  * This actually counts as an empty extent as
1047                  * c_clusters == 0
1048                  */
1049                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1050                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1051                 /*
1052                  * eb_el isn't always an interior node, but even leaf
1053                  * nodes want a zero'd flags and reserved field so
1054                  * this gets the whole 32 bits regardless of use.
1055                  */
1056                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1057                 if (!eb_el->l_tree_depth)
1058                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1059
1060                 status = ocfs2_journal_dirty(handle, bh);
1061                 if (status < 0) {
1062                         mlog_errno(status);
1063                         goto bail;
1064                 }
1065
1066                 next_blkno = le64_to_cpu(eb->h_blkno);
1067         }
1068
1069         /* This is a bit hairy. We want to update up to three blocks
1070          * here without leaving any of them in an inconsistent state
1071          * in case of error. We don't have to worry about
1072          * journal_dirty erroring as it won't unless we've aborted the
1073          * handle (in which case we would never be here) so reserving
1074          * the write with journal_access is all we need to do. */
1075         status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
1076                                          OCFS2_JOURNAL_ACCESS_WRITE);
1077         if (status < 0) {
1078                 mlog_errno(status);
1079                 goto bail;
1080         }
1081         status = ocfs2_et_root_journal_access(handle, inode, et,
1082                                               OCFS2_JOURNAL_ACCESS_WRITE);
1083         if (status < 0) {
1084                 mlog_errno(status);
1085                 goto bail;
1086         }
1087         if (eb_bh) {
1088                 status = ocfs2_journal_access_eb(handle, inode, eb_bh,
1089                                                  OCFS2_JOURNAL_ACCESS_WRITE);
1090                 if (status < 0) {
1091                         mlog_errno(status);
1092                         goto bail;
1093                 }
1094         }
1095
1096         /* Link the new branch into the rest of the tree (el will
1097          * either be on the root_bh, or the extent block passed in. */
1098         i = le16_to_cpu(el->l_next_free_rec);
1099         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1100         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1101         el->l_recs[i].e_int_clusters = 0;
1102         le16_add_cpu(&el->l_next_free_rec, 1);
1103
1104         /* fe needs a new last extent block pointer, as does the
1105          * next_leaf on the previously last-extent-block. */
1106         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1107
1108         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1109         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1110
1111         status = ocfs2_journal_dirty(handle, *last_eb_bh);
1112         if (status < 0)
1113                 mlog_errno(status);
1114         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1115         if (status < 0)
1116                 mlog_errno(status);
1117         if (eb_bh) {
1118                 status = ocfs2_journal_dirty(handle, eb_bh);
1119                 if (status < 0)
1120                         mlog_errno(status);
1121         }
1122
1123         /*
1124          * Some callers want to track the rightmost leaf so pass it
1125          * back here.
1126          */
1127         brelse(*last_eb_bh);
1128         get_bh(new_eb_bhs[0]);
1129         *last_eb_bh = new_eb_bhs[0];
1130
1131         status = 0;
1132 bail:
1133         if (new_eb_bhs) {
1134                 for (i = 0; i < new_blocks; i++)
1135                         brelse(new_eb_bhs[i]);
1136                 kfree(new_eb_bhs);
1137         }
1138
1139         mlog_exit(status);
1140         return status;
1141 }
1142
1143 /*
1144  * adds another level to the allocation tree.
1145  * returns back the new extent block so you can add a branch to it
1146  * after this call.
1147  */
1148 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1149                                   handle_t *handle,
1150                                   struct inode *inode,
1151                                   struct ocfs2_extent_tree *et,
1152                                   struct ocfs2_alloc_context *meta_ac,
1153                                   struct buffer_head **ret_new_eb_bh)
1154 {
1155         int status, i;
1156         u32 new_clusters;
1157         struct buffer_head *new_eb_bh = NULL;
1158         struct ocfs2_extent_block *eb;
1159         struct ocfs2_extent_list  *root_el;
1160         struct ocfs2_extent_list  *eb_el;
1161
1162         mlog_entry_void();
1163
1164         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1165                                            &new_eb_bh);
1166         if (status < 0) {
1167                 mlog_errno(status);
1168                 goto bail;
1169         }
1170
1171         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1172         /* ocfs2_create_new_meta_bhs() should create it right! */
1173         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1174
1175         eb_el = &eb->h_list;
1176         root_el = et->et_root_el;
1177
1178         status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
1179                                          OCFS2_JOURNAL_ACCESS_CREATE);
1180         if (status < 0) {
1181                 mlog_errno(status);
1182                 goto bail;
1183         }
1184
1185         /* copy the root extent list data into the new extent block */
1186         eb_el->l_tree_depth = root_el->l_tree_depth;
1187         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1188         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1189                 eb_el->l_recs[i] = root_el->l_recs[i];
1190
1191         status = ocfs2_journal_dirty(handle, new_eb_bh);
1192         if (status < 0) {
1193                 mlog_errno(status);
1194                 goto bail;
1195         }
1196
1197         status = ocfs2_et_root_journal_access(handle, inode, et,
1198                                               OCFS2_JOURNAL_ACCESS_WRITE);
1199         if (status < 0) {
1200                 mlog_errno(status);
1201                 goto bail;
1202         }
1203
1204         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1205
1206         /* update root_bh now */
1207         le16_add_cpu(&root_el->l_tree_depth, 1);
1208         root_el->l_recs[0].e_cpos = 0;
1209         root_el->l_recs[0].e_blkno = eb->h_blkno;
1210         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1211         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1212                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1213         root_el->l_next_free_rec = cpu_to_le16(1);
1214
1215         /* If this is our 1st tree depth shift, then last_eb_blk
1216          * becomes the allocated extent block */
1217         if (root_el->l_tree_depth == cpu_to_le16(1))
1218                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1219
1220         status = ocfs2_journal_dirty(handle, et->et_root_bh);
1221         if (status < 0) {
1222                 mlog_errno(status);
1223                 goto bail;
1224         }
1225
1226         *ret_new_eb_bh = new_eb_bh;
1227         new_eb_bh = NULL;
1228         status = 0;
1229 bail:
1230         brelse(new_eb_bh);
1231
1232         mlog_exit(status);
1233         return status;
1234 }
1235
1236 /*
1237  * Should only be called when there is no space left in any of the
1238  * leaf nodes. What we want to do is find the lowest tree depth
1239  * non-leaf extent block with room for new records. There are three
1240  * valid results of this search:
1241  *
1242  * 1) a lowest extent block is found, then we pass it back in
1243  *    *lowest_eb_bh and return '0'
1244  *
1245  * 2) the search fails to find anything, but the root_el has room. We
1246  *    pass NULL back in *lowest_eb_bh, but still return '0'
1247  *
1248  * 3) the search fails to find anything AND the root_el is full, in
1249  *    which case we return > 0
1250  *
1251  * return status < 0 indicates an error.
1252  */
1253 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1254                                     struct inode *inode,
1255                                     struct ocfs2_extent_tree *et,
1256                                     struct buffer_head **target_bh)
1257 {
1258         int status = 0, i;
1259         u64 blkno;
1260         struct ocfs2_extent_block *eb;
1261         struct ocfs2_extent_list  *el;
1262         struct buffer_head *bh = NULL;
1263         struct buffer_head *lowest_bh = NULL;
1264
1265         mlog_entry_void();
1266
1267         *target_bh = NULL;
1268
1269         el = et->et_root_el;
1270
1271         while(le16_to_cpu(el->l_tree_depth) > 1) {
1272                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1273                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1274                                     "extent list (next_free_rec == 0)",
1275                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1276                         status = -EIO;
1277                         goto bail;
1278                 }
1279                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1280                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1281                 if (!blkno) {
1282                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1283                                     "list where extent # %d has no physical "
1284                                     "block start",
1285                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1286                         status = -EIO;
1287                         goto bail;
1288                 }
1289
1290                 brelse(bh);
1291                 bh = NULL;
1292
1293                 status = ocfs2_read_extent_block(inode, blkno, &bh);
1294                 if (status < 0) {
1295                         mlog_errno(status);
1296                         goto bail;
1297                 }
1298
1299                 eb = (struct ocfs2_extent_block *) bh->b_data;
1300                 el = &eb->h_list;
1301
1302                 if (le16_to_cpu(el->l_next_free_rec) <
1303                     le16_to_cpu(el->l_count)) {
1304                         brelse(lowest_bh);
1305                         lowest_bh = bh;
1306                         get_bh(lowest_bh);
1307                 }
1308         }
1309
1310         /* If we didn't find one and the fe doesn't have any room,
1311          * then return '1' */
1312         el = et->et_root_el;
1313         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1314                 status = 1;
1315
1316         *target_bh = lowest_bh;
1317 bail:
1318         brelse(bh);
1319
1320         mlog_exit(status);
1321         return status;
1322 }
1323
1324 /*
1325  * Grow a b-tree so that it has more records.
1326  *
1327  * We might shift the tree depth in which case existing paths should
1328  * be considered invalid.
1329  *
1330  * Tree depth after the grow is returned via *final_depth.
1331  *
1332  * *last_eb_bh will be updated by ocfs2_add_branch().
1333  */
1334 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1335                            struct ocfs2_extent_tree *et, int *final_depth,
1336                            struct buffer_head **last_eb_bh,
1337                            struct ocfs2_alloc_context *meta_ac)
1338 {
1339         int ret, shift;
1340         struct ocfs2_extent_list *el = et->et_root_el;
1341         int depth = le16_to_cpu(el->l_tree_depth);
1342         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1343         struct buffer_head *bh = NULL;
1344
1345         BUG_ON(meta_ac == NULL);
1346
1347         shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1348         if (shift < 0) {
1349                 ret = shift;
1350                 mlog_errno(ret);
1351                 goto out;
1352         }
1353
1354         /* We traveled all the way to the bottom of the allocation tree
1355          * and didn't find room for any more extents - we need to add
1356          * another tree level */
1357         if (shift) {
1358                 BUG_ON(bh);
1359                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1360
1361                 /* ocfs2_shift_tree_depth will return us a buffer with
1362                  * the new extent block (so we can pass that to
1363                  * ocfs2_add_branch). */
1364                 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1365                                              meta_ac, &bh);
1366                 if (ret < 0) {
1367                         mlog_errno(ret);
1368                         goto out;
1369                 }
1370                 depth++;
1371                 if (depth == 1) {
1372                         /*
1373                          * Special case: we have room now if we shifted from
1374                          * tree_depth 0, so no more work needs to be done.
1375                          *
1376                          * We won't be calling add_branch, so pass
1377                          * back *last_eb_bh as the new leaf. At depth
1378                          * zero, it should always be null so there's
1379                          * no reason to brelse.
1380                          */
1381                         BUG_ON(*last_eb_bh);
1382                         get_bh(bh);
1383                         *last_eb_bh = bh;
1384                         goto out;
1385                 }
1386         }
1387
1388         /* call ocfs2_add_branch to add the final part of the tree with
1389          * the new data. */
1390         mlog(0, "add branch. bh = %p\n", bh);
1391         ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1392                                meta_ac);
1393         if (ret < 0) {
1394                 mlog_errno(ret);
1395                 goto out;
1396         }
1397
1398 out:
1399         if (final_depth)
1400                 *final_depth = depth;
1401         brelse(bh);
1402         return ret;
1403 }
1404
1405 /*
1406  * This function will discard the rightmost extent record.
1407  */
1408 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1409 {
1410         int next_free = le16_to_cpu(el->l_next_free_rec);
1411         int count = le16_to_cpu(el->l_count);
1412         unsigned int num_bytes;
1413
1414         BUG_ON(!next_free);
1415         /* This will cause us to go off the end of our extent list. */
1416         BUG_ON(next_free >= count);
1417
1418         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1419
1420         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1421 }
1422
1423 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1424                               struct ocfs2_extent_rec *insert_rec)
1425 {
1426         int i, insert_index, next_free, has_empty, num_bytes;
1427         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1428         struct ocfs2_extent_rec *rec;
1429
1430         next_free = le16_to_cpu(el->l_next_free_rec);
1431         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1432
1433         BUG_ON(!next_free);
1434
1435         /* The tree code before us didn't allow enough room in the leaf. */
1436         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1437
1438         /*
1439          * The easiest way to approach this is to just remove the
1440          * empty extent and temporarily decrement next_free.
1441          */
1442         if (has_empty) {
1443                 /*
1444                  * If next_free was 1 (only an empty extent), this
1445                  * loop won't execute, which is fine. We still want
1446                  * the decrement above to happen.
1447                  */
1448                 for(i = 0; i < (next_free - 1); i++)
1449                         el->l_recs[i] = el->l_recs[i+1];
1450
1451                 next_free--;
1452         }
1453
1454         /*
1455          * Figure out what the new record index should be.
1456          */
1457         for(i = 0; i < next_free; i++) {
1458                 rec = &el->l_recs[i];
1459
1460                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1461                         break;
1462         }
1463         insert_index = i;
1464
1465         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1466              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1467
1468         BUG_ON(insert_index < 0);
1469         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1470         BUG_ON(insert_index > next_free);
1471
1472         /*
1473          * No need to memmove if we're just adding to the tail.
1474          */
1475         if (insert_index != next_free) {
1476                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1477
1478                 num_bytes = next_free - insert_index;
1479                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1480                 memmove(&el->l_recs[insert_index + 1],
1481                         &el->l_recs[insert_index],
1482                         num_bytes);
1483         }
1484
1485         /*
1486          * Either we had an empty extent, and need to re-increment or
1487          * there was no empty extent on a non full rightmost leaf node,
1488          * in which case we still need to increment.
1489          */
1490         next_free++;
1491         el->l_next_free_rec = cpu_to_le16(next_free);
1492         /*
1493          * Make sure none of the math above just messed up our tree.
1494          */
1495         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1496
1497         el->l_recs[insert_index] = *insert_rec;
1498
1499 }
1500
1501 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1502 {
1503         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1504
1505         BUG_ON(num_recs == 0);
1506
1507         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1508                 num_recs--;
1509                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1510                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1511                 memset(&el->l_recs[num_recs], 0,
1512                        sizeof(struct ocfs2_extent_rec));
1513                 el->l_next_free_rec = cpu_to_le16(num_recs);
1514         }
1515 }
1516
1517 /*
1518  * Create an empty extent record .
1519  *
1520  * l_next_free_rec may be updated.
1521  *
1522  * If an empty extent already exists do nothing.
1523  */
1524 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1525 {
1526         int next_free = le16_to_cpu(el->l_next_free_rec);
1527
1528         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1529
1530         if (next_free == 0)
1531                 goto set_and_inc;
1532
1533         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1534                 return;
1535
1536         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1537                         "Asked to create an empty extent in a full list:\n"
1538                         "count = %u, tree depth = %u",
1539                         le16_to_cpu(el->l_count),
1540                         le16_to_cpu(el->l_tree_depth));
1541
1542         ocfs2_shift_records_right(el);
1543
1544 set_and_inc:
1545         le16_add_cpu(&el->l_next_free_rec, 1);
1546         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1547 }
1548
1549 /*
1550  * For a rotation which involves two leaf nodes, the "root node" is
1551  * the lowest level tree node which contains a path to both leafs. This
1552  * resulting set of information can be used to form a complete "subtree"
1553  *
1554  * This function is passed two full paths from the dinode down to a
1555  * pair of adjacent leaves. It's task is to figure out which path
1556  * index contains the subtree root - this can be the root index itself
1557  * in a worst-case rotation.
1558  *
1559  * The array index of the subtree root is passed back.
1560  */
1561 static int ocfs2_find_subtree_root(struct inode *inode,
1562                                    struct ocfs2_path *left,
1563                                    struct ocfs2_path *right)
1564 {
1565         int i = 0;
1566
1567         /*
1568          * Check that the caller passed in two paths from the same tree.
1569          */
1570         BUG_ON(path_root_bh(left) != path_root_bh(right));
1571
1572         do {
1573                 i++;
1574
1575                 /*
1576                  * The caller didn't pass two adjacent paths.
1577                  */
1578                 mlog_bug_on_msg(i > left->p_tree_depth,
1579                                 "Inode %lu, left depth %u, right depth %u\n"
1580                                 "left leaf blk %llu, right leaf blk %llu\n",
1581                                 inode->i_ino, left->p_tree_depth,
1582                                 right->p_tree_depth,
1583                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1584                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1585         } while (left->p_node[i].bh->b_blocknr ==
1586                  right->p_node[i].bh->b_blocknr);
1587
1588         return i - 1;
1589 }
1590
1591 typedef void (path_insert_t)(void *, struct buffer_head *);
1592
1593 /*
1594  * Traverse a btree path in search of cpos, starting at root_el.
1595  *
1596  * This code can be called with a cpos larger than the tree, in which
1597  * case it will return the rightmost path.
1598  */
1599 static int __ocfs2_find_path(struct inode *inode,
1600                              struct ocfs2_extent_list *root_el, u32 cpos,
1601                              path_insert_t *func, void *data)
1602 {
1603         int i, ret = 0;
1604         u32 range;
1605         u64 blkno;
1606         struct buffer_head *bh = NULL;
1607         struct ocfs2_extent_block *eb;
1608         struct ocfs2_extent_list *el;
1609         struct ocfs2_extent_rec *rec;
1610         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1611
1612         el = root_el;
1613         while (el->l_tree_depth) {
1614                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1615                         ocfs2_error(inode->i_sb,
1616                                     "Inode %llu has empty extent list at "
1617                                     "depth %u\n",
1618                                     (unsigned long long)oi->ip_blkno,
1619                                     le16_to_cpu(el->l_tree_depth));
1620                         ret = -EROFS;
1621                         goto out;
1622
1623                 }
1624
1625                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1626                         rec = &el->l_recs[i];
1627
1628                         /*
1629                          * In the case that cpos is off the allocation
1630                          * tree, this should just wind up returning the
1631                          * rightmost record.
1632                          */
1633                         range = le32_to_cpu(rec->e_cpos) +
1634                                 ocfs2_rec_clusters(el, rec);
1635                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1636                             break;
1637                 }
1638
1639                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1640                 if (blkno == 0) {
1641                         ocfs2_error(inode->i_sb,
1642                                     "Inode %llu has bad blkno in extent list "
1643                                     "at depth %u (index %d)\n",
1644                                     (unsigned long long)oi->ip_blkno,
1645                                     le16_to_cpu(el->l_tree_depth), i);
1646                         ret = -EROFS;
1647                         goto out;
1648                 }
1649
1650                 brelse(bh);
1651                 bh = NULL;
1652                 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1653                 if (ret) {
1654                         mlog_errno(ret);
1655                         goto out;
1656                 }
1657
1658                 eb = (struct ocfs2_extent_block *) bh->b_data;
1659                 el = &eb->h_list;
1660
1661                 if (le16_to_cpu(el->l_next_free_rec) >
1662                     le16_to_cpu(el->l_count)) {
1663                         ocfs2_error(inode->i_sb,
1664                                     "Inode %llu has bad count in extent list "
1665                                     "at block %llu (next free=%u, count=%u)\n",
1666                                     (unsigned long long)oi->ip_blkno,
1667                                     (unsigned long long)bh->b_blocknr,
1668                                     le16_to_cpu(el->l_next_free_rec),
1669                                     le16_to_cpu(el->l_count));
1670                         ret = -EROFS;
1671                         goto out;
1672                 }
1673
1674                 if (func)
1675                         func(data, bh);
1676         }
1677
1678 out:
1679         /*
1680          * Catch any trailing bh that the loop didn't handle.
1681          */
1682         brelse(bh);
1683
1684         return ret;
1685 }
1686
1687 /*
1688  * Given an initialized path (that is, it has a valid root extent
1689  * list), this function will traverse the btree in search of the path
1690  * which would contain cpos.
1691  *
1692  * The path traveled is recorded in the path structure.
1693  *
1694  * Note that this will not do any comparisons on leaf node extent
1695  * records, so it will work fine in the case that we just added a tree
1696  * branch.
1697  */
1698 struct find_path_data {
1699         int index;
1700         struct ocfs2_path *path;
1701 };
1702 static void find_path_ins(void *data, struct buffer_head *bh)
1703 {
1704         struct find_path_data *fp = data;
1705
1706         get_bh(bh);
1707         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1708         fp->index++;
1709 }
1710 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1711                            u32 cpos)
1712 {
1713         struct find_path_data data;
1714
1715         data.index = 1;
1716         data.path = path;
1717         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1718                                  find_path_ins, &data);
1719 }
1720
1721 static void find_leaf_ins(void *data, struct buffer_head *bh)
1722 {
1723         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1724         struct ocfs2_extent_list *el = &eb->h_list;
1725         struct buffer_head **ret = data;
1726
1727         /* We want to retain only the leaf block. */
1728         if (le16_to_cpu(el->l_tree_depth) == 0) {
1729                 get_bh(bh);
1730                 *ret = bh;
1731         }
1732 }
1733 /*
1734  * Find the leaf block in the tree which would contain cpos. No
1735  * checking of the actual leaf is done.
1736  *
1737  * Some paths want to call this instead of allocating a path structure
1738  * and calling ocfs2_find_path().
1739  *
1740  * This function doesn't handle non btree extent lists.
1741  */
1742 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1743                     u32 cpos, struct buffer_head **leaf_bh)
1744 {
1745         int ret;
1746         struct buffer_head *bh = NULL;
1747
1748         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1749         if (ret) {
1750                 mlog_errno(ret);
1751                 goto out;
1752         }
1753
1754         *leaf_bh = bh;
1755 out:
1756         return ret;
1757 }
1758
1759 /*
1760  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1761  *
1762  * Basically, we've moved stuff around at the bottom of the tree and
1763  * we need to fix up the extent records above the changes to reflect
1764  * the new changes.
1765  *
1766  * left_rec: the record on the left.
1767  * left_child_el: is the child list pointed to by left_rec
1768  * right_rec: the record to the right of left_rec
1769  * right_child_el: is the child list pointed to by right_rec
1770  *
1771  * By definition, this only works on interior nodes.
1772  */
1773 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1774                                   struct ocfs2_extent_list *left_child_el,
1775                                   struct ocfs2_extent_rec *right_rec,
1776                                   struct ocfs2_extent_list *right_child_el)
1777 {
1778         u32 left_clusters, right_end;
1779
1780         /*
1781          * Interior nodes never have holes. Their cpos is the cpos of
1782          * the leftmost record in their child list. Their cluster
1783          * count covers the full theoretical range of their child list
1784          * - the range between their cpos and the cpos of the record
1785          * immediately to their right.
1786          */
1787         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1788         if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1789                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1790                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1791         }
1792         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1793         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1794
1795         /*
1796          * Calculate the rightmost cluster count boundary before
1797          * moving cpos - we will need to adjust clusters after
1798          * updating e_cpos to keep the same highest cluster count.
1799          */
1800         right_end = le32_to_cpu(right_rec->e_cpos);
1801         right_end += le32_to_cpu(right_rec->e_int_clusters);
1802
1803         right_rec->e_cpos = left_rec->e_cpos;
1804         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1805
1806         right_end -= le32_to_cpu(right_rec->e_cpos);
1807         right_rec->e_int_clusters = cpu_to_le32(right_end);
1808 }
1809
1810 /*
1811  * Adjust the adjacent root node records involved in a
1812  * rotation. left_el_blkno is passed in as a key so that we can easily
1813  * find it's index in the root list.
1814  */
1815 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1816                                       struct ocfs2_extent_list *left_el,
1817                                       struct ocfs2_extent_list *right_el,
1818                                       u64 left_el_blkno)
1819 {
1820         int i;
1821
1822         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1823                le16_to_cpu(left_el->l_tree_depth));
1824
1825         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1826                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1827                         break;
1828         }
1829
1830         /*
1831          * The path walking code should have never returned a root and
1832          * two paths which are not adjacent.
1833          */
1834         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1835
1836         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1837                                       &root_el->l_recs[i + 1], right_el);
1838 }
1839
1840 /*
1841  * We've changed a leaf block (in right_path) and need to reflect that
1842  * change back up the subtree.
1843  *
1844  * This happens in multiple places:
1845  *   - When we've moved an extent record from the left path leaf to the right
1846  *     path leaf to make room for an empty extent in the left path leaf.
1847  *   - When our insert into the right path leaf is at the leftmost edge
1848  *     and requires an update of the path immediately to it's left. This
1849  *     can occur at the end of some types of rotation and appending inserts.
1850  *   - When we've adjusted the last extent record in the left path leaf and the
1851  *     1st extent record in the right path leaf during cross extent block merge.
1852  */
1853 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1854                                        struct ocfs2_path *left_path,
1855                                        struct ocfs2_path *right_path,
1856                                        int subtree_index)
1857 {
1858         int ret, i, idx;
1859         struct ocfs2_extent_list *el, *left_el, *right_el;
1860         struct ocfs2_extent_rec *left_rec, *right_rec;
1861         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1862
1863         /*
1864          * Update the counts and position values within all the
1865          * interior nodes to reflect the leaf rotation we just did.
1866          *
1867          * The root node is handled below the loop.
1868          *
1869          * We begin the loop with right_el and left_el pointing to the
1870          * leaf lists and work our way up.
1871          *
1872          * NOTE: within this loop, left_el and right_el always refer
1873          * to the *child* lists.
1874          */
1875         left_el = path_leaf_el(left_path);
1876         right_el = path_leaf_el(right_path);
1877         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1878                 mlog(0, "Adjust records at index %u\n", i);
1879
1880                 /*
1881                  * One nice property of knowing that all of these
1882                  * nodes are below the root is that we only deal with
1883                  * the leftmost right node record and the rightmost
1884                  * left node record.
1885                  */
1886                 el = left_path->p_node[i].el;
1887                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1888                 left_rec = &el->l_recs[idx];
1889
1890                 el = right_path->p_node[i].el;
1891                 right_rec = &el->l_recs[0];
1892
1893                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1894                                               right_el);
1895
1896                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1897                 if (ret)
1898                         mlog_errno(ret);
1899
1900                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1901                 if (ret)
1902                         mlog_errno(ret);
1903
1904                 /*
1905                  * Setup our list pointers now so that the current
1906                  * parents become children in the next iteration.
1907                  */
1908                 left_el = left_path->p_node[i].el;
1909                 right_el = right_path->p_node[i].el;
1910         }
1911
1912         /*
1913          * At the root node, adjust the two adjacent records which
1914          * begin our path to the leaves.
1915          */
1916
1917         el = left_path->p_node[subtree_index].el;
1918         left_el = left_path->p_node[subtree_index + 1].el;
1919         right_el = right_path->p_node[subtree_index + 1].el;
1920
1921         ocfs2_adjust_root_records(el, left_el, right_el,
1922                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1923
1924         root_bh = left_path->p_node[subtree_index].bh;
1925
1926         ret = ocfs2_journal_dirty(handle, root_bh);
1927         if (ret)
1928                 mlog_errno(ret);
1929 }
1930
1931 static int ocfs2_rotate_subtree_right(struct inode *inode,
1932                                       handle_t *handle,
1933                                       struct ocfs2_path *left_path,
1934                                       struct ocfs2_path *right_path,
1935                                       int subtree_index)
1936 {
1937         int ret, i;
1938         struct buffer_head *right_leaf_bh;
1939         struct buffer_head *left_leaf_bh = NULL;
1940         struct buffer_head *root_bh;
1941         struct ocfs2_extent_list *right_el, *left_el;
1942         struct ocfs2_extent_rec move_rec;
1943
1944         left_leaf_bh = path_leaf_bh(left_path);
1945         left_el = path_leaf_el(left_path);
1946
1947         if (left_el->l_next_free_rec != left_el->l_count) {
1948                 ocfs2_error(inode->i_sb,
1949                             "Inode %llu has non-full interior leaf node %llu"
1950                             "(next free = %u)",
1951                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1952                             (unsigned long long)left_leaf_bh->b_blocknr,
1953                             le16_to_cpu(left_el->l_next_free_rec));
1954                 return -EROFS;
1955         }
1956
1957         /*
1958          * This extent block may already have an empty record, so we
1959          * return early if so.
1960          */
1961         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1962                 return 0;
1963
1964         root_bh = left_path->p_node[subtree_index].bh;
1965         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1966
1967         ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
1968                                            subtree_index);
1969         if (ret) {
1970                 mlog_errno(ret);
1971                 goto out;
1972         }
1973
1974         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1975                 ret = ocfs2_path_bh_journal_access(handle, inode,
1976                                                    right_path, i);
1977                 if (ret) {
1978                         mlog_errno(ret);
1979                         goto out;
1980                 }
1981
1982                 ret = ocfs2_path_bh_journal_access(handle, inode,
1983                                                    left_path, i);
1984                 if (ret) {
1985                         mlog_errno(ret);
1986                         goto out;
1987                 }
1988         }
1989
1990         right_leaf_bh = path_leaf_bh(right_path);
1991         right_el = path_leaf_el(right_path);
1992
1993         /* This is a code error, not a disk corruption. */
1994         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1995                         "because rightmost leaf block %llu is empty\n",
1996                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1997                         (unsigned long long)right_leaf_bh->b_blocknr);
1998
1999         ocfs2_create_empty_extent(right_el);
2000
2001         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
2002         if (ret) {
2003                 mlog_errno(ret);
2004                 goto out;
2005         }
2006
2007         /* Do the copy now. */
2008         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2009         move_rec = left_el->l_recs[i];
2010         right_el->l_recs[0] = move_rec;
2011
2012         /*
2013          * Clear out the record we just copied and shift everything
2014          * over, leaving an empty extent in the left leaf.
2015          *
2016          * We temporarily subtract from next_free_rec so that the
2017          * shift will lose the tail record (which is now defunct).
2018          */
2019         le16_add_cpu(&left_el->l_next_free_rec, -1);
2020         ocfs2_shift_records_right(left_el);
2021         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2022         le16_add_cpu(&left_el->l_next_free_rec, 1);
2023
2024         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
2025         if (ret) {
2026                 mlog_errno(ret);
2027                 goto out;
2028         }
2029
2030         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2031                                 subtree_index);
2032
2033 out:
2034         return ret;
2035 }
2036
2037 /*
2038  * Given a full path, determine what cpos value would return us a path
2039  * containing the leaf immediately to the left of the current one.
2040  *
2041  * Will return zero if the path passed in is already the leftmost path.
2042  */
2043 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2044                                          struct ocfs2_path *path, u32 *cpos)
2045 {
2046         int i, j, ret = 0;
2047         u64 blkno;
2048         struct ocfs2_extent_list *el;
2049
2050         BUG_ON(path->p_tree_depth == 0);
2051
2052         *cpos = 0;
2053
2054         blkno = path_leaf_bh(path)->b_blocknr;
2055
2056         /* Start at the tree node just above the leaf and work our way up. */
2057         i = path->p_tree_depth - 1;
2058         while (i >= 0) {
2059                 el = path->p_node[i].el;
2060
2061                 /*
2062                  * Find the extent record just before the one in our
2063                  * path.
2064                  */
2065                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2066                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2067                                 if (j == 0) {
2068                                         if (i == 0) {
2069                                                 /*
2070                                                  * We've determined that the
2071                                                  * path specified is already
2072                                                  * the leftmost one - return a
2073                                                  * cpos of zero.
2074                                                  */
2075                                                 goto out;
2076                                         }
2077                                         /*
2078                                          * The leftmost record points to our
2079                                          * leaf - we need to travel up the
2080                                          * tree one level.
2081                                          */
2082                                         goto next_node;
2083                                 }
2084
2085                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2086                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2087                                                            &el->l_recs[j - 1]);
2088                                 *cpos = *cpos - 1;
2089                                 goto out;
2090                         }
2091                 }
2092
2093                 /*
2094                  * If we got here, we never found a valid node where
2095                  * the tree indicated one should be.
2096                  */
2097                 ocfs2_error(sb,
2098                             "Invalid extent tree at extent block %llu\n",
2099                             (unsigned long long)blkno);
2100                 ret = -EROFS;
2101                 goto out;
2102
2103 next_node:
2104                 blkno = path->p_node[i].bh->b_blocknr;
2105                 i--;
2106         }
2107
2108 out:
2109         return ret;
2110 }
2111
2112 /*
2113  * Extend the transaction by enough credits to complete the rotation,
2114  * and still leave at least the original number of credits allocated
2115  * to this transaction.
2116  */
2117 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2118                                            int op_credits,
2119                                            struct ocfs2_path *path)
2120 {
2121         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2122
2123         if (handle->h_buffer_credits < credits)
2124                 return ocfs2_extend_trans(handle, credits);
2125
2126         return 0;
2127 }
2128
2129 /*
2130  * Trap the case where we're inserting into the theoretical range past
2131  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2132  * whose cpos is less than ours into the right leaf.
2133  *
2134  * It's only necessary to look at the rightmost record of the left
2135  * leaf because the logic that calls us should ensure that the
2136  * theoretical ranges in the path components above the leaves are
2137  * correct.
2138  */
2139 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2140                                                  u32 insert_cpos)
2141 {
2142         struct ocfs2_extent_list *left_el;
2143         struct ocfs2_extent_rec *rec;
2144         int next_free;
2145
2146         left_el = path_leaf_el(left_path);
2147         next_free = le16_to_cpu(left_el->l_next_free_rec);
2148         rec = &left_el->l_recs[next_free - 1];
2149
2150         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2151                 return 1;
2152         return 0;
2153 }
2154
2155 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2156 {
2157         int next_free = le16_to_cpu(el->l_next_free_rec);
2158         unsigned int range;
2159         struct ocfs2_extent_rec *rec;
2160
2161         if (next_free == 0)
2162                 return 0;
2163
2164         rec = &el->l_recs[0];
2165         if (ocfs2_is_empty_extent(rec)) {
2166                 /* Empty list. */
2167                 if (next_free == 1)
2168                         return 0;
2169                 rec = &el->l_recs[1];
2170         }
2171
2172         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2173         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2174                 return 1;
2175         return 0;
2176 }
2177
2178 /*
2179  * Rotate all the records in a btree right one record, starting at insert_cpos.
2180  *
2181  * The path to the rightmost leaf should be passed in.
2182  *
2183  * The array is assumed to be large enough to hold an entire path (tree depth).
2184  *
2185  * Upon succesful return from this function:
2186  *
2187  * - The 'right_path' array will contain a path to the leaf block
2188  *   whose range contains e_cpos.
2189  * - That leaf block will have a single empty extent in list index 0.
2190  * - In the case that the rotation requires a post-insert update,
2191  *   *ret_left_path will contain a valid path which can be passed to
2192  *   ocfs2_insert_path().
2193  */
2194 static int ocfs2_rotate_tree_right(struct inode *inode,
2195                                    handle_t *handle,
2196                                    enum ocfs2_split_type split,
2197                                    u32 insert_cpos,
2198                                    struct ocfs2_path *right_path,
2199                                    struct ocfs2_path **ret_left_path)
2200 {
2201         int ret, start, orig_credits = handle->h_buffer_credits;
2202         u32 cpos;
2203         struct ocfs2_path *left_path = NULL;
2204
2205         *ret_left_path = NULL;
2206
2207         left_path = ocfs2_new_path_from_path(right_path);
2208         if (!left_path) {
2209                 ret = -ENOMEM;
2210                 mlog_errno(ret);
2211                 goto out;
2212         }
2213
2214         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2215         if (ret) {
2216                 mlog_errno(ret);
2217                 goto out;
2218         }
2219
2220         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2221
2222         /*
2223          * What we want to do here is:
2224          *
2225          * 1) Start with the rightmost path.
2226          *
2227          * 2) Determine a path to the leaf block directly to the left
2228          *    of that leaf.
2229          *
2230          * 3) Determine the 'subtree root' - the lowest level tree node
2231          *    which contains a path to both leaves.
2232          *
2233          * 4) Rotate the subtree.
2234          *
2235          * 5) Find the next subtree by considering the left path to be
2236          *    the new right path.
2237          *
2238          * The check at the top of this while loop also accepts
2239          * insert_cpos == cpos because cpos is only a _theoretical_
2240          * value to get us the left path - insert_cpos might very well
2241          * be filling that hole.
2242          *
2243          * Stop at a cpos of '0' because we either started at the
2244          * leftmost branch (i.e., a tree with one branch and a
2245          * rotation inside of it), or we've gone as far as we can in
2246          * rotating subtrees.
2247          */
2248         while (cpos && insert_cpos <= cpos) {
2249                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2250                      insert_cpos, cpos);
2251
2252                 ret = ocfs2_find_path(inode, left_path, cpos);
2253                 if (ret) {
2254                         mlog_errno(ret);
2255                         goto out;
2256                 }
2257
2258                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2259                                 path_leaf_bh(right_path),
2260                                 "Inode %lu: error during insert of %u "
2261                                 "(left path cpos %u) results in two identical "
2262                                 "paths ending at %llu\n",
2263                                 inode->i_ino, insert_cpos, cpos,
2264                                 (unsigned long long)
2265                                 path_leaf_bh(left_path)->b_blocknr);
2266
2267                 if (split == SPLIT_NONE &&
2268                     ocfs2_rotate_requires_path_adjustment(left_path,
2269                                                           insert_cpos)) {
2270
2271                         /*
2272                          * We've rotated the tree as much as we
2273                          * should. The rest is up to
2274                          * ocfs2_insert_path() to complete, after the
2275                          * record insertion. We indicate this
2276                          * situation by returning the left path.
2277                          *
2278                          * The reason we don't adjust the records here
2279                          * before the record insert is that an error
2280                          * later might break the rule where a parent
2281                          * record e_cpos will reflect the actual
2282                          * e_cpos of the 1st nonempty record of the
2283                          * child list.
2284                          */
2285                         *ret_left_path = left_path;
2286                         goto out_ret_path;
2287                 }
2288
2289                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
2290
2291                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2292                      start,
2293                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2294                      right_path->p_tree_depth);
2295
2296                 ret = ocfs2_extend_rotate_transaction(handle, start,
2297                                                       orig_credits, right_path);
2298                 if (ret) {
2299                         mlog_errno(ret);
2300                         goto out;
2301                 }
2302
2303                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2304                                                  right_path, start);
2305                 if (ret) {
2306                         mlog_errno(ret);
2307                         goto out;
2308                 }
2309
2310                 if (split != SPLIT_NONE &&
2311                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2312                                                 insert_cpos)) {
2313                         /*
2314                          * A rotate moves the rightmost left leaf
2315                          * record over to the leftmost right leaf
2316                          * slot. If we're doing an extent split
2317                          * instead of a real insert, then we have to
2318                          * check that the extent to be split wasn't
2319                          * just moved over. If it was, then we can
2320                          * exit here, passing left_path back -
2321                          * ocfs2_split_extent() is smart enough to
2322                          * search both leaves.
2323                          */
2324                         *ret_left_path = left_path;
2325                         goto out_ret_path;
2326                 }
2327
2328                 /*
2329                  * There is no need to re-read the next right path
2330                  * as we know that it'll be our current left
2331                  * path. Optimize by copying values instead.
2332                  */
2333                 ocfs2_mv_path(right_path, left_path);
2334
2335                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2336                                                     &cpos);
2337                 if (ret) {
2338                         mlog_errno(ret);
2339                         goto out;
2340                 }
2341         }
2342
2343 out:
2344         ocfs2_free_path(left_path);
2345
2346 out_ret_path:
2347         return ret;
2348 }
2349
2350 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2351                                       struct ocfs2_path *path)
2352 {
2353         int i, idx;
2354         struct ocfs2_extent_rec *rec;
2355         struct ocfs2_extent_list *el;
2356         struct ocfs2_extent_block *eb;
2357         u32 range;
2358
2359         /* Path should always be rightmost. */
2360         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2361         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2362
2363         el = &eb->h_list;
2364         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2365         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2366         rec = &el->l_recs[idx];
2367         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2368
2369         for (i = 0; i < path->p_tree_depth; i++) {
2370                 el = path->p_node[i].el;
2371                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2372                 rec = &el->l_recs[idx];
2373
2374                 rec->e_int_clusters = cpu_to_le32(range);
2375                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2376
2377                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2378         }
2379 }
2380
2381 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2382                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2383                               struct ocfs2_path *path, int unlink_start)
2384 {
2385         int ret, i;
2386         struct ocfs2_extent_block *eb;
2387         struct ocfs2_extent_list *el;
2388         struct buffer_head *bh;
2389
2390         for(i = unlink_start; i < path_num_items(path); i++) {
2391                 bh = path->p_node[i].bh;
2392
2393                 eb = (struct ocfs2_extent_block *)bh->b_data;
2394                 /*
2395                  * Not all nodes might have had their final count
2396                  * decremented by the caller - handle this here.
2397                  */
2398                 el = &eb->h_list;
2399                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2400                         mlog(ML_ERROR,
2401                              "Inode %llu, attempted to remove extent block "
2402                              "%llu with %u records\n",
2403                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
2404                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2405                              le16_to_cpu(el->l_next_free_rec));
2406
2407                         ocfs2_journal_dirty(handle, bh);
2408                         ocfs2_remove_from_cache(inode, bh);
2409                         continue;
2410                 }
2411
2412                 el->l_next_free_rec = 0;
2413                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2414
2415                 ocfs2_journal_dirty(handle, bh);
2416
2417                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2418                 if (ret)
2419                         mlog_errno(ret);
2420
2421                 ocfs2_remove_from_cache(inode, bh);
2422         }
2423 }
2424
2425 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2426                                  struct ocfs2_path *left_path,
2427                                  struct ocfs2_path *right_path,
2428                                  int subtree_index,
2429                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2430 {
2431         int i;
2432         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2433         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2434         struct ocfs2_extent_list *el;
2435         struct ocfs2_extent_block *eb;
2436
2437         el = path_leaf_el(left_path);
2438
2439         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2440
2441         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2442                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2443                         break;
2444
2445         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2446
2447         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2448         le16_add_cpu(&root_el->l_next_free_rec, -1);
2449
2450         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2451         eb->h_next_leaf_blk = 0;
2452
2453         ocfs2_journal_dirty(handle, root_bh);
2454         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2455
2456         ocfs2_unlink_path(inode, handle, dealloc, right_path,
2457                           subtree_index + 1);
2458 }
2459
2460 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2461                                      struct ocfs2_path *left_path,
2462                                      struct ocfs2_path *right_path,
2463                                      int subtree_index,
2464                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2465                                      int *deleted,
2466                                      struct ocfs2_extent_tree *et)
2467 {
2468         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2469         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2470         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2471         struct ocfs2_extent_block *eb;
2472
2473         *deleted = 0;
2474
2475         right_leaf_el = path_leaf_el(right_path);
2476         left_leaf_el = path_leaf_el(left_path);
2477         root_bh = left_path->p_node[subtree_index].bh;
2478         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2479
2480         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2481                 return 0;
2482
2483         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2484         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2485                 /*
2486                  * It's legal for us to proceed if the right leaf is
2487                  * the rightmost one and it has an empty extent. There
2488                  * are two cases to handle - whether the leaf will be
2489                  * empty after removal or not. If the leaf isn't empty
2490                  * then just remove the empty extent up front. The
2491                  * next block will handle empty leaves by flagging
2492                  * them for unlink.
2493                  *
2494                  * Non rightmost leaves will throw -EAGAIN and the
2495                  * caller can manually move the subtree and retry.
2496                  */
2497
2498                 if (eb->h_next_leaf_blk != 0ULL)
2499                         return -EAGAIN;
2500
2501                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2502                         ret = ocfs2_journal_access_eb(handle, inode,
2503                                                       path_leaf_bh(right_path),
2504                                                       OCFS2_JOURNAL_ACCESS_WRITE);
2505                         if (ret) {
2506                                 mlog_errno(ret);
2507                                 goto out;
2508                         }
2509
2510                         ocfs2_remove_empty_extent(right_leaf_el);
2511                 } else
2512                         right_has_empty = 1;
2513         }
2514
2515         if (eb->h_next_leaf_blk == 0ULL &&
2516             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2517                 /*
2518                  * We have to update i_last_eb_blk during the meta
2519                  * data delete.
2520                  */
2521                 ret = ocfs2_et_root_journal_access(handle, inode, et,
2522                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2523                 if (ret) {
2524                         mlog_errno(ret);
2525                         goto out;
2526                 }
2527
2528                 del_right_subtree = 1;
2529         }
2530
2531         /*
2532          * Getting here with an empty extent in the right path implies
2533          * that it's the rightmost path and will be deleted.
2534          */
2535         BUG_ON(right_has_empty && !del_right_subtree);
2536
2537         ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2538                                            subtree_index);
2539         if (ret) {
2540                 mlog_errno(ret);
2541                 goto out;
2542         }
2543
2544         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2545                 ret = ocfs2_path_bh_journal_access(handle, inode,
2546                                                    right_path, i);
2547                 if (ret) {
2548                         mlog_errno(ret);
2549                         goto out;
2550                 }
2551
2552                 ret = ocfs2_path_bh_journal_access(handle, inode,
2553                                                    left_path, i);
2554                 if (ret) {
2555                         mlog_errno(ret);
2556                         goto out;
2557                 }
2558         }
2559
2560         if (!right_has_empty) {
2561                 /*
2562                  * Only do this if we're moving a real
2563                  * record. Otherwise, the action is delayed until
2564                  * after removal of the right path in which case we
2565                  * can do a simple shift to remove the empty extent.
2566                  */
2567                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2568                 memset(&right_leaf_el->l_recs[0], 0,
2569                        sizeof(struct ocfs2_extent_rec));
2570         }
2571         if (eb->h_next_leaf_blk == 0ULL) {
2572                 /*
2573                  * Move recs over to get rid of empty extent, decrease
2574                  * next_free. This is allowed to remove the last
2575                  * extent in our leaf (setting l_next_free_rec to
2576                  * zero) - the delete code below won't care.
2577                  */
2578                 ocfs2_remove_empty_extent(right_leaf_el);
2579         }
2580
2581         ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2582         if (ret)
2583                 mlog_errno(ret);
2584         ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2585         if (ret)
2586                 mlog_errno(ret);
2587
2588         if (del_right_subtree) {
2589                 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2590                                      subtree_index, dealloc);
2591                 ocfs2_update_edge_lengths(inode, handle, left_path);
2592
2593                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2594                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2595
2596                 /*
2597                  * Removal of the extent in the left leaf was skipped
2598                  * above so we could delete the right path
2599                  * 1st.
2600                  */
2601                 if (right_has_empty)
2602                         ocfs2_remove_empty_extent(left_leaf_el);
2603
2604                 ret = ocfs2_journal_dirty(handle, et_root_bh);
2605                 if (ret)
2606                         mlog_errno(ret);
2607
2608                 *deleted = 1;
2609         } else
2610                 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2611                                            subtree_index);
2612
2613 out:
2614         return ret;
2615 }
2616
2617 /*
2618  * Given a full path, determine what cpos value would return us a path
2619  * containing the leaf immediately to the right of the current one.
2620  *
2621  * Will return zero if the path passed in is already the rightmost path.
2622  *
2623  * This looks similar, but is subtly different to
2624  * ocfs2_find_cpos_for_left_leaf().
2625  */
2626 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2627                                           struct ocfs2_path *path, u32 *cpos)
2628 {
2629         int i, j, ret = 0;
2630         u64 blkno;
2631         struct ocfs2_extent_list *el;
2632
2633         *cpos = 0;
2634
2635         if (path->p_tree_depth == 0)
2636                 return 0;
2637
2638         blkno = path_leaf_bh(path)->b_blocknr;
2639
2640         /* Start at the tree node just above the leaf and work our way up. */
2641         i = path->p_tree_depth - 1;
2642         while (i >= 0) {
2643                 int next_free;
2644
2645                 el = path->p_node[i].el;
2646
2647                 /*
2648                  * Find the extent record just after the one in our
2649                  * path.
2650                  */
2651                 next_free = le16_to_cpu(el->l_next_free_rec);
2652                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2653                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2654                                 if (j == (next_free - 1)) {
2655                                         if (i == 0) {
2656                                                 /*
2657                                                  * We've determined that the
2658                                                  * path specified is already
2659                                                  * the rightmost one - return a
2660                                                  * cpos of zero.
2661                                                  */
2662                                                 goto out;
2663                                         }
2664                                         /*
2665                                          * The rightmost record points to our
2666                                          * leaf - we need to travel up the
2667                                          * tree one level.
2668                                          */
2669                                         goto next_node;
2670                                 }
2671
2672                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2673                                 goto out;
2674                         }
2675                 }
2676
2677                 /*
2678                  * If we got here, we never found a valid node where
2679                  * the tree indicated one should be.
2680                  */
2681                 ocfs2_error(sb,
2682                             "Invalid extent tree at extent block %llu\n",
2683                             (unsigned long long)blkno);
2684                 ret = -EROFS;
2685                 goto out;
2686
2687 next_node:
2688                 blkno = path->p_node[i].bh->b_blocknr;
2689                 i--;
2690         }
2691
2692 out:
2693         return ret;
2694 }
2695
2696 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2697                                             handle_t *handle,
2698                                             struct ocfs2_path *path)
2699 {
2700         int ret;
2701         struct buffer_head *bh = path_leaf_bh(path);
2702         struct ocfs2_extent_list *el = path_leaf_el(path);
2703
2704         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2705                 return 0;
2706
2707         ret = ocfs2_path_bh_journal_access(handle, inode, path,
2708                                            path_num_items(path) - 1);
2709         if (ret) {
2710                 mlog_errno(ret);
2711                 goto out;
2712         }
2713
2714         ocfs2_remove_empty_extent(el);
2715
2716         ret = ocfs2_journal_dirty(handle, bh);
2717         if (ret)
2718                 mlog_errno(ret);
2719
2720 out:
2721         return ret;
2722 }
2723
2724 static int __ocfs2_rotate_tree_left(struct inode *inode,
2725                                     handle_t *handle, int orig_credits,
2726                                     struct ocfs2_path *path,
2727                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2728                                     struct ocfs2_path **empty_extent_path,
2729                                     struct ocfs2_extent_tree *et)
2730 {
2731         int ret, subtree_root, deleted;
2732         u32 right_cpos;
2733         struct ocfs2_path *left_path = NULL;
2734         struct ocfs2_path *right_path = NULL;
2735
2736         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2737
2738         *empty_extent_path = NULL;
2739
2740         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2741                                              &right_cpos);
2742         if (ret) {
2743                 mlog_errno(ret);
2744                 goto out;
2745         }
2746
2747         left_path = ocfs2_new_path_from_path(path);
2748         if (!left_path) {
2749                 ret = -ENOMEM;
2750                 mlog_errno(ret);
2751                 goto out;
2752         }
2753
2754         ocfs2_cp_path(left_path, path);
2755
2756         right_path = ocfs2_new_path_from_path(path);
2757         if (!right_path) {
2758                 ret = -ENOMEM;
2759                 mlog_errno(ret);
2760                 goto out;
2761         }
2762
2763         while (right_cpos) {
2764                 ret = ocfs2_find_path(inode, right_path, right_cpos);
2765                 if (ret) {
2766                         mlog_errno(ret);
2767                         goto out;
2768                 }
2769
2770                 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2771                                                        right_path);
2772
2773                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2774                      subtree_root,
2775                      (unsigned long long)
2776                      right_path->p_node[subtree_root].bh->b_blocknr,
2777                      right_path->p_tree_depth);
2778
2779                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2780                                                       orig_credits, left_path);
2781                 if (ret) {
2782                         mlog_errno(ret);
2783                         goto out;
2784                 }
2785
2786                 /*
2787                  * Caller might still want to make changes to the
2788                  * tree root, so re-add it to the journal here.
2789                  */
2790                 ret = ocfs2_path_bh_journal_access(handle, inode,
2791                                                    left_path, 0);
2792                 if (ret) {
2793                         mlog_errno(ret);
2794                         goto out;
2795                 }
2796
2797                 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2798                                                 right_path, subtree_root,
2799                                                 dealloc, &deleted, et);
2800                 if (ret == -EAGAIN) {
2801                         /*
2802                          * The rotation has to temporarily stop due to
2803                          * the right subtree having an empty
2804                          * extent. Pass it back to the caller for a
2805                          * fixup.
2806                          */
2807                         *empty_extent_path = right_path;
2808                         right_path = NULL;
2809                         goto out;
2810                 }
2811                 if (ret) {
2812                         mlog_errno(ret);
2813                         goto out;
2814                 }
2815
2816                 /*
2817                  * The subtree rotate might have removed records on
2818                  * the rightmost edge. If so, then rotation is
2819                  * complete.
2820                  */
2821                 if (deleted)
2822                         break;
2823
2824                 ocfs2_mv_path(left_path, right_path);
2825
2826                 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2827                                                      &right_cpos);
2828                 if (ret) {
2829                         mlog_errno(ret);
2830                         goto out;
2831                 }
2832         }
2833
2834 out:
2835         ocfs2_free_path(right_path);
2836         ocfs2_free_path(left_path);
2837
2838         return ret;
2839 }
2840
2841 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2842                                 struct ocfs2_path *path,
2843                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
2844                                 struct ocfs2_extent_tree *et)
2845 {
2846         int ret, subtree_index;
2847         u32 cpos;
2848         struct ocfs2_path *left_path = NULL;
2849         struct ocfs2_extent_block *eb;
2850         struct ocfs2_extent_list *el;
2851
2852
2853         ret = ocfs2_et_sanity_check(inode, et);
2854         if (ret)
2855                 goto out;
2856         /*
2857          * There's two ways we handle this depending on
2858          * whether path is the only existing one.
2859          */
2860         ret = ocfs2_extend_rotate_transaction(handle, 0,
2861                                               handle->h_buffer_credits,
2862                                               path);
2863         if (ret) {
2864                 mlog_errno(ret);
2865                 goto out;
2866         }
2867
2868         ret = ocfs2_journal_access_path(inode, handle, path);
2869         if (ret) {
2870                 mlog_errno(ret);
2871                 goto out;
2872         }
2873
2874         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2875         if (ret) {
2876                 mlog_errno(ret);
2877                 goto out;
2878         }
2879
2880         if (cpos) {
2881                 /*
2882                  * We have a path to the left of this one - it needs
2883                  * an update too.
2884                  */
2885                 left_path = ocfs2_new_path_from_path(path);
2886                 if (!left_path) {
2887                         ret = -ENOMEM;
2888                         mlog_errno(ret);
2889                         goto out;
2890                 }
2891
2892                 ret = ocfs2_find_path(inode, left_path, cpos);
2893                 if (ret) {
2894                         mlog_errno(ret);
2895                         goto out;
2896                 }
2897
2898                 ret = ocfs2_journal_access_path(inode, handle, left_path);
2899                 if (ret) {
2900                         mlog_errno(ret);
2901                         goto out;
2902                 }
2903
2904                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2905
2906                 ocfs2_unlink_subtree(inode, handle, left_path, path,
2907                                      subtree_index, dealloc);
2908                 ocfs2_update_edge_lengths(inode, handle, left_path);
2909
2910                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2911                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2912         } else {
2913                 /*
2914                  * 'path' is also the leftmost path which
2915                  * means it must be the only one. This gets
2916                  * handled differently because we want to
2917                  * revert the inode back to having extents
2918                  * in-line.
2919                  */
2920                 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2921
2922                 el = et->et_root_el;
2923                 el->l_tree_depth = 0;
2924                 el->l_next_free_rec = 0;
2925                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2926
2927                 ocfs2_et_set_last_eb_blk(et, 0);
2928         }
2929
2930         ocfs2_journal_dirty(handle, path_root_bh(path));
2931
2932 out:
2933         ocfs2_free_path(left_path);
2934         return ret;
2935 }
2936
2937 /*
2938  * Left rotation of btree records.
2939  *
2940  * In many ways, this is (unsurprisingly) the opposite of right
2941  * rotation. We start at some non-rightmost path containing an empty
2942  * extent in the leaf block. The code works its way to the rightmost
2943  * path by rotating records to the left in every subtree.
2944  *
2945  * This is used by any code which reduces the number of extent records
2946  * in a leaf. After removal, an empty record should be placed in the
2947  * leftmost list position.
2948  *
2949  * This won't handle a length update of the rightmost path records if
2950  * the rightmost tree leaf record is removed so the caller is
2951  * responsible for detecting and correcting that.
2952  */
2953 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2954                                   struct ocfs2_path *path,
2955                                   struct ocfs2_cached_dealloc_ctxt *dealloc,
2956                                   struct ocfs2_extent_tree *et)
2957 {
2958         int ret, orig_credits = handle->h_buffer_credits;
2959         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2960         struct ocfs2_extent_block *eb;
2961         struct ocfs2_extent_list *el;
2962
2963         el = path_leaf_el(path);
2964         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2965                 return 0;
2966
2967         if (path->p_tree_depth == 0) {
2968 rightmost_no_delete:
2969                 /*
2970                  * Inline extents. This is trivially handled, so do
2971                  * it up front.
2972                  */
2973                 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2974                                                        path);
2975                 if (ret)
2976                         mlog_errno(ret);
2977                 goto out;
2978         }
2979
2980         /*
2981          * Handle rightmost branch now. There's several cases:
2982          *  1) simple rotation leaving records in there. That's trivial.
2983          *  2) rotation requiring a branch delete - there's no more
2984          *     records left. Two cases of this:
2985          *     a) There are branches to the left.
2986          *     b) This is also the leftmost (the only) branch.
2987          *
2988          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2989          *  2a) we need the left branch so that we can update it with the unlink
2990          *  2b) we need to bring the inode back to inline extents.
2991          */
2992
2993         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2994         el = &eb->h_list;
2995         if (eb->h_next_leaf_blk == 0) {
2996                 /*
2997                  * This gets a bit tricky if we're going to delete the
2998                  * rightmost path. Get the other cases out of the way
2999                  * 1st.
3000                  */
3001                 if (le16_to_cpu(el->l_next_free_rec) > 1)
3002                         goto rightmost_no_delete;
3003
3004                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3005                         ret = -EIO;
3006                         ocfs2_error(inode->i_sb,
3007                                     "Inode %llu has empty extent block at %llu",
3008                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
3009                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
3010                         goto out;
3011                 }
3012
3013                 /*
3014                  * XXX: The caller can not trust "path" any more after
3015                  * this as it will have been deleted. What do we do?
3016                  *
3017                  * In theory the rotate-for-merge code will never get
3018                  * here because it'll always ask for a rotate in a
3019                  * nonempty list.
3020                  */
3021
3022                 ret = ocfs2_remove_rightmost_path(inode, handle, path,
3023                                                   dealloc, et);
3024                 if (ret)
3025                         mlog_errno(ret);
3026                 goto out;
3027         }
3028
3029         /*
3030          * Now we can loop, remembering the path we get from -EAGAIN
3031          * and restarting from there.
3032          */
3033 try_rotate:
3034         ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
3035                                        dealloc, &restart_path, et);
3036         if (ret && ret != -EAGAIN) {
3037                 mlog_errno(ret);
3038                 goto out;
3039         }
3040
3041         while (ret == -EAGAIN) {
3042                 tmp_path = restart_path;
3043                 restart_path = NULL;
3044
3045                 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
3046                                                tmp_path, dealloc,
3047                                                &restart_path, et);
3048                 if (ret && ret != -EAGAIN) {
3049                         mlog_errno(ret);
3050                         goto out;
3051                 }
3052
3053                 ocfs2_free_path(tmp_path);
3054                 tmp_path = NULL;
3055
3056                 if (ret == 0)
3057                         goto try_rotate;
3058         }
3059
3060 out:
3061         ocfs2_free_path(tmp_path);
3062         ocfs2_free_path(restart_path);
3063         return ret;
3064 }
3065
3066 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3067                                 int index)
3068 {
3069         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3070         unsigned int size;
3071
3072         if (rec->e_leaf_clusters == 0) {
3073                 /*
3074                  * We consumed all of the merged-from record. An empty
3075                  * extent cannot exist anywhere but the 1st array
3076                  * position, so move things over if the merged-from
3077                  * record doesn't occupy that position.
3078                  *
3079                  * This creates a new empty extent so the caller
3080                  * should be smart enough to have removed any existing
3081                  * ones.
3082                  */
3083                 if (index > 0) {
3084                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3085                         size = index * sizeof(struct ocfs2_extent_rec);
3086                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3087                 }
3088
3089                 /*
3090                  * Always memset - the caller doesn't check whether it
3091                  * created an empty extent, so there could be junk in
3092                  * the other fields.
3093                  */
3094                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3095         }
3096 }
3097
3098 static int ocfs2_get_right_path(struct inode *inode,
3099                                 struct ocfs2_path *left_path,
3100                                 struct ocfs2_path **ret_right_path)
3101 {
3102         int ret;
3103         u32 right_cpos;
3104         struct ocfs2_path *right_path = NULL;
3105         struct ocfs2_extent_list *left_el;
3106
3107         *ret_right_path = NULL;
3108
3109         /* This function shouldn't be called for non-trees. */
3110         BUG_ON(left_path->p_tree_depth == 0);
3111
3112         left_el = path_leaf_el(left_path);
3113         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3114
3115         ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3116                                              &right_cpos);
3117         if (ret) {
3118                 mlog_errno(ret);
3119                 goto out;
3120         }
3121
3122         /* This function shouldn't be called for the rightmost leaf. */
3123         BUG_ON(right_cpos == 0);
3124
3125         right_path = ocfs2_new_path_from_path(left_path);
3126         if (!right_path) {
3127                 ret = -ENOMEM;
3128                 mlog_errno(ret);
3129                 goto out;
3130         }
3131
3132         ret = ocfs2_find_path(inode, right_path, right_cpos);
3133         if (ret) {
3134                 mlog_errno(ret);
3135                 goto out;
3136         }
3137
3138         *ret_right_path = right_path;
3139 out:
3140         if (ret)
3141                 ocfs2_free_path(right_path);
3142         return ret;
3143 }
3144
3145 /*
3146  * Remove split_rec clusters from the record at index and merge them
3147  * onto the beginning of the record "next" to it.
3148  * For index < l_count - 1, the next means the extent rec at index + 1.
3149  * For index == l_count - 1, the "next" means the 1st extent rec of the
3150  * next extent block.
3151  */
3152 static int ocfs2_merge_rec_right(struct inode *inode,
3153                                  struct ocfs2_path *left_path,
3154                                  handle_t *handle,
3155                                  struct ocfs2_extent_rec *split_rec,
3156                                  int index)
3157 {
3158         int ret, next_free, i;
3159         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3160         struct ocfs2_extent_rec *left_rec;
3161         struct ocfs2_extent_rec *right_rec;
3162         struct ocfs2_extent_list *right_el;
3163         struct ocfs2_path *right_path = NULL;
3164         int subtree_index = 0;
3165         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3166         struct buffer_head *bh = path_leaf_bh(left_path);
3167         struct buffer_head *root_bh = NULL;
3168
3169         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3170         left_rec = &el->l_recs[index];
3171
3172         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3173             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3174                 /* we meet with a cross extent block merge. */
3175                 ret = ocfs2_get_right_path(inode, left_path, &right_path);
3176                 if (ret) {
3177                         mlog_errno(ret);
3178                         goto out;
3179                 }
3180
3181                 right_el = path_leaf_el(right_path);
3182                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3183                 BUG_ON(next_free <= 0);
3184                 right_rec = &right_el->l_recs[0];
3185                 if (ocfs2_is_empty_extent(right_rec)) {
3186                         BUG_ON(next_free <= 1);
3187                         right_rec = &right_el->l_recs[1];
3188                 }
3189
3190                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3191                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3192                        le32_to_cpu(right_rec->e_cpos));
3193
3194                 subtree_index = ocfs2_find_subtree_root(inode,
3195                                                         left_path, right_path);
3196
3197                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3198                                                       handle->h_buffer_credits,
3199                                                       right_path);
3200                 if (ret) {
3201                         mlog_errno(ret);
3202                         goto out;
3203                 }
3204
3205                 root_bh = left_path->p_node[subtree_index].bh;
3206                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3207
3208                 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3209                                                    subtree_index);
3210                 if (ret) {
3211                         mlog_errno(ret);
3212                         goto out;
3213                 }
3214
3215                 for (i = subtree_index + 1;
3216                      i < path_num_items(right_path); i++) {
3217                         ret = ocfs2_path_bh_journal_access(handle, inode,
3218                                                            right_path, i);
3219                         if (ret) {
3220                                 mlog_errno(ret);
3221                                 goto out;
3222                         }
3223
3224                         ret = ocfs2_path_bh_journal_access(handle, inode,
3225                                                            left_path, i);
3226                         if (ret) {
3227                                 mlog_errno(ret);
3228                                 goto out;
3229                         }
3230                 }
3231
3232         } else {
3233                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3234                 right_rec = &el->l_recs[index + 1];
3235         }
3236
3237         ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3238                                            path_num_items(left_path) - 1);
3239         if (ret) {
3240                 mlog_errno(ret);
3241                 goto out;
3242         }
3243
3244         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3245
3246         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3247         le64_add_cpu(&right_rec->e_blkno,
3248                      -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3249         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3250
3251         ocfs2_cleanup_merge(el, index);
3252
3253         ret = ocfs2_journal_dirty(handle, bh);
3254         if (ret)
3255                 mlog_errno(ret);
3256
3257         if (right_path) {
3258                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3259                 if (ret)
3260                         mlog_errno(ret);
3261
3262                 ocfs2_complete_edge_insert(inode, handle, left_path,
3263                                            right_path, subtree_index);
3264         }
3265 out:
3266         if (right_path)
3267                 ocfs2_free_path(right_path);
3268         return ret;
3269 }
3270
3271 static int ocfs2_get_left_path(struct inode *inode,
3272                                struct ocfs2_path *right_path,
3273                                struct ocfs2_path **ret_left_path)
3274 {
3275         int ret;
3276         u32 left_cpos;
3277         struct ocfs2_path *left_path = NULL;
3278
3279         *ret_left_path = NULL;
3280
3281         /* This function shouldn't be called for non-trees. */
3282         BUG_ON(right_path->p_tree_depth == 0);
3283
3284         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3285                                             right_path, &left_cpos);
3286         if (ret) {
3287                 mlog_errno(ret);
3288                 goto out;
3289         }
3290
3291         /* This function shouldn't be called for the leftmost leaf. */
3292         BUG_ON(left_cpos == 0);
3293
3294         left_path = ocfs2_new_path_from_path(right_path);
3295         if (!left_path) {
3296                 ret = -ENOMEM;
3297                 mlog_errno(ret);
3298                 goto out;
3299         }
3300
3301         ret = ocfs2_find_path(inode, left_path, left_cpos);
3302         if (ret) {
3303                 mlog_errno(ret);
3304                 goto out;
3305         }
3306
3307         *ret_left_path = left_path;
3308 out:
3309         if (ret)
3310                 ocfs2_free_path(left_path);
3311         return ret;
3312 }
3313
3314 /*
3315  * Remove split_rec clusters from the record at index and merge them
3316  * onto the tail of the record "before" it.
3317  * For index > 0, the "before" means the extent rec at index - 1.
3318  *
3319  * For index == 0, the "before" means the last record of the previous
3320  * extent block. And there is also a situation that we may need to
3321  * remove the rightmost leaf extent block in the right_path and change
3322  * the right path to indicate the new rightmost path.
3323  */
3324 static int ocfs2_merge_rec_left(struct inode *inode,
3325                                 struct ocfs2_path *right_path,
3326                                 handle_t *handle,
3327                                 struct ocfs2_extent_rec *split_rec,
3328                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3329                                 struct ocfs2_extent_tree *et,
3330                                 int index)
3331 {
3332         int ret, i, subtree_index = 0, has_empty_extent = 0;
3333         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3334         struct ocfs2_extent_rec *left_rec;
3335         struct ocfs2_extent_rec *right_rec;
3336         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3337         struct buffer_head *bh = path_leaf_bh(right_path);
3338         struct buffer_head *root_bh = NULL;
3339         struct ocfs2_path *left_path = NULL;
3340         struct ocfs2_extent_list *left_el;
3341
3342         BUG_ON(index < 0);
3343
3344         right_rec = &el->l_recs[index];
3345         if (index == 0) {
3346                 /* we meet with a cross extent block merge. */
3347                 ret = ocfs2_get_left_path(inode, right_path, &left_path);
3348                 if (ret) {
3349                         mlog_errno(ret);
3350                         goto out;
3351                 }
3352
3353                 left_el = path_leaf_el(left_path);
3354                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3355                        le16_to_cpu(left_el->l_count));
3356
3357                 left_rec = &left_el->l_recs[
3358                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3359                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3360                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3361                        le32_to_cpu(split_rec->e_cpos));
3362
3363                 subtree_index = ocfs2_find_subtree_root(inode,
3364                                                         left_path, right_path);
3365
3366                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3367                                                       handle->h_buffer_credits,
3368                                                       left_path);
3369                 if (ret) {
3370                         mlog_errno(ret);
3371                         goto out;
3372                 }
3373
3374                 root_bh = left_path->p_node[subtree_index].bh;
3375                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3376
3377                 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3378                                                    subtree_index);
3379                 if (ret) {
3380                         mlog_errno(ret);
3381                         goto out;
3382                 }
3383
3384                 for (i = subtree_index + 1;
3385                      i < path_num_items(right_path); i++) {
3386                         ret = ocfs2_path_bh_journal_access(handle, inode,
3387                                                            right_path, i);
3388                         if (ret) {
3389                                 mlog_errno(ret);
3390                                 goto out;
3391                         }
3392
3393                         ret = ocfs2_path_bh_journal_access(handle, inode,
3394                                                            left_path, i);
3395                         if (ret) {
3396                                 mlog_errno(ret);
3397                                 goto out;
3398                         }
3399                 }
3400         } else {
3401                 left_rec = &el->l_recs[index - 1];
3402                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3403                         has_empty_extent = 1;
3404         }
3405
3406         ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3407                                            path_num_items(right_path) - 1);
3408         if (ret) {
3409                 mlog_errno(ret);
3410                 goto out;
3411         }
3412
3413         if (has_empty_extent && index == 1) {
3414                 /*
3415                  * The easy case - we can just plop the record right in.
3416                  */
3417                 *left_rec = *split_rec;
3418
3419                 has_empty_extent = 0;
3420         } else
3421                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3422
3423         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3424         le64_add_cpu(&right_rec->e_blkno,
3425                      ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3426         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3427
3428         ocfs2_cleanup_merge(el, index);
3429
3430         ret = ocfs2_journal_dirty(handle, bh);
3431         if (ret)
3432                 mlog_errno(ret);
3433
3434         if (left_path) {
3435                 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3436                 if (ret)
3437                         mlog_errno(ret);
3438
3439                 /*
3440                  * In the situation that the right_rec is empty and the extent
3441                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3442                  * it and we need to delete the right extent block.
3443                  */
3444                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3445                     le16_to_cpu(el->l_next_free_rec) == 1) {
3446
3447                         ret = ocfs2_remove_rightmost_path(inode, handle,
3448                                                           right_path,
3449                                                           dealloc, et);
3450                         if (ret) {
3451                                 mlog_errno(ret);
3452                                 goto out;
3453                         }
3454
3455                         /* Now the rightmost extent block has been deleted.
3456                          * So we use the new rightmost path.
3457                          */
3458                         ocfs2_mv_path(right_path, left_path);
3459                         left_path = NULL;
3460                 } else
3461                         ocfs2_complete_edge_insert(inode, handle, left_path,
3462                                                    right_path, subtree_index);
3463         }
3464 out:
3465         if (left_path)
3466                 ocfs2_free_path(left_path);
3467         return ret;
3468 }
3469
3470 static int ocfs2_try_to_merge_extent(struct inode *inode,
3471                                      handle_t *handle,
3472                                      struct ocfs2_path *path,
3473                                      int split_index,
3474                                      struct ocfs2_extent_rec *split_rec,
3475                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3476                                      struct ocfs2_merge_ctxt *ctxt,
3477                                      struct ocfs2_extent_tree *et)
3478
3479 {
3480         int ret = 0;
3481         struct ocfs2_extent_list *el = path_leaf_el(path);
3482         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3483
3484         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3485
3486         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3487                 /*
3488                  * The merge code will need to create an empty
3489                  * extent to take the place of the newly
3490                  * emptied slot. Remove any pre-existing empty
3491                  * extents - having more than one in a leaf is
3492                  * illegal.
3493                  */
3494                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3495                                              dealloc, et);
3496                 if (ret) {
3497                         mlog_errno(ret);
3498                         goto out;
3499                 }
3500                 split_index--;
3501                 rec = &el->l_recs[split_index];
3502         }
3503
3504         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3505                 /*
3506                  * Left-right contig implies this.
3507                  */
3508                 BUG_ON(!ctxt->c_split_covers_rec);
3509
3510                 /*
3511                  * Since the leftright insert always covers the entire
3512                  * extent, this call will delete the insert record
3513                  * entirely, resulting in an empty extent record added to
3514                  * the extent block.
3515                  *
3516                  * Since the adding of an empty extent shifts
3517                  * everything back to the right, there's no need to
3518                  * update split_index here.
3519                  *
3520                  * When the split_index is zero, we need to merge it to the
3521                  * prevoius extent block. It is more efficient and easier
3522                  * if we do merge_right first and merge_left later.
3523                  */
3524                 ret = ocfs2_merge_rec_right(inode, path,
3525                                             handle, split_rec,
3526                                             split_index);
3527                 if (ret) {
3528                         mlog_errno(ret);
3529                         goto out;
3530                 }
3531
3532                 /*
3533                  * We can only get this from logic error above.
3534                  */
3535                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3536
3537                 /* The merge left us with an empty extent, remove it. */
3538                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3539                                              dealloc, et);
3540                 if (ret) {
3541                         mlog_errno(ret);
3542                         goto out;
3543                 }
3544
3545                 rec = &el->l_recs[split_index];
3546
3547                 /*
3548                  * Note that we don't pass split_rec here on purpose -
3549                  * we've merged it into the rec already.
3550                  */
3551                 ret = ocfs2_merge_rec_left(inode, path,
3552                                            handle, rec,
3553                                            dealloc, et,
3554                                            split_index);
3555
3556                 if (ret) {
3557                         mlog_errno(ret);
3558                         goto out;
3559                 }
3560
3561                 ret = ocfs2_rotate_tree_left(inode, handle, path,
3562                                              dealloc, et);
3563                 /*
3564                  * Error from this last rotate is not critical, so
3565                  * print but don't bubble it up.
3566                  */
3567                 if (ret)
3568                         mlog_errno(ret);
3569                 ret = 0;
3570         } else {
3571                 /*
3572                  * Merge a record to the left or right.
3573                  *
3574                  * 'contig_type' is relative to the existing record,
3575                  * so for example, if we're "right contig", it's to
3576                  * the record on the left (hence the left merge).
3577                  */
3578                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3579                         ret = ocfs2_merge_rec_left(inode,
3580                                                    path,
3581                                                    handle, split_rec,
3582                                                    dealloc, et,
3583                                                    split_index);
3584                         if (ret) {
3585                                 mlog_errno(ret);
3586                                 goto out;
3587                         }
3588                 } else {
3589                         ret = ocfs2_merge_rec_right(inode,
3590                                                     path,
3591                                                     handle, split_rec,
3592                                                     split_index);
3593                         if (ret) {
3594                                 mlog_errno(ret);
3595                                 goto out;
3596                         }
3597                 }
3598
3599                 if (ctxt->c_split_covers_rec) {
3600                         /*
3601                          * The merge may have left an empty extent in
3602                          * our leaf. Try to rotate it away.
3603                          */
3604                         ret = ocfs2_rotate_tree_left(inode, handle, path,
3605                                                      dealloc, et);
3606                         if (ret)
3607                                 mlog_errno(ret);
3608                         ret = 0;
3609                 }
3610         }
3611
3612 out:
3613         return ret;
3614 }
3615
3616 static void ocfs2_subtract_from_rec(struct super_block *sb,
3617                                     enum ocfs2_split_type split,
3618                                     struct ocfs2_extent_rec *rec,
3619                                     struct ocfs2_extent_rec *split_rec)
3620 {
3621         u64 len_blocks;
3622
3623         len_blocks = ocfs2_clusters_to_blocks(sb,
3624                                 le16_to_cpu(split_rec->e_leaf_clusters));
3625
3626         if (split == SPLIT_LEFT) {
3627                 /*
3628                  * Region is on the left edge of the existing
3629                  * record.
3630                  */
3631                 le32_add_cpu(&rec->e_cpos,
3632                              le16_to_cpu(split_rec->e_leaf_clusters));
3633                 le64_add_cpu(&rec->e_blkno, len_blocks);
3634                 le16_add_cpu(&rec->e_leaf_clusters,
3635                              -le16_to_cpu(split_rec->e_leaf_clusters));
3636         } else {
3637                 /*
3638                  * Region is on the right edge of the existing
3639                  * record.
3640                  */
3641                 le16_add_cpu(&rec->e_leaf_clusters,
3642                              -le16_to_cpu(split_rec->e_leaf_clusters));
3643         }
3644 }
3645
3646 /*
3647  * Do the final bits of extent record insertion at the target leaf
3648  * list. If this leaf is part of an allocation tree, it is assumed
3649  * that the tree above has been prepared.
3650  */
3651 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3652                                  struct ocfs2_extent_list *el,
3653                                  struct ocfs2_insert_type *insert,
3654                                  struct inode *inode)
3655 {
3656         int i = insert->ins_contig_index;
3657         unsigned int range;
3658         struct ocfs2_extent_rec *rec;
3659
3660         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3661
3662         if (insert->ins_split != SPLIT_NONE) {
3663                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3664                 BUG_ON(i == -1);
3665                 rec = &el->l_recs[i];
3666                 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3667                                         insert_rec);
3668                 goto rotate;
3669         }
3670
3671         /*
3672          * Contiguous insert - either left or right.
3673          */
3674         if (insert->ins_contig != CONTIG_NONE) {
3675                 rec = &el->l_recs[i];
3676                 if (insert->ins_contig == CONTIG_LEFT) {
3677                         rec->e_blkno = insert_rec->e_blkno;
3678                         rec->e_cpos = insert_rec->e_cpos;
3679                 }
3680                 le16_add_cpu(&rec->e_leaf_clusters,
3681                              le16_to_cpu(insert_rec->e_leaf_clusters));
3682                 return;
3683         }
3684
3685         /*
3686          * Handle insert into an empty leaf.
3687          */
3688         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3689             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3690              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3691                 el->l_recs[0] = *insert_rec;
3692                 el->l_next_free_rec = cpu_to_le16(1);
3693                 return;
3694         }
3695
3696         /*
3697          * Appending insert.
3698          */
3699         if (insert->ins_appending == APPEND_TAIL) {
3700                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3701                 rec = &el->l_recs[i];
3702                 range = le32_to_cpu(rec->e_cpos)
3703                         + le16_to_cpu(rec->e_leaf_clusters);
3704                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3705
3706                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3707                                 le16_to_cpu(el->l_count),
3708                                 "inode %lu, depth %u, count %u, next free %u, "
3709                                 "rec.cpos %u, rec.clusters %u, "
3710                                 "insert.cpos %u, insert.clusters %u\n",
3711                                 inode->i_ino,
3712                                 le16_to_cpu(el->l_tree_depth),
3713                                 le16_to_cpu(el->l_count),
3714                                 le16_to_cpu(el->l_next_free_rec),
3715                                 le32_to_cpu(el->l_recs[i].e_cpos),
3716                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3717                                 le32_to_cpu(insert_rec->e_cpos),
3718                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3719                 i++;
3720                 el->l_recs[i] = *insert_rec;
3721                 le16_add_cpu(&el->l_next_free_rec, 1);
3722                 return;
3723         }
3724
3725 rotate:
3726         /*
3727          * Ok, we have to rotate.
3728          *
3729          * At this point, it is safe to assume that inserting into an
3730          * empty leaf and appending to a leaf have both been handled
3731          * above.
3732          *
3733          * This leaf needs to have space, either by the empty 1st
3734          * extent record, or by virtue of an l_next_rec < l_count.
3735          */
3736         ocfs2_rotate_leaf(el, insert_rec);
3737 }
3738
3739 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3740                                            handle_t *handle,
3741                                            struct ocfs2_path *path,
3742                                            struct ocfs2_extent_rec *insert_rec)
3743 {
3744         int ret, i, next_free;
3745         struct buffer_head *bh;
3746         struct ocfs2_extent_list *el;
3747         struct ocfs2_extent_rec *rec;
3748
3749         /*
3750          * Update everything except the leaf block.
3751          */
3752         for (i = 0; i < path->p_tree_depth; i++) {
3753                 bh = path->p_node[i].bh;
3754                 el = path->p_node[i].el;
3755
3756                 next_free = le16_to_cpu(el->l_next_free_rec);
3757                 if (next_free == 0) {
3758                         ocfs2_error(inode->i_sb,
3759                                     "Dinode %llu has a bad extent list",
3760                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
3761                         ret = -EIO;
3762                         return;
3763                 }
3764
3765                 rec = &el->l_recs[next_free - 1];
3766
3767                 rec->e_int_clusters = insert_rec->e_cpos;
3768                 le32_add_cpu(&rec->e_int_clusters,
3769                              le16_to_cpu(insert_rec->e_leaf_clusters));
3770                 le32_add_cpu(&rec->e_int_clusters,
3771                              -le32_to_cpu(rec->e_cpos));
3772
3773                 ret = ocfs2_journal_dirty(handle, bh);
3774                 if (ret)
3775                         mlog_errno(ret);
3776
3777         }
3778 }
3779
3780 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3781                                     struct ocfs2_extent_rec *insert_rec,
3782                                     struct ocfs2_path *right_path,
3783                                     struct ocfs2_path **ret_left_path)
3784 {
3785         int ret, next_free;
3786         struct ocfs2_extent_list *el;
3787         struct ocfs2_path *left_path = NULL;
3788
3789         *ret_left_path = NULL;
3790
3791         /*
3792          * This shouldn't happen for non-trees. The extent rec cluster
3793          * count manipulation below only works for interior nodes.
3794          */
3795         BUG_ON(right_path->p_tree_depth == 0);
3796
3797         /*
3798          * If our appending insert is at the leftmost edge of a leaf,
3799          * then we might need to update the rightmost records of the
3800          * neighboring path.
3801          */
3802         el = path_leaf_el(right_path);
3803         next_free = le16_to_cpu(el->l_next_free_rec);
3804         if (next_free == 0 ||
3805             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3806                 u32 left_cpos;
3807
3808                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3809                                                     &left_cpos);
3810                 if (ret) {
3811                         mlog_errno(ret);
3812                         goto out;
3813                 }
3814
3815                 mlog(0, "Append may need a left path update. cpos: %u, "
3816                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3817                      left_cpos);
3818
3819                 /*
3820                  * No need to worry if the append is already in the
3821                  * leftmost leaf.
3822                  */
3823                 if (left_cpos) {
3824                         left_path = ocfs2_new_path_from_path(right_path);
3825                         if (!left_path) {
3826                                 ret = -ENOMEM;
3827                                 mlog_errno(ret);
3828                                 goto out;
3829                         }
3830
3831                         ret = ocfs2_find_path(inode, left_path, left_cpos);
3832                         if (ret) {
3833                                 mlog_errno(ret);
3834                                 goto out;
3835                         }
3836
3837                         /*
3838                          * ocfs2_insert_path() will pass the left_path to the
3839                          * journal for us.
3840                          */
3841                 }
3842         }
3843
3844         ret = ocfs2_journal_access_path(inode, handle, right_path);
3845         if (ret) {
3846                 mlog_errno(ret);
3847                 goto out;
3848         }
3849
3850         ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3851
3852         *ret_left_path = left_path;
3853         ret = 0;
3854 out:
3855         if (ret != 0)
3856                 ocfs2_free_path(left_path);
3857
3858         return ret;
3859 }
3860
3861 static void ocfs2_split_record(struct inode *inode,
3862                                struct ocfs2_path *left_path,
3863                                struct ocfs2_path *right_path,
3864                                struct ocfs2_extent_rec *split_rec,
3865                                enum ocfs2_split_type split)
3866 {
3867         int index;
3868         u32 cpos = le32_to_cpu(split_rec->e_cpos);
3869         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3870         struct ocfs2_extent_rec *rec, *tmprec;
3871
3872         right_el = path_leaf_el(right_path);
3873         if (left_path)
3874                 left_el = path_leaf_el(left_path);
3875
3876         el = right_el;
3877         insert_el = right_el;
3878         index = ocfs2_search_extent_list(el, cpos);
3879         if (index != -1) {
3880                 if (index == 0 && left_path) {
3881                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3882
3883                         /*
3884                          * This typically means that the record
3885                          * started in the left path but moved to the
3886                          * right as a result of rotation. We either
3887                          * move the existing record to the left, or we
3888                          * do the later insert there.
3889                          *
3890                          * In this case, the left path should always
3891                          * exist as the rotate code will have passed
3892                          * it back for a post-insert update.
3893                          */
3894
3895                         if (split == SPLIT_LEFT) {
3896                                 /*
3897                                  * It's a left split. Since we know
3898                                  * that the rotate code gave us an
3899                                  * empty extent in the left path, we
3900                                  * can just do the insert there.
3901                                  */
3902                                 insert_el = left_el;
3903                         } else {
3904                                 /*
3905                                  * Right split - we have to move the
3906                                  * existing record over to the left
3907                                  * leaf. The insert will be into the
3908                                  * newly created empty extent in the
3909                                  * right leaf.
3910                                  */
3911                                 tmprec = &right_el->l_recs[index];
3912                                 ocfs2_rotate_leaf(left_el, tmprec);
3913                                 el = left_el;
3914
3915                                 memset(tmprec, 0, sizeof(*tmprec));
3916                                 index = ocfs2_search_extent_list(left_el, cpos);
3917                                 BUG_ON(index == -1);
3918                         }
3919                 }
3920         } else {
3921                 BUG_ON(!left_path);
3922                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3923                 /*
3924                  * Left path is easy - we can just allow the insert to
3925                  * happen.
3926                  */
3927                 el = left_el;
3928                 insert_el = left_el;
3929                 index = ocfs2_search_extent_list(el, cpos);
3930                 BUG_ON(index == -1);
3931         }
3932
3933         rec = &el->l_recs[index];
3934         ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3935         ocfs2_rotate_leaf(insert_el, split_rec);
3936 }
3937
3938 /*
3939  * This function only does inserts on an allocation b-tree. For tree
3940  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3941  *
3942  * right_path is the path we want to do the actual insert
3943  * in. left_path should only be passed in if we need to update that
3944  * portion of the tree after an edge insert.
3945  */
3946 static int ocfs2_insert_path(struct inode *inode,
3947                              handle_t *handle,
3948                              struct ocfs2_path *left_path,
3949                              struct ocfs2_path *right_path,
3950                              struct ocfs2_extent_rec *insert_rec,
3951                              struct ocfs2_insert_type *insert)
3952 {
3953         int ret, subtree_index;
3954         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3955
3956         if (left_path) {
3957                 int credits = handle->h_buffer_credits;
3958
3959                 /*
3960                  * There's a chance that left_path got passed back to
3961                  * us without being accounted for in the
3962                  * journal. Extend our transaction here to be sure we
3963                  * can change those blocks.
3964                  */
3965                 credits += left_path->p_tree_depth;
3966
3967                 ret = ocfs2_extend_trans(handle, credits);
3968                 if (ret < 0) {
3969                         mlog_errno(ret);
3970                         goto out;
3971                 }
3972
3973                 ret = ocfs2_journal_access_path(inode, handle, left_path);
3974                 if (ret < 0) {
3975                         mlog_errno(ret);
3976                         goto out;
3977                 }
3978         }
3979
3980         /*
3981          * Pass both paths to the journal. The majority of inserts
3982          * will be touching all components anyway.
3983          */
3984         ret = ocfs2_journal_access_path(inode, handle, right_path);
3985         if (ret < 0) {
3986                 mlog_errno(ret);
3987                 goto out;
3988         }
3989
3990         if (insert->ins_split != SPLIT_NONE) {
3991                 /*
3992                  * We could call ocfs2_insert_at_leaf() for some types
3993                  * of splits, but it's easier to just let one separate
3994                  * function sort it all out.
3995                  */
3996                 ocfs2_split_record(inode, left_path, right_path,
3997                                    insert_rec, insert->ins_split);
3998
3999                 /*
4000                  * Split might have modified either leaf and we don't
4001                  * have a guarantee that the later edge insert will
4002                  * dirty this for us.
4003                  */
4004                 if (left_path)
4005                         ret = ocfs2_journal_dirty(handle,
4006                                                   path_leaf_bh(left_path));
4007                         if (ret)
4008                                 mlog_errno(ret);
4009         } else
4010                 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
4011                                      insert, inode);
4012
4013         ret = ocfs2_journal_dirty(handle, leaf_bh);
4014         if (ret)
4015                 mlog_errno(ret);
4016
4017         if (left_path) {
4018                 /*
4019                  * The rotate code has indicated that we need to fix
4020                  * up portions of the tree after the insert.
4021                  *
4022                  * XXX: Should we extend the transaction here?
4023                  */
4024                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
4025                                                         right_path);
4026                 ocfs2_complete_edge_insert(inode, handle, left_path,
4027                                            right_path, subtree_index);
4028         }
4029
4030         ret = 0;
4031 out:
4032         return ret;
4033 }
4034
4035 static int ocfs2_do_insert_extent(struct inode *inode,
4036                                   handle_t *handle,
4037                                   struct ocfs2_extent_tree *et,
4038                                   struct ocfs2_extent_rec *insert_rec,
4039                                   struct ocfs2_insert_type *type)
4040 {
4041         int ret, rotate = 0;
4042         u32 cpos;
4043         struct ocfs2_path *right_path = NULL;
4044         struct ocfs2_path *left_path = NULL;
4045         struct ocfs2_extent_list *el;
4046
4047         el = et->et_root_el;
4048
4049         ret = ocfs2_et_root_journal_access(handle, inode, et,
4050                                            OCFS2_JOURNAL_ACCESS_WRITE);
4051         if (ret) {
4052                 mlog_errno(ret);
4053                 goto out;
4054         }
4055
4056         if (le16_to_cpu(el->l_tree_depth) == 0) {
4057                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4058                 goto out_update_clusters;
4059         }
4060
4061         right_path = ocfs2_new_path_from_et(et);
4062         if (!right_path) {
4063                 ret = -ENOMEM;
4064                 mlog_errno(ret);
4065                 goto out;
4066         }
4067
4068         /*
4069          * Determine the path to start with. Rotations need the
4070          * rightmost path, everything else can go directly to the
4071          * target leaf.
4072          */
4073         cpos = le32_to_cpu(insert_rec->e_cpos);
4074         if (type->ins_appending == APPEND_NONE &&
4075             type->ins_contig == CONTIG_NONE) {
4076                 rotate = 1;
4077                 cpos = UINT_MAX;
4078         }
4079
4080         ret = ocfs2_find_path(inode, right_path, cpos);
4081         if (ret) {
4082                 mlog_errno(ret);
4083                 goto out;
4084         }
4085
4086         /*
4087          * Rotations and appends need special treatment - they modify
4088          * parts of the tree's above them.
4089          *
4090          * Both might pass back a path immediate to the left of the
4091          * one being inserted to. This will be cause
4092          * ocfs2_insert_path() to modify the rightmost records of
4093          * left_path to account for an edge insert.
4094          *
4095          * XXX: When modifying this code, keep in mind that an insert
4096          * can wind up skipping both of these two special cases...
4097          */
4098         if (rotate) {
4099                 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4100                                               le32_to_cpu(insert_rec->e_cpos),
4101                                               right_path, &left_path);
4102                 if (ret) {
4103                         mlog_errno(ret);
4104                         goto out;
4105                 }
4106
4107                 /*
4108                  * ocfs2_rotate_tree_right() might have extended the
4109                  * transaction without re-journaling our tree root.
4110                  */
4111                 ret = ocfs2_et_root_journal_access(handle, inode, et,
4112                                                    OCFS2_JOURNAL_ACCESS_WRITE);
4113                 if (ret) {
4114                         mlog_errno(ret);
4115                         goto out;
4116                 }
4117         } else if (type->ins_appending == APPEND_TAIL
4118                    && type->ins_contig != CONTIG_LEFT) {
4119                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4120                                                right_path, &left_path);
4121                 if (ret) {
4122                         mlog_errno(ret);
4123                         goto out;
4124                 }
4125         }
4126
4127         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4128                                 insert_rec, type);
4129         if (ret) {
4130                 mlog_errno(ret);
4131                 goto out;
4132         }
4133
4134 out_update_clusters:
4135         if (type->ins_split == SPLIT_NONE)
4136                 ocfs2_et_update_clusters(inode, et,
4137                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4138
4139         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4140         if (ret)
4141                 mlog_errno(ret);
4142
4143 out:
4144         ocfs2_free_path(left_path);
4145         ocfs2_free_path(right_path);
4146
4147         return ret;
4148 }
4149
4150 static enum ocfs2_contig_type
4151 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4152                                struct ocfs2_extent_list *el, int index,
4153                                struct ocfs2_extent_rec *split_rec)
4154 {
4155         int status;
4156         enum ocfs2_contig_type ret = CONTIG_NONE;
4157         u32 left_cpos, right_cpos;
4158         struct ocfs2_extent_rec *rec = NULL;
4159         struct ocfs2_extent_list *new_el;
4160         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4161         struct buffer_head *bh;
4162         struct ocfs2_extent_block *eb;
4163
4164         if (index > 0) {
4165                 rec = &el->l_recs[index - 1];
4166         } else if (path->p_tree_depth > 0) {
4167                 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4168                                                        path, &left_cpos);
4169                 if (status)
4170                         goto out;
4171
4172                 if (left_cpos != 0) {
4173                         left_path = ocfs2_new_path_from_path(path);
4174                         if (!left_path)
4175                                 goto out;
4176
4177                         status = ocfs2_find_path(inode, left_path, left_cpos);
4178                         if (status)
4179                                 goto out;
4180
4181                         new_el = path_leaf_el(left_path);
4182
4183                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4184                             le16_to_cpu(new_el->l_count)) {
4185                                 bh = path_leaf_bh(left_path);
4186                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4187                                 ocfs2_error(inode->i_sb,
4188                                             "Extent block #%llu has an "
4189                                             "invalid l_next_free_rec of "
4190                                             "%d.  It should have "
4191                                             "matched the l_count of %d",
4192                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4193                                             le16_to_cpu(new_el->l_next_free_rec),
4194                                             le16_to_cpu(new_el->l_count));
4195                                 status = -EINVAL;
4196                                 goto out;
4197                         }
4198                         rec = &new_el->l_recs[
4199                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4200                 }
4201         }
4202
4203         /*
4204          * We're careful to check for an empty extent record here -
4205          * the merge code will know what to do if it sees one.
4206          */
4207         if (rec) {
4208                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4209                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4210                                 ret = CONTIG_RIGHT;
4211                 } else {
4212                         ret = ocfs2_extent_contig(inode, rec, split_rec);
4213                 }
4214         }
4215
4216         rec = NULL;
4217         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4218                 rec = &el->l_recs[index + 1];
4219         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4220                  path->p_tree_depth > 0) {
4221                 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4222                                                         path, &right_cpos);
4223                 if (status)
4224                         goto out;
4225
4226                 if (right_cpos == 0)
4227                         goto out;
4228
4229                 right_path = ocfs2_new_path_from_path(path);
4230                 if (!right_path)
4231                         goto out;
4232
4233                 status = ocfs2_find_path(inode, right_path, right_cpos);
4234                 if (status)
4235                         goto out;
4236
4237                 new_el = path_leaf_el(right_path);
4238                 rec = &new_el->l_recs[0];
4239                 if (ocfs2_is_empty_extent(rec)) {
4240                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4241                                 bh = path_leaf_bh(right_path);
4242                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4243                                 ocfs2_error(inode->i_sb,
4244                                             "Extent block #%llu has an "
4245                                             "invalid l_next_free_rec of %d",
4246                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4247                                             le16_to_cpu(new_el->l_next_free_rec));
4248                                 status = -EINVAL;
4249                                 goto out;
4250                         }
4251                         rec = &new_el->l_recs[1];
4252                 }
4253         }
4254
4255         if (rec) {
4256                 enum ocfs2_contig_type contig_type;
4257
4258                 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4259
4260                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4261                         ret = CONTIG_LEFTRIGHT;
4262                 else if (ret == CONTIG_NONE)
4263                         ret = contig_type;
4264         }
4265
4266 out:
4267         if (left_path)
4268                 ocfs2_free_path(left_path);
4269         if (right_path)
4270                 ocfs2_free_path(right_path);
4271
4272         return ret;
4273 }
4274
4275 static void ocfs2_figure_contig_type(struct inode *inode,
4276                                      struct ocfs2_insert_type *insert,
4277                                      struct ocfs2_extent_list *el,
4278                                      struct ocfs2_extent_rec *insert_rec,
4279                                      struct ocfs2_extent_tree *et)
4280 {
4281         int i;
4282         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4283
4284         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4285
4286         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4287                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4288                                                   insert_rec);
4289                 if (contig_type != CONTIG_NONE) {
4290                         insert->ins_contig_index = i;
4291                         break;
4292                 }
4293         }
4294         insert->ins_contig = contig_type;
4295
4296         if (insert->ins_contig != CONTIG_NONE) {
4297                 struct ocfs2_extent_rec *rec =
4298                                 &el->l_recs[insert->ins_contig_index];
4299                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4300                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4301
4302                 /*
4303                  * Caller might want us to limit the size of extents, don't
4304                  * calculate contiguousness if we might exceed that limit.
4305                  */
4306                 if (et->et_max_leaf_clusters &&
4307                     (len > et->et_max_leaf_clusters))
4308                         insert->ins_contig = CONTIG_NONE;
4309         }
4310 }
4311
4312 /*
4313  * This should only be called against the righmost leaf extent list.
4314  *
4315  * ocfs2_figure_appending_type() will figure out whether we'll have to
4316  * insert at the tail of the rightmost leaf.
4317  *
4318  * This should also work against the root extent list for tree's with 0
4319  * depth. If we consider the root extent list to be the rightmost leaf node
4320  * then the logic here makes sense.
4321  */
4322 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4323                                         struct ocfs2_extent_list *el,
4324                                         struct ocfs2_extent_rec *insert_rec)
4325 {
4326         int i;
4327         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4328         struct ocfs2_extent_rec *rec;
4329
4330         insert->ins_appending = APPEND_NONE;
4331
4332         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4333
4334         if (!el->l_next_free_rec)
4335                 goto set_tail_append;
4336
4337         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4338                 /* Were all records empty? */
4339                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4340                         goto set_tail_append;
4341         }
4342
4343         i = le16_to_cpu(el->l_next_free_rec) - 1;
4344         rec = &el->l_recs[i];
4345
4346         if (cpos >=
4347             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4348                 goto set_tail_append;
4349
4350         return;
4351
4352 set_tail_append:
4353         insert->ins_appending = APPEND_TAIL;
4354 }
4355
4356 /*
4357  * Helper function called at the begining of an insert.
4358  *
4359  * This computes a few things that are commonly used in the process of
4360  * inserting into the btree:
4361  *   - Whether the new extent is contiguous with an existing one.
4362  *   - The current tree depth.
4363  *   - Whether the insert is an appending one.
4364  *   - The total # of free records in the tree.
4365  *
4366  * All of the information is stored on the ocfs2_insert_type
4367  * structure.
4368  */
4369 static int ocfs2_figure_insert_type(struct inode *inode,
4370                                     struct ocfs2_extent_tree *et,
4371                                     struct buffer_head **last_eb_bh,
4372                                     struct ocfs2_extent_rec *insert_rec,
4373                                     int *free_records,
4374                                     struct ocfs2_insert_type *insert)
4375 {
4376         int ret;
4377         struct ocfs2_extent_block *eb;
4378         struct ocfs2_extent_list *el;
4379         struct ocfs2_path *path = NULL;
4380         struct buffer_head *bh = NULL;
4381
4382         insert->ins_split = SPLIT_NONE;
4383
4384         el = et->et_root_el;
4385         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4386
4387         if (el->l_tree_depth) {
4388                 /*
4389                  * If we have tree depth, we read in the
4390                  * rightmost extent block ahead of time as
4391                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4392                  * may want it later.
4393                  */
4394                 ret = ocfs2_read_extent_block(inode,
4395                                               ocfs2_et_get_last_eb_blk(et),
4396                                               &bh);
4397                 if (ret) {
4398                         mlog_exit(ret);
4399                         goto out;
4400                 }
4401                 eb = (struct ocfs2_extent_block *) bh->b_data;
4402                 el = &eb->h_list;
4403         }
4404
4405         /*
4406          * Unless we have a contiguous insert, we'll need to know if
4407          * there is room left in our allocation tree for another
4408          * extent record.
4409          *
4410          * XXX: This test is simplistic, we can search for empty
4411          * extent records too.
4412          */
4413         *free_records = le16_to_cpu(el->l_count) -
4414                 le16_to_cpu(el->l_next_free_rec);
4415
4416         if (!insert->ins_tree_depth) {
4417                 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4418                 ocfs2_figure_appending_type(insert, el, insert_rec);
4419                 return 0;
4420         }
4421
4422         path = ocfs2_new_path_from_et(et);
4423         if (!path) {
4424                 ret = -ENOMEM;
4425                 mlog_errno(ret);
4426                 goto out;
4427         }
4428
4429         /*
4430          * In the case that we're inserting past what the tree
4431          * currently accounts for, ocfs2_find_path() will return for
4432          * us the rightmost tree path. This is accounted for below in
4433          * the appending code.
4434          */
4435         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4436         if (ret) {
4437                 mlog_errno(ret);
4438                 goto out;
4439         }
4440
4441         el = path_leaf_el(path);
4442
4443         /*
4444          * Now that we have the path, there's two things we want to determine:
4445          * 1) Contiguousness (also set contig_index if this is so)
4446          *
4447          * 2) Are we doing an append? We can trivially break this up
4448          *     into two types of appends: simple record append, or a
4449          *     rotate inside the tail leaf.
4450          */
4451         ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4452
4453         /*
4454          * The insert code isn't quite ready to deal with all cases of
4455          * left contiguousness. Specifically, if it's an insert into
4456          * the 1st record in a leaf, it will require the adjustment of
4457          * cluster count on the last record of the path directly to it's
4458          * left. For now, just catch that case and fool the layers
4459          * above us. This works just fine for tree_depth == 0, which
4460          * is why we allow that above.
4461          */
4462         if (insert->ins_contig == CONTIG_LEFT &&
4463             insert->ins_contig_index == 0)
4464                 insert->ins_contig = CONTIG_NONE;
4465
4466         /*
4467          * Ok, so we can simply compare against last_eb to figure out
4468          * whether the path doesn't exist. This will only happen in
4469          * the case that we're doing a tail append, so maybe we can
4470          * take advantage of that information somehow.
4471          */
4472         if (ocfs2_et_get_last_eb_blk(et) ==
4473             path_leaf_bh(path)->b_blocknr) {
4474                 /*
4475                  * Ok, ocfs2_find_path() returned us the rightmost
4476                  * tree path. This might be an appending insert. There are
4477                  * two cases:
4478                  *    1) We're doing a true append at the tail:
4479                  *      -This might even be off the end of the leaf
4480                  *    2) We're "appending" by rotating in the tail
4481                  */
4482                 ocfs2_figure_appending_type(insert, el, insert_rec);
4483         }
4484
4485 out:
4486         ocfs2_free_path(path);
4487
4488         if (ret == 0)
4489                 *last_eb_bh = bh;
4490         else
4491                 brelse(bh);
4492         return ret;
4493 }
4494
4495 /*
4496  * Insert an extent into an inode btree.
4497  *
4498  * The caller needs to update fe->i_clusters
4499  */
4500 int ocfs2_insert_extent(struct ocfs2_super *osb,
4501                         handle_t *handle,
4502                         struct inode *inode,
4503                         struct ocfs2_extent_tree *et,
4504                         u32 cpos,
4505                         u64 start_blk,
4506                         u32 new_clusters,
4507                         u8 flags,
4508                         struct ocfs2_alloc_context *meta_ac)
4509 {
4510         int status;
4511         int uninitialized_var(free_records);
4512         struct buffer_head *last_eb_bh = NULL;
4513         struct ocfs2_insert_type insert = {0, };
4514         struct ocfs2_extent_rec rec;
4515
4516         mlog(0, "add %u clusters at position %u to inode %llu\n",
4517              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4518
4519         memset(&rec, 0, sizeof(rec));
4520         rec.e_cpos = cpu_to_le32(cpos);
4521         rec.e_blkno = cpu_to_le64(start_blk);
4522         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4523         rec.e_flags = flags;
4524         status = ocfs2_et_insert_check(inode, et, &rec);
4525         if (status) {
4526                 mlog_errno(status);
4527                 goto bail;
4528         }
4529
4530         status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4531                                           &free_records, &insert);
4532         if (status < 0) {
4533                 mlog_errno(status);
4534                 goto bail;
4535         }
4536
4537         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4538              "Insert.contig_index: %d, Insert.free_records: %d, "
4539              "Insert.tree_depth: %d\n",
4540              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4541              free_records, insert.ins_tree_depth);
4542
4543         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4544                 status = ocfs2_grow_tree(inode, handle, et,
4545                                          &insert.ins_tree_depth, &last_eb_bh,
4546                                          meta_ac);
4547                 if (status) {
4548                         mlog_errno(status);
4549                         goto bail;
4550                 }
4551         }
4552
4553         /* Finally, we can add clusters. This might rotate the tree for us. */
4554         status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4555         if (status < 0)
4556                 mlog_errno(status);
4557         else if (et->et_ops == &ocfs2_dinode_et_ops)
4558                 ocfs2_extent_map_insert_rec(inode, &rec);
4559
4560 bail:
4561         brelse(last_eb_bh);
4562
4563         mlog_exit(status);
4564         return status;
4565 }
4566
4567 /*
4568  * Allcate and add clusters into the extent b-tree.
4569  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4570  * The extent b-tree's root is specified by et, and
4571  * it is not limited to the file storage. Any extent tree can use this
4572  * function if it implements the proper ocfs2_extent_tree.
4573  */
4574 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4575                                 struct inode *inode,
4576                                 u32 *logical_offset,
4577                                 u32 clusters_to_add,
4578                                 int mark_unwritten,
4579                                 struct ocfs2_extent_tree *et,
4580                                 handle_t *handle,
4581                                 struct ocfs2_alloc_context *data_ac,
4582                                 struct ocfs2_alloc_context *meta_ac,
4583                                 enum ocfs2_alloc_restarted *reason_ret)
4584 {
4585         int status = 0;
4586         int free_extents;
4587         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4588         u32 bit_off, num_bits;
4589         u64 block;
4590         u8 flags = 0;
4591
4592         BUG_ON(!clusters_to_add);
4593
4594         if (mark_unwritten)
4595                 flags = OCFS2_EXT_UNWRITTEN;
4596
4597         free_extents = ocfs2_num_free_extents(osb, inode, et);
4598         if (free_extents < 0) {
4599                 status = free_extents;
4600                 mlog_errno(status);
4601                 goto leave;
4602         }
4603
4604         /* there are two cases which could cause us to EAGAIN in the
4605          * we-need-more-metadata case:
4606          * 1) we haven't reserved *any*
4607          * 2) we are so fragmented, we've needed to add metadata too
4608          *    many times. */
4609         if (!free_extents && !meta_ac) {
4610                 mlog(0, "we haven't reserved any metadata!\n");
4611                 status = -EAGAIN;
4612                 reason = RESTART_META;
4613                 goto leave;
4614         } else if ((!free_extents)
4615                    && (ocfs2_alloc_context_bits_left(meta_ac)
4616                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4617                 mlog(0, "filesystem is really fragmented...\n");
4618                 status = -EAGAIN;
4619                 reason = RESTART_META;
4620                 goto leave;
4621         }
4622
4623         status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4624                                         clusters_to_add, &bit_off, &num_bits);
4625         if (status < 0) {
4626                 if (status != -ENOSPC)
4627                         mlog_errno(status);
4628                 goto leave;
4629         }
4630
4631         BUG_ON(num_bits > clusters_to_add);
4632
4633         /* reserve our write early -- insert_extent may update the tree root */
4634         status = ocfs2_et_root_journal_access(handle, inode, et,
4635                                               OCFS2_JOURNAL_ACCESS_WRITE);
4636         if (status < 0) {
4637                 mlog_errno(status);
4638                 goto leave;
4639         }
4640
4641         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4642         mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4643              num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4644         status = ocfs2_insert_extent(osb, handle, inode, et,
4645                                      *logical_offset, block,
4646                                      num_bits, flags, meta_ac);
4647         if (status < 0) {
4648                 mlog_errno(status);
4649                 goto leave;
4650         }
4651
4652         status = ocfs2_journal_dirty(handle, et->et_root_bh);
4653         if (status < 0) {
4654                 mlog_errno(status);
4655                 goto leave;
4656         }
4657
4658         clusters_to_add -= num_bits;
4659         *logical_offset += num_bits;
4660
4661         if (clusters_to_add) {
4662                 mlog(0, "need to alloc once more, wanted = %u\n",
4663                      clusters_to_add);
4664                 status = -EAGAIN;
4665                 reason = RESTART_TRANS;
4666         }
4667
4668 leave:
4669         mlog_exit(status);
4670         if (reason_ret)
4671                 *reason_ret = reason;
4672         return status;
4673 }
4674
4675 static void ocfs2_make_right_split_rec(struct super_block *sb,
4676                                        struct ocfs2_extent_rec *split_rec,
4677                                        u32 cpos,
4678                                        struct ocfs2_extent_rec *rec)
4679 {
4680         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4681         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4682
4683         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4684
4685         split_rec->e_cpos = cpu_to_le32(cpos);
4686         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4687
4688         split_rec->e_blkno = rec->e_blkno;
4689         le64_add_cpu(&split_rec->e_blkno,
4690                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4691
4692         split_rec->e_flags = rec->e_flags;
4693 }
4694
4695 static int ocfs2_split_and_insert(struct inode *inode,
4696                                   handle_t *handle,
4697                                   struct ocfs2_path *path,
4698                                   struct ocfs2_extent_tree *et,
4699                                   struct buffer_head **last_eb_bh,
4700                                   int split_index,
4701                                   struct ocfs2_extent_rec *orig_split_rec,
4702                                   struct ocfs2_alloc_context *meta_ac)
4703 {
4704         int ret = 0, depth;
4705         unsigned int insert_range, rec_range, do_leftright = 0;
4706         struct ocfs2_extent_rec tmprec;
4707         struct ocfs2_extent_list *rightmost_el;
4708         struct ocfs2_extent_rec rec;
4709         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4710         struct ocfs2_insert_type insert;
4711         struct ocfs2_extent_block *eb;
4712
4713 leftright:
4714         /*
4715          * Store a copy of the record on the stack - it might move
4716          * around as the tree is manipulated below.
4717          */
4718         rec = path_leaf_el(path)->l_recs[split_index];
4719
4720         rightmost_el = et->et_root_el;
4721
4722         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4723         if (depth) {
4724                 BUG_ON(!(*last_eb_bh));
4725                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4726                 rightmost_el = &eb->h_list;
4727         }
4728
4729         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4730             le16_to_cpu(rightmost_el->l_count)) {
4731                 ret = ocfs2_grow_tree(inode, handle, et,
4732                                       &depth, last_eb_bh, meta_ac);
4733                 if (ret) {
4734                         mlog_errno(ret);
4735                         goto out;
4736                 }
4737         }
4738
4739         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4740         insert.ins_appending = APPEND_NONE;
4741         insert.ins_contig = CONTIG_NONE;
4742         insert.ins_tree_depth = depth;
4743
4744         insert_range = le32_to_cpu(split_rec.e_cpos) +
4745                 le16_to_cpu(split_rec.e_leaf_clusters);
4746         rec_range = le32_to_cpu(rec.e_cpos) +
4747                 le16_to_cpu(rec.e_leaf_clusters);
4748
4749         if (split_rec.e_cpos == rec.e_cpos) {
4750                 insert.ins_split = SPLIT_LEFT;
4751         } else if (insert_range == rec_range) {
4752                 insert.ins_split = SPLIT_RIGHT;
4753         } else {
4754                 /*
4755                  * Left/right split. We fake this as a right split
4756                  * first and then make a second pass as a left split.
4757                  */
4758                 insert.ins_split = SPLIT_RIGHT;
4759
4760                 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4761                                            &rec);
4762
4763                 split_rec = tmprec;
4764
4765                 BUG_ON(do_leftright);
4766                 do_leftright = 1;
4767         }
4768
4769         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4770         if (ret) {
4771                 mlog_errno(ret);
4772                 goto out;
4773         }
4774
4775         if (do_leftright == 1) {
4776                 u32 cpos;
4777                 struct ocfs2_extent_list *el;
4778
4779                 do_leftright++;
4780                 split_rec = *orig_split_rec;
4781
4782                 ocfs2_reinit_path(path, 1);
4783
4784                 cpos = le32_to_cpu(split_rec.e_cpos);
4785                 ret = ocfs2_find_path(inode, path, cpos);
4786                 if (ret) {
4787                         mlog_errno(ret);
4788                         goto out;
4789                 }
4790
4791                 el = path_leaf_el(path);
4792                 split_index = ocfs2_search_extent_list(el, cpos);
4793                 goto leftright;
4794         }
4795 out:
4796
4797         return ret;
4798 }
4799
4800 static int ocfs2_replace_extent_rec(struct inode *inode,
4801                                     handle_t *handle,
4802                                     struct ocfs2_path *path,
4803                                     struct ocfs2_extent_list *el,
4804                                     int split_index,
4805                                     struct ocfs2_extent_rec *split_rec)
4806 {
4807         int ret;
4808
4809         ret = ocfs2_path_bh_journal_access(handle, inode, path,
4810                                            path_num_items(path) - 1);
4811         if (ret) {
4812                 mlog_errno(ret);
4813                 goto out;
4814         }
4815
4816         el->l_recs[split_index] = *split_rec;
4817
4818         ocfs2_journal_dirty(handle, path_leaf_bh(path));
4819 out:
4820         return ret;
4821 }
4822
4823 /*
4824  * Mark part or all of the extent record at split_index in the leaf
4825  * pointed to by path as written. This removes the unwritten
4826  * extent flag.
4827  *
4828  * Care is taken to handle contiguousness so as to not grow the tree.
4829  *
4830  * meta_ac is not strictly necessary - we only truly need it if growth
4831  * of the tree is required. All other cases will degrade into a less
4832  * optimal tree layout.
4833  *
4834  * last_eb_bh should be the rightmost leaf block for any extent
4835  * btree. Since a split may grow the tree or a merge might shrink it,
4836  * the caller cannot trust the contents of that buffer after this call.
4837  *
4838  * This code is optimized for readability - several passes might be
4839  * made over certain portions of the tree. All of those blocks will
4840  * have been brought into cache (and pinned via the journal), so the
4841  * extra overhead is not expressed in terms of disk reads.
4842  */
4843 static int __ocfs2_mark_extent_written(struct inode *inode,
4844                                        struct ocfs2_extent_tree *et,
4845                                        handle_t *handle,
4846                                        struct ocfs2_path *path,
4847                                        int split_index,
4848                                        struct ocfs2_extent_rec *split_rec,
4849                                        struct ocfs2_alloc_context *meta_ac,
4850                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
4851 {
4852         int ret = 0;
4853         struct ocfs2_extent_list *el = path_leaf_el(path);
4854         struct buffer_head *last_eb_bh = NULL;
4855         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4856         struct ocfs2_merge_ctxt ctxt;
4857         struct ocfs2_extent_list *rightmost_el;
4858
4859         if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4860                 ret = -EIO;
4861                 mlog_errno(ret);
4862                 goto out;
4863         }
4864
4865         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4866             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4867              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4868                 ret = -EIO;
4869                 mlog_errno(ret);
4870                 goto out;
4871         }
4872
4873         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4874                                                             split_index,
4875                                                             split_rec);
4876
4877         /*
4878          * The core merge / split code wants to know how much room is
4879          * left in this inodes allocation tree, so we pass the
4880          * rightmost extent list.
4881          */
4882         if (path->p_tree_depth) {
4883                 struct ocfs2_extent_block *eb;
4884
4885                 ret = ocfs2_read_extent_block(inode,
4886                                               ocfs2_et_get_last_eb_blk(et),
4887                                               &last_eb_bh);
4888                 if (ret) {
4889                         mlog_exit(ret);
4890                         goto out;
4891                 }
4892
4893                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4894                 rightmost_el = &eb->h_list;
4895         } else
4896                 rightmost_el = path_root_el(path);
4897
4898         if (rec->e_cpos == split_rec->e_cpos &&
4899             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4900                 ctxt.c_split_covers_rec = 1;
4901         else
4902                 ctxt.c_split_covers_rec = 0;
4903
4904         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4905
4906         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4907              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4908              ctxt.c_split_covers_rec);
4909
4910         if (ctxt.c_contig_type == CONTIG_NONE) {
4911                 if (ctxt.c_split_covers_rec)
4912                         ret = ocfs2_replace_extent_rec(inode, handle,
4913                                                        path, el,
4914                                                        split_index, split_rec);
4915                 else
4916                         ret = ocfs2_split_and_insert(inode, handle, path, et,
4917                                                      &last_eb_bh, split_index,
4918                                                      split_rec, meta_ac);
4919                 if (ret)
4920                         mlog_errno(ret);
4921         } else {
4922                 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4923                                                 split_index, split_rec,
4924                                                 dealloc, &ctxt, et);
4925                 if (ret)
4926                         mlog_errno(ret);
4927         }
4928
4929 out:
4930         brelse(last_eb_bh);
4931         return ret;
4932 }
4933
4934 /*
4935  * Mark the already-existing extent at cpos as written for len clusters.
4936  *
4937  * If the existing extent is larger than the request, initiate a
4938  * split. An attempt will be made at merging with adjacent extents.
4939  *
4940  * The caller is responsible for passing down meta_ac if we'll need it.
4941  */
4942 int ocfs2_mark_extent_written(struct inode *inode,
4943                               struct ocfs2_extent_tree *et,
4944                               handle_t *handle, u32 cpos, u32 len, u32 phys,
4945                               struct ocfs2_alloc_context *meta_ac,
4946                               struct ocfs2_cached_dealloc_ctxt *dealloc)
4947 {
4948         int ret, index;
4949         u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4950         struct ocfs2_extent_rec split_rec;
4951         struct ocfs2_path *left_path = NULL;
4952         struct ocfs2_extent_list *el;
4953
4954         mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4955              inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4956
4957         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4958                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4959                             "that are being written to, but the feature bit "
4960                             "is not set in the super block.",
4961                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
4962                 ret = -EROFS;
4963                 goto out;
4964         }
4965
4966         /*
4967          * XXX: This should be fixed up so that we just re-insert the
4968          * next extent records.
4969          *
4970          * XXX: This is a hack on the extent tree, maybe it should be
4971          * an op?
4972          */
4973         if (et->et_ops == &ocfs2_dinode_et_ops)
4974                 ocfs2_extent_map_trunc(inode, 0);
4975
4976         left_path = ocfs2_new_path_from_et(et);
4977         if (!left_path) {
4978                 ret = -ENOMEM;
4979                 mlog_errno(ret);
4980                 goto out;
4981         }
4982
4983         ret = ocfs2_find_path(inode, left_path, cpos);
4984         if (ret) {
4985                 mlog_errno(ret);
4986                 goto out;
4987         }
4988         el = path_leaf_el(left_path);
4989
4990         index = ocfs2_search_extent_list(el, cpos);
4991         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4992                 ocfs2_error(inode->i_sb,
4993                             "Inode %llu has an extent at cpos %u which can no "
4994                             "longer be found.\n",
4995                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4996                 ret = -EROFS;
4997                 goto out;
4998         }
4999
5000         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5001         split_rec.e_cpos = cpu_to_le32(cpos);
5002         split_rec.e_leaf_clusters = cpu_to_le16(len);
5003         split_rec.e_blkno = cpu_to_le64(start_blkno);
5004         split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
5005         split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
5006
5007         ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
5008                                           index, &split_rec, meta_ac,
5009                                           dealloc);
5010         if (ret)
5011                 mlog_errno(ret);
5012
5013 out:
5014         ocfs2_free_path(left_path);
5015         return ret;
5016 }
5017
5018 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5019                             handle_t *handle, struct ocfs2_path *path,
5020                             int index, u32 new_range,
5021                             struct ocfs2_alloc_context *meta_ac)
5022 {
5023         int ret, depth, credits = handle->h_buffer_credits;
5024         struct buffer_head *last_eb_bh = NULL;
5025         struct ocfs2_extent_block *eb;
5026         struct ocfs2_extent_list *rightmost_el, *el;
5027         struct ocfs2_extent_rec split_rec;
5028         struct ocfs2_extent_rec *rec;
5029         struct ocfs2_insert_type insert;
5030
5031         /*
5032          * Setup the record to split before we grow the tree.
5033          */
5034         el = path_leaf_el(path);
5035         rec = &el->l_recs[index];
5036         ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
5037
5038         depth = path->p_tree_depth;
5039         if (depth > 0) {
5040                 ret = ocfs2_read_extent_block(inode,
5041                                               ocfs2_et_get_last_eb_blk(et),
5042                                               &last_eb_bh);
5043                 if (ret < 0) {
5044                         mlog_errno(ret);
5045                         goto out;
5046                 }
5047
5048                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5049                 rightmost_el = &eb->h_list;
5050         } else
5051                 rightmost_el = path_leaf_el(path);
5052
5053         credits += path->p_tree_depth +
5054                    ocfs2_extend_meta_needed(et->et_root_el);
5055         ret = ocfs2_extend_trans(handle, credits);
5056         if (ret) {
5057                 mlog_errno(ret);
5058                 goto out;
5059         }
5060
5061         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5062             le16_to_cpu(rightmost_el->l_count)) {
5063                 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
5064                                       meta_ac);
5065                 if (ret) {
5066                         mlog_errno(ret);
5067                         goto out;
5068                 }
5069         }
5070
5071         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5072         insert.ins_appending = APPEND_NONE;
5073         insert.ins_contig = CONTIG_NONE;
5074         insert.ins_split = SPLIT_RIGHT;
5075         insert.ins_tree_depth = depth;
5076
5077         ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5078         if (ret)
5079                 mlog_errno(ret);
5080
5081 out:
5082         brelse(last_eb_bh);
5083         return ret;
5084 }
5085
5086 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5087                               struct ocfs2_path *path, int index,
5088                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5089                               u32 cpos, u32 len,
5090                               struct ocfs2_extent_tree *et)
5091 {
5092         int ret;
5093         u32 left_cpos, rec_range, trunc_range;
5094         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5095         struct super_block *sb = inode->i_sb;
5096         struct ocfs2_path *left_path = NULL;
5097         struct ocfs2_extent_list *el = path_leaf_el(path);
5098         struct ocfs2_extent_rec *rec;
5099         struct ocfs2_extent_block *eb;
5100
5101         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5102                 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5103                 if (ret) {
5104                         mlog_errno(ret);
5105                         goto out;
5106                 }
5107
5108                 index--;
5109         }
5110
5111         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5112             path->p_tree_depth) {
5113                 /*
5114                  * Check whether this is the rightmost tree record. If
5115                  * we remove all of this record or part of its right
5116                  * edge then an update of the record lengths above it
5117                  * will be required.
5118                  */
5119                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5120                 if (eb->h_next_leaf_blk == 0)
5121                         is_rightmost_tree_rec = 1;
5122         }
5123
5124         rec = &el->l_recs[index];
5125         if (index == 0 && path->p_tree_depth &&
5126             le32_to_cpu(rec->e_cpos) == cpos) {
5127                 /*
5128                  * Changing the leftmost offset (via partial or whole
5129                  * record truncate) of an interior (or rightmost) path
5130                  * means we have to update the subtree that is formed
5131                  * by this leaf and the one to it's left.
5132                  *
5133                  * There are two cases we can skip:
5134                  *   1) Path is the leftmost one in our inode tree.
5135                  *   2) The leaf is rightmost and will be empty after
5136                  *      we remove the extent record - the rotate code
5137                  *      knows how to update the newly formed edge.
5138                  */
5139
5140                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5141                                                     &left_cpos);
5142                 if (ret) {
5143                         mlog_errno(ret);
5144                         goto out;
5145                 }
5146
5147                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5148                         left_path = ocfs2_new_path_from_path(path);
5149                         if (!left_path) {
5150                                 ret = -ENOMEM;
5151                                 mlog_errno(ret);
5152                                 goto out;
5153                         }
5154
5155                         ret = ocfs2_find_path(inode, left_path, left_cpos);
5156                         if (ret) {
5157                                 mlog_errno(ret);
5158                                 goto out;
5159                         }
5160                 }
5161         }
5162
5163         ret = ocfs2_extend_rotate_transaction(handle, 0,
5164                                               handle->h_buffer_credits,
5165                                               path);
5166         if (ret) {
5167                 mlog_errno(ret);
5168                 goto out;
5169         }
5170
5171         ret = ocfs2_journal_access_path(inode, handle, path);
5172         if (ret) {
5173                 mlog_errno(ret);
5174                 goto out;
5175         }
5176
5177         ret = ocfs2_journal_access_path(inode, handle, left_path);
5178         if (ret) {
5179                 mlog_errno(ret);
5180                 goto out;
5181         }
5182
5183         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5184         trunc_range = cpos + len;
5185
5186         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5187                 int next_free;
5188
5189                 memset(rec, 0, sizeof(*rec));
5190                 ocfs2_cleanup_merge(el, index);
5191                 wants_rotate = 1;
5192
5193                 next_free = le16_to_cpu(el->l_next_free_rec);
5194                 if (is_rightmost_tree_rec && next_free > 1) {
5195                         /*
5196                          * We skip the edge update if this path will
5197                          * be deleted by the rotate code.
5198                          */
5199                         rec = &el->l_recs[next_free - 1];
5200                         ocfs2_adjust_rightmost_records(inode, handle, path,
5201                                                        rec);
5202                 }
5203         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5204                 /* Remove leftmost portion of the record. */
5205                 le32_add_cpu(&rec->e_cpos, len);
5206                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5207                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5208         } else if (rec_range == trunc_range) {
5209                 /* Remove rightmost portion of the record */
5210                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5211                 if (is_rightmost_tree_rec)
5212                         ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5213         } else {
5214                 /* Caller should have trapped this. */
5215                 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5216                      "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5217                      le32_to_cpu(rec->e_cpos),
5218                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5219                 BUG();
5220         }
5221
5222         if (left_path) {
5223                 int subtree_index;
5224
5225                 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5226                 ocfs2_complete_edge_insert(inode, handle, left_path, path,
5227                                            subtree_index);
5228         }
5229
5230         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5231
5232         ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5233         if (ret) {
5234                 mlog_errno(ret);
5235                 goto out;
5236         }
5237
5238 out:
5239         ocfs2_free_path(left_path);
5240         return ret;
5241 }
5242
5243 int ocfs2_remove_extent(struct inode *inode,
5244                         struct ocfs2_extent_tree *et,
5245                         u32 cpos, u32 len, handle_t *handle,
5246                         struct ocfs2_alloc_context *meta_ac,
5247                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5248 {
5249         int ret, index;
5250         u32 rec_range, trunc_range;
5251         struct ocfs2_extent_rec *rec;
5252         struct ocfs2_extent_list *el;
5253         struct ocfs2_path *path = NULL;
5254
5255         ocfs2_extent_map_trunc(inode, 0);
5256
5257         path = ocfs2_new_path_from_et(et);
5258         if (!path) {
5259                 ret = -ENOMEM;
5260                 mlog_errno(ret);
5261                 goto out;
5262         }
5263
5264         ret = ocfs2_find_path(inode, path, cpos);
5265         if (ret) {
5266                 mlog_errno(ret);
5267                 goto out;
5268         }
5269
5270         el = path_leaf_el(path);
5271         index = ocfs2_search_extent_list(el, cpos);
5272         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5273                 ocfs2_error(inode->i_sb,
5274                             "Inode %llu has an extent at cpos %u which can no "
5275                             "longer be found.\n",
5276                             (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5277                 ret = -EROFS;
5278                 goto out;
5279         }
5280
5281         /*
5282          * We have 3 cases of extent removal:
5283          *   1) Range covers the entire extent rec
5284          *   2) Range begins or ends on one edge of the extent rec
5285          *   3) Range is in the middle of the extent rec (no shared edges)
5286          *
5287          * For case 1 we remove the extent rec and left rotate to
5288          * fill the hole.
5289          *
5290          * For case 2 we just shrink the existing extent rec, with a
5291          * tree update if the shrinking edge is also the edge of an
5292          * extent block.
5293          *
5294          * For case 3 we do a right split to turn the extent rec into
5295          * something case 2 can handle.
5296          */
5297         rec = &el->l_recs[index];
5298         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5299         trunc_range = cpos + len;
5300
5301         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5302
5303         mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5304              "(cpos %u, len %u)\n",
5305              (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5306              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5307
5308         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5309                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5310                                          cpos, len, et);
5311                 if (ret) {
5312                         mlog_errno(ret);
5313                         goto out;
5314                 }
5315         } else {
5316                 ret = ocfs2_split_tree(inode, et, handle, path, index,
5317                                        trunc_range, meta_ac);
5318                 if (ret) {
5319                         mlog_errno(ret);
5320                         goto out;
5321                 }
5322
5323                 /*
5324                  * The split could have manipulated the tree enough to
5325                  * move the record location, so we have to look for it again.
5326                  */
5327                 ocfs2_reinit_path(path, 1);
5328
5329                 ret = ocfs2_find_path(inode, path, cpos);
5330                 if (ret) {
5331                         mlog_errno(ret);
5332                         goto out;
5333                 }
5334
5335                 el = path_leaf_el(path);
5336                 index = ocfs2_search_extent_list(el, cpos);
5337                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5338                         ocfs2_error(inode->i_sb,
5339                                     "Inode %llu: split at cpos %u lost record.",
5340                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5341                                     cpos);
5342                         ret = -EROFS;
5343                         goto out;
5344                 }
5345
5346                 /*
5347                  * Double check our values here. If anything is fishy,
5348                  * it's easier to catch it at the top level.
5349                  */
5350                 rec = &el->l_recs[index];
5351                 rec_range = le32_to_cpu(rec->e_cpos) +
5352                         ocfs2_rec_clusters(el, rec);
5353                 if (rec_range != trunc_range) {
5354                         ocfs2_error(inode->i_sb,
5355                                     "Inode %llu: error after split at cpos %u"
5356                                     "trunc len %u, existing record is (%u,%u)",
5357                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
5358                                     cpos, len, le32_to_cpu(rec->e_cpos),
5359                                     ocfs2_rec_clusters(el, rec));
5360                         ret = -EROFS;
5361                         goto out;
5362                 }
5363
5364                 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5365                                          cpos, len, et);
5366                 if (ret) {
5367                         mlog_errno(ret);
5368                         goto out;
5369                 }
5370         }
5371
5372 out:
5373         ocfs2_free_path(path);
5374         return ret;
5375 }
5376
5377 int ocfs2_remove_btree_range(struct inode *inode,
5378                              struct ocfs2_extent_tree *et,
5379                              u32 cpos, u32 phys_cpos, u32 len,
5380                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5381 {
5382         int ret;
5383         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5384         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5385         struct inode *tl_inode = osb->osb_tl_inode;
5386         handle_t *handle;
5387         struct ocfs2_alloc_context *meta_ac = NULL;
5388
5389         ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5390         if (ret) {
5391                 mlog_errno(ret);
5392                 return ret;
5393         }
5394
5395         mutex_lock(&tl_inode->i_mutex);
5396
5397         if (ocfs2_truncate_log_needs_flush(osb)) {
5398                 ret = __ocfs2_flush_truncate_log(osb);
5399                 if (ret < 0) {
5400                         mlog_errno(ret);
5401                         goto out;
5402                 }
5403         }
5404
5405         handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5406         if (IS_ERR(handle)) {
5407                 ret = PTR_ERR(handle);
5408                 mlog_errno(ret);
5409                 goto out;
5410         }
5411
5412         ret = ocfs2_et_root_journal_access(handle, inode, et,
5413                                            OCFS2_JOURNAL_ACCESS_WRITE);
5414         if (ret) {
5415                 mlog_errno(ret);
5416                 goto out;
5417         }
5418
5419         vfs_dq_free_space_nodirty(inode,
5420                                   ocfs2_clusters_to_bytes(inode->i_sb, len));
5421
5422         ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5423                                   dealloc);
5424         if (ret) {
5425                 mlog_errno(ret);
5426                 goto out_commit;
5427         }
5428
5429         ocfs2_et_update_clusters(inode, et, -len);
5430
5431         ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5432         if (ret) {
5433                 mlog_errno(ret);
5434                 goto out_commit;
5435         }
5436
5437         ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5438         if (ret)
5439                 mlog_errno(ret);
5440
5441 out_commit:
5442         ocfs2_commit_trans(osb, handle);
5443 out:
5444         mutex_unlock(&tl_inode->i_mutex);
5445
5446         if (meta_ac)
5447                 ocfs2_free_alloc_context(meta_ac);
5448
5449         return ret;
5450 }
5451
5452 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5453 {
5454         struct buffer_head *tl_bh = osb->osb_tl_bh;
5455         struct ocfs2_dinode *di;
5456         struct ocfs2_truncate_log *tl;
5457
5458         di = (struct ocfs2_dinode *) tl_bh->b_data;
5459         tl = &di->id2.i_dealloc;
5460
5461         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5462                         "slot %d, invalid truncate log parameters: used = "
5463                         "%u, count = %u\n", osb->slot_num,
5464                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5465         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5466 }
5467
5468 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5469                                            unsigned int new_start)
5470 {
5471         unsigned int tail_index;
5472         unsigned int current_tail;
5473
5474         /* No records, nothing to coalesce */
5475         if (!le16_to_cpu(tl->tl_used))
5476                 return 0;
5477
5478         tail_index = le16_to_cpu(tl->tl_used) - 1;
5479         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5480         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5481
5482         return current_tail == new_start;
5483 }
5484
5485 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5486                               handle_t *handle,
5487                               u64 start_blk,
5488                               unsigned int num_clusters)
5489 {
5490         int status, index;
5491         unsigned int start_cluster, tl_count;
5492         struct inode *tl_inode = osb->osb_tl_inode;
5493         struct buffer_head *tl_bh = osb->osb_tl_bh;
5494         struct ocfs2_dinode *di;
5495         struct ocfs2_truncate_log *tl;
5496
5497         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5498                    (unsigned long long)start_blk, num_clusters);
5499
5500         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5501
5502         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5503
5504         di = (struct ocfs2_dinode *) tl_bh->b_data;
5505
5506         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5507          * by the underlying call to ocfs2_read_inode_block(), so any
5508          * corruption is a code bug */
5509         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5510
5511         tl = &di->id2.i_dealloc;
5512         tl_count = le16_to_cpu(tl->tl_count);
5513         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5514                         tl_count == 0,
5515                         "Truncate record count on #%llu invalid "
5516                         "wanted %u, actual %u\n",
5517                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5518                         ocfs2_truncate_recs_per_inode(osb->sb),
5519                         le16_to_cpu(tl->tl_count));
5520
5521         /* Caller should have known to flush before calling us. */
5522         index = le16_to_cpu(tl->tl_used);
5523         if (index >= tl_count) {
5524                 status = -ENOSPC;
5525                 mlog_errno(status);
5526                 goto bail;
5527         }
5528
5529         status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5530                                          OCFS2_JOURNAL_ACCESS_WRITE);
5531         if (status < 0) {
5532                 mlog_errno(status);
5533                 goto bail;
5534         }
5535
5536         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5537              "%llu (index = %d)\n", num_clusters, start_cluster,
5538              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5539
5540         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5541                 /*
5542                  * Move index back to the record we are coalescing with.
5543                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5544                  */
5545                 index--;
5546
5547                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5548                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5549                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5550                      num_clusters);
5551         } else {
5552                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5553                 tl->tl_used = cpu_to_le16(index + 1);
5554         }
5555         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5556
5557         status = ocfs2_journal_dirty(handle, tl_bh);
5558         if (status < 0) {
5559                 mlog_errno(status);
5560                 goto bail;
5561         }
5562
5563 bail:
5564         mlog_exit(status);
5565         return status;
5566 }
5567
5568 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5569                                          handle_t *handle,
5570                                          struct inode *data_alloc_inode,
5571                                          struct buffer_head *data_alloc_bh)
5572 {
5573         int status = 0;
5574         int i;
5575         unsigned int num_clusters;
5576         u64 start_blk;
5577         struct ocfs2_truncate_rec rec;
5578         struct ocfs2_dinode *di;
5579         struct ocfs2_truncate_log *tl;
5580         struct inode *tl_inode = osb->osb_tl_inode;
5581         struct buffer_head *tl_bh = osb->osb_tl_bh;
5582
5583         mlog_entry_void();
5584
5585         di = (struct ocfs2_dinode *) tl_bh->b_data;
5586         tl = &di->id2.i_dealloc;
5587         i = le16_to_cpu(tl->tl_used) - 1;
5588         while (i >= 0) {
5589                 /* Caller has given us at least enough credits to
5590                  * update the truncate log dinode */
5591                 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5592                                                  OCFS2_JOURNAL_ACCESS_WRITE);
5593                 if (status < 0) {
5594                         mlog_errno(status);
5595                         goto bail;
5596                 }
5597
5598                 tl->tl_used = cpu_to_le16(i);
5599
5600                 status = ocfs2_journal_dirty(handle, tl_bh);
5601                 if (status < 0) {
5602                         mlog_errno(status);
5603                         goto bail;
5604                 }
5605
5606                 /* TODO: Perhaps we can calculate the bulk of the
5607                  * credits up front rather than extending like
5608                  * this. */
5609                 status = ocfs2_extend_trans(handle,
5610                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5611                 if (status < 0) {
5612                         mlog_errno(status);
5613                         goto bail;
5614                 }
5615
5616                 rec = tl->tl_recs[i];
5617                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5618                                                     le32_to_cpu(rec.t_start));
5619                 num_clusters = le32_to_cpu(rec.t_clusters);
5620
5621                 /* if start_blk is not set, we ignore the record as
5622                  * invalid. */
5623                 if (start_blk) {
5624                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5625                              i, le32_to_cpu(rec.t_start), num_clusters);
5626
5627                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5628                                                      data_alloc_bh, start_blk,
5629                                                      num_clusters);
5630                         if (status < 0) {
5631                                 mlog_errno(status);
5632                                 goto bail;
5633                         }
5634                 }
5635                 i--;
5636         }
5637
5638 bail:
5639         mlog_exit(status);
5640         return status;
5641 }
5642
5643 /* Expects you to already be holding tl_inode->i_mutex */
5644 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5645 {
5646         int status;
5647         unsigned int num_to_flush;
5648         handle_t *handle;
5649         struct inode *tl_inode = osb->osb_tl_inode;
5650         struct inode *data_alloc_inode = NULL;
5651         struct buffer_head *tl_bh = osb->osb_tl_bh;
5652         struct buffer_head *data_alloc_bh = NULL;
5653         struct ocfs2_dinode *di;
5654         struct ocfs2_truncate_log *tl;
5655
5656         mlog_entry_void();
5657
5658         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5659
5660         di = (struct ocfs2_dinode *) tl_bh->b_data;
5661
5662         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5663          * by the underlying call to ocfs2_read_inode_block(), so any
5664          * corruption is a code bug */
5665         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5666
5667         tl = &di->id2.i_dealloc;
5668         num_to_flush = le16_to_cpu(tl->tl_used);
5669         mlog(0, "Flush %u records from truncate log #%llu\n",
5670              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5671         if (!num_to_flush) {
5672                 status = 0;
5673                 goto out;
5674         }
5675
5676         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5677                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5678                                                        OCFS2_INVALID_SLOT);
5679         if (!data_alloc_inode) {
5680                 status = -EINVAL;
5681                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5682                 goto out;
5683         }
5684
5685         mutex_lock(&data_alloc_inode->i_mutex);
5686
5687         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5688         if (status < 0) {
5689                 mlog_errno(status);
5690                 goto out_mutex;
5691         }
5692
5693         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5694         if (IS_ERR(handle)) {
5695                 status = PTR_ERR(handle);
5696                 mlog_errno(status);
5697                 goto out_unlock;
5698         }
5699
5700         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5701                                                data_alloc_bh);
5702         if (status < 0)
5703                 mlog_errno(status);
5704
5705         ocfs2_commit_trans(osb, handle);
5706
5707 out_unlock:
5708         brelse(data_alloc_bh);
5709         ocfs2_inode_unlock(data_alloc_inode, 1);
5710
5711 out_mutex:
5712         mutex_unlock(&data_alloc_inode->i_mutex);
5713         iput(data_alloc_inode);
5714
5715 out:
5716         mlog_exit(status);
5717         return status;
5718 }
5719
5720 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5721 {
5722         int status;
5723         struct inode *tl_inode = osb->osb_tl_inode;
5724
5725         mutex_lock(&tl_inode->i_mutex);
5726         status = __ocfs2_flush_truncate_log(osb);
5727         mutex_unlock(&tl_inode->i_mutex);
5728
5729         return status;
5730 }
5731
5732 static void ocfs2_truncate_log_worker(struct work_struct *work)
5733 {
5734         int status;
5735         struct ocfs2_super *osb =
5736                 container_of(work, struct ocfs2_super,
5737                              osb_truncate_log_wq.work);
5738
5739         mlog_entry_void();
5740
5741         status = ocfs2_flush_truncate_log(osb);
5742         if (status < 0)
5743                 mlog_errno(status);
5744         else
5745                 ocfs2_init_inode_steal_slot(osb);
5746
5747         mlog_exit(status);
5748 }
5749
5750 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5751 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5752                                        int cancel)
5753 {
5754         if (osb->osb_tl_inode) {
5755                 /* We want to push off log flushes while truncates are
5756                  * still running. */
5757                 if (cancel)
5758                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5759
5760                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5761                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5762         }
5763 }
5764
5765 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5766                                        int slot_num,
5767                                        struct inode **tl_inode,
5768                                        struct buffer_head **tl_bh)
5769 {
5770         int status;
5771         struct inode *inode = NULL;
5772         struct buffer_head *bh = NULL;
5773
5774         inode = ocfs2_get_system_file_inode(osb,
5775                                            TRUNCATE_LOG_SYSTEM_INODE,
5776                                            slot_num);
5777         if (!inode) {
5778                 status = -EINVAL;
5779                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5780                 goto bail;
5781         }
5782
5783         status = ocfs2_read_inode_block(inode, &bh);
5784         if (status < 0) {
5785                 iput(inode);
5786                 mlog_errno(status);
5787                 goto bail;
5788         }
5789
5790         *tl_inode = inode;
5791         *tl_bh    = bh;
5792 bail:
5793         mlog_exit(status);
5794         return status;
5795 }
5796
5797 /* called during the 1st stage of node recovery. we stamp a clean
5798  * truncate log and pass back a copy for processing later. if the
5799  * truncate log does not require processing, a *tl_copy is set to
5800  * NULL. */
5801 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5802                                       int slot_num,
5803                                       struct ocfs2_dinode **tl_copy)
5804 {
5805         int status;
5806         struct inode *tl_inode = NULL;
5807         struct buffer_head *tl_bh = NULL;
5808         struct ocfs2_dinode *di;
5809         struct ocfs2_truncate_log *tl;
5810
5811         *tl_copy = NULL;
5812
5813         mlog(0, "recover truncate log from slot %d\n", slot_num);
5814
5815         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5816         if (status < 0) {
5817                 mlog_errno(status);
5818                 goto bail;
5819         }
5820
5821         di = (struct ocfs2_dinode *) tl_bh->b_data;
5822
5823         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5824          * validated by the underlying call to ocfs2_read_inode_block(),
5825          * so any corruption is a code bug */
5826         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5827
5828         tl = &di->id2.i_dealloc;
5829         if (le16_to_cpu(tl->tl_used)) {
5830                 mlog(0, "We'll have %u logs to recover\n",
5831                      le16_to_cpu(tl->tl_used));
5832
5833                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5834                 if (!(*tl_copy)) {
5835                         status = -ENOMEM;
5836                         mlog_errno(status);
5837                         goto bail;
5838                 }
5839
5840                 /* Assuming the write-out below goes well, this copy
5841                  * will be passed back to recovery for processing. */
5842                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5843
5844                 /* All we need to do to clear the truncate log is set
5845                  * tl_used. */
5846                 tl->tl_used = 0;
5847
5848                 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
5849                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5850                 if (status < 0) {
5851                         mlog_errno(status);
5852                         goto bail;
5853                 }
5854         }
5855
5856 bail:
5857         if (tl_inode)
5858                 iput(tl_inode);
5859         brelse(tl_bh);
5860
5861         if (status < 0 && (*tl_copy)) {
5862                 kfree(*tl_copy);
5863                 *tl_copy = NULL;
5864         }
5865
5866         mlog_exit(status);
5867         return status;
5868 }
5869
5870 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5871                                          struct ocfs2_dinode *tl_copy)
5872 {
5873         int status = 0;
5874         int i;
5875         unsigned int clusters, num_recs, start_cluster;
5876         u64 start_blk;
5877         handle_t *handle;
5878         struct inode *tl_inode = osb->osb_tl_inode;
5879         struct ocfs2_truncate_log *tl;
5880
5881         mlog_entry_void();
5882
5883         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5884                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5885                 return -EINVAL;
5886         }
5887
5888         tl = &tl_copy->id2.i_dealloc;
5889         num_recs = le16_to_cpu(tl->tl_used);
5890         mlog(0, "cleanup %u records from %llu\n", num_recs,
5891              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5892
5893         mutex_lock(&tl_inode->i_mutex);
5894         for(i = 0; i < num_recs; i++) {
5895                 if (ocfs2_truncate_log_needs_flush(osb)) {
5896                         status = __ocfs2_flush_truncate_log(osb);
5897                         if (status < 0) {
5898                                 mlog_errno(status);
5899                                 goto bail_up;
5900                         }
5901                 }
5902
5903                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5904                 if (IS_ERR(handle)) {
5905                         status = PTR_ERR(handle);
5906                         mlog_errno(status);
5907                         goto bail_up;
5908                 }
5909
5910                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5911                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5912                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5913
5914                 status = ocfs2_truncate_log_append(osb, handle,
5915                                                    start_blk, clusters);
5916                 ocfs2_commit_trans(osb, handle);
5917                 if (status < 0) {
5918                         mlog_errno(status);
5919                         goto bail_up;
5920                 }
5921         }
5922
5923 bail_up:
5924         mutex_unlock(&tl_inode->i_mutex);
5925
5926         mlog_exit(status);
5927         return status;
5928 }
5929
5930 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5931 {
5932         int status;
5933         struct inode *tl_inode = osb->osb_tl_inode;
5934
5935         mlog_entry_void();
5936
5937         if (tl_inode) {
5938                 cancel_delayed_work(&osb->osb_truncate_log_wq);
5939                 flush_workqueue(ocfs2_wq);
5940
5941                 status = ocfs2_flush_truncate_log(osb);
5942                 if (status < 0)
5943                         mlog_errno(status);
5944
5945                 brelse(osb->osb_tl_bh);
5946                 iput(osb->osb_tl_inode);
5947         }
5948
5949         mlog_exit_void();
5950 }
5951
5952 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5953 {
5954         int status;
5955         struct inode *tl_inode = NULL;
5956         struct buffer_head *tl_bh = NULL;
5957
5958         mlog_entry_void();
5959
5960         status = ocfs2_get_truncate_log_info(osb,
5961                                              osb->slot_num,
5962                                              &tl_inode,
5963                                              &tl_bh);
5964         if (status < 0)
5965                 mlog_errno(status);
5966
5967         /* ocfs2_truncate_log_shutdown keys on the existence of
5968          * osb->osb_tl_inode so we don't set any of the osb variables
5969          * until we're sure all is well. */
5970         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5971                           ocfs2_truncate_log_worker);
5972         osb->osb_tl_bh    = tl_bh;
5973         osb->osb_tl_inode = tl_inode;
5974
5975         mlog_exit(status);
5976         return status;
5977 }
5978
5979 /*
5980  * Delayed de-allocation of suballocator blocks.
5981  *
5982  * Some sets of block de-allocations might involve multiple suballocator inodes.
5983  *
5984  * The locking for this can get extremely complicated, especially when
5985  * the suballocator inodes to delete from aren't known until deep
5986  * within an unrelated codepath.
5987  *
5988  * ocfs2_extent_block structures are a good example of this - an inode
5989  * btree could have been grown by any number of nodes each allocating
5990  * out of their own suballoc inode.
5991  *
5992  * These structures allow the delay of block de-allocation until a
5993  * later time, when locking of multiple cluster inodes won't cause
5994  * deadlock.
5995  */
5996
5997 /*
5998  * Describe a single bit freed from a suballocator.  For the block
5999  * suballocators, it represents one block.  For the global cluster
6000  * allocator, it represents some clusters and free_bit indicates
6001  * clusters number.
6002  */
6003 struct ocfs2_cached_block_free {
6004         struct ocfs2_cached_block_free          *free_next;
6005         u64                                     free_blk;
6006         unsigned int                            free_bit;
6007 };
6008
6009 struct ocfs2_per_slot_free_list {
6010         struct ocfs2_per_slot_free_list         *f_next_suballocator;
6011         int                                     f_inode_type;
6012         int                                     f_slot;
6013         struct ocfs2_cached_block_free          *f_first;
6014 };
6015
6016 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6017                                     int sysfile_type,
6018                                     int slot,
6019                                     struct ocfs2_cached_block_free *head)
6020 {
6021         int ret;
6022         u64 bg_blkno;
6023         handle_t *handle;
6024         struct inode *inode;
6025         struct buffer_head *di_bh = NULL;
6026         struct ocfs2_cached_block_free *tmp;
6027
6028         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6029         if (!inode) {
6030                 ret = -EINVAL;
6031                 mlog_errno(ret);
6032                 goto out;
6033         }
6034
6035         mutex_lock(&inode->i_mutex);
6036
6037         ret = ocfs2_inode_lock(inode, &di_bh, 1);
6038         if (ret) {
6039                 mlog_errno(ret);
6040                 goto out_mutex;
6041         }
6042
6043         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6044         if (IS_ERR(handle)) {
6045                 ret = PTR_ERR(handle);
6046                 mlog_errno(ret);
6047                 goto out_unlock;
6048         }
6049
6050         while (head) {
6051                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6052                                                       head->free_bit);
6053                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6054                      head->free_bit, (unsigned long long)head->free_blk);
6055
6056                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6057                                                head->free_bit, bg_blkno, 1);
6058                 if (ret) {
6059                         mlog_errno(ret);
6060                         goto out_journal;
6061                 }
6062
6063                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6064                 if (ret) {
6065                         mlog_errno(ret);
6066                         goto out_journal;
6067                 }
6068
6069                 tmp = head;
6070                 head = head->free_next;
6071                 kfree(tmp);
6072         }
6073
6074 out_journal:
6075         ocfs2_commit_trans(osb, handle);
6076
6077 out_unlock:
6078         ocfs2_inode_unlock(inode, 1);
6079         brelse(di_bh);
6080 out_mutex:
6081         mutex_unlock(&inode->i_mutex);
6082         iput(inode);
6083 out:
6084         while(head) {
6085                 /* Premature exit may have left some dangling items. */
6086                 tmp = head;
6087                 head = head->free_next;
6088                 kfree(tmp);
6089         }
6090
6091         return ret;
6092 }
6093
6094 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6095                                 u64 blkno, unsigned int bit)
6096 {
6097         int ret = 0;
6098         struct ocfs2_cached_block_free *item;
6099
6100         item = kmalloc(sizeof(*item), GFP_NOFS);
6101         if (item == NULL) {
6102                 ret = -ENOMEM;
6103                 mlog_errno(ret);
6104                 return ret;
6105         }
6106
6107         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6108              bit, (unsigned long long)blkno);
6109
6110         item->free_blk = blkno;
6111         item->free_bit = bit;
6112         item->free_next = ctxt->c_global_allocator;
6113
6114         ctxt->c_global_allocator = item;
6115         return ret;
6116 }
6117
6118 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6119                                       struct ocfs2_cached_block_free *head)
6120 {
6121         struct ocfs2_cached_block_free *tmp;
6122         struct inode *tl_inode = osb->osb_tl_inode;
6123         handle_t *handle;
6124         int ret = 0;
6125
6126         mutex_lock(&tl_inode->i_mutex);
6127
6128         while (head) {
6129                 if (ocfs2_truncate_log_needs_flush(osb)) {
6130                         ret = __ocfs2_flush_truncate_log(osb);
6131                         if (ret < 0) {
6132                                 mlog_errno(ret);
6133                                 break;
6134                         }
6135                 }
6136
6137                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6138                 if (IS_ERR(handle)) {
6139                         ret = PTR_ERR(handle);
6140                         mlog_errno(ret);
6141                         break;
6142                 }
6143
6144                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6145                                                 head->free_bit);
6146
6147                 ocfs2_commit_trans(osb, handle);
6148                 tmp = head;
6149                 head = head->free_next;
6150                 kfree(tmp);
6151
6152                 if (ret < 0) {
6153                         mlog_errno(ret);
6154                         break;
6155                 }
6156         }
6157
6158         mutex_unlock(&tl_inode->i_mutex);
6159
6160         while (head) {
6161                 /* Premature exit may have left some dangling items. */
6162                 tmp = head;
6163                 head = head->free_next;
6164                 kfree(tmp);
6165         }
6166
6167         return ret;
6168 }
6169
6170 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6171                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6172 {
6173         int ret = 0, ret2;
6174         struct ocfs2_per_slot_free_list *fl;
6175
6176         if (!ctxt)
6177                 return 0;
6178
6179         while (ctxt->c_first_suballocator) {
6180                 fl = ctxt->c_first_suballocator;
6181
6182                 if (fl->f_first) {
6183                         mlog(0, "Free items: (type %u, slot %d)\n",
6184                              fl->f_inode_type, fl->f_slot);
6185                         ret2 = ocfs2_free_cached_blocks(osb,
6186                                                         fl->f_inode_type,
6187                                                         fl->f_slot,
6188                                                         fl->f_first);
6189                         if (ret2)
6190                                 mlog_errno(ret2);
6191                         if (!ret)
6192                                 ret = ret2;
6193                 }
6194
6195                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6196                 kfree(fl);
6197         }
6198
6199         if (ctxt->c_global_allocator) {
6200                 ret2 = ocfs2_free_cached_clusters(osb,
6201                                                   ctxt->c_global_allocator);
6202                 if (ret2)
6203                         mlog_errno(ret2);
6204                 if (!ret)
6205                         ret = ret2;
6206
6207                 ctxt->c_global_allocator = NULL;
6208         }
6209
6210         return ret;
6211 }
6212
6213 static struct ocfs2_per_slot_free_list *
6214 ocfs2_find_per_slot_free_list(int type,
6215                               int slot,
6216                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6217 {
6218         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6219
6220         while (fl) {
6221                 if (fl->f_inode_type == type && fl->f_slot == slot)
6222                         return fl;
6223
6224                 fl = fl->f_next_suballocator;
6225         }
6226
6227         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6228         if (fl) {
6229                 fl->f_inode_type = type;
6230                 fl->f_slot = slot;
6231                 fl->f_first = NULL;
6232                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6233
6234                 ctxt->c_first_suballocator = fl;
6235         }
6236         return fl;
6237 }
6238
6239 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6240                                      int type, int slot, u64 blkno,
6241                                      unsigned int bit)
6242 {
6243         int ret;
6244         struct ocfs2_per_slot_free_list *fl;
6245         struct ocfs2_cached_block_free *item;
6246
6247         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6248         if (fl == NULL) {
6249                 ret = -ENOMEM;
6250                 mlog_errno(ret);
6251                 goto out;
6252         }
6253
6254         item = kmalloc(sizeof(*item), GFP_NOFS);
6255         if (item == NULL) {
6256                 ret = -ENOMEM;
6257                 mlog_errno(ret);
6258                 goto out;
6259         }
6260
6261         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6262              type, slot, bit, (unsigned long long)blkno);
6263
6264         item->free_blk = blkno;
6265         item->free_bit = bit;
6266         item->free_next = fl->f_first;
6267
6268         fl->f_first = item;
6269
6270         ret = 0;
6271 out:
6272         return ret;
6273 }
6274
6275 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6276                                          struct ocfs2_extent_block *eb)
6277 {
6278         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6279                                          le16_to_cpu(eb->h_suballoc_slot),
6280                                          le64_to_cpu(eb->h_blkno),
6281                                          le16_to_cpu(eb->h_suballoc_bit));
6282 }
6283
6284 /* This function will figure out whether the currently last extent
6285  * block will be deleted, and if it will, what the new last extent
6286  * block will be so we can update his h_next_leaf_blk field, as well
6287  * as the dinodes i_last_eb_blk */
6288 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6289                                        unsigned int clusters_to_del,
6290                                        struct ocfs2_path *path,
6291                                        struct buffer_head **new_last_eb)
6292 {
6293         int next_free, ret = 0;
6294         u32 cpos;
6295         struct ocfs2_extent_rec *rec;
6296         struct ocfs2_extent_block *eb;
6297         struct ocfs2_extent_list *el;
6298         struct buffer_head *bh = NULL;
6299
6300         *new_last_eb = NULL;
6301
6302         /* we have no tree, so of course, no last_eb. */
6303         if (!path->p_tree_depth)
6304                 goto out;
6305
6306         /* trunc to zero special case - this makes tree_depth = 0
6307          * regardless of what it is.  */
6308         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6309                 goto out;
6310
6311         el = path_leaf_el(path);
6312         BUG_ON(!el->l_next_free_rec);
6313
6314         /*
6315          * Make sure that this extent list will actually be empty
6316          * after we clear away the data. We can shortcut out if
6317          * there's more than one non-empty extent in the
6318          * list. Otherwise, a check of the remaining extent is
6319          * necessary.
6320          */
6321         next_free = le16_to_cpu(el->l_next_free_rec);
6322         rec = NULL;
6323         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6324                 if (next_free > 2)
6325                         goto out;
6326
6327                 /* We may have a valid extent in index 1, check it. */
6328                 if (next_free == 2)
6329                         rec = &el->l_recs[1];
6330
6331                 /*
6332                  * Fall through - no more nonempty extents, so we want
6333                  * to delete this leaf.
6334                  */
6335         } else {
6336                 if (next_free > 1)
6337                         goto out;
6338
6339                 rec = &el->l_recs[0];
6340         }
6341
6342         if (rec) {
6343                 /*
6344                  * Check it we'll only be trimming off the end of this
6345                  * cluster.
6346                  */
6347                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6348                         goto out;
6349         }
6350
6351         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6352         if (ret) {
6353                 mlog_errno(ret);
6354                 goto out;
6355         }
6356
6357         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6358         if (ret) {
6359                 mlog_errno(ret);
6360                 goto out;
6361         }
6362
6363         eb = (struct ocfs2_extent_block *) bh->b_data;
6364         el = &eb->h_list;
6365
6366         /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6367          * Any corruption is a code bug. */
6368         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6369
6370         *new_last_eb = bh;
6371         get_bh(*new_last_eb);
6372         mlog(0, "returning block %llu, (cpos: %u)\n",
6373              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6374 out:
6375         brelse(bh);
6376
6377         return ret;
6378 }
6379
6380 /*
6381  * Trim some clusters off the rightmost edge of a tree. Only called
6382  * during truncate.
6383  *
6384  * The caller needs to:
6385  *   - start journaling of each path component.
6386  *   - compute and fully set up any new last ext block
6387  */
6388 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6389                            handle_t *handle, struct ocfs2_truncate_context *tc,
6390                            u32 clusters_to_del, u64 *delete_start)
6391 {
6392         int ret, i, index = path->p_tree_depth;
6393         u32 new_edge = 0;
6394         u64 deleted_eb = 0;
6395         struct buffer_head *bh;
6396         struct ocfs2_extent_list *el;
6397         struct ocfs2_extent_rec *rec;
6398
6399         *delete_start = 0;
6400
6401         while (index >= 0) {
6402                 bh = path->p_node[index].bh;
6403                 el = path->p_node[index].el;
6404
6405                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6406                      index,  (unsigned long long)bh->b_blocknr);
6407
6408                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6409
6410                 if (index !=
6411                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6412                         ocfs2_error(inode->i_sb,
6413                                     "Inode %lu has invalid ext. block %llu",
6414                                     inode->i_ino,
6415                                     (unsigned long long)bh->b_blocknr);
6416                         ret = -EROFS;
6417                         goto out;
6418                 }
6419
6420 find_tail_record:
6421                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6422                 rec = &el->l_recs[i];
6423
6424                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6425                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6426                      ocfs2_rec_clusters(el, rec),
6427                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6428                      le16_to_cpu(el->l_next_free_rec));
6429
6430                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6431
6432                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6433                         /*
6434                          * If the leaf block contains a single empty
6435                          * extent and no records, we can just remove
6436                          * the block.
6437                          */
6438                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6439                                 memset(rec, 0,
6440                                        sizeof(struct ocfs2_extent_rec));
6441                                 el->l_next_free_rec = cpu_to_le16(0);
6442
6443                                 goto delete;
6444                         }
6445
6446                         /*
6447                          * Remove any empty extents by shifting things
6448                          * left. That should make life much easier on
6449                          * the code below. This condition is rare
6450                          * enough that we shouldn't see a performance
6451                          * hit.
6452                          */
6453                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6454                                 le16_add_cpu(&el->l_next_free_rec, -1);
6455
6456                                 for(i = 0;
6457                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6458                                         el->l_recs[i] = el->l_recs[i + 1];
6459
6460                                 memset(&el->l_recs[i], 0,
6461                                        sizeof(struct ocfs2_extent_rec));
6462
6463                                 /*
6464                                  * We've modified our extent list. The
6465                                  * simplest way to handle this change
6466                                  * is to being the search from the
6467                                  * start again.
6468                                  */
6469                                 goto find_tail_record;
6470                         }
6471
6472                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6473
6474                         /*
6475                          * We'll use "new_edge" on our way back up the
6476                          * tree to know what our rightmost cpos is.
6477                          */
6478                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6479                         new_edge += le32_to_cpu(rec->e_cpos);
6480
6481                         /*
6482                          * The caller will use this to delete data blocks.
6483                          */
6484                         *delete_start = le64_to_cpu(rec->e_blkno)
6485                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6486                                         le16_to_cpu(rec->e_leaf_clusters));
6487
6488                         /*
6489                          * If it's now empty, remove this record.
6490                          */
6491                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6492                                 memset(rec, 0,
6493                                        sizeof(struct ocfs2_extent_rec));
6494                                 le16_add_cpu(&el->l_next_free_rec, -1);
6495                         }
6496                 } else {
6497                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6498                                 memset(rec, 0,
6499                                        sizeof(struct ocfs2_extent_rec));
6500                                 le16_add_cpu(&el->l_next_free_rec, -1);
6501
6502                                 goto delete;
6503                         }
6504
6505                         /* Can this actually happen? */
6506                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6507                                 goto delete;
6508
6509                         /*
6510                          * We never actually deleted any clusters
6511                          * because our leaf was empty. There's no
6512                          * reason to adjust the rightmost edge then.
6513                          */
6514                         if (new_edge == 0)
6515                                 goto delete;
6516
6517                         rec->e_int_clusters = cpu_to_le32(new_edge);
6518                         le32_add_cpu(&rec->e_int_clusters,
6519                                      -le32_to_cpu(rec->e_cpos));
6520
6521                          /*
6522                           * A deleted child record should have been
6523                           * caught above.
6524                           */
6525                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6526                 }
6527
6528 delete:
6529                 ret = ocfs2_journal_dirty(handle, bh);
6530                 if (ret) {
6531                         mlog_errno(ret);
6532                         goto out;
6533                 }
6534
6535                 mlog(0, "extent list container %llu, after: record %d: "
6536                      "(%u, %u, %llu), next = %u.\n",
6537                      (unsigned long long)bh->b_blocknr, i,
6538                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6539                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6540                      le16_to_cpu(el->l_next_free_rec));
6541
6542                 /*
6543                  * We must be careful to only attempt delete of an
6544                  * extent block (and not the root inode block).
6545                  */
6546                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6547                         struct ocfs2_extent_block *eb =
6548                                 (struct ocfs2_extent_block *)bh->b_data;
6549
6550                         /*
6551                          * Save this for use when processing the
6552                          * parent block.
6553                          */
6554                         deleted_eb = le64_to_cpu(eb->h_blkno);
6555
6556                         mlog(0, "deleting this extent block.\n");
6557
6558                         ocfs2_remove_from_cache(inode, bh);
6559
6560                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6561                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6562                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6563
6564                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6565                         /* An error here is not fatal. */
6566                         if (ret < 0)
6567                                 mlog_errno(ret);
6568                 } else {
6569                         deleted_eb = 0;
6570                 }
6571
6572                 index--;
6573         }
6574
6575         ret = 0;
6576 out:
6577         return ret;
6578 }
6579
6580 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6581                              unsigned int clusters_to_del,
6582                              struct inode *inode,
6583                              struct buffer_head *fe_bh,
6584                              handle_t *handle,
6585                              struct ocfs2_truncate_context *tc,
6586                              struct ocfs2_path *path)
6587 {
6588         int status;
6589         struct ocfs2_dinode *fe;
6590         struct ocfs2_extent_block *last_eb = NULL;
6591         struct ocfs2_extent_list *el;
6592         struct buffer_head *last_eb_bh = NULL;
6593         u64 delete_blk = 0;
6594
6595         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6596
6597         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6598                                              path, &last_eb_bh);
6599         if (status < 0) {
6600                 mlog_errno(status);
6601                 goto bail;
6602         }
6603
6604         /*
6605          * Each component will be touched, so we might as well journal
6606          * here to avoid having to handle errors later.
6607          */
6608         status = ocfs2_journal_access_path(inode, handle, path);
6609         if (status < 0) {
6610                 mlog_errno(status);
6611                 goto bail;
6612         }
6613
6614         if (last_eb_bh) {
6615                 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
6616                                                  OCFS2_JOURNAL_ACCESS_WRITE);
6617                 if (status < 0) {
6618                         mlog_errno(status);
6619                         goto bail;
6620                 }
6621
6622                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6623         }
6624
6625         el = &(fe->id2.i_list);
6626
6627         /*
6628          * Lower levels depend on this never happening, but it's best
6629          * to check it up here before changing the tree.
6630          */
6631         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6632                 ocfs2_error(inode->i_sb,
6633                             "Inode %lu has an empty extent record, depth %u\n",
6634                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6635                 status = -EROFS;
6636                 goto bail;
6637         }
6638
6639         vfs_dq_free_space_nodirty(inode,
6640                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6641         spin_lock(&OCFS2_I(inode)->ip_lock);
6642         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6643                                       clusters_to_del;
6644         spin_unlock(&OCFS2_I(inode)->ip_lock);
6645         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6646         inode->i_blocks = ocfs2_inode_sector_count(inode);
6647
6648         status = ocfs2_trim_tree(inode, path, handle, tc,
6649                                  clusters_to_del, &delete_blk);
6650         if (status) {
6651                 mlog_errno(status);
6652                 goto bail;
6653         }
6654
6655         if (le32_to_cpu(fe->i_clusters) == 0) {
6656                 /* trunc to zero is a special case. */
6657                 el->l_tree_depth = 0;
6658                 fe->i_last_eb_blk = 0;
6659         } else if (last_eb)
6660                 fe->i_last_eb_blk = last_eb->h_blkno;
6661
6662         status = ocfs2_journal_dirty(handle, fe_bh);
6663         if (status < 0) {
6664                 mlog_errno(status);
6665                 goto bail;
6666         }
6667
6668         if (last_eb) {
6669                 /* If there will be a new last extent block, then by
6670                  * definition, there cannot be any leaves to the right of
6671                  * him. */
6672                 last_eb->h_next_leaf_blk = 0;
6673                 status = ocfs2_journal_dirty(handle, last_eb_bh);
6674                 if (status < 0) {
6675                         mlog_errno(status);
6676                         goto bail;
6677                 }
6678         }
6679
6680         if (delete_blk) {
6681                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6682                                                    clusters_to_del);
6683                 if (status < 0) {
6684                         mlog_errno(status);
6685                         goto bail;
6686                 }
6687         }
6688         status = 0;
6689 bail:
6690
6691         mlog_exit(status);
6692         return status;
6693 }
6694
6695 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6696 {
6697         set_buffer_uptodate(bh);
6698         mark_buffer_dirty(bh);
6699         return 0;
6700 }
6701
6702 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6703                                      unsigned int from, unsigned int to,
6704                                      struct page *page, int zero, u64 *phys)
6705 {
6706         int ret, partial = 0;
6707
6708         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6709         if (ret)
6710                 mlog_errno(ret);
6711
6712         if (zero)
6713                 zero_user_segment(page, from, to);
6714
6715         /*
6716          * Need to set the buffers we zero'd into uptodate
6717          * here if they aren't - ocfs2_map_page_blocks()
6718          * might've skipped some
6719          */
6720         ret = walk_page_buffers(handle, page_buffers(page),
6721                                 from, to, &partial,
6722                                 ocfs2_zero_func);
6723         if (ret < 0)
6724                 mlog_errno(ret);
6725         else if (ocfs2_should_order_data(inode)) {
6726                 ret = ocfs2_jbd2_file_inode(handle, inode);
6727                 if (ret < 0)
6728                         mlog_errno(ret);
6729         }
6730
6731         if (!partial)
6732                 SetPageUptodate(page);
6733
6734         flush_dcache_page(page);
6735 }
6736
6737 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6738                                      loff_t end, struct page **pages,
6739                                      int numpages, u64 phys, handle_t *handle)
6740 {
6741         int i;
6742         struct page *page;
6743         unsigned int from, to = PAGE_CACHE_SIZE;
6744         struct super_block *sb = inode->i_sb;
6745
6746         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6747
6748         if (numpages == 0)
6749                 goto out;
6750
6751         to = PAGE_CACHE_SIZE;
6752         for(i = 0; i < numpages; i++) {
6753                 page = pages[i];
6754
6755                 from = start & (PAGE_CACHE_SIZE - 1);
6756                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6757                         to = end & (PAGE_CACHE_SIZE - 1);
6758
6759                 BUG_ON(from > PAGE_CACHE_SIZE);
6760                 BUG_ON(to > PAGE_CACHE_SIZE);
6761
6762                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6763                                          &phys);
6764
6765                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6766         }
6767 out:
6768         if (pages)
6769                 ocfs2_unlock_and_free_pages(pages, numpages);
6770 }
6771
6772 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6773                                 struct page **pages, int *num)
6774 {
6775         int numpages, ret = 0;
6776         struct super_block *sb = inode->i_sb;
6777         struct address_space *mapping = inode->i_mapping;
6778         unsigned long index;
6779         loff_t last_page_bytes;
6780
6781         BUG_ON(start > end);
6782
6783         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6784                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6785
6786         numpages = 0;
6787         last_page_bytes = PAGE_ALIGN(end);
6788         index = start >> PAGE_CACHE_SHIFT;
6789         do {
6790                 pages[numpages] = grab_cache_page(mapping, index);
6791                 if (!pages[numpages]) {
6792                         ret = -ENOMEM;
6793                         mlog_errno(ret);
6794                         goto out;
6795                 }
6796
6797                 numpages++;
6798                 index++;
6799         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6800
6801 out:
6802         if (ret != 0) {
6803                 if (pages)
6804                         ocfs2_unlock_and_free_pages(pages, numpages);
6805                 numpages = 0;
6806         }
6807
6808         *num = numpages;
6809
6810         return ret;
6811 }
6812
6813 /*
6814  * Zero the area past i_size but still within an allocated
6815  * cluster. This avoids exposing nonzero data on subsequent file
6816  * extends.
6817  *
6818  * We need to call this before i_size is updated on the inode because
6819  * otherwise block_write_full_page() will skip writeout of pages past
6820  * i_size. The new_i_size parameter is passed for this reason.
6821  */
6822 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6823                                   u64 range_start, u64 range_end)
6824 {
6825         int ret = 0, numpages;
6826         struct page **pages = NULL;
6827         u64 phys;
6828         unsigned int ext_flags;
6829         struct super_block *sb = inode->i_sb;
6830
6831         /*
6832          * File systems which don't support sparse files zero on every
6833          * extend.
6834          */
6835         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6836                 return 0;
6837
6838         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6839                         sizeof(struct page *), GFP_NOFS);
6840         if (pages == NULL) {
6841                 ret = -ENOMEM;
6842                 mlog_errno(ret);
6843                 goto out;
6844         }
6845
6846         if (range_start == range_end)
6847                 goto out;
6848
6849         ret = ocfs2_extent_map_get_blocks(inode,
6850                                           range_start >> sb->s_blocksize_bits,
6851                                           &phys, NULL, &ext_flags);
6852         if (ret) {
6853                 mlog_errno(ret);
6854                 goto out;
6855         }
6856
6857         /*
6858          * Tail is a hole, or is marked unwritten. In either case, we
6859          * can count on read and write to return/push zero's.
6860          */
6861         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6862                 goto out;
6863
6864         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6865                                    &numpages);
6866         if (ret) {
6867                 mlog_errno(ret);
6868                 goto out;
6869         }
6870
6871         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6872                                  numpages, phys, handle);
6873
6874         /*
6875          * Initiate writeout of the pages we zero'd here. We don't
6876          * wait on them - the truncate_inode_pages() call later will
6877          * do that for us.
6878          */
6879         ret = do_sync_mapping_range(inode->i_mapping, range_start,
6880                                     range_end - 1, SYNC_FILE_RANGE_WRITE);
6881         if (ret)
6882                 mlog_errno(ret);
6883
6884 out:
6885         if (pages)
6886                 kfree(pages);
6887
6888         return ret;
6889 }
6890
6891 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6892                                              struct ocfs2_dinode *di)
6893 {
6894         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6895         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6896
6897         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6898                 memset(&di->id2, 0, blocksize -
6899                                     offsetof(struct ocfs2_dinode, id2) -
6900                                     xattrsize);
6901         else
6902                 memset(&di->id2, 0, blocksize -
6903                                     offsetof(struct ocfs2_dinode, id2));
6904 }
6905
6906 void ocfs2_dinode_new_extent_list(struct inode *inode,
6907                                   struct ocfs2_dinode *di)
6908 {
6909         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6910         di->id2.i_list.l_tree_depth = 0;
6911         di->id2.i_list.l_next_free_rec = 0;
6912         di->id2.i_list.l_count = cpu_to_le16(
6913                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6914 }
6915
6916 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6917 {
6918         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6919         struct ocfs2_inline_data *idata = &di->id2.i_data;
6920
6921         spin_lock(&oi->ip_lock);
6922         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6923         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6924         spin_unlock(&oi->ip_lock);
6925
6926         /*
6927          * We clear the entire i_data structure here so that all
6928          * fields can be properly initialized.
6929          */
6930         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6931
6932         idata->id_count = cpu_to_le16(
6933                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6934 }
6935
6936 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6937                                          struct buffer_head *di_bh)
6938 {
6939         int ret, i, has_data, num_pages = 0;
6940         handle_t *handle;
6941         u64 uninitialized_var(block);
6942         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6943         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6944         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6945         struct ocfs2_alloc_context *data_ac = NULL;
6946         struct page **pages = NULL;
6947         loff_t end = osb->s_clustersize;
6948         struct ocfs2_extent_tree et;
6949         int did_quota = 0;
6950
6951         has_data = i_size_read(inode) ? 1 : 0;
6952
6953         if (has_data) {
6954                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6955                                 sizeof(struct page *), GFP_NOFS);
6956                 if (pages == NULL) {
6957                         ret = -ENOMEM;
6958                         mlog_errno(ret);
6959                         goto out;
6960                 }
6961
6962                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6963                 if (ret) {
6964                         mlog_errno(ret);
6965                         goto out;
6966                 }
6967         }
6968
6969         handle = ocfs2_start_trans(osb,
6970                                    ocfs2_inline_to_extents_credits(osb->sb));
6971         if (IS_ERR(handle)) {
6972                 ret = PTR_ERR(handle);
6973                 mlog_errno(ret);
6974                 goto out_unlock;
6975         }
6976
6977         ret = ocfs2_journal_access_di(handle, inode, di_bh,
6978                                       OCFS2_JOURNAL_ACCESS_WRITE);
6979         if (ret) {
6980                 mlog_errno(ret);
6981                 goto out_commit;
6982         }
6983
6984         if (has_data) {
6985                 u32 bit_off, num;
6986                 unsigned int page_end;
6987                 u64 phys;
6988
6989                 if (vfs_dq_alloc_space_nodirty(inode,
6990                                        ocfs2_clusters_to_bytes(osb->sb, 1))) {
6991                         ret = -EDQUOT;
6992                         goto out_commit;
6993                 }
6994                 did_quota = 1;
6995
6996                 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6997                                            &num);
6998                 if (ret) {
6999                         mlog_errno(ret);
7000                         goto out_commit;
7001                 }
7002
7003                 /*
7004                  * Save two copies, one for insert, and one that can
7005                  * be changed by ocfs2_map_and_dirty_page() below.
7006                  */
7007                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7008
7009                 /*
7010                  * Non sparse file systems zero on extend, so no need
7011                  * to do that now.
7012                  */
7013                 if (!ocfs2_sparse_alloc(osb) &&
7014                     PAGE_CACHE_SIZE < osb->s_clustersize)
7015                         end = PAGE_CACHE_SIZE;
7016
7017                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
7018                 if (ret) {
7019                         mlog_errno(ret);
7020                         goto out_commit;
7021                 }
7022
7023                 /*
7024                  * This should populate the 1st page for us and mark
7025                  * it up to date.
7026                  */
7027                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7028                 if (ret) {
7029                         mlog_errno(ret);
7030                         goto out_commit;
7031                 }
7032
7033                 page_end = PAGE_CACHE_SIZE;
7034                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
7035                         page_end = osb->s_clustersize;
7036
7037                 for (i = 0; i < num_pages; i++)
7038                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7039                                                  pages[i], i > 0, &phys);
7040         }
7041
7042         spin_lock(&oi->ip_lock);
7043         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7044         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7045         spin_unlock(&oi->ip_lock);
7046
7047         ocfs2_dinode_new_extent_list(inode, di);
7048
7049         ocfs2_journal_dirty(handle, di_bh);
7050
7051         if (has_data) {
7052                 /*
7053                  * An error at this point should be extremely rare. If
7054                  * this proves to be false, we could always re-build
7055                  * the in-inode data from our pages.
7056                  */
7057                 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
7058                 ret = ocfs2_insert_extent(osb, handle, inode, &et,
7059                                           0, block, 1, 0, NULL);
7060                 if (ret) {
7061                         mlog_errno(ret);
7062                         goto out_commit;
7063                 }
7064
7065                 inode->i_blocks = ocfs2_inode_sector_count(inode);
7066         }
7067
7068 out_commit:
7069         if (ret < 0 && did_quota)
7070                 vfs_dq_free_space_nodirty(inode,
7071                                           ocfs2_clusters_to_bytes(osb->sb, 1));
7072
7073         ocfs2_commit_trans(osb, handle);
7074
7075 out_unlock:
7076         if (data_ac)
7077                 ocfs2_free_alloc_context(data_ac);
7078
7079 out:
7080         if (pages) {
7081                 ocfs2_unlock_and_free_pages(pages, num_pages);
7082                 kfree(pages);
7083         }
7084
7085         return ret;
7086 }
7087
7088 /*
7089  * It is expected, that by the time you call this function,
7090  * inode->i_size and fe->i_size have been adjusted.
7091  *
7092  * WARNING: This will kfree the truncate context
7093  */
7094 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7095                           struct inode *inode,
7096                           struct buffer_head *fe_bh,
7097                           struct ocfs2_truncate_context *tc)
7098 {
7099         int status, i, credits, tl_sem = 0;
7100         u32 clusters_to_del, new_highest_cpos, range;
7101         struct ocfs2_extent_list *el;
7102         handle_t *handle = NULL;
7103         struct inode *tl_inode = osb->osb_tl_inode;
7104         struct ocfs2_path *path = NULL;
7105         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7106
7107         mlog_entry_void();
7108
7109         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7110                                                      i_size_read(inode));
7111
7112         path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7113                               ocfs2_journal_access_di);
7114         if (!path) {
7115                 status = -ENOMEM;
7116                 mlog_errno(status);
7117                 goto bail;
7118         }
7119
7120         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7121
7122 start:
7123         /*
7124          * Check that we still have allocation to delete.
7125          */
7126         if (OCFS2_I(inode)->ip_clusters == 0) {
7127                 status = 0;
7128                 goto bail;
7129         }
7130
7131         /*
7132          * Truncate always works against the rightmost tree branch.
7133          */
7134         status = ocfs2_find_path(inode, path, UINT_MAX);
7135         if (status) {
7136                 mlog_errno(status);
7137                 goto bail;
7138         }
7139
7140         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7141              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7142
7143         /*
7144          * By now, el will point to the extent list on the bottom most
7145          * portion of this tree. Only the tail record is considered in
7146          * each pass.
7147          *
7148          * We handle the following cases, in order:
7149          * - empty extent: delete the remaining branch
7150          * - remove the entire record
7151          * - remove a partial record
7152          * - no record needs to be removed (truncate has completed)
7153          */
7154         el = path_leaf_el(path);
7155         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7156                 ocfs2_error(inode->i_sb,
7157                             "Inode %llu has empty extent block at %llu\n",
7158                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7159                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7160                 status = -EROFS;
7161                 goto bail;
7162         }
7163
7164         i = le16_to_cpu(el->l_next_free_rec) - 1;
7165         range = le32_to_cpu(el->l_recs[i].e_cpos) +
7166                 ocfs2_rec_clusters(el, &el->l_recs[i]);
7167         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7168                 clusters_to_del = 0;
7169         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7170                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7171         } else if (range > new_highest_cpos) {
7172                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7173                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
7174                                   new_highest_cpos;
7175         } else {
7176                 status = 0;
7177                 goto bail;
7178         }
7179
7180         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7181              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7182
7183         mutex_lock(&tl_inode->i_mutex);
7184         tl_sem = 1;
7185         /* ocfs2_truncate_log_needs_flush guarantees us at least one
7186          * record is free for use. If there isn't any, we flush to get
7187          * an empty truncate log.  */
7188         if (ocfs2_truncate_log_needs_flush(osb)) {
7189                 status = __ocfs2_flush_truncate_log(osb);
7190                 if (status < 0) {
7191                         mlog_errno(status);
7192                         goto bail;
7193                 }
7194         }
7195
7196         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7197                                                 (struct ocfs2_dinode *)fe_bh->b_data,
7198                                                 el);
7199         handle = ocfs2_start_trans(osb, credits);
7200         if (IS_ERR(handle)) {
7201                 status = PTR_ERR(handle);
7202                 handle = NULL;
7203                 mlog_errno(status);
7204                 goto bail;
7205         }
7206
7207         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7208                                    tc, path);
7209         if (status < 0) {
7210                 mlog_errno(status);
7211                 goto bail;
7212         }
7213
7214         mutex_unlock(&tl_inode->i_mutex);
7215         tl_sem = 0;
7216
7217         ocfs2_commit_trans(osb, handle);
7218         handle = NULL;
7219
7220         ocfs2_reinit_path(path, 1);
7221
7222         /*
7223          * The check above will catch the case where we've truncated
7224          * away all allocation.
7225          */
7226         goto start;
7227
7228 bail:
7229
7230         ocfs2_schedule_truncate_log_flush(osb, 1);
7231
7232         if (tl_sem)
7233                 mutex_unlock(&tl_inode->i_mutex);
7234
7235         if (handle)
7236                 ocfs2_commit_trans(osb, handle);
7237
7238         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7239
7240         ocfs2_free_path(path);
7241
7242         /* This will drop the ext_alloc cluster lock for us */
7243         ocfs2_free_truncate_context(tc);
7244
7245         mlog_exit(status);
7246         return status;
7247 }
7248
7249 /*
7250  * Expects the inode to already be locked.
7251  */
7252 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7253                            struct inode *inode,
7254                            struct buffer_head *fe_bh,
7255                            struct ocfs2_truncate_context **tc)
7256 {
7257         int status;
7258         unsigned int new_i_clusters;
7259         struct ocfs2_dinode *fe;
7260         struct ocfs2_extent_block *eb;
7261         struct buffer_head *last_eb_bh = NULL;
7262
7263         mlog_entry_void();
7264
7265         *tc = NULL;
7266
7267         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7268                                                   i_size_read(inode));
7269         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7270
7271         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7272              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7273              (unsigned long long)le64_to_cpu(fe->i_size));
7274
7275         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7276         if (!(*tc)) {
7277                 status = -ENOMEM;
7278                 mlog_errno(status);
7279                 goto bail;
7280         }
7281         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7282
7283         if (fe->id2.i_list.l_tree_depth) {
7284                 status = ocfs2_read_extent_block(inode,
7285                                                  le64_to_cpu(fe->i_last_eb_blk),
7286                                                  &last_eb_bh);
7287                 if (status < 0) {
7288                         mlog_errno(status);
7289                         goto bail;
7290                 }
7291                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7292         }
7293
7294         (*tc)->tc_last_eb_bh = last_eb_bh;
7295
7296         status = 0;
7297 bail:
7298         if (status < 0) {
7299                 if (*tc)
7300                         ocfs2_free_truncate_context(*tc);
7301                 *tc = NULL;
7302         }
7303         mlog_exit_void();
7304         return status;
7305 }
7306
7307 /*
7308  * 'start' is inclusive, 'end' is not.
7309  */
7310 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7311                           unsigned int start, unsigned int end, int trunc)
7312 {
7313         int ret;
7314         unsigned int numbytes;
7315         handle_t *handle;
7316         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7317         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7318         struct ocfs2_inline_data *idata = &di->id2.i_data;
7319
7320         if (end > i_size_read(inode))
7321                 end = i_size_read(inode);
7322
7323         BUG_ON(start >= end);
7324
7325         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7326             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7327             !ocfs2_supports_inline_data(osb)) {
7328                 ocfs2_error(inode->i_sb,
7329                             "Inline data flags for inode %llu don't agree! "
7330                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7331                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7332                             le16_to_cpu(di->i_dyn_features),
7333                             OCFS2_I(inode)->ip_dyn_features,
7334                             osb->s_feature_incompat);
7335                 ret = -EROFS;
7336                 goto out;
7337         }
7338
7339         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7340         if (IS_ERR(handle)) {
7341                 ret = PTR_ERR(handle);
7342                 mlog_errno(ret);
7343                 goto out;
7344         }
7345
7346         ret = ocfs2_journal_access_di(handle, inode, di_bh,
7347                                       OCFS2_JOURNAL_ACCESS_WRITE);
7348         if (ret) {
7349                 mlog_errno(ret);
7350                 goto out_commit;
7351         }
7352
7353         numbytes = end - start;
7354         memset(idata->id_data + start, 0, numbytes);
7355
7356         /*
7357          * No need to worry about the data page here - it's been
7358          * truncated already and inline data doesn't need it for
7359          * pushing zero's to disk, so we'll let readpage pick it up
7360          * later.
7361          */
7362         if (trunc) {
7363                 i_size_write(inode, start);
7364                 di->i_size = cpu_to_le64(start);
7365         }
7366
7367         inode->i_blocks = ocfs2_inode_sector_count(inode);
7368         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7369
7370         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7371         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7372
7373         ocfs2_journal_dirty(handle, di_bh);
7374
7375 out_commit:
7376         ocfs2_commit_trans(osb, handle);
7377
7378 out:
7379         return ret;
7380 }
7381
7382 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7383 {
7384         /*
7385          * The caller is responsible for completing deallocation
7386          * before freeing the context.
7387          */
7388         if (tc->tc_dealloc.c_first_suballocator != NULL)
7389                 mlog(ML_NOTICE,
7390                      "Truncate completion has non-empty dealloc context\n");
7391
7392         brelse(tc->tc_last_eb_bh);
7393
7394         kfree(tc);
7395 }