sh: Update the alignment when 4K stacks are used.
[linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53
54 /*
55  * Structures which describe a path through a btree, and functions to
56  * manipulate them.
57  *
58  * The idea here is to be as generic as possible with the tree
59  * manipulation code.
60  */
61 struct ocfs2_path_item {
62         struct buffer_head              *bh;
63         struct ocfs2_extent_list        *el;
64 };
65
66 #define OCFS2_MAX_PATH_DEPTH    5
67
68 struct ocfs2_path {
69         int                     p_tree_depth;
70         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
71 };
72
73 #define path_root_bh(_path) ((_path)->p_node[0].bh)
74 #define path_root_el(_path) ((_path)->p_node[0].el)
75 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
76 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
77 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
78
79 /*
80  * Reset the actual path elements so that we can re-use the structure
81  * to build another path. Generally, this involves freeing the buffer
82  * heads.
83  */
84 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
85 {
86         int i, start = 0, depth = 0;
87         struct ocfs2_path_item *node;
88
89         if (keep_root)
90                 start = 1;
91
92         for(i = start; i < path_num_items(path); i++) {
93                 node = &path->p_node[i];
94
95                 brelse(node->bh);
96                 node->bh = NULL;
97                 node->el = NULL;
98         }
99
100         /*
101          * Tree depth may change during truncate, or insert. If we're
102          * keeping the root extent list, then make sure that our path
103          * structure reflects the proper depth.
104          */
105         if (keep_root)
106                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
107
108         path->p_tree_depth = depth;
109 }
110
111 static void ocfs2_free_path(struct ocfs2_path *path)
112 {
113         if (path) {
114                 ocfs2_reinit_path(path, 0);
115                 kfree(path);
116         }
117 }
118
119 /*
120  * Make the *dest path the same as src and re-initialize src path to
121  * have a root only.
122  */
123 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
124 {
125         int i;
126
127         BUG_ON(path_root_bh(dest) != path_root_bh(src));
128
129         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
130                 brelse(dest->p_node[i].bh);
131
132                 dest->p_node[i].bh = src->p_node[i].bh;
133                 dest->p_node[i].el = src->p_node[i].el;
134
135                 src->p_node[i].bh = NULL;
136                 src->p_node[i].el = NULL;
137         }
138 }
139
140 /*
141  * Insert an extent block at given index.
142  *
143  * This will not take an additional reference on eb_bh.
144  */
145 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
146                                         struct buffer_head *eb_bh)
147 {
148         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
149
150         /*
151          * Right now, no root bh is an extent block, so this helps
152          * catch code errors with dinode trees. The assertion can be
153          * safely removed if we ever need to insert extent block
154          * structures at the root.
155          */
156         BUG_ON(index == 0);
157
158         path->p_node[index].bh = eb_bh;
159         path->p_node[index].el = &eb->h_list;
160 }
161
162 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
163                                          struct ocfs2_extent_list *root_el)
164 {
165         struct ocfs2_path *path;
166
167         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
168
169         path = kzalloc(sizeof(*path), GFP_NOFS);
170         if (path) {
171                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
172                 get_bh(root_bh);
173                 path_root_bh(path) = root_bh;
174                 path_root_el(path) = root_el;
175         }
176
177         return path;
178 }
179
180 /*
181  * Allocate and initialize a new path based on a disk inode tree.
182  */
183 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
184 {
185         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
186         struct ocfs2_extent_list *el = &di->id2.i_list;
187
188         return ocfs2_new_path(di_bh, el);
189 }
190
191 /*
192  * Convenience function to journal all components in a path.
193  */
194 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
195                                      struct ocfs2_path *path)
196 {
197         int i, ret = 0;
198
199         if (!path)
200                 goto out;
201
202         for(i = 0; i < path_num_items(path); i++) {
203                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
204                                            OCFS2_JOURNAL_ACCESS_WRITE);
205                 if (ret < 0) {
206                         mlog_errno(ret);
207                         goto out;
208                 }
209         }
210
211 out:
212         return ret;
213 }
214
215 enum ocfs2_contig_type {
216         CONTIG_NONE = 0,
217         CONTIG_LEFT,
218         CONTIG_RIGHT
219 };
220
221
222 /*
223  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
224  * ocfs2_extent_contig only work properly against leaf nodes!
225  */
226 static int ocfs2_block_extent_contig(struct super_block *sb,
227                                      struct ocfs2_extent_rec *ext,
228                                      u64 blkno)
229 {
230         u64 blk_end = le64_to_cpu(ext->e_blkno);
231
232         blk_end += ocfs2_clusters_to_blocks(sb,
233                                     le16_to_cpu(ext->e_leaf_clusters));
234
235         return blkno == blk_end;
236 }
237
238 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
239                                   struct ocfs2_extent_rec *right)
240 {
241         u32 left_range;
242
243         left_range = le32_to_cpu(left->e_cpos) +
244                 le16_to_cpu(left->e_leaf_clusters);
245
246         return (left_range == le32_to_cpu(right->e_cpos));
247 }
248
249 static enum ocfs2_contig_type
250         ocfs2_extent_contig(struct inode *inode,
251                             struct ocfs2_extent_rec *ext,
252                             struct ocfs2_extent_rec *insert_rec)
253 {
254         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255
256         if (ocfs2_extents_adjacent(ext, insert_rec) &&
257             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258                         return CONTIG_RIGHT;
259
260         blkno = le64_to_cpu(ext->e_blkno);
261         if (ocfs2_extents_adjacent(insert_rec, ext) &&
262             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
263                 return CONTIG_LEFT;
264
265         return CONTIG_NONE;
266 }
267
268 /*
269  * NOTE: We can have pretty much any combination of contiguousness and
270  * appending.
271  *
272  * The usefulness of APPEND_TAIL is more in that it lets us know that
273  * we'll have to update the path to that leaf.
274  */
275 enum ocfs2_append_type {
276         APPEND_NONE = 0,
277         APPEND_TAIL,
278 };
279
280 struct ocfs2_insert_type {
281         enum ocfs2_append_type  ins_appending;
282         enum ocfs2_contig_type  ins_contig;
283         int                     ins_contig_index;
284         int                     ins_free_records;
285         int                     ins_tree_depth;
286 };
287
288 /*
289  * How many free extents have we got before we need more meta data?
290  */
291 int ocfs2_num_free_extents(struct ocfs2_super *osb,
292                            struct inode *inode,
293                            struct ocfs2_dinode *fe)
294 {
295         int retval;
296         struct ocfs2_extent_list *el;
297         struct ocfs2_extent_block *eb;
298         struct buffer_head *eb_bh = NULL;
299
300         mlog_entry_void();
301
302         if (!OCFS2_IS_VALID_DINODE(fe)) {
303                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
304                 retval = -EIO;
305                 goto bail;
306         }
307
308         if (fe->i_last_eb_blk) {
309                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
310                                           &eb_bh, OCFS2_BH_CACHED, inode);
311                 if (retval < 0) {
312                         mlog_errno(retval);
313                         goto bail;
314                 }
315                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
316                 el = &eb->h_list;
317         } else
318                 el = &fe->id2.i_list;
319
320         BUG_ON(el->l_tree_depth != 0);
321
322         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
323 bail:
324         if (eb_bh)
325                 brelse(eb_bh);
326
327         mlog_exit(retval);
328         return retval;
329 }
330
331 /* expects array to already be allocated
332  *
333  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
334  * l_count for you
335  */
336 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
337                                      handle_t *handle,
338                                      struct inode *inode,
339                                      int wanted,
340                                      struct ocfs2_alloc_context *meta_ac,
341                                      struct buffer_head *bhs[])
342 {
343         int count, status, i;
344         u16 suballoc_bit_start;
345         u32 num_got;
346         u64 first_blkno;
347         struct ocfs2_extent_block *eb;
348
349         mlog_entry_void();
350
351         count = 0;
352         while (count < wanted) {
353                 status = ocfs2_claim_metadata(osb,
354                                               handle,
355                                               meta_ac,
356                                               wanted - count,
357                                               &suballoc_bit_start,
358                                               &num_got,
359                                               &first_blkno);
360                 if (status < 0) {
361                         mlog_errno(status);
362                         goto bail;
363                 }
364
365                 for(i = count;  i < (num_got + count); i++) {
366                         bhs[i] = sb_getblk(osb->sb, first_blkno);
367                         if (bhs[i] == NULL) {
368                                 status = -EIO;
369                                 mlog_errno(status);
370                                 goto bail;
371                         }
372                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
373
374                         status = ocfs2_journal_access(handle, inode, bhs[i],
375                                                       OCFS2_JOURNAL_ACCESS_CREATE);
376                         if (status < 0) {
377                                 mlog_errno(status);
378                                 goto bail;
379                         }
380
381                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
382                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
383                         /* Ok, setup the minimal stuff here. */
384                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385                         eb->h_blkno = cpu_to_le64(first_blkno);
386                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389                         /* we always use slot zero's suballocator */
390                         eb->h_suballoc_slot = 0;
391 #else
392                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393 #endif
394                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395                         eb->h_list.l_count =
396                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
397
398                         suballoc_bit_start++;
399                         first_blkno++;
400
401                         /* We'll also be dirtied by the caller, so
402                          * this isn't absolutely necessary. */
403                         status = ocfs2_journal_dirty(handle, bhs[i]);
404                         if (status < 0) {
405                                 mlog_errno(status);
406                                 goto bail;
407                         }
408                 }
409
410                 count += num_got;
411         }
412
413         status = 0;
414 bail:
415         if (status < 0) {
416                 for(i = 0; i < wanted; i++) {
417                         if (bhs[i])
418                                 brelse(bhs[i]);
419                         bhs[i] = NULL;
420                 }
421         }
422         mlog_exit(status);
423         return status;
424 }
425
426 /*
427  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
428  *
429  * Returns the sum of the rightmost extent rec logical offset and
430  * cluster count.
431  *
432  * ocfs2_add_branch() uses this to determine what logical cluster
433  * value should be populated into the leftmost new branch records.
434  *
435  * ocfs2_shift_tree_depth() uses this to determine the # clusters
436  * value for the new topmost tree record.
437  */
438 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
439 {
440         int i;
441
442         i = le16_to_cpu(el->l_next_free_rec) - 1;
443
444         return le32_to_cpu(el->l_recs[i].e_cpos) +
445                 ocfs2_rec_clusters(el, &el->l_recs[i]);
446 }
447
448 /*
449  * Add an entire tree branch to our inode. eb_bh is the extent block
450  * to start at, if we don't want to start the branch at the dinode
451  * structure.
452  *
453  * last_eb_bh is required as we have to update it's next_leaf pointer
454  * for the new last extent block.
455  *
456  * the new branch will be 'empty' in the sense that every block will
457  * contain a single record with cluster count == 0.
458  */
459 static int ocfs2_add_branch(struct ocfs2_super *osb,
460                             handle_t *handle,
461                             struct inode *inode,
462                             struct buffer_head *fe_bh,
463                             struct buffer_head *eb_bh,
464                             struct buffer_head *last_eb_bh,
465                             struct ocfs2_alloc_context *meta_ac)
466 {
467         int status, new_blocks, i;
468         u64 next_blkno, new_last_eb_blk;
469         struct buffer_head *bh;
470         struct buffer_head **new_eb_bhs = NULL;
471         struct ocfs2_dinode *fe;
472         struct ocfs2_extent_block *eb;
473         struct ocfs2_extent_list  *eb_el;
474         struct ocfs2_extent_list  *el;
475         u32 new_cpos;
476
477         mlog_entry_void();
478
479         BUG_ON(!last_eb_bh);
480
481         fe = (struct ocfs2_dinode *) fe_bh->b_data;
482
483         if (eb_bh) {
484                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
485                 el = &eb->h_list;
486         } else
487                 el = &fe->id2.i_list;
488
489         /* we never add a branch to a leaf. */
490         BUG_ON(!el->l_tree_depth);
491
492         new_blocks = le16_to_cpu(el->l_tree_depth);
493
494         /* allocate the number of new eb blocks we need */
495         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
496                              GFP_KERNEL);
497         if (!new_eb_bhs) {
498                 status = -ENOMEM;
499                 mlog_errno(status);
500                 goto bail;
501         }
502
503         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
504                                            meta_ac, new_eb_bhs);
505         if (status < 0) {
506                 mlog_errno(status);
507                 goto bail;
508         }
509
510         eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
511         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512
513         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
514          * linked with the rest of the tree.
515          * conversly, new_eb_bhs[0] is the new bottommost leaf.
516          *
517          * when we leave the loop, new_last_eb_blk will point to the
518          * newest leaf, and next_blkno will point to the topmost extent
519          * block. */
520         next_blkno = new_last_eb_blk = 0;
521         for(i = 0; i < new_blocks; i++) {
522                 bh = new_eb_bhs[i];
523                 eb = (struct ocfs2_extent_block *) bh->b_data;
524                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
525                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
526                         status = -EIO;
527                         goto bail;
528                 }
529                 eb_el = &eb->h_list;
530
531                 status = ocfs2_journal_access(handle, inode, bh,
532                                               OCFS2_JOURNAL_ACCESS_CREATE);
533                 if (status < 0) {
534                         mlog_errno(status);
535                         goto bail;
536                 }
537
538                 eb->h_next_leaf_blk = 0;
539                 eb_el->l_tree_depth = cpu_to_le16(i);
540                 eb_el->l_next_free_rec = cpu_to_le16(1);
541                 /*
542                  * This actually counts as an empty extent as
543                  * c_clusters == 0
544                  */
545                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
546                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
547                 /*
548                  * eb_el isn't always an interior node, but even leaf
549                  * nodes want a zero'd flags and reserved field so
550                  * this gets the whole 32 bits regardless of use.
551                  */
552                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
553                 if (!eb_el->l_tree_depth)
554                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
555
556                 status = ocfs2_journal_dirty(handle, bh);
557                 if (status < 0) {
558                         mlog_errno(status);
559                         goto bail;
560                 }
561
562                 next_blkno = le64_to_cpu(eb->h_blkno);
563         }
564
565         /* This is a bit hairy. We want to update up to three blocks
566          * here without leaving any of them in an inconsistent state
567          * in case of error. We don't have to worry about
568          * journal_dirty erroring as it won't unless we've aborted the
569          * handle (in which case we would never be here) so reserving
570          * the write with journal_access is all we need to do. */
571         status = ocfs2_journal_access(handle, inode, last_eb_bh,
572                                       OCFS2_JOURNAL_ACCESS_WRITE);
573         if (status < 0) {
574                 mlog_errno(status);
575                 goto bail;
576         }
577         status = ocfs2_journal_access(handle, inode, fe_bh,
578                                       OCFS2_JOURNAL_ACCESS_WRITE);
579         if (status < 0) {
580                 mlog_errno(status);
581                 goto bail;
582         }
583         if (eb_bh) {
584                 status = ocfs2_journal_access(handle, inode, eb_bh,
585                                               OCFS2_JOURNAL_ACCESS_WRITE);
586                 if (status < 0) {
587                         mlog_errno(status);
588                         goto bail;
589                 }
590         }
591
592         /* Link the new branch into the rest of the tree (el will
593          * either be on the fe, or the extent block passed in. */
594         i = le16_to_cpu(el->l_next_free_rec);
595         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
596         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
597         el->l_recs[i].e_int_clusters = 0;
598         le16_add_cpu(&el->l_next_free_rec, 1);
599
600         /* fe needs a new last extent block pointer, as does the
601          * next_leaf on the previously last-extent-block. */
602         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603
604         eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
605         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606
607         status = ocfs2_journal_dirty(handle, last_eb_bh);
608         if (status < 0)
609                 mlog_errno(status);
610         status = ocfs2_journal_dirty(handle, fe_bh);
611         if (status < 0)
612                 mlog_errno(status);
613         if (eb_bh) {
614                 status = ocfs2_journal_dirty(handle, eb_bh);
615                 if (status < 0)
616                         mlog_errno(status);
617         }
618
619         status = 0;
620 bail:
621         if (new_eb_bhs) {
622                 for (i = 0; i < new_blocks; i++)
623                         if (new_eb_bhs[i])
624                                 brelse(new_eb_bhs[i]);
625                 kfree(new_eb_bhs);
626         }
627
628         mlog_exit(status);
629         return status;
630 }
631
632 /*
633  * adds another level to the allocation tree.
634  * returns back the new extent block so you can add a branch to it
635  * after this call.
636  */
637 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
638                                   handle_t *handle,
639                                   struct inode *inode,
640                                   struct buffer_head *fe_bh,
641                                   struct ocfs2_alloc_context *meta_ac,
642                                   struct buffer_head **ret_new_eb_bh)
643 {
644         int status, i;
645         u32 new_clusters;
646         struct buffer_head *new_eb_bh = NULL;
647         struct ocfs2_dinode *fe;
648         struct ocfs2_extent_block *eb;
649         struct ocfs2_extent_list  *fe_el;
650         struct ocfs2_extent_list  *eb_el;
651
652         mlog_entry_void();
653
654         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
655                                            &new_eb_bh);
656         if (status < 0) {
657                 mlog_errno(status);
658                 goto bail;
659         }
660
661         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
662         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
663                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
664                 status = -EIO;
665                 goto bail;
666         }
667
668         eb_el = &eb->h_list;
669         fe = (struct ocfs2_dinode *) fe_bh->b_data;
670         fe_el = &fe->id2.i_list;
671
672         status = ocfs2_journal_access(handle, inode, new_eb_bh,
673                                       OCFS2_JOURNAL_ACCESS_CREATE);
674         if (status < 0) {
675                 mlog_errno(status);
676                 goto bail;
677         }
678
679         /* copy the fe data into the new extent block */
680         eb_el->l_tree_depth = fe_el->l_tree_depth;
681         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
682         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
683                 eb_el->l_recs[i] = fe_el->l_recs[i];
684
685         status = ocfs2_journal_dirty(handle, new_eb_bh);
686         if (status < 0) {
687                 mlog_errno(status);
688                 goto bail;
689         }
690
691         status = ocfs2_journal_access(handle, inode, fe_bh,
692                                       OCFS2_JOURNAL_ACCESS_WRITE);
693         if (status < 0) {
694                 mlog_errno(status);
695                 goto bail;
696         }
697
698         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
699
700         /* update fe now */
701         le16_add_cpu(&fe_el->l_tree_depth, 1);
702         fe_el->l_recs[0].e_cpos = 0;
703         fe_el->l_recs[0].e_blkno = eb->h_blkno;
704         fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
705         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
706                 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
707         fe_el->l_next_free_rec = cpu_to_le16(1);
708
709         /* If this is our 1st tree depth shift, then last_eb_blk
710          * becomes the allocated extent block */
711         if (fe_el->l_tree_depth == cpu_to_le16(1))
712                 fe->i_last_eb_blk = eb->h_blkno;
713
714         status = ocfs2_journal_dirty(handle, fe_bh);
715         if (status < 0) {
716                 mlog_errno(status);
717                 goto bail;
718         }
719
720         *ret_new_eb_bh = new_eb_bh;
721         new_eb_bh = NULL;
722         status = 0;
723 bail:
724         if (new_eb_bh)
725                 brelse(new_eb_bh);
726
727         mlog_exit(status);
728         return status;
729 }
730
731 /*
732  * Should only be called when there is no space left in any of the
733  * leaf nodes. What we want to do is find the lowest tree depth
734  * non-leaf extent block with room for new records. There are three
735  * valid results of this search:
736  *
737  * 1) a lowest extent block is found, then we pass it back in
738  *    *lowest_eb_bh and return '0'
739  *
740  * 2) the search fails to find anything, but the dinode has room. We
741  *    pass NULL back in *lowest_eb_bh, but still return '0'
742  *
743  * 3) the search fails to find anything AND the dinode is full, in
744  *    which case we return > 0
745  *
746  * return status < 0 indicates an error.
747  */
748 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
749                                     struct inode *inode,
750                                     struct buffer_head *fe_bh,
751                                     struct buffer_head **target_bh)
752 {
753         int status = 0, i;
754         u64 blkno;
755         struct ocfs2_dinode *fe;
756         struct ocfs2_extent_block *eb;
757         struct ocfs2_extent_list  *el;
758         struct buffer_head *bh = NULL;
759         struct buffer_head *lowest_bh = NULL;
760
761         mlog_entry_void();
762
763         *target_bh = NULL;
764
765         fe = (struct ocfs2_dinode *) fe_bh->b_data;
766         el = &fe->id2.i_list;
767
768         while(le16_to_cpu(el->l_tree_depth) > 1) {
769                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
770                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
771                                     "extent list (next_free_rec == 0)",
772                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
773                         status = -EIO;
774                         goto bail;
775                 }
776                 i = le16_to_cpu(el->l_next_free_rec) - 1;
777                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
778                 if (!blkno) {
779                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
780                                     "list where extent # %d has no physical "
781                                     "block start",
782                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
783                         status = -EIO;
784                         goto bail;
785                 }
786
787                 if (bh) {
788                         brelse(bh);
789                         bh = NULL;
790                 }
791
792                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
793                                           inode);
794                 if (status < 0) {
795                         mlog_errno(status);
796                         goto bail;
797                 }
798
799                 eb = (struct ocfs2_extent_block *) bh->b_data;
800                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
801                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
802                         status = -EIO;
803                         goto bail;
804                 }
805                 el = &eb->h_list;
806
807                 if (le16_to_cpu(el->l_next_free_rec) <
808                     le16_to_cpu(el->l_count)) {
809                         if (lowest_bh)
810                                 brelse(lowest_bh);
811                         lowest_bh = bh;
812                         get_bh(lowest_bh);
813                 }
814         }
815
816         /* If we didn't find one and the fe doesn't have any room,
817          * then return '1' */
818         if (!lowest_bh
819             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
820                 status = 1;
821
822         *target_bh = lowest_bh;
823 bail:
824         if (bh)
825                 brelse(bh);
826
827         mlog_exit(status);
828         return status;
829 }
830
831 /*
832  * This is only valid for leaf nodes, which are the only ones that can
833  * have empty extents anyway.
834  */
835 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
836 {
837         return !rec->e_leaf_clusters;
838 }
839
840 /*
841  * This function will discard the rightmost extent record.
842  */
843 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
844 {
845         int next_free = le16_to_cpu(el->l_next_free_rec);
846         int count = le16_to_cpu(el->l_count);
847         unsigned int num_bytes;
848
849         BUG_ON(!next_free);
850         /* This will cause us to go off the end of our extent list. */
851         BUG_ON(next_free >= count);
852
853         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
854
855         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
856 }
857
858 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
859                               struct ocfs2_extent_rec *insert_rec)
860 {
861         int i, insert_index, next_free, has_empty, num_bytes;
862         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
863         struct ocfs2_extent_rec *rec;
864
865         next_free = le16_to_cpu(el->l_next_free_rec);
866         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
867
868         BUG_ON(!next_free);
869
870         /* The tree code before us didn't allow enough room in the leaf. */
871         if (el->l_next_free_rec == el->l_count && !has_empty)
872                 BUG();
873
874         /*
875          * The easiest way to approach this is to just remove the
876          * empty extent and temporarily decrement next_free.
877          */
878         if (has_empty) {
879                 /*
880                  * If next_free was 1 (only an empty extent), this
881                  * loop won't execute, which is fine. We still want
882                  * the decrement above to happen.
883                  */
884                 for(i = 0; i < (next_free - 1); i++)
885                         el->l_recs[i] = el->l_recs[i+1];
886
887                 next_free--;
888         }
889
890         /*
891          * Figure out what the new record index should be.
892          */
893         for(i = 0; i < next_free; i++) {
894                 rec = &el->l_recs[i];
895
896                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
897                         break;
898         }
899         insert_index = i;
900
901         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
902              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
903
904         BUG_ON(insert_index < 0);
905         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
906         BUG_ON(insert_index > next_free);
907
908         /*
909          * No need to memmove if we're just adding to the tail.
910          */
911         if (insert_index != next_free) {
912                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
913
914                 num_bytes = next_free - insert_index;
915                 num_bytes *= sizeof(struct ocfs2_extent_rec);
916                 memmove(&el->l_recs[insert_index + 1],
917                         &el->l_recs[insert_index],
918                         num_bytes);
919         }
920
921         /*
922          * Either we had an empty extent, and need to re-increment or
923          * there was no empty extent on a non full rightmost leaf node,
924          * in which case we still need to increment.
925          */
926         next_free++;
927         el->l_next_free_rec = cpu_to_le16(next_free);
928         /*
929          * Make sure none of the math above just messed up our tree.
930          */
931         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
932
933         el->l_recs[insert_index] = *insert_rec;
934
935 }
936
937 /*
938  * Create an empty extent record .
939  *
940  * l_next_free_rec may be updated.
941  *
942  * If an empty extent already exists do nothing.
943  */
944 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
945 {
946         int next_free = le16_to_cpu(el->l_next_free_rec);
947
948         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
949
950         if (next_free == 0)
951                 goto set_and_inc;
952
953         if (ocfs2_is_empty_extent(&el->l_recs[0]))
954                 return;
955
956         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
957                         "Asked to create an empty extent in a full list:\n"
958                         "count = %u, tree depth = %u",
959                         le16_to_cpu(el->l_count),
960                         le16_to_cpu(el->l_tree_depth));
961
962         ocfs2_shift_records_right(el);
963
964 set_and_inc:
965         le16_add_cpu(&el->l_next_free_rec, 1);
966         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
967 }
968
969 /*
970  * For a rotation which involves two leaf nodes, the "root node" is
971  * the lowest level tree node which contains a path to both leafs. This
972  * resulting set of information can be used to form a complete "subtree"
973  *
974  * This function is passed two full paths from the dinode down to a
975  * pair of adjacent leaves. It's task is to figure out which path
976  * index contains the subtree root - this can be the root index itself
977  * in a worst-case rotation.
978  *
979  * The array index of the subtree root is passed back.
980  */
981 static int ocfs2_find_subtree_root(struct inode *inode,
982                                    struct ocfs2_path *left,
983                                    struct ocfs2_path *right)
984 {
985         int i = 0;
986
987         /*
988          * Check that the caller passed in two paths from the same tree.
989          */
990         BUG_ON(path_root_bh(left) != path_root_bh(right));
991
992         do {
993                 i++;
994
995                 /*
996                  * The caller didn't pass two adjacent paths.
997                  */
998                 mlog_bug_on_msg(i > left->p_tree_depth,
999                                 "Inode %lu, left depth %u, right depth %u\n"
1000                                 "left leaf blk %llu, right leaf blk %llu\n",
1001                                 inode->i_ino, left->p_tree_depth,
1002                                 right->p_tree_depth,
1003                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1004                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1005         } while (left->p_node[i].bh->b_blocknr ==
1006                  right->p_node[i].bh->b_blocknr);
1007
1008         return i - 1;
1009 }
1010
1011 typedef void (path_insert_t)(void *, struct buffer_head *);
1012
1013 /*
1014  * Traverse a btree path in search of cpos, starting at root_el.
1015  *
1016  * This code can be called with a cpos larger than the tree, in which
1017  * case it will return the rightmost path.
1018  */
1019 static int __ocfs2_find_path(struct inode *inode,
1020                              struct ocfs2_extent_list *root_el, u32 cpos,
1021                              path_insert_t *func, void *data)
1022 {
1023         int i, ret = 0;
1024         u32 range;
1025         u64 blkno;
1026         struct buffer_head *bh = NULL;
1027         struct ocfs2_extent_block *eb;
1028         struct ocfs2_extent_list *el;
1029         struct ocfs2_extent_rec *rec;
1030         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1031
1032         el = root_el;
1033         while (el->l_tree_depth) {
1034                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1035                         ocfs2_error(inode->i_sb,
1036                                     "Inode %llu has empty extent list at "
1037                                     "depth %u\n",
1038                                     (unsigned long long)oi->ip_blkno,
1039                                     le16_to_cpu(el->l_tree_depth));
1040                         ret = -EROFS;
1041                         goto out;
1042
1043                 }
1044
1045                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1046                         rec = &el->l_recs[i];
1047
1048                         /*
1049                          * In the case that cpos is off the allocation
1050                          * tree, this should just wind up returning the
1051                          * rightmost record.
1052                          */
1053                         range = le32_to_cpu(rec->e_cpos) +
1054                                 ocfs2_rec_clusters(el, rec);
1055                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1056                             break;
1057                 }
1058
1059                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1060                 if (blkno == 0) {
1061                         ocfs2_error(inode->i_sb,
1062                                     "Inode %llu has bad blkno in extent list "
1063                                     "at depth %u (index %d)\n",
1064                                     (unsigned long long)oi->ip_blkno,
1065                                     le16_to_cpu(el->l_tree_depth), i);
1066                         ret = -EROFS;
1067                         goto out;
1068                 }
1069
1070                 brelse(bh);
1071                 bh = NULL;
1072                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1073                                        &bh, OCFS2_BH_CACHED, inode);
1074                 if (ret) {
1075                         mlog_errno(ret);
1076                         goto out;
1077                 }
1078
1079                 eb = (struct ocfs2_extent_block *) bh->b_data;
1080                 el = &eb->h_list;
1081                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1082                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1083                         ret = -EIO;
1084                         goto out;
1085                 }
1086
1087                 if (le16_to_cpu(el->l_next_free_rec) >
1088                     le16_to_cpu(el->l_count)) {
1089                         ocfs2_error(inode->i_sb,
1090                                     "Inode %llu has bad count in extent list "
1091                                     "at block %llu (next free=%u, count=%u)\n",
1092                                     (unsigned long long)oi->ip_blkno,
1093                                     (unsigned long long)bh->b_blocknr,
1094                                     le16_to_cpu(el->l_next_free_rec),
1095                                     le16_to_cpu(el->l_count));
1096                         ret = -EROFS;
1097                         goto out;
1098                 }
1099
1100                 if (func)
1101                         func(data, bh);
1102         }
1103
1104 out:
1105         /*
1106          * Catch any trailing bh that the loop didn't handle.
1107          */
1108         brelse(bh);
1109
1110         return ret;
1111 }
1112
1113 /*
1114  * Given an initialized path (that is, it has a valid root extent
1115  * list), this function will traverse the btree in search of the path
1116  * which would contain cpos.
1117  *
1118  * The path traveled is recorded in the path structure.
1119  *
1120  * Note that this will not do any comparisons on leaf node extent
1121  * records, so it will work fine in the case that we just added a tree
1122  * branch.
1123  */
1124 struct find_path_data {
1125         int index;
1126         struct ocfs2_path *path;
1127 };
1128 static void find_path_ins(void *data, struct buffer_head *bh)
1129 {
1130         struct find_path_data *fp = data;
1131
1132         get_bh(bh);
1133         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1134         fp->index++;
1135 }
1136 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1137                            u32 cpos)
1138 {
1139         struct find_path_data data;
1140
1141         data.index = 1;
1142         data.path = path;
1143         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1144                                  find_path_ins, &data);
1145 }
1146
1147 static void find_leaf_ins(void *data, struct buffer_head *bh)
1148 {
1149         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1150         struct ocfs2_extent_list *el = &eb->h_list;
1151         struct buffer_head **ret = data;
1152
1153         /* We want to retain only the leaf block. */
1154         if (le16_to_cpu(el->l_tree_depth) == 0) {
1155                 get_bh(bh);
1156                 *ret = bh;
1157         }
1158 }
1159 /*
1160  * Find the leaf block in the tree which would contain cpos. No
1161  * checking of the actual leaf is done.
1162  *
1163  * Some paths want to call this instead of allocating a path structure
1164  * and calling ocfs2_find_path().
1165  *
1166  * This function doesn't handle non btree extent lists.
1167  */
1168 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1169                     u32 cpos, struct buffer_head **leaf_bh)
1170 {
1171         int ret;
1172         struct buffer_head *bh = NULL;
1173
1174         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1175         if (ret) {
1176                 mlog_errno(ret);
1177                 goto out;
1178         }
1179
1180         *leaf_bh = bh;
1181 out:
1182         return ret;
1183 }
1184
1185 /*
1186  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1187  *
1188  * Basically, we've moved stuff around at the bottom of the tree and
1189  * we need to fix up the extent records above the changes to reflect
1190  * the new changes.
1191  *
1192  * left_rec: the record on the left.
1193  * left_child_el: is the child list pointed to by left_rec
1194  * right_rec: the record to the right of left_rec
1195  * right_child_el: is the child list pointed to by right_rec
1196  *
1197  * By definition, this only works on interior nodes.
1198  */
1199 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1200                                   struct ocfs2_extent_list *left_child_el,
1201                                   struct ocfs2_extent_rec *right_rec,
1202                                   struct ocfs2_extent_list *right_child_el)
1203 {
1204         u32 left_clusters, right_end;
1205
1206         /*
1207          * Interior nodes never have holes. Their cpos is the cpos of
1208          * the leftmost record in their child list. Their cluster
1209          * count covers the full theoretical range of their child list
1210          * - the range between their cpos and the cpos of the record
1211          * immediately to their right.
1212          */
1213         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1214         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216
1217         /*
1218          * Calculate the rightmost cluster count boundary before
1219          * moving cpos - we will need to adjust clusters after
1220          * updating e_cpos to keep the same highest cluster count.
1221          */
1222         right_end = le32_to_cpu(right_rec->e_cpos);
1223         right_end += le32_to_cpu(right_rec->e_int_clusters);
1224
1225         right_rec->e_cpos = left_rec->e_cpos;
1226         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1227
1228         right_end -= le32_to_cpu(right_rec->e_cpos);
1229         right_rec->e_int_clusters = cpu_to_le32(right_end);
1230 }
1231
1232 /*
1233  * Adjust the adjacent root node records involved in a
1234  * rotation. left_el_blkno is passed in as a key so that we can easily
1235  * find it's index in the root list.
1236  */
1237 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1238                                       struct ocfs2_extent_list *left_el,
1239                                       struct ocfs2_extent_list *right_el,
1240                                       u64 left_el_blkno)
1241 {
1242         int i;
1243
1244         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1245                le16_to_cpu(left_el->l_tree_depth));
1246
1247         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1248                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1249                         break;
1250         }
1251
1252         /*
1253          * The path walking code should have never returned a root and
1254          * two paths which are not adjacent.
1255          */
1256         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1257
1258         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1259                                       &root_el->l_recs[i + 1], right_el);
1260 }
1261
1262 /*
1263  * We've changed a leaf block (in right_path) and need to reflect that
1264  * change back up the subtree.
1265  *
1266  * This happens in multiple places:
1267  *   - When we've moved an extent record from the left path leaf to the right
1268  *     path leaf to make room for an empty extent in the left path leaf.
1269  *   - When our insert into the right path leaf is at the leftmost edge
1270  *     and requires an update of the path immediately to it's left. This
1271  *     can occur at the end of some types of rotation and appending inserts.
1272  */
1273 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1274                                        struct ocfs2_path *left_path,
1275                                        struct ocfs2_path *right_path,
1276                                        int subtree_index)
1277 {
1278         int ret, i, idx;
1279         struct ocfs2_extent_list *el, *left_el, *right_el;
1280         struct ocfs2_extent_rec *left_rec, *right_rec;
1281         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1282
1283         /*
1284          * Update the counts and position values within all the
1285          * interior nodes to reflect the leaf rotation we just did.
1286          *
1287          * The root node is handled below the loop.
1288          *
1289          * We begin the loop with right_el and left_el pointing to the
1290          * leaf lists and work our way up.
1291          *
1292          * NOTE: within this loop, left_el and right_el always refer
1293          * to the *child* lists.
1294          */
1295         left_el = path_leaf_el(left_path);
1296         right_el = path_leaf_el(right_path);
1297         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1298                 mlog(0, "Adjust records at index %u\n", i);
1299
1300                 /*
1301                  * One nice property of knowing that all of these
1302                  * nodes are below the root is that we only deal with
1303                  * the leftmost right node record and the rightmost
1304                  * left node record.
1305                  */
1306                 el = left_path->p_node[i].el;
1307                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1308                 left_rec = &el->l_recs[idx];
1309
1310                 el = right_path->p_node[i].el;
1311                 right_rec = &el->l_recs[0];
1312
1313                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1314                                               right_el);
1315
1316                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1317                 if (ret)
1318                         mlog_errno(ret);
1319
1320                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1321                 if (ret)
1322                         mlog_errno(ret);
1323
1324                 /*
1325                  * Setup our list pointers now so that the current
1326                  * parents become children in the next iteration.
1327                  */
1328                 left_el = left_path->p_node[i].el;
1329                 right_el = right_path->p_node[i].el;
1330         }
1331
1332         /*
1333          * At the root node, adjust the two adjacent records which
1334          * begin our path to the leaves.
1335          */
1336
1337         el = left_path->p_node[subtree_index].el;
1338         left_el = left_path->p_node[subtree_index + 1].el;
1339         right_el = right_path->p_node[subtree_index + 1].el;
1340
1341         ocfs2_adjust_root_records(el, left_el, right_el,
1342                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1343
1344         root_bh = left_path->p_node[subtree_index].bh;
1345
1346         ret = ocfs2_journal_dirty(handle, root_bh);
1347         if (ret)
1348                 mlog_errno(ret);
1349 }
1350
1351 static int ocfs2_rotate_subtree_right(struct inode *inode,
1352                                       handle_t *handle,
1353                                       struct ocfs2_path *left_path,
1354                                       struct ocfs2_path *right_path,
1355                                       int subtree_index)
1356 {
1357         int ret, i;
1358         struct buffer_head *right_leaf_bh;
1359         struct buffer_head *left_leaf_bh = NULL;
1360         struct buffer_head *root_bh;
1361         struct ocfs2_extent_list *right_el, *left_el;
1362         struct ocfs2_extent_rec move_rec;
1363
1364         left_leaf_bh = path_leaf_bh(left_path);
1365         left_el = path_leaf_el(left_path);
1366
1367         if (left_el->l_next_free_rec != left_el->l_count) {
1368                 ocfs2_error(inode->i_sb,
1369                             "Inode %llu has non-full interior leaf node %llu"
1370                             "(next free = %u)",
1371                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1372                             (unsigned long long)left_leaf_bh->b_blocknr,
1373                             le16_to_cpu(left_el->l_next_free_rec));
1374                 return -EROFS;
1375         }
1376
1377         /*
1378          * This extent block may already have an empty record, so we
1379          * return early if so.
1380          */
1381         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1382                 return 0;
1383
1384         root_bh = left_path->p_node[subtree_index].bh;
1385         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1386
1387         ret = ocfs2_journal_access(handle, inode, root_bh,
1388                                    OCFS2_JOURNAL_ACCESS_WRITE);
1389         if (ret) {
1390                 mlog_errno(ret);
1391                 goto out;
1392         }
1393
1394         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1395                 ret = ocfs2_journal_access(handle, inode,
1396                                            right_path->p_node[i].bh,
1397                                            OCFS2_JOURNAL_ACCESS_WRITE);
1398                 if (ret) {
1399                         mlog_errno(ret);
1400                         goto out;
1401                 }
1402
1403                 ret = ocfs2_journal_access(handle, inode,
1404                                            left_path->p_node[i].bh,
1405                                            OCFS2_JOURNAL_ACCESS_WRITE);
1406                 if (ret) {
1407                         mlog_errno(ret);
1408                         goto out;
1409                 }
1410         }
1411
1412         right_leaf_bh = path_leaf_bh(right_path);
1413         right_el = path_leaf_el(right_path);
1414
1415         /* This is a code error, not a disk corruption. */
1416         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1417                         "because rightmost leaf block %llu is empty\n",
1418                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1419                         (unsigned long long)right_leaf_bh->b_blocknr);
1420
1421         ocfs2_create_empty_extent(right_el);
1422
1423         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1424         if (ret) {
1425                 mlog_errno(ret);
1426                 goto out;
1427         }
1428
1429         /* Do the copy now. */
1430         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1431         move_rec = left_el->l_recs[i];
1432         right_el->l_recs[0] = move_rec;
1433
1434         /*
1435          * Clear out the record we just copied and shift everything
1436          * over, leaving an empty extent in the left leaf.
1437          *
1438          * We temporarily subtract from next_free_rec so that the
1439          * shift will lose the tail record (which is now defunct).
1440          */
1441         le16_add_cpu(&left_el->l_next_free_rec, -1);
1442         ocfs2_shift_records_right(left_el);
1443         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1444         le16_add_cpu(&left_el->l_next_free_rec, 1);
1445
1446         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1447         if (ret) {
1448                 mlog_errno(ret);
1449                 goto out;
1450         }
1451
1452         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1453                                 subtree_index);
1454
1455 out:
1456         return ret;
1457 }
1458
1459 /*
1460  * Given a full path, determine what cpos value would return us a path
1461  * containing the leaf immediately to the left of the current one.
1462  *
1463  * Will return zero if the path passed in is already the leftmost path.
1464  */
1465 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1466                                          struct ocfs2_path *path, u32 *cpos)
1467 {
1468         int i, j, ret = 0;
1469         u64 blkno;
1470         struct ocfs2_extent_list *el;
1471
1472         BUG_ON(path->p_tree_depth == 0);
1473
1474         *cpos = 0;
1475
1476         blkno = path_leaf_bh(path)->b_blocknr;
1477
1478         /* Start at the tree node just above the leaf and work our way up. */
1479         i = path->p_tree_depth - 1;
1480         while (i >= 0) {
1481                 el = path->p_node[i].el;
1482
1483                 /*
1484                  * Find the extent record just before the one in our
1485                  * path.
1486                  */
1487                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1488                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1489                                 if (j == 0) {
1490                                         if (i == 0) {
1491                                                 /*
1492                                                  * We've determined that the
1493                                                  * path specified is already
1494                                                  * the leftmost one - return a
1495                                                  * cpos of zero.
1496                                                  */
1497                                                 goto out;
1498                                         }
1499                                         /*
1500                                          * The leftmost record points to our
1501                                          * leaf - we need to travel up the
1502                                          * tree one level.
1503                                          */
1504                                         goto next_node;
1505                                 }
1506
1507                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1508                                 *cpos = *cpos + ocfs2_rec_clusters(el,
1509                                                            &el->l_recs[j - 1]);
1510                                 *cpos = *cpos - 1;
1511                                 goto out;
1512                         }
1513                 }
1514
1515                 /*
1516                  * If we got here, we never found a valid node where
1517                  * the tree indicated one should be.
1518                  */
1519                 ocfs2_error(sb,
1520                             "Invalid extent tree at extent block %llu\n",
1521                             (unsigned long long)blkno);
1522                 ret = -EROFS;
1523                 goto out;
1524
1525 next_node:
1526                 blkno = path->p_node[i].bh->b_blocknr;
1527                 i--;
1528         }
1529
1530 out:
1531         return ret;
1532 }
1533
1534 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1535                                            struct ocfs2_path *path)
1536 {
1537         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1538
1539         if (handle->h_buffer_credits < credits)
1540                 return ocfs2_extend_trans(handle, credits);
1541
1542         return 0;
1543 }
1544
1545 /*
1546  * Trap the case where we're inserting into the theoretical range past
1547  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1548  * whose cpos is less than ours into the right leaf.
1549  *
1550  * It's only necessary to look at the rightmost record of the left
1551  * leaf because the logic that calls us should ensure that the
1552  * theoretical ranges in the path components above the leaves are
1553  * correct.
1554  */
1555 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1556                                                  u32 insert_cpos)
1557 {
1558         struct ocfs2_extent_list *left_el;
1559         struct ocfs2_extent_rec *rec;
1560         int next_free;
1561
1562         left_el = path_leaf_el(left_path);
1563         next_free = le16_to_cpu(left_el->l_next_free_rec);
1564         rec = &left_el->l_recs[next_free - 1];
1565
1566         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1567                 return 1;
1568         return 0;
1569 }
1570
1571 /*
1572  * Rotate all the records in a btree right one record, starting at insert_cpos.
1573  *
1574  * The path to the rightmost leaf should be passed in.
1575  *
1576  * The array is assumed to be large enough to hold an entire path (tree depth).
1577  *
1578  * Upon succesful return from this function:
1579  *
1580  * - The 'right_path' array will contain a path to the leaf block
1581  *   whose range contains e_cpos.
1582  * - That leaf block will have a single empty extent in list index 0.
1583  * - In the case that the rotation requires a post-insert update,
1584  *   *ret_left_path will contain a valid path which can be passed to
1585  *   ocfs2_insert_path().
1586  */
1587 static int ocfs2_rotate_tree_right(struct inode *inode,
1588                                    handle_t *handle,
1589                                    u32 insert_cpos,
1590                                    struct ocfs2_path *right_path,
1591                                    struct ocfs2_path **ret_left_path)
1592 {
1593         int ret, start;
1594         u32 cpos;
1595         struct ocfs2_path *left_path = NULL;
1596
1597         *ret_left_path = NULL;
1598
1599         left_path = ocfs2_new_path(path_root_bh(right_path),
1600                                    path_root_el(right_path));
1601         if (!left_path) {
1602                 ret = -ENOMEM;
1603                 mlog_errno(ret);
1604                 goto out;
1605         }
1606
1607         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1608         if (ret) {
1609                 mlog_errno(ret);
1610                 goto out;
1611         }
1612
1613         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1614
1615         /*
1616          * What we want to do here is:
1617          *
1618          * 1) Start with the rightmost path.
1619          *
1620          * 2) Determine a path to the leaf block directly to the left
1621          *    of that leaf.
1622          *
1623          * 3) Determine the 'subtree root' - the lowest level tree node
1624          *    which contains a path to both leaves.
1625          *
1626          * 4) Rotate the subtree.
1627          *
1628          * 5) Find the next subtree by considering the left path to be
1629          *    the new right path.
1630          *
1631          * The check at the top of this while loop also accepts
1632          * insert_cpos == cpos because cpos is only a _theoretical_
1633          * value to get us the left path - insert_cpos might very well
1634          * be filling that hole.
1635          *
1636          * Stop at a cpos of '0' because we either started at the
1637          * leftmost branch (i.e., a tree with one branch and a
1638          * rotation inside of it), or we've gone as far as we can in
1639          * rotating subtrees.
1640          */
1641         while (cpos && insert_cpos <= cpos) {
1642                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1643                      insert_cpos, cpos);
1644
1645                 ret = ocfs2_find_path(inode, left_path, cpos);
1646                 if (ret) {
1647                         mlog_errno(ret);
1648                         goto out;
1649                 }
1650
1651                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1652                                 path_leaf_bh(right_path),
1653                                 "Inode %lu: error during insert of %u "
1654                                 "(left path cpos %u) results in two identical "
1655                                 "paths ending at %llu\n",
1656                                 inode->i_ino, insert_cpos, cpos,
1657                                 (unsigned long long)
1658                                 path_leaf_bh(left_path)->b_blocknr);
1659
1660                 if (ocfs2_rotate_requires_path_adjustment(left_path,
1661                                                           insert_cpos)) {
1662                         mlog(0, "Path adjustment required\n");
1663
1664                         /*
1665                          * We've rotated the tree as much as we
1666                          * should. The rest is up to
1667                          * ocfs2_insert_path() to complete, after the
1668                          * record insertion. We indicate this
1669                          * situation by returning the left path.
1670                          *
1671                          * The reason we don't adjust the records here
1672                          * before the record insert is that an error
1673                          * later might break the rule where a parent
1674                          * record e_cpos will reflect the actual
1675                          * e_cpos of the 1st nonempty record of the
1676                          * child list.
1677                          */
1678                         *ret_left_path = left_path;
1679                         goto out_ret_path;
1680                 }
1681
1682                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1683
1684                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1685                      start,
1686                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1687                      right_path->p_tree_depth);
1688
1689                 ret = ocfs2_extend_rotate_transaction(handle, start,
1690                                                       right_path);
1691                 if (ret) {
1692                         mlog_errno(ret);
1693                         goto out;
1694                 }
1695
1696                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1697                                                  right_path, start);
1698                 if (ret) {
1699                         mlog_errno(ret);
1700                         goto out;
1701                 }
1702
1703                 /*
1704                  * There is no need to re-read the next right path
1705                  * as we know that it'll be our current left
1706                  * path. Optimize by copying values instead.
1707                  */
1708                 ocfs2_mv_path(right_path, left_path);
1709
1710                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1711                                                     &cpos);
1712                 if (ret) {
1713                         mlog_errno(ret);
1714                         goto out;
1715                 }
1716         }
1717
1718 out:
1719         ocfs2_free_path(left_path);
1720
1721 out_ret_path:
1722         return ret;
1723 }
1724
1725 /*
1726  * Do the final bits of extent record insertion at the target leaf
1727  * list. If this leaf is part of an allocation tree, it is assumed
1728  * that the tree above has been prepared.
1729  */
1730 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1731                                  struct ocfs2_extent_list *el,
1732                                  struct ocfs2_insert_type *insert,
1733                                  struct inode *inode)
1734 {
1735         int i = insert->ins_contig_index;
1736         unsigned int range;
1737         struct ocfs2_extent_rec *rec;
1738
1739         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740
1741         /*
1742          * Contiguous insert - either left or right.
1743          */
1744         if (insert->ins_contig != CONTIG_NONE) {
1745                 rec = &el->l_recs[i];
1746                 if (insert->ins_contig == CONTIG_LEFT) {
1747                         rec->e_blkno = insert_rec->e_blkno;
1748                         rec->e_cpos = insert_rec->e_cpos;
1749                 }
1750                 le16_add_cpu(&rec->e_leaf_clusters,
1751                              le16_to_cpu(insert_rec->e_leaf_clusters));
1752                 return;
1753         }
1754
1755         /*
1756          * Handle insert into an empty leaf.
1757          */
1758         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1759             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1760              ocfs2_is_empty_extent(&el->l_recs[0]))) {
1761                 el->l_recs[0] = *insert_rec;
1762                 el->l_next_free_rec = cpu_to_le16(1);
1763                 return;
1764         }
1765
1766         /*
1767          * Appending insert.
1768          */
1769         if (insert->ins_appending == APPEND_TAIL) {
1770                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1771                 rec = &el->l_recs[i];
1772                 range = le32_to_cpu(rec->e_cpos)
1773                         + le16_to_cpu(rec->e_leaf_clusters);
1774                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1775
1776                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1777                                 le16_to_cpu(el->l_count),
1778                                 "inode %lu, depth %u, count %u, next free %u, "
1779                                 "rec.cpos %u, rec.clusters %u, "
1780                                 "insert.cpos %u, insert.clusters %u\n",
1781                                 inode->i_ino,
1782                                 le16_to_cpu(el->l_tree_depth),
1783                                 le16_to_cpu(el->l_count),
1784                                 le16_to_cpu(el->l_next_free_rec),
1785                                 le32_to_cpu(el->l_recs[i].e_cpos),
1786                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
1787                                 le32_to_cpu(insert_rec->e_cpos),
1788                                 le16_to_cpu(insert_rec->e_leaf_clusters));
1789                 i++;
1790                 el->l_recs[i] = *insert_rec;
1791                 le16_add_cpu(&el->l_next_free_rec, 1);
1792                 return;
1793         }
1794
1795         /*
1796          * Ok, we have to rotate.
1797          *
1798          * At this point, it is safe to assume that inserting into an
1799          * empty leaf and appending to a leaf have both been handled
1800          * above.
1801          *
1802          * This leaf needs to have space, either by the empty 1st
1803          * extent record, or by virtue of an l_next_rec < l_count.
1804          */
1805         ocfs2_rotate_leaf(el, insert_rec);
1806 }
1807
1808 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1809                                                 struct ocfs2_dinode *di,
1810                                                 u32 clusters)
1811 {
1812         le32_add_cpu(&di->i_clusters, clusters);
1813         spin_lock(&OCFS2_I(inode)->ip_lock);
1814         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1815         spin_unlock(&OCFS2_I(inode)->ip_lock);
1816 }
1817
1818 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819                                     struct ocfs2_extent_rec *insert_rec,
1820                                     struct ocfs2_path *right_path,
1821                                     struct ocfs2_path **ret_left_path)
1822 {
1823         int ret, i, next_free;
1824         struct buffer_head *bh;
1825         struct ocfs2_extent_list *el;
1826         struct ocfs2_path *left_path = NULL;
1827
1828         *ret_left_path = NULL;
1829
1830         /*
1831          * This shouldn't happen for non-trees. The extent rec cluster
1832          * count manipulation below only works for interior nodes.
1833          */
1834         BUG_ON(right_path->p_tree_depth == 0);
1835
1836         /*
1837          * If our appending insert is at the leftmost edge of a leaf,
1838          * then we might need to update the rightmost records of the
1839          * neighboring path.
1840          */
1841         el = path_leaf_el(right_path);
1842         next_free = le16_to_cpu(el->l_next_free_rec);
1843         if (next_free == 0 ||
1844             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1845                 u32 left_cpos;
1846
1847                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1848                                                     &left_cpos);
1849                 if (ret) {
1850                         mlog_errno(ret);
1851                         goto out;
1852                 }
1853
1854                 mlog(0, "Append may need a left path update. cpos: %u, "
1855                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1856                      left_cpos);
1857
1858                 /*
1859                  * No need to worry if the append is already in the
1860                  * leftmost leaf.
1861                  */
1862                 if (left_cpos) {
1863                         left_path = ocfs2_new_path(path_root_bh(right_path),
1864                                                    path_root_el(right_path));
1865                         if (!left_path) {
1866                                 ret = -ENOMEM;
1867                                 mlog_errno(ret);
1868                                 goto out;
1869                         }
1870
1871                         ret = ocfs2_find_path(inode, left_path, left_cpos);
1872                         if (ret) {
1873                                 mlog_errno(ret);
1874                                 goto out;
1875                         }
1876
1877                         /*
1878                          * ocfs2_insert_path() will pass the left_path to the
1879                          * journal for us.
1880                          */
1881                 }
1882         }
1883
1884         ret = ocfs2_journal_access_path(inode, handle, right_path);
1885         if (ret) {
1886                 mlog_errno(ret);
1887                 goto out;
1888         }
1889
1890         el = path_root_el(right_path);
1891         bh = path_root_bh(right_path);
1892         i = 0;
1893         while (1) {
1894                 struct ocfs2_extent_rec *rec;
1895
1896                 next_free = le16_to_cpu(el->l_next_free_rec);
1897                 if (next_free == 0) {
1898                         ocfs2_error(inode->i_sb,
1899                                     "Dinode %llu has a bad extent list",
1900                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901                         ret = -EIO;
1902                         goto out;
1903                 }
1904
1905                 rec = &el->l_recs[next_free - 1];
1906
1907                 rec->e_int_clusters = insert_rec->e_cpos;
1908                 le32_add_cpu(&rec->e_int_clusters,
1909                              le16_to_cpu(insert_rec->e_leaf_clusters));
1910                 le32_add_cpu(&rec->e_int_clusters,
1911                              -le32_to_cpu(rec->e_cpos));
1912
1913                 ret = ocfs2_journal_dirty(handle, bh);
1914                 if (ret)
1915                         mlog_errno(ret);
1916
1917                 /* Don't touch the leaf node */
1918                 if (++i >= right_path->p_tree_depth)
1919                         break;
1920
1921                 bh = right_path->p_node[i].bh;
1922                 el = right_path->p_node[i].el;
1923         }
1924
1925         *ret_left_path = left_path;
1926         ret = 0;
1927 out:
1928         if (ret != 0)
1929                 ocfs2_free_path(left_path);
1930
1931         return ret;
1932 }
1933
1934 /*
1935  * This function only does inserts on an allocation b-tree. For dinode
1936  * lists, ocfs2_insert_at_leaf() is called directly.
1937  *
1938  * right_path is the path we want to do the actual insert
1939  * in. left_path should only be passed in if we need to update that
1940  * portion of the tree after an edge insert.
1941  */
1942 static int ocfs2_insert_path(struct inode *inode,
1943                              handle_t *handle,
1944                              struct ocfs2_path *left_path,
1945                              struct ocfs2_path *right_path,
1946                              struct ocfs2_extent_rec *insert_rec,
1947                              struct ocfs2_insert_type *insert)
1948 {
1949         int ret, subtree_index;
1950         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951         struct ocfs2_extent_list *el;
1952
1953         /*
1954          * Pass both paths to the journal. The majority of inserts
1955          * will be touching all components anyway.
1956          */
1957         ret = ocfs2_journal_access_path(inode, handle, right_path);
1958         if (ret < 0) {
1959                 mlog_errno(ret);
1960                 goto out;
1961         }
1962
1963         if (left_path) {
1964                 int credits = handle->h_buffer_credits;
1965
1966                 /*
1967                  * There's a chance that left_path got passed back to
1968                  * us without being accounted for in the
1969                  * journal. Extend our transaction here to be sure we
1970                  * can change those blocks.
1971                  */
1972                 credits += left_path->p_tree_depth;
1973
1974                 ret = ocfs2_extend_trans(handle, credits);
1975                 if (ret < 0) {
1976                         mlog_errno(ret);
1977                         goto out;
1978                 }
1979
1980                 ret = ocfs2_journal_access_path(inode, handle, left_path);
1981                 if (ret < 0) {
1982                         mlog_errno(ret);
1983                         goto out;
1984                 }
1985         }
1986
1987         el = path_leaf_el(right_path);
1988
1989         ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990         ret = ocfs2_journal_dirty(handle, leaf_bh);
1991         if (ret)
1992                 mlog_errno(ret);
1993
1994         if (left_path) {
1995                 /*
1996                  * The rotate code has indicated that we need to fix
1997                  * up portions of the tree after the insert.
1998                  *
1999                  * XXX: Should we extend the transaction here?
2000                  */
2001                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
2002                                                         right_path);
2003                 ocfs2_complete_edge_insert(inode, handle, left_path,
2004                                            right_path, subtree_index);
2005         }
2006
2007         ret = 0;
2008 out:
2009         return ret;
2010 }
2011
2012 static int ocfs2_do_insert_extent(struct inode *inode,
2013                                   handle_t *handle,
2014                                   struct buffer_head *di_bh,
2015                                   struct ocfs2_extent_rec *insert_rec,
2016                                   struct ocfs2_insert_type *type)
2017 {
2018         int ret, rotate = 0;
2019         u32 cpos;
2020         struct ocfs2_path *right_path = NULL;
2021         struct ocfs2_path *left_path = NULL;
2022         struct ocfs2_dinode *di;
2023         struct ocfs2_extent_list *el;
2024
2025         di = (struct ocfs2_dinode *) di_bh->b_data;
2026         el = &di->id2.i_list;
2027
2028         ret = ocfs2_journal_access(handle, inode, di_bh,
2029                                    OCFS2_JOURNAL_ACCESS_WRITE);
2030         if (ret) {
2031                 mlog_errno(ret);
2032                 goto out;
2033         }
2034
2035         if (le16_to_cpu(el->l_tree_depth) == 0) {
2036                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2037                 goto out_update_clusters;
2038         }
2039
2040         right_path = ocfs2_new_inode_path(di_bh);
2041         if (!right_path) {
2042                 ret = -ENOMEM;
2043                 mlog_errno(ret);
2044                 goto out;
2045         }
2046
2047         /*
2048          * Determine the path to start with. Rotations need the
2049          * rightmost path, everything else can go directly to the
2050          * target leaf.
2051          */
2052         cpos = le32_to_cpu(insert_rec->e_cpos);
2053         if (type->ins_appending == APPEND_NONE &&
2054             type->ins_contig == CONTIG_NONE) {
2055                 rotate = 1;
2056                 cpos = UINT_MAX;
2057         }
2058
2059         ret = ocfs2_find_path(inode, right_path, cpos);
2060         if (ret) {
2061                 mlog_errno(ret);
2062                 goto out;
2063         }
2064
2065         /*
2066          * Rotations and appends need special treatment - they modify
2067          * parts of the tree's above them.
2068          *
2069          * Both might pass back a path immediate to the left of the
2070          * one being inserted to. This will be cause
2071          * ocfs2_insert_path() to modify the rightmost records of
2072          * left_path to account for an edge insert.
2073          *
2074          * XXX: When modifying this code, keep in mind that an insert
2075          * can wind up skipping both of these two special cases...
2076          */
2077         if (rotate) {
2078                 ret = ocfs2_rotate_tree_right(inode, handle,
2079                                               le32_to_cpu(insert_rec->e_cpos),
2080                                               right_path, &left_path);
2081                 if (ret) {
2082                         mlog_errno(ret);
2083                         goto out;
2084                 }
2085         } else if (type->ins_appending == APPEND_TAIL
2086                    && type->ins_contig != CONTIG_LEFT) {
2087                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2088                                                right_path, &left_path);
2089                 if (ret) {
2090                         mlog_errno(ret);
2091                         goto out;
2092                 }
2093         }
2094
2095         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2096                                 insert_rec, type);
2097         if (ret) {
2098                 mlog_errno(ret);
2099                 goto out;
2100         }
2101
2102 out_update_clusters:
2103         ocfs2_update_dinode_clusters(inode, di,
2104                                      le16_to_cpu(insert_rec->e_leaf_clusters));
2105
2106         ret = ocfs2_journal_dirty(handle, di_bh);
2107         if (ret)
2108                 mlog_errno(ret);
2109
2110 out:
2111         ocfs2_free_path(left_path);
2112         ocfs2_free_path(right_path);
2113
2114         return ret;
2115 }
2116
2117 static void ocfs2_figure_contig_type(struct inode *inode,
2118                                      struct ocfs2_insert_type *insert,
2119                                      struct ocfs2_extent_list *el,
2120                                      struct ocfs2_extent_rec *insert_rec)
2121 {
2122         int i;
2123         enum ocfs2_contig_type contig_type = CONTIG_NONE;
2124
2125         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2126
2127         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2128                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2129                                                   insert_rec);
2130                 if (contig_type != CONTIG_NONE) {
2131                         insert->ins_contig_index = i;
2132                         break;
2133                 }
2134         }
2135         insert->ins_contig = contig_type;
2136 }
2137
2138 /*
2139  * This should only be called against the righmost leaf extent list.
2140  *
2141  * ocfs2_figure_appending_type() will figure out whether we'll have to
2142  * insert at the tail of the rightmost leaf.
2143  *
2144  * This should also work against the dinode list for tree's with 0
2145  * depth. If we consider the dinode list to be the rightmost leaf node
2146  * then the logic here makes sense.
2147  */
2148 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2149                                         struct ocfs2_extent_list *el,
2150                                         struct ocfs2_extent_rec *insert_rec)
2151 {
2152         int i;
2153         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2154         struct ocfs2_extent_rec *rec;
2155
2156         insert->ins_appending = APPEND_NONE;
2157
2158         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2159
2160         if (!el->l_next_free_rec)
2161                 goto set_tail_append;
2162
2163         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2164                 /* Were all records empty? */
2165                 if (le16_to_cpu(el->l_next_free_rec) == 1)
2166                         goto set_tail_append;
2167         }
2168
2169         i = le16_to_cpu(el->l_next_free_rec) - 1;
2170         rec = &el->l_recs[i];
2171
2172         if (cpos >=
2173             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
2174                 goto set_tail_append;
2175
2176         return;
2177
2178 set_tail_append:
2179         insert->ins_appending = APPEND_TAIL;
2180 }
2181
2182 /*
2183  * Helper function called at the begining of an insert.
2184  *
2185  * This computes a few things that are commonly used in the process of
2186  * inserting into the btree:
2187  *   - Whether the new extent is contiguous with an existing one.
2188  *   - The current tree depth.
2189  *   - Whether the insert is an appending one.
2190  *   - The total # of free records in the tree.
2191  *
2192  * All of the information is stored on the ocfs2_insert_type
2193  * structure.
2194  */
2195 static int ocfs2_figure_insert_type(struct inode *inode,
2196                                     struct buffer_head *di_bh,
2197                                     struct buffer_head **last_eb_bh,
2198                                     struct ocfs2_extent_rec *insert_rec,
2199                                     struct ocfs2_insert_type *insert)
2200 {
2201         int ret;
2202         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2203         struct ocfs2_extent_block *eb;
2204         struct ocfs2_extent_list *el;
2205         struct ocfs2_path *path = NULL;
2206         struct buffer_head *bh = NULL;
2207
2208         el = &di->id2.i_list;
2209         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210
2211         if (el->l_tree_depth) {
2212                 /*
2213                  * If we have tree depth, we read in the
2214                  * rightmost extent block ahead of time as
2215                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
2216                  * may want it later.
2217                  */
2218                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2219                                        le64_to_cpu(di->i_last_eb_blk), &bh,
2220                                        OCFS2_BH_CACHED, inode);
2221                 if (ret) {
2222                         mlog_exit(ret);
2223                         goto out;
2224                 }
2225                 eb = (struct ocfs2_extent_block *) bh->b_data;
2226                 el = &eb->h_list;
2227         }
2228
2229         /*
2230          * Unless we have a contiguous insert, we'll need to know if
2231          * there is room left in our allocation tree for another
2232          * extent record.
2233          *
2234          * XXX: This test is simplistic, we can search for empty
2235          * extent records too.
2236          */
2237         insert->ins_free_records = le16_to_cpu(el->l_count) -
2238                 le16_to_cpu(el->l_next_free_rec);
2239
2240         if (!insert->ins_tree_depth) {
2241                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2242                 ocfs2_figure_appending_type(insert, el, insert_rec);
2243                 return 0;
2244         }
2245
2246         path = ocfs2_new_inode_path(di_bh);
2247         if (!path) {
2248                 ret = -ENOMEM;
2249                 mlog_errno(ret);
2250                 goto out;
2251         }
2252
2253         /*
2254          * In the case that we're inserting past what the tree
2255          * currently accounts for, ocfs2_find_path() will return for
2256          * us the rightmost tree path. This is accounted for below in
2257          * the appending code.
2258          */
2259         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2260         if (ret) {
2261                 mlog_errno(ret);
2262                 goto out;
2263         }
2264
2265         el = path_leaf_el(path);
2266
2267         /*
2268          * Now that we have the path, there's two things we want to determine:
2269          * 1) Contiguousness (also set contig_index if this is so)
2270          *
2271          * 2) Are we doing an append? We can trivially break this up
2272          *     into two types of appends: simple record append, or a
2273          *     rotate inside the tail leaf.
2274          */
2275         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2276
2277         /*
2278          * The insert code isn't quite ready to deal with all cases of
2279          * left contiguousness. Specifically, if it's an insert into
2280          * the 1st record in a leaf, it will require the adjustment of
2281          * cluster count on the last record of the path directly to it's
2282          * left. For now, just catch that case and fool the layers
2283          * above us. This works just fine for tree_depth == 0, which
2284          * is why we allow that above.
2285          */
2286         if (insert->ins_contig == CONTIG_LEFT &&
2287             insert->ins_contig_index == 0)
2288                 insert->ins_contig = CONTIG_NONE;
2289
2290         /*
2291          * Ok, so we can simply compare against last_eb to figure out
2292          * whether the path doesn't exist. This will only happen in
2293          * the case that we're doing a tail append, so maybe we can
2294          * take advantage of that information somehow.
2295          */
2296         if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2297                 /*
2298                  * Ok, ocfs2_find_path() returned us the rightmost
2299                  * tree path. This might be an appending insert. There are
2300                  * two cases:
2301                  *    1) We're doing a true append at the tail:
2302                  *      -This might even be off the end of the leaf
2303                  *    2) We're "appending" by rotating in the tail
2304                  */
2305                 ocfs2_figure_appending_type(insert, el, insert_rec);
2306         }
2307
2308 out:
2309         ocfs2_free_path(path);
2310
2311         if (ret == 0)
2312                 *last_eb_bh = bh;
2313         else
2314                 brelse(bh);
2315         return ret;
2316 }
2317
2318 /*
2319  * Insert an extent into an inode btree.
2320  *
2321  * The caller needs to update fe->i_clusters
2322  */
2323 int ocfs2_insert_extent(struct ocfs2_super *osb,
2324                         handle_t *handle,
2325                         struct inode *inode,
2326                         struct buffer_head *fe_bh,
2327                         u32 cpos,
2328                         u64 start_blk,
2329                         u32 new_clusters,
2330                         struct ocfs2_alloc_context *meta_ac)
2331 {
2332         int status, shift;
2333         struct buffer_head *last_eb_bh = NULL;
2334         struct buffer_head *bh = NULL;
2335         struct ocfs2_insert_type insert = {0, };
2336         struct ocfs2_extent_rec rec;
2337
2338         mlog(0, "add %u clusters at position %u to inode %llu\n",
2339              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2340
2341         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2342                         (OCFS2_I(inode)->ip_clusters != cpos),
2343                         "Device %s, asking for sparse allocation: inode %llu, "
2344                         "cpos %u, clusters %u\n",
2345                         osb->dev_str,
2346                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2347                         OCFS2_I(inode)->ip_clusters);
2348
2349         memset(&rec, 0, sizeof(rec));
2350         rec.e_cpos = cpu_to_le32(cpos);
2351         rec.e_blkno = cpu_to_le64(start_blk);
2352         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
2353
2354         status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355                                           &insert);
2356         if (status < 0) {
2357                 mlog_errno(status);
2358                 goto bail;
2359         }
2360
2361         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
2362              "Insert.contig_index: %d, Insert.free_records: %d, "
2363              "Insert.tree_depth: %d\n",
2364              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365              insert.ins_free_records, insert.ins_tree_depth);
2366
2367         /*
2368          * Avoid growing the tree unless we're out of records and the
2369          * insert type requres one.
2370          */
2371         if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2372                 goto out_add;
2373
2374         shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2375         if (shift < 0) {
2376                 status = shift;
2377                 mlog_errno(status);
2378                 goto bail;
2379         }
2380
2381         /* We traveled all the way to the bottom of the allocation tree
2382          * and didn't find room for any more extents - we need to add
2383          * another tree level */
2384         if (shift) {
2385                 BUG_ON(bh);
2386                 mlog(0, "need to shift tree depth "
2387                      "(current = %d)\n", insert.ins_tree_depth);
2388
2389                 /* ocfs2_shift_tree_depth will return us a buffer with
2390                  * the new extent block (so we can pass that to
2391                  * ocfs2_add_branch). */
2392                 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2393                                                 meta_ac, &bh);
2394                 if (status < 0) {
2395                         mlog_errno(status);
2396                         goto bail;
2397                 }
2398                 insert.ins_tree_depth++;
2399                 /* Special case: we have room now if we shifted from
2400                  * tree_depth 0 */
2401                 if (insert.ins_tree_depth == 1)
2402                         goto out_add;
2403         }
2404
2405         /* call ocfs2_add_branch to add the final part of the tree with
2406          * the new data. */
2407         mlog(0, "add branch. bh = %p\n", bh);
2408         status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2409                                   meta_ac);
2410         if (status < 0) {
2411                 mlog_errno(status);
2412                 goto bail;
2413         }
2414
2415 out_add:
2416         /* Finally, we can add clusters. This might rotate the tree for us. */
2417         status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2418         if (status < 0)
2419                 mlog_errno(status);
2420         else
2421                 ocfs2_extent_map_insert_rec(inode, &rec);
2422
2423 bail:
2424         if (bh)
2425                 brelse(bh);
2426
2427         if (last_eb_bh)
2428                 brelse(last_eb_bh);
2429
2430         mlog_exit(status);
2431         return status;
2432 }
2433
2434 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2435 {
2436         struct buffer_head *tl_bh = osb->osb_tl_bh;
2437         struct ocfs2_dinode *di;
2438         struct ocfs2_truncate_log *tl;
2439
2440         di = (struct ocfs2_dinode *) tl_bh->b_data;
2441         tl = &di->id2.i_dealloc;
2442
2443         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
2444                         "slot %d, invalid truncate log parameters: used = "
2445                         "%u, count = %u\n", osb->slot_num,
2446                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
2447         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
2448 }
2449
2450 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2451                                            unsigned int new_start)
2452 {
2453         unsigned int tail_index;
2454         unsigned int current_tail;
2455
2456         /* No records, nothing to coalesce */
2457         if (!le16_to_cpu(tl->tl_used))
2458                 return 0;
2459
2460         tail_index = le16_to_cpu(tl->tl_used) - 1;
2461         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
2462         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
2463
2464         return current_tail == new_start;
2465 }
2466
2467 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2468                                      handle_t *handle,
2469                                      u64 start_blk,
2470                                      unsigned int num_clusters)
2471 {
2472         int status, index;
2473         unsigned int start_cluster, tl_count;
2474         struct inode *tl_inode = osb->osb_tl_inode;
2475         struct buffer_head *tl_bh = osb->osb_tl_bh;
2476         struct ocfs2_dinode *di;
2477         struct ocfs2_truncate_log *tl;
2478
2479         mlog_entry("start_blk = %llu, num_clusters = %u\n",
2480                    (unsigned long long)start_blk, num_clusters);
2481
2482         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2483
2484         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
2485
2486         di = (struct ocfs2_dinode *) tl_bh->b_data;
2487         tl = &di->id2.i_dealloc;
2488         if (!OCFS2_IS_VALID_DINODE(di)) {
2489                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2490                 status = -EIO;
2491                 goto bail;
2492         }
2493
2494         tl_count = le16_to_cpu(tl->tl_count);
2495         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
2496                         tl_count == 0,
2497                         "Truncate record count on #%llu invalid "
2498                         "wanted %u, actual %u\n",
2499                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
2500                         ocfs2_truncate_recs_per_inode(osb->sb),
2501                         le16_to_cpu(tl->tl_count));
2502
2503         /* Caller should have known to flush before calling us. */
2504         index = le16_to_cpu(tl->tl_used);
2505         if (index >= tl_count) {
2506                 status = -ENOSPC;
2507                 mlog_errno(status);
2508                 goto bail;
2509         }
2510
2511         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2512                                       OCFS2_JOURNAL_ACCESS_WRITE);
2513         if (status < 0) {
2514                 mlog_errno(status);
2515                 goto bail;
2516         }
2517
2518         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
2519              "%llu (index = %d)\n", num_clusters, start_cluster,
2520              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
2521
2522         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
2523                 /*
2524                  * Move index back to the record we are coalescing with.
2525                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
2526                  */
2527                 index--;
2528
2529                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
2530                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
2531                      index, le32_to_cpu(tl->tl_recs[index].t_start),
2532                      num_clusters);
2533         } else {
2534                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
2535                 tl->tl_used = cpu_to_le16(index + 1);
2536         }
2537         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
2538
2539         status = ocfs2_journal_dirty(handle, tl_bh);
2540         if (status < 0) {
2541                 mlog_errno(status);
2542                 goto bail;
2543         }
2544
2545 bail:
2546         mlog_exit(status);
2547         return status;
2548 }
2549
2550 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
2551                                          handle_t *handle,
2552                                          struct inode *data_alloc_inode,
2553                                          struct buffer_head *data_alloc_bh)
2554 {
2555         int status = 0;
2556         int i;
2557         unsigned int num_clusters;
2558         u64 start_blk;
2559         struct ocfs2_truncate_rec rec;
2560         struct ocfs2_dinode *di;
2561         struct ocfs2_truncate_log *tl;
2562         struct inode *tl_inode = osb->osb_tl_inode;
2563         struct buffer_head *tl_bh = osb->osb_tl_bh;
2564
2565         mlog_entry_void();
2566
2567         di = (struct ocfs2_dinode *) tl_bh->b_data;
2568         tl = &di->id2.i_dealloc;
2569         i = le16_to_cpu(tl->tl_used) - 1;
2570         while (i >= 0) {
2571                 /* Caller has given us at least enough credits to
2572                  * update the truncate log dinode */
2573                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2574                                               OCFS2_JOURNAL_ACCESS_WRITE);
2575                 if (status < 0) {
2576                         mlog_errno(status);
2577                         goto bail;
2578                 }
2579
2580                 tl->tl_used = cpu_to_le16(i);
2581
2582                 status = ocfs2_journal_dirty(handle, tl_bh);
2583                 if (status < 0) {
2584                         mlog_errno(status);
2585                         goto bail;
2586                 }
2587
2588                 /* TODO: Perhaps we can calculate the bulk of the
2589                  * credits up front rather than extending like
2590                  * this. */
2591                 status = ocfs2_extend_trans(handle,
2592                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
2593                 if (status < 0) {
2594                         mlog_errno(status);
2595                         goto bail;
2596                 }
2597
2598                 rec = tl->tl_recs[i];
2599                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
2600                                                     le32_to_cpu(rec.t_start));
2601                 num_clusters = le32_to_cpu(rec.t_clusters);
2602
2603                 /* if start_blk is not set, we ignore the record as
2604                  * invalid. */
2605                 if (start_blk) {
2606                         mlog(0, "free record %d, start = %u, clusters = %u\n",
2607                              i, le32_to_cpu(rec.t_start), num_clusters);
2608
2609                         status = ocfs2_free_clusters(handle, data_alloc_inode,
2610                                                      data_alloc_bh, start_blk,
2611                                                      num_clusters);
2612                         if (status < 0) {
2613                                 mlog_errno(status);
2614                                 goto bail;
2615                         }
2616                 }
2617                 i--;
2618         }
2619
2620 bail:
2621         mlog_exit(status);
2622         return status;
2623 }
2624
2625 /* Expects you to already be holding tl_inode->i_mutex */
2626 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2627 {
2628         int status;
2629         unsigned int num_to_flush;
2630         handle_t *handle;
2631         struct inode *tl_inode = osb->osb_tl_inode;
2632         struct inode *data_alloc_inode = NULL;
2633         struct buffer_head *tl_bh = osb->osb_tl_bh;
2634         struct buffer_head *data_alloc_bh = NULL;
2635         struct ocfs2_dinode *di;
2636         struct ocfs2_truncate_log *tl;
2637
2638         mlog_entry_void();
2639
2640         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2641
2642         di = (struct ocfs2_dinode *) tl_bh->b_data;
2643         tl = &di->id2.i_dealloc;
2644         if (!OCFS2_IS_VALID_DINODE(di)) {
2645                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2646                 status = -EIO;
2647                 goto out;
2648         }
2649
2650         num_to_flush = le16_to_cpu(tl->tl_used);
2651         mlog(0, "Flush %u records from truncate log #%llu\n",
2652              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
2653         if (!num_to_flush) {
2654                 status = 0;
2655                 goto out;
2656         }
2657
2658         data_alloc_inode = ocfs2_get_system_file_inode(osb,
2659                                                        GLOBAL_BITMAP_SYSTEM_INODE,
2660                                                        OCFS2_INVALID_SLOT);
2661         if (!data_alloc_inode) {
2662                 status = -EINVAL;
2663                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
2664                 goto out;
2665         }
2666
2667         mutex_lock(&data_alloc_inode->i_mutex);
2668
2669         status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
2670         if (status < 0) {
2671                 mlog_errno(status);
2672                 goto out_mutex;
2673         }
2674
2675         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2676         if (IS_ERR(handle)) {
2677                 status = PTR_ERR(handle);
2678                 mlog_errno(status);
2679                 goto out_unlock;
2680         }
2681
2682         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
2683                                                data_alloc_bh);
2684         if (status < 0)
2685                 mlog_errno(status);
2686
2687         ocfs2_commit_trans(osb, handle);
2688
2689 out_unlock:
2690         brelse(data_alloc_bh);
2691         ocfs2_meta_unlock(data_alloc_inode, 1);
2692
2693 out_mutex:
2694         mutex_unlock(&data_alloc_inode->i_mutex);
2695         iput(data_alloc_inode);
2696
2697 out:
2698         mlog_exit(status);
2699         return status;
2700 }
2701
2702 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2703 {
2704         int status;
2705         struct inode *tl_inode = osb->osb_tl_inode;
2706
2707         mutex_lock(&tl_inode->i_mutex);
2708         status = __ocfs2_flush_truncate_log(osb);
2709         mutex_unlock(&tl_inode->i_mutex);
2710
2711         return status;
2712 }
2713
2714 static void ocfs2_truncate_log_worker(struct work_struct *work)
2715 {
2716         int status;
2717         struct ocfs2_super *osb =
2718                 container_of(work, struct ocfs2_super,
2719                              osb_truncate_log_wq.work);
2720
2721         mlog_entry_void();
2722
2723         status = ocfs2_flush_truncate_log(osb);
2724         if (status < 0)
2725                 mlog_errno(status);
2726
2727         mlog_exit(status);
2728 }
2729
2730 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
2731 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
2732                                        int cancel)
2733 {
2734         if (osb->osb_tl_inode) {
2735                 /* We want to push off log flushes while truncates are
2736                  * still running. */
2737                 if (cancel)
2738                         cancel_delayed_work(&osb->osb_truncate_log_wq);
2739
2740                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
2741                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
2742         }
2743 }
2744
2745 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
2746                                        int slot_num,
2747                                        struct inode **tl_inode,
2748                                        struct buffer_head **tl_bh)
2749 {
2750         int status;
2751         struct inode *inode = NULL;
2752         struct buffer_head *bh = NULL;
2753
2754         inode = ocfs2_get_system_file_inode(osb,
2755                                            TRUNCATE_LOG_SYSTEM_INODE,
2756                                            slot_num);
2757         if (!inode) {
2758                 status = -EINVAL;
2759                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
2760                 goto bail;
2761         }
2762
2763         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
2764                                   OCFS2_BH_CACHED, inode);
2765         if (status < 0) {
2766                 iput(inode);
2767                 mlog_errno(status);
2768                 goto bail;
2769         }
2770
2771         *tl_inode = inode;
2772         *tl_bh    = bh;
2773 bail:
2774         mlog_exit(status);
2775         return status;
2776 }
2777
2778 /* called during the 1st stage of node recovery. we stamp a clean
2779  * truncate log and pass back a copy for processing later. if the
2780  * truncate log does not require processing, a *tl_copy is set to
2781  * NULL. */
2782 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
2783                                       int slot_num,
2784                                       struct ocfs2_dinode **tl_copy)
2785 {
2786         int status;
2787         struct inode *tl_inode = NULL;
2788         struct buffer_head *tl_bh = NULL;
2789         struct ocfs2_dinode *di;
2790         struct ocfs2_truncate_log *tl;
2791
2792         *tl_copy = NULL;
2793
2794         mlog(0, "recover truncate log from slot %d\n", slot_num);
2795
2796         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
2797         if (status < 0) {
2798                 mlog_errno(status);
2799                 goto bail;
2800         }
2801
2802         di = (struct ocfs2_dinode *) tl_bh->b_data;
2803         tl = &di->id2.i_dealloc;
2804         if (!OCFS2_IS_VALID_DINODE(di)) {
2805                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
2806                 status = -EIO;
2807                 goto bail;
2808         }
2809
2810         if (le16_to_cpu(tl->tl_used)) {
2811                 mlog(0, "We'll have %u logs to recover\n",
2812                      le16_to_cpu(tl->tl_used));
2813
2814                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
2815                 if (!(*tl_copy)) {
2816                         status = -ENOMEM;
2817                         mlog_errno(status);
2818                         goto bail;
2819                 }
2820
2821                 /* Assuming the write-out below goes well, this copy
2822                  * will be passed back to recovery for processing. */
2823                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
2824
2825                 /* All we need to do to clear the truncate log is set
2826                  * tl_used. */
2827                 tl->tl_used = 0;
2828
2829                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
2830                 if (status < 0) {
2831                         mlog_errno(status);
2832                         goto bail;
2833                 }
2834         }
2835
2836 bail:
2837         if (tl_inode)
2838                 iput(tl_inode);
2839         if (tl_bh)
2840                 brelse(tl_bh);
2841
2842         if (status < 0 && (*tl_copy)) {
2843                 kfree(*tl_copy);
2844                 *tl_copy = NULL;
2845         }
2846
2847         mlog_exit(status);
2848         return status;
2849 }
2850
2851 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
2852                                          struct ocfs2_dinode *tl_copy)
2853 {
2854         int status = 0;
2855         int i;
2856         unsigned int clusters, num_recs, start_cluster;
2857         u64 start_blk;
2858         handle_t *handle;
2859         struct inode *tl_inode = osb->osb_tl_inode;
2860         struct ocfs2_truncate_log *tl;
2861
2862         mlog_entry_void();
2863
2864         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
2865                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
2866                 return -EINVAL;
2867         }
2868
2869         tl = &tl_copy->id2.i_dealloc;
2870         num_recs = le16_to_cpu(tl->tl_used);
2871         mlog(0, "cleanup %u records from %llu\n", num_recs,
2872              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
2873
2874         mutex_lock(&tl_inode->i_mutex);
2875         for(i = 0; i < num_recs; i++) {
2876                 if (ocfs2_truncate_log_needs_flush(osb)) {
2877                         status = __ocfs2_flush_truncate_log(osb);
2878                         if (status < 0) {
2879                                 mlog_errno(status);
2880                                 goto bail_up;
2881                         }
2882                 }
2883
2884                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2885                 if (IS_ERR(handle)) {
2886                         status = PTR_ERR(handle);
2887                         mlog_errno(status);
2888                         goto bail_up;
2889                 }
2890
2891                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
2892                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
2893                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
2894
2895                 status = ocfs2_truncate_log_append(osb, handle,
2896                                                    start_blk, clusters);
2897                 ocfs2_commit_trans(osb, handle);
2898                 if (status < 0) {
2899                         mlog_errno(status);
2900                         goto bail_up;
2901                 }
2902         }
2903
2904 bail_up:
2905         mutex_unlock(&tl_inode->i_mutex);
2906
2907         mlog_exit(status);
2908         return status;
2909 }
2910
2911 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
2912 {
2913         int status;
2914         struct inode *tl_inode = osb->osb_tl_inode;
2915
2916         mlog_entry_void();
2917
2918         if (tl_inode) {
2919                 cancel_delayed_work(&osb->osb_truncate_log_wq);
2920                 flush_workqueue(ocfs2_wq);
2921
2922                 status = ocfs2_flush_truncate_log(osb);
2923                 if (status < 0)
2924                         mlog_errno(status);
2925
2926                 brelse(osb->osb_tl_bh);
2927                 iput(osb->osb_tl_inode);
2928         }
2929
2930         mlog_exit_void();
2931 }
2932
2933 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2934 {
2935         int status;
2936         struct inode *tl_inode = NULL;
2937         struct buffer_head *tl_bh = NULL;
2938
2939         mlog_entry_void();
2940
2941         status = ocfs2_get_truncate_log_info(osb,
2942                                              osb->slot_num,
2943                                              &tl_inode,
2944                                              &tl_bh);
2945         if (status < 0)
2946                 mlog_errno(status);
2947
2948         /* ocfs2_truncate_log_shutdown keys on the existence of
2949          * osb->osb_tl_inode so we don't set any of the osb variables
2950          * until we're sure all is well. */
2951         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
2952                           ocfs2_truncate_log_worker);
2953         osb->osb_tl_bh    = tl_bh;
2954         osb->osb_tl_inode = tl_inode;
2955
2956         mlog_exit(status);
2957         return status;
2958 }
2959
2960 /* This function will figure out whether the currently last extent
2961  * block will be deleted, and if it will, what the new last extent
2962  * block will be so we can update his h_next_leaf_blk field, as well
2963  * as the dinodes i_last_eb_blk */
2964 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
2965                                        unsigned int clusters_to_del,
2966                                        struct ocfs2_path *path,
2967                                        struct buffer_head **new_last_eb)
2968 {
2969         int next_free, ret = 0;
2970         u32 cpos;
2971         struct ocfs2_extent_rec *rec;
2972         struct ocfs2_extent_block *eb;
2973         struct ocfs2_extent_list *el;
2974         struct buffer_head *bh = NULL;
2975
2976         *new_last_eb = NULL;
2977
2978         /* we have no tree, so of course, no last_eb. */
2979         if (!path->p_tree_depth)
2980                 goto out;
2981
2982         /* trunc to zero special case - this makes tree_depth = 0
2983          * regardless of what it is.  */
2984         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
2985                 goto out;
2986
2987         el = path_leaf_el(path);
2988         BUG_ON(!el->l_next_free_rec);
2989
2990         /*
2991          * Make sure that this extent list will actually be empty
2992          * after we clear away the data. We can shortcut out if
2993          * there's more than one non-empty extent in the
2994          * list. Otherwise, a check of the remaining extent is
2995          * necessary.
2996          */
2997         next_free = le16_to_cpu(el->l_next_free_rec);
2998         rec = NULL;
2999         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3000                 if (next_free > 2)
3001                         goto out;
3002
3003                 /* We may have a valid extent in index 1, check it. */
3004                 if (next_free == 2)
3005                         rec = &el->l_recs[1];
3006
3007                 /*
3008                  * Fall through - no more nonempty extents, so we want
3009                  * to delete this leaf.
3010                  */
3011         } else {
3012                 if (next_free > 1)
3013                         goto out;
3014
3015                 rec = &el->l_recs[0];
3016         }
3017
3018         if (rec) {
3019                 /*
3020                  * Check it we'll only be trimming off the end of this
3021                  * cluster.
3022                  */
3023                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
3024                         goto out;
3025         }
3026
3027         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
3028         if (ret) {
3029                 mlog_errno(ret);
3030                 goto out;
3031         }
3032
3033         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
3034         if (ret) {
3035                 mlog_errno(ret);
3036                 goto out;
3037         }
3038
3039         eb = (struct ocfs2_extent_block *) bh->b_data;
3040         el = &eb->h_list;
3041         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3042                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3043                 ret = -EROFS;
3044                 goto out;
3045         }
3046
3047         *new_last_eb = bh;
3048         get_bh(*new_last_eb);
3049         mlog(0, "returning block %llu, (cpos: %u)\n",
3050              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
3051 out:
3052         brelse(bh);
3053
3054         return ret;
3055 }
3056
3057 /*
3058  * Trim some clusters off the rightmost edge of a tree. Only called
3059  * during truncate.
3060  *
3061  * The caller needs to:
3062  *   - start journaling of each path component.
3063  *   - compute and fully set up any new last ext block
3064  */
3065 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
3066                            handle_t *handle, struct ocfs2_truncate_context *tc,
3067                            u32 clusters_to_del, u64 *delete_start)
3068 {
3069         int ret, i, index = path->p_tree_depth;
3070         u32 new_edge = 0;
3071         u64 deleted_eb = 0;
3072         struct buffer_head *bh;
3073         struct ocfs2_extent_list *el;
3074         struct ocfs2_extent_rec *rec;
3075
3076         *delete_start = 0;
3077
3078         while (index >= 0) {
3079                 bh = path->p_node[index].bh;
3080                 el = path->p_node[index].el;
3081
3082                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
3083                      index,  (unsigned long long)bh->b_blocknr);
3084
3085                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3086
3087                 if (index !=
3088                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3089                         ocfs2_error(inode->i_sb,
3090                                     "Inode %lu has invalid ext. block %llu",
3091                                     inode->i_ino,
3092                                     (unsigned long long)bh->b_blocknr);
3093                         ret = -EROFS;
3094                         goto out;
3095                 }
3096
3097 find_tail_record:
3098                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3099                 rec = &el->l_recs[i];
3100
3101                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
3102                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
3103                      ocfs2_rec_clusters(el, rec),
3104                      (unsigned long long)le64_to_cpu(rec->e_blkno),
3105                      le16_to_cpu(el->l_next_free_rec));
3106
3107                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
3108
3109                 if (le16_to_cpu(el->l_tree_depth) == 0) {
3110                         /*
3111                          * If the leaf block contains a single empty
3112                          * extent and no records, we can just remove
3113                          * the block.
3114                          */
3115                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
3116                                 memset(rec, 0,
3117                                        sizeof(struct ocfs2_extent_rec));
3118                                 el->l_next_free_rec = cpu_to_le16(0);
3119
3120                                 goto delete;
3121                         }
3122
3123                         /*
3124                          * Remove any empty extents by shifting things
3125                          * left. That should make life much easier on
3126                          * the code below. This condition is rare
3127                          * enough that we shouldn't see a performance
3128                          * hit.
3129                          */
3130                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3131                                 le16_add_cpu(&el->l_next_free_rec, -1);
3132
3133                                 for(i = 0;
3134                                     i < le16_to_cpu(el->l_next_free_rec); i++)
3135                                         el->l_recs[i] = el->l_recs[i + 1];
3136
3137                                 memset(&el->l_recs[i], 0,
3138                                        sizeof(struct ocfs2_extent_rec));
3139
3140                                 /*
3141                                  * We've modified our extent list. The
3142                                  * simplest way to handle this change
3143                                  * is to being the search from the
3144                                  * start again.
3145                                  */
3146                                 goto find_tail_record;
3147                         }
3148
3149                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
3150
3151                         /*
3152                          * We'll use "new_edge" on our way back up the
3153                          * tree to know what our rightmost cpos is.
3154                          */
3155                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
3156                         new_edge += le32_to_cpu(rec->e_cpos);
3157
3158                         /*
3159                          * The caller will use this to delete data blocks.
3160                          */
3161                         *delete_start = le64_to_cpu(rec->e_blkno)
3162                                 + ocfs2_clusters_to_blocks(inode->i_sb,
3163                                         le16_to_cpu(rec->e_leaf_clusters));
3164
3165                         /*
3166                          * If it's now empty, remove this record.
3167                          */
3168                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
3169                                 memset(rec, 0,
3170                                        sizeof(struct ocfs2_extent_rec));
3171                                 le16_add_cpu(&el->l_next_free_rec, -1);
3172                         }
3173                 } else {
3174                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
3175                                 memset(rec, 0,
3176                                        sizeof(struct ocfs2_extent_rec));
3177                                 le16_add_cpu(&el->l_next_free_rec, -1);
3178
3179                                 goto delete;
3180                         }
3181
3182                         /* Can this actually happen? */
3183                         if (le16_to_cpu(el->l_next_free_rec) == 0)
3184                                 goto delete;
3185
3186                         /*
3187                          * We never actually deleted any clusters
3188                          * because our leaf was empty. There's no
3189                          * reason to adjust the rightmost edge then.
3190                          */
3191                         if (new_edge == 0)
3192                                 goto delete;
3193
3194                         rec->e_int_clusters = cpu_to_le32(new_edge);
3195                         le32_add_cpu(&rec->e_int_clusters,
3196                                      -le32_to_cpu(rec->e_cpos));
3197
3198                          /*
3199                           * A deleted child record should have been
3200                           * caught above.
3201                           */
3202                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
3203                 }
3204
3205 delete:
3206                 ret = ocfs2_journal_dirty(handle, bh);
3207                 if (ret) {
3208                         mlog_errno(ret);
3209                         goto out;
3210                 }
3211
3212                 mlog(0, "extent list container %llu, after: record %d: "
3213                      "(%u, %u, %llu), next = %u.\n",
3214                      (unsigned long long)bh->b_blocknr, i,
3215                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
3216                      (unsigned long long)le64_to_cpu(rec->e_blkno),
3217                      le16_to_cpu(el->l_next_free_rec));
3218
3219                 /*
3220                  * We must be careful to only attempt delete of an
3221                  * extent block (and not the root inode block).
3222                  */
3223                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
3224                         struct ocfs2_extent_block *eb =
3225                                 (struct ocfs2_extent_block *)bh->b_data;
3226
3227                         /*
3228                          * Save this for use when processing the
3229                          * parent block.
3230                          */
3231                         deleted_eb = le64_to_cpu(eb->h_blkno);
3232
3233                         mlog(0, "deleting this extent block.\n");
3234
3235                         ocfs2_remove_from_cache(inode, bh);
3236
3237                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
3238                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240
3241                         if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
3242                                 /*
3243                                  * This code only understands how to
3244                                  * lock the suballocator in slot 0,
3245                                  * which is fine because allocation is
3246                                  * only ever done out of that
3247                                  * suballocator too. A future version
3248                                  * might change that however, so avoid
3249                                  * a free if we don't know how to
3250                                  * handle it. This way an fs incompat
3251                                  * bit will not be necessary.
3252                                  */
3253                                 ret = ocfs2_free_extent_block(handle,
3254                                                               tc->tc_ext_alloc_inode,
3255                                                               tc->tc_ext_alloc_bh,
3256                                                               eb);
3257
3258                                 /* An error here is not fatal. */
3259                                 if (ret < 0)
3260                                         mlog_errno(ret);
3261                         }
3262                 } else {
3263                         deleted_eb = 0;
3264                 }
3265
3266                 index--;
3267         }
3268
3269         ret = 0;
3270 out:
3271         return ret;
3272 }
3273
3274 static int ocfs2_do_truncate(struct ocfs2_super *osb,
3275                              unsigned int clusters_to_del,
3276                              struct inode *inode,
3277                              struct buffer_head *fe_bh,
3278                              handle_t *handle,
3279                              struct ocfs2_truncate_context *tc,
3280                              struct ocfs2_path *path)
3281 {
3282         int status;
3283         struct ocfs2_dinode *fe;
3284         struct ocfs2_extent_block *last_eb = NULL;
3285         struct ocfs2_extent_list *el;
3286         struct buffer_head *last_eb_bh = NULL;
3287         u64 delete_blk = 0;
3288
3289         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3290
3291         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
3292                                              path, &last_eb_bh);
3293         if (status < 0) {
3294                 mlog_errno(status);
3295                 goto bail;
3296         }
3297
3298         /*
3299          * Each component will be touched, so we might as well journal
3300          * here to avoid having to handle errors later.
3301          */
3302         status = ocfs2_journal_access_path(inode, handle, path);
3303         if (status < 0) {
3304                 mlog_errno(status);
3305                 goto bail;
3306         }
3307
3308         if (last_eb_bh) {
3309                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3310                                               OCFS2_JOURNAL_ACCESS_WRITE);
3311                 if (status < 0) {
3312                         mlog_errno(status);
3313                         goto bail;
3314                 }
3315
3316                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3317         }
3318
3319         el = &(fe->id2.i_list);
3320
3321         /*
3322          * Lower levels depend on this never happening, but it's best
3323          * to check it up here before changing the tree.
3324          */
3325         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
3326                 ocfs2_error(inode->i_sb,
3327                             "Inode %lu has an empty extent record, depth %u\n",
3328                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
3329                 status = -EROFS;
3330                 goto bail;
3331         }
3332
3333         spin_lock(&OCFS2_I(inode)->ip_lock);
3334         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
3335                                       clusters_to_del;
3336         spin_unlock(&OCFS2_I(inode)->ip_lock);
3337         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
3338
3339         status = ocfs2_trim_tree(inode, path, handle, tc,
3340                                  clusters_to_del, &delete_blk);
3341         if (status) {
3342                 mlog_errno(status);
3343                 goto bail;
3344         }
3345
3346         if (le32_to_cpu(fe->i_clusters) == 0) {
3347                 /* trunc to zero is a special case. */
3348                 el->l_tree_depth = 0;
3349                 fe->i_last_eb_blk = 0;
3350         } else if (last_eb)
3351                 fe->i_last_eb_blk = last_eb->h_blkno;
3352
3353         status = ocfs2_journal_dirty(handle, fe_bh);
3354         if (status < 0) {
3355                 mlog_errno(status);
3356                 goto bail;
3357         }
3358
3359         if (last_eb) {
3360                 /* If there will be a new last extent block, then by
3361                  * definition, there cannot be any leaves to the right of
3362                  * him. */
3363                 last_eb->h_next_leaf_blk = 0;
3364                 status = ocfs2_journal_dirty(handle, last_eb_bh);
3365                 if (status < 0) {
3366                         mlog_errno(status);
3367                         goto bail;
3368                 }
3369         }
3370
3371         if (delete_blk) {
3372                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
3373                                                    clusters_to_del);
3374                 if (status < 0) {
3375                         mlog_errno(status);
3376                         goto bail;
3377                 }
3378         }
3379         status = 0;
3380 bail:
3381
3382         mlog_exit(status);
3383         return status;
3384 }
3385
3386 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
3387 {
3388         set_buffer_uptodate(bh);
3389         mark_buffer_dirty(bh);
3390         return 0;
3391 }
3392
3393 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3394 {
3395         set_buffer_uptodate(bh);
3396         mark_buffer_dirty(bh);
3397         return ocfs2_journal_dirty_data(handle, bh);
3398 }
3399
3400 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3401                                      struct page **pages, int numpages,
3402                                      u64 phys, handle_t *handle)
3403 {
3404         int i, ret, partial = 0;
3405         void *kaddr;
3406         struct page *page;
3407         unsigned int from, to = PAGE_CACHE_SIZE;
3408         struct super_block *sb = inode->i_sb;
3409
3410         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3411
3412         if (numpages == 0)
3413                 goto out;
3414
3415         from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
3416         if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417                 /*
3418                  * Since 'from' has been capped to a value below page
3419                  * size, this calculation won't be able to overflow
3420                  * 'to'
3421                  */
3422                 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424                 /*
3425                  * The truncate tail in this case should never contain
3426                  * more than one page at maximum. The loop below also
3427                  * assumes this.
3428                  */
3429                 BUG_ON(numpages != 1);
3430         }
3431
3432         for(i = 0; i < numpages; i++) {
3433                 page = pages[i];
3434
3435                 BUG_ON(from > PAGE_CACHE_SIZE);
3436                 BUG_ON(to > PAGE_CACHE_SIZE);
3437
3438                 ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
3439                 if (ret)
3440                         mlog_errno(ret);
3441
3442                 kaddr = kmap_atomic(page, KM_USER0);
3443                 memset(kaddr + from, 0, to - from);
3444                 kunmap_atomic(kaddr, KM_USER0);
3445
3446                 /*
3447                  * Need to set the buffers we zero'd into uptodate
3448                  * here if they aren't - ocfs2_map_page_blocks()
3449                  * might've skipped some
3450                  */
3451                 if (ocfs2_should_order_data(inode)) {
3452                         ret = walk_page_buffers(handle,
3453                                                 page_buffers(page),
3454                                                 from, to, &partial,
3455                                                 ocfs2_ordered_zero_func);
3456                         if (ret < 0)
3457                                 mlog_errno(ret);
3458                 } else {
3459                         ret = walk_page_buffers(handle, page_buffers(page),
3460                                                 from, to, &partial,
3461                                                 ocfs2_writeback_zero_func);
3462                         if (ret < 0)
3463                                 mlog_errno(ret);
3464                 }
3465
3466                 if (!partial)
3467                         SetPageUptodate(page);
3468
3469                 flush_dcache_page(page);
3470
3471                 /*
3472                  * Every page after the 1st one should be completely zero'd.
3473                  */
3474                 from = 0;
3475         }
3476 out:
3477         if (pages) {
3478                 for (i = 0; i < numpages; i++) {
3479                         page = pages[i];
3480                         unlock_page(page);
3481                         mark_page_accessed(page);
3482                         page_cache_release(page);
3483                 }
3484         }
3485 }
3486
3487 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
3488                                 int *num, u64 *phys)
3489 {
3490         int i, numpages = 0, ret = 0;
3491         unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492         unsigned int ext_flags;
3493         struct super_block *sb = inode->i_sb;
3494         struct address_space *mapping = inode->i_mapping;
3495         unsigned long index;
3496         u64 next_cluster_bytes;
3497
3498         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3499
3500         /* Cluster boundary, so we don't need to grab any pages. */
3501         if ((isize & (csize - 1)) == 0)
3502                 goto out;
3503
3504         ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
3505                                           phys, NULL, &ext_flags);
3506         if (ret) {
3507                 mlog_errno(ret);
3508                 goto out;
3509         }
3510
3511         /* Tail is a hole. */
3512         if (*phys == 0)
3513                 goto out;
3514
3515         /* Tail is marked as unwritten, we can count on write to zero
3516          * in that case. */
3517         if (ext_flags & OCFS2_EXT_UNWRITTEN)
3518                 goto out;
3519
3520         next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
3521         index = isize >> PAGE_CACHE_SHIFT;
3522         do {
3523                 pages[numpages] = grab_cache_page(mapping, index);
3524                 if (!pages[numpages]) {
3525                         ret = -ENOMEM;
3526                         mlog_errno(ret);
3527                         goto out;
3528                 }
3529
3530                 numpages++;
3531                 index++;
3532         } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
3533
3534 out:
3535         if (ret != 0) {
3536                 if (pages) {
3537                         for (i = 0; i < numpages; i++) {
3538                                 if (pages[i]) {
3539                                         unlock_page(pages[i]);
3540                                         page_cache_release(pages[i]);
3541                                 }
3542                         }
3543                 }
3544                 numpages = 0;
3545         }
3546
3547         *num = numpages;
3548
3549         return ret;
3550 }
3551
3552 /*
3553  * Zero the area past i_size but still within an allocated
3554  * cluster. This avoids exposing nonzero data on subsequent file
3555  * extends.
3556  *
3557  * We need to call this before i_size is updated on the inode because
3558  * otherwise block_write_full_page() will skip writeout of pages past
3559  * i_size. The new_i_size parameter is passed for this reason.
3560  */
3561 int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3562                                  u64 new_i_size)
3563 {
3564         int ret, numpages;
3565         loff_t endbyte;
3566         struct page **pages = NULL;
3567         u64 phys;
3568
3569         /*
3570          * File systems which don't support sparse files zero on every
3571          * extend.
3572          */
3573         if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
3574                 return 0;
3575
3576         pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
3577                         sizeof(struct page *), GFP_NOFS);
3578         if (pages == NULL) {
3579                 ret = -ENOMEM;
3580                 mlog_errno(ret);
3581                 goto out;
3582         }
3583
3584         ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
3585         if (ret) {
3586                 mlog_errno(ret);
3587                 goto out;
3588         }
3589
3590         if (numpages == 0)
3591                 goto out;
3592
3593         ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
3594                                  handle);
3595
3596         /*
3597          * Initiate writeout of the pages we zero'd here. We don't
3598          * wait on them - the truncate_inode_pages() call later will
3599          * do that for us.
3600          */
3601         endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
3602         ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
3603                                     endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604         if (ret)
3605                 mlog_errno(ret);
3606
3607 out:
3608         if (pages)
3609                 kfree(pages);
3610
3611         return ret;
3612 }
3613
3614 /*
3615  * It is expected, that by the time you call this function,
3616  * inode->i_size and fe->i_size have been adjusted.
3617  *
3618  * WARNING: This will kfree the truncate context
3619  */
3620 int ocfs2_commit_truncate(struct ocfs2_super *osb,
3621                           struct inode *inode,
3622                           struct buffer_head *fe_bh,
3623                           struct ocfs2_truncate_context *tc)
3624 {
3625         int status, i, credits, tl_sem = 0;
3626         u32 clusters_to_del, new_highest_cpos, range;
3627         struct ocfs2_extent_list *el;
3628         handle_t *handle = NULL;
3629         struct inode *tl_inode = osb->osb_tl_inode;
3630         struct ocfs2_path *path = NULL;
3631
3632         mlog_entry_void();
3633
3634         down_write(&OCFS2_I(inode)->ip_alloc_sem);
3635
3636         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3637                                                      i_size_read(inode));
3638
3639         path = ocfs2_new_inode_path(fe_bh);
3640         if (!path) {
3641                 status = -ENOMEM;
3642                 mlog_errno(status);
3643                 goto bail;
3644         }
3645
3646         ocfs2_extent_map_trunc(inode, new_highest_cpos);
3647
3648 start:
3649         /*
3650          * Check that we still have allocation to delete.
3651          */
3652         if (OCFS2_I(inode)->ip_clusters == 0) {
3653                 status = 0;
3654                 goto bail;
3655         }
3656
3657         /*
3658          * Truncate always works against the rightmost tree branch.
3659          */
3660         status = ocfs2_find_path(inode, path, UINT_MAX);
3661         if (status) {
3662                 mlog_errno(status);
3663                 goto bail;
3664         }
3665
3666         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
3667              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
3668
3669         /*
3670          * By now, el will point to the extent list on the bottom most
3671          * portion of this tree. Only the tail record is considered in
3672          * each pass.
3673          *
3674          * We handle the following cases, in order:
3675          * - empty extent: delete the remaining branch
3676          * - remove the entire record
3677          * - remove a partial record
3678          * - no record needs to be removed (truncate has completed)
3679          */
3680         el = path_leaf_el(path);
3681         if (le16_to_cpu(el->l_next_free_rec) == 0) {
3682                 ocfs2_error(inode->i_sb,
3683                             "Inode %llu has empty extent block at %llu\n",
3684                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
3685                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
3686                 status = -EROFS;
3687                 goto bail;
3688         }
3689
3690         i = le16_to_cpu(el->l_next_free_rec) - 1;
3691         range = le32_to_cpu(el->l_recs[i].e_cpos) +
3692                 ocfs2_rec_clusters(el, &el->l_recs[i]);
3693         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
3694                 clusters_to_del = 0;
3695         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3696                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
3697         } else if (range > new_highest_cpos) {
3698                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
3699                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
3700                                   new_highest_cpos;
3701         } else {
3702                 status = 0;
3703                 goto bail;
3704         }
3705
3706         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3707              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3708
3709         BUG_ON(clusters_to_del == 0);
3710
3711         mutex_lock(&tl_inode->i_mutex);
3712         tl_sem = 1;
3713         /* ocfs2_truncate_log_needs_flush guarantees us at least one
3714          * record is free for use. If there isn't any, we flush to get
3715          * an empty truncate log.  */
3716         if (ocfs2_truncate_log_needs_flush(osb)) {
3717                 status = __ocfs2_flush_truncate_log(osb);
3718                 if (status < 0) {
3719                         mlog_errno(status);
3720                         goto bail;
3721                 }
3722         }
3723
3724         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
3725                                                 (struct ocfs2_dinode *)fe_bh->b_data,
3726                                                 el);
3727         handle = ocfs2_start_trans(osb, credits);
3728         if (IS_ERR(handle)) {
3729                 status = PTR_ERR(handle);
3730                 handle = NULL;
3731                 mlog_errno(status);
3732                 goto bail;
3733         }
3734
3735         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
3736                                    tc, path);
3737         if (status < 0) {
3738                 mlog_errno(status);
3739                 goto bail;
3740         }
3741
3742         mutex_unlock(&tl_inode->i_mutex);
3743         tl_sem = 0;
3744
3745         ocfs2_commit_trans(osb, handle);
3746         handle = NULL;
3747
3748         ocfs2_reinit_path(path, 1);
3749
3750         /*
3751          * The check above will catch the case where we've truncated
3752          * away all allocation.
3753          */
3754         goto start;
3755
3756 bail:
3757         up_write(&OCFS2_I(inode)->ip_alloc_sem);
3758
3759         ocfs2_schedule_truncate_log_flush(osb, 1);
3760
3761         if (tl_sem)
3762                 mutex_unlock(&tl_inode->i_mutex);
3763
3764         if (handle)
3765                 ocfs2_commit_trans(osb, handle);
3766
3767         ocfs2_free_path(path);
3768
3769         /* This will drop the ext_alloc cluster lock for us */
3770         ocfs2_free_truncate_context(tc);
3771
3772         mlog_exit(status);
3773         return status;
3774 }
3775
3776 /*
3777  * Expects the inode to already be locked. This will figure out which
3778  * inodes need to be locked and will put them on the returned truncate
3779  * context.
3780  */
3781 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3782                            struct inode *inode,
3783                            struct buffer_head *fe_bh,
3784                            struct ocfs2_truncate_context **tc)
3785 {
3786         int status, metadata_delete, i;
3787         unsigned int new_i_clusters;
3788         struct ocfs2_dinode *fe;
3789         struct ocfs2_extent_block *eb;
3790         struct ocfs2_extent_list *el;
3791         struct buffer_head *last_eb_bh = NULL;
3792         struct inode *ext_alloc_inode = NULL;
3793         struct buffer_head *ext_alloc_bh = NULL;
3794
3795         mlog_entry_void();
3796
3797         *tc = NULL;
3798
3799         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
3800                                                   i_size_read(inode));
3801         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3802
3803         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
3804              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
3805              (unsigned long long)le64_to_cpu(fe->i_size));
3806
3807         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
3808         if (!(*tc)) {
3809                 status = -ENOMEM;
3810                 mlog_errno(status);
3811                 goto bail;
3812         }
3813
3814         metadata_delete = 0;
3815         if (fe->id2.i_list.l_tree_depth) {
3816                 /* If we have a tree, then the truncate may result in
3817                  * metadata deletes. Figure this out from the
3818                  * rightmost leaf block.*/
3819                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3820                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
3821                 if (status < 0) {
3822                         mlog_errno(status);
3823                         goto bail;
3824                 }
3825                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3826                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3827                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3828
3829                         brelse(last_eb_bh);
3830                         status = -EIO;
3831                         goto bail;
3832                 }
3833                 el = &(eb->h_list);
3834
3835                 i = 0;
3836                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837                         i = 1;
3838                 /*
3839                  * XXX: Should we check that next_free_rec contains
3840                  * the extent?
3841                  */
3842                 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3843                         metadata_delete = 1;
3844         }
3845
3846         (*tc)->tc_last_eb_bh = last_eb_bh;
3847
3848         if (metadata_delete) {
3849                 mlog(0, "Will have to delete metadata for this trunc. "
3850                      "locking allocator.\n");
3851                 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3852                 if (!ext_alloc_inode) {
3853                         status = -ENOMEM;
3854                         mlog_errno(status);
3855                         goto bail;
3856                 }
3857
3858                 mutex_lock(&ext_alloc_inode->i_mutex);
3859                 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3860
3861                 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3862                 if (status < 0) {
3863                         mlog_errno(status);
3864                         goto bail;
3865                 }
3866                 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3867                 (*tc)->tc_ext_alloc_locked = 1;
3868         }
3869
3870         status = 0;
3871 bail:
3872         if (status < 0) {
3873                 if (*tc)
3874                         ocfs2_free_truncate_context(*tc);
3875                 *tc = NULL;
3876         }
3877         mlog_exit_void();
3878         return status;
3879 }
3880
3881 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3882 {
3883         if (tc->tc_ext_alloc_inode) {
3884                 if (tc->tc_ext_alloc_locked)
3885                         ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
3886
3887                 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
3888                 iput(tc->tc_ext_alloc_inode);
3889         }
3890
3891         if (tc->tc_ext_alloc_bh)
3892                 brelse(tc->tc_ext_alloc_bh);
3893
3894         if (tc->tc_last_eb_bh)
3895                 brelse(tc->tc_last_eb_bh);
3896
3897         kfree(tc);
3898 }