ocfs2: zero tail of sparse files on truncate
[linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49
50 #include "buffer_head_io.h"
51
52 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53
54 /*
55  * Structures which describe a path through a btree, and functions to
56  * manipulate them.
57  *
58  * The idea here is to be as generic as possible with the tree
59  * manipulation code.
60  */
61 struct ocfs2_path_item {
62         struct buffer_head              *bh;
63         struct ocfs2_extent_list        *el;
64 };
65
66 #define OCFS2_MAX_PATH_DEPTH    5
67
68 struct ocfs2_path {
69         int                     p_tree_depth;
70         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
71 };
72
73 #define path_root_bh(_path) ((_path)->p_node[0].bh)
74 #define path_root_el(_path) ((_path)->p_node[0].el)
75 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
76 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
77 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
78
79 /*
80  * Reset the actual path elements so that we can re-use the structure
81  * to build another path. Generally, this involves freeing the buffer
82  * heads.
83  */
84 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
85 {
86         int i, start = 0, depth = 0;
87         struct ocfs2_path_item *node;
88
89         if (keep_root)
90                 start = 1;
91
92         for(i = start; i < path_num_items(path); i++) {
93                 node = &path->p_node[i];
94
95                 brelse(node->bh);
96                 node->bh = NULL;
97                 node->el = NULL;
98         }
99
100         /*
101          * Tree depth may change during truncate, or insert. If we're
102          * keeping the root extent list, then make sure that our path
103          * structure reflects the proper depth.
104          */
105         if (keep_root)
106                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
107
108         path->p_tree_depth = depth;
109 }
110
111 static void ocfs2_free_path(struct ocfs2_path *path)
112 {
113         if (path) {
114                 ocfs2_reinit_path(path, 0);
115                 kfree(path);
116         }
117 }
118
119 /*
120  * Make the *dest path the same as src and re-initialize src path to
121  * have a root only.
122  */
123 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
124 {
125         int i;
126
127         BUG_ON(path_root_bh(dest) != path_root_bh(src));
128
129         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
130                 brelse(dest->p_node[i].bh);
131
132                 dest->p_node[i].bh = src->p_node[i].bh;
133                 dest->p_node[i].el = src->p_node[i].el;
134
135                 src->p_node[i].bh = NULL;
136                 src->p_node[i].el = NULL;
137         }
138 }
139
140 /*
141  * Insert an extent block at given index.
142  *
143  * This will not take an additional reference on eb_bh.
144  */
145 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
146                                         struct buffer_head *eb_bh)
147 {
148         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
149
150         /*
151          * Right now, no root bh is an extent block, so this helps
152          * catch code errors with dinode trees. The assertion can be
153          * safely removed if we ever need to insert extent block
154          * structures at the root.
155          */
156         BUG_ON(index == 0);
157
158         path->p_node[index].bh = eb_bh;
159         path->p_node[index].el = &eb->h_list;
160 }
161
162 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
163                                          struct ocfs2_extent_list *root_el)
164 {
165         struct ocfs2_path *path;
166
167         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
168
169         path = kzalloc(sizeof(*path), GFP_NOFS);
170         if (path) {
171                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
172                 get_bh(root_bh);
173                 path_root_bh(path) = root_bh;
174                 path_root_el(path) = root_el;
175         }
176
177         return path;
178 }
179
180 /*
181  * Allocate and initialize a new path based on a disk inode tree.
182  */
183 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
184 {
185         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
186         struct ocfs2_extent_list *el = &di->id2.i_list;
187
188         return ocfs2_new_path(di_bh, el);
189 }
190
191 /*
192  * Convenience function to journal all components in a path.
193  */
194 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
195                                      struct ocfs2_path *path)
196 {
197         int i, ret = 0;
198
199         if (!path)
200                 goto out;
201
202         for(i = 0; i < path_num_items(path); i++) {
203                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
204                                            OCFS2_JOURNAL_ACCESS_WRITE);
205                 if (ret < 0) {
206                         mlog_errno(ret);
207                         goto out;
208                 }
209         }
210
211 out:
212         return ret;
213 }
214
215 enum ocfs2_contig_type {
216         CONTIG_NONE = 0,
217         CONTIG_LEFT,
218         CONTIG_RIGHT
219 };
220
221 static int ocfs2_block_extent_contig(struct super_block *sb,
222                                      struct ocfs2_extent_rec *ext,
223                                      u64 blkno)
224 {
225         return blkno == (le64_to_cpu(ext->e_blkno) +
226                          ocfs2_clusters_to_blocks(sb,
227                                                   le32_to_cpu(ext->e_clusters)));
228 }
229
230 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
231                                   struct ocfs2_extent_rec *right)
232 {
233         return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
234                 le32_to_cpu(right->e_cpos));
235 }
236
237 static enum ocfs2_contig_type
238         ocfs2_extent_contig(struct inode *inode,
239                             struct ocfs2_extent_rec *ext,
240                             struct ocfs2_extent_rec *insert_rec)
241 {
242         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
243
244         if (ocfs2_extents_adjacent(ext, insert_rec) &&
245             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
246                         return CONTIG_RIGHT;
247
248         blkno = le64_to_cpu(ext->e_blkno);
249         if (ocfs2_extents_adjacent(insert_rec, ext) &&
250             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
251                 return CONTIG_LEFT;
252
253         return CONTIG_NONE;
254 }
255
256 /*
257  * NOTE: We can have pretty much any combination of contiguousness and
258  * appending.
259  *
260  * The usefulness of APPEND_TAIL is more in that it lets us know that
261  * we'll have to update the path to that leaf.
262  */
263 enum ocfs2_append_type {
264         APPEND_NONE = 0,
265         APPEND_TAIL,
266 };
267
268 struct ocfs2_insert_type {
269         enum ocfs2_append_type  ins_appending;
270         enum ocfs2_contig_type  ins_contig;
271         int                     ins_contig_index;
272         int                     ins_free_records;
273         int                     ins_tree_depth;
274 };
275
276 /*
277  * How many free extents have we got before we need more meta data?
278  */
279 int ocfs2_num_free_extents(struct ocfs2_super *osb,
280                            struct inode *inode,
281                            struct ocfs2_dinode *fe)
282 {
283         int retval;
284         struct ocfs2_extent_list *el;
285         struct ocfs2_extent_block *eb;
286         struct buffer_head *eb_bh = NULL;
287
288         mlog_entry_void();
289
290         if (!OCFS2_IS_VALID_DINODE(fe)) {
291                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
292                 retval = -EIO;
293                 goto bail;
294         }
295
296         if (fe->i_last_eb_blk) {
297                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
298                                           &eb_bh, OCFS2_BH_CACHED, inode);
299                 if (retval < 0) {
300                         mlog_errno(retval);
301                         goto bail;
302                 }
303                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
304                 el = &eb->h_list;
305         } else
306                 el = &fe->id2.i_list;
307
308         BUG_ON(el->l_tree_depth != 0);
309
310         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
311 bail:
312         if (eb_bh)
313                 brelse(eb_bh);
314
315         mlog_exit(retval);
316         return retval;
317 }
318
319 /* expects array to already be allocated
320  *
321  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
322  * l_count for you
323  */
324 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
325                                      handle_t *handle,
326                                      struct inode *inode,
327                                      int wanted,
328                                      struct ocfs2_alloc_context *meta_ac,
329                                      struct buffer_head *bhs[])
330 {
331         int count, status, i;
332         u16 suballoc_bit_start;
333         u32 num_got;
334         u64 first_blkno;
335         struct ocfs2_extent_block *eb;
336
337         mlog_entry_void();
338
339         count = 0;
340         while (count < wanted) {
341                 status = ocfs2_claim_metadata(osb,
342                                               handle,
343                                               meta_ac,
344                                               wanted - count,
345                                               &suballoc_bit_start,
346                                               &num_got,
347                                               &first_blkno);
348                 if (status < 0) {
349                         mlog_errno(status);
350                         goto bail;
351                 }
352
353                 for(i = count;  i < (num_got + count); i++) {
354                         bhs[i] = sb_getblk(osb->sb, first_blkno);
355                         if (bhs[i] == NULL) {
356                                 status = -EIO;
357                                 mlog_errno(status);
358                                 goto bail;
359                         }
360                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
361
362                         status = ocfs2_journal_access(handle, inode, bhs[i],
363                                                       OCFS2_JOURNAL_ACCESS_CREATE);
364                         if (status < 0) {
365                                 mlog_errno(status);
366                                 goto bail;
367                         }
368
369                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
370                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
371                         /* Ok, setup the minimal stuff here. */
372                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
373                         eb->h_blkno = cpu_to_le64(first_blkno);
374                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
375
376 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
377                         /* we always use slot zero's suballocator */
378                         eb->h_suballoc_slot = 0;
379 #else
380                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
381 #endif
382                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
383                         eb->h_list.l_count =
384                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
385
386                         suballoc_bit_start++;
387                         first_blkno++;
388
389                         /* We'll also be dirtied by the caller, so
390                          * this isn't absolutely necessary. */
391                         status = ocfs2_journal_dirty(handle, bhs[i]);
392                         if (status < 0) {
393                                 mlog_errno(status);
394                                 goto bail;
395                         }
396                 }
397
398                 count += num_got;
399         }
400
401         status = 0;
402 bail:
403         if (status < 0) {
404                 for(i = 0; i < wanted; i++) {
405                         if (bhs[i])
406                                 brelse(bhs[i]);
407                         bhs[i] = NULL;
408                 }
409         }
410         mlog_exit(status);
411         return status;
412 }
413
414 /*
415  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
416  *
417  * Returns the sum of the rightmost extent rec logical offset and
418  * cluster count.
419  *
420  * ocfs2_add_branch() uses this to determine what logical cluster
421  * value should be populated into the leftmost new branch records.
422  *
423  * ocfs2_shift_tree_depth() uses this to determine the # clusters
424  * value for the new topmost tree record.
425  */
426 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
427 {
428         int i;
429
430         i = le16_to_cpu(el->l_next_free_rec) - 1;
431
432         return le32_to_cpu(el->l_recs[i].e_cpos) +
433                 le32_to_cpu(el->l_recs[i].e_clusters);
434 }
435
436 /*
437  * Add an entire tree branch to our inode. eb_bh is the extent block
438  * to start at, if we don't want to start the branch at the dinode
439  * structure.
440  *
441  * last_eb_bh is required as we have to update it's next_leaf pointer
442  * for the new last extent block.
443  *
444  * the new branch will be 'empty' in the sense that every block will
445  * contain a single record with e_clusters == 0.
446  */
447 static int ocfs2_add_branch(struct ocfs2_super *osb,
448                             handle_t *handle,
449                             struct inode *inode,
450                             struct buffer_head *fe_bh,
451                             struct buffer_head *eb_bh,
452                             struct buffer_head *last_eb_bh,
453                             struct ocfs2_alloc_context *meta_ac)
454 {
455         int status, new_blocks, i;
456         u64 next_blkno, new_last_eb_blk;
457         struct buffer_head *bh;
458         struct buffer_head **new_eb_bhs = NULL;
459         struct ocfs2_dinode *fe;
460         struct ocfs2_extent_block *eb;
461         struct ocfs2_extent_list  *eb_el;
462         struct ocfs2_extent_list  *el;
463         u32 new_cpos;
464
465         mlog_entry_void();
466
467         BUG_ON(!last_eb_bh);
468
469         fe = (struct ocfs2_dinode *) fe_bh->b_data;
470
471         if (eb_bh) {
472                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
473                 el = &eb->h_list;
474         } else
475                 el = &fe->id2.i_list;
476
477         /* we never add a branch to a leaf. */
478         BUG_ON(!el->l_tree_depth);
479
480         new_blocks = le16_to_cpu(el->l_tree_depth);
481
482         /* allocate the number of new eb blocks we need */
483         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
484                              GFP_KERNEL);
485         if (!new_eb_bhs) {
486                 status = -ENOMEM;
487                 mlog_errno(status);
488                 goto bail;
489         }
490
491         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
492                                            meta_ac, new_eb_bhs);
493         if (status < 0) {
494                 mlog_errno(status);
495                 goto bail;
496         }
497
498         eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
499         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
500
501         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
502          * linked with the rest of the tree.
503          * conversly, new_eb_bhs[0] is the new bottommost leaf.
504          *
505          * when we leave the loop, new_last_eb_blk will point to the
506          * newest leaf, and next_blkno will point to the topmost extent
507          * block. */
508         next_blkno = new_last_eb_blk = 0;
509         for(i = 0; i < new_blocks; i++) {
510                 bh = new_eb_bhs[i];
511                 eb = (struct ocfs2_extent_block *) bh->b_data;
512                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
513                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
514                         status = -EIO;
515                         goto bail;
516                 }
517                 eb_el = &eb->h_list;
518
519                 status = ocfs2_journal_access(handle, inode, bh,
520                                               OCFS2_JOURNAL_ACCESS_CREATE);
521                 if (status < 0) {
522                         mlog_errno(status);
523                         goto bail;
524                 }
525
526                 eb->h_next_leaf_blk = 0;
527                 eb_el->l_tree_depth = cpu_to_le16(i);
528                 eb_el->l_next_free_rec = cpu_to_le16(1);
529                 /*
530                  * This actually counts as an empty extent as
531                  * c_clusters == 0
532                  */
533                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
534                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
535                 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
536                 if (!eb_el->l_tree_depth)
537                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
538
539                 status = ocfs2_journal_dirty(handle, bh);
540                 if (status < 0) {
541                         mlog_errno(status);
542                         goto bail;
543                 }
544
545                 next_blkno = le64_to_cpu(eb->h_blkno);
546         }
547
548         /* This is a bit hairy. We want to update up to three blocks
549          * here without leaving any of them in an inconsistent state
550          * in case of error. We don't have to worry about
551          * journal_dirty erroring as it won't unless we've aborted the
552          * handle (in which case we would never be here) so reserving
553          * the write with journal_access is all we need to do. */
554         status = ocfs2_journal_access(handle, inode, last_eb_bh,
555                                       OCFS2_JOURNAL_ACCESS_WRITE);
556         if (status < 0) {
557                 mlog_errno(status);
558                 goto bail;
559         }
560         status = ocfs2_journal_access(handle, inode, fe_bh,
561                                       OCFS2_JOURNAL_ACCESS_WRITE);
562         if (status < 0) {
563                 mlog_errno(status);
564                 goto bail;
565         }
566         if (eb_bh) {
567                 status = ocfs2_journal_access(handle, inode, eb_bh,
568                                               OCFS2_JOURNAL_ACCESS_WRITE);
569                 if (status < 0) {
570                         mlog_errno(status);
571                         goto bail;
572                 }
573         }
574
575         /* Link the new branch into the rest of the tree (el will
576          * either be on the fe, or the extent block passed in. */
577         i = le16_to_cpu(el->l_next_free_rec);
578         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
579         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
580         el->l_recs[i].e_clusters = 0;
581         le16_add_cpu(&el->l_next_free_rec, 1);
582
583         /* fe needs a new last extent block pointer, as does the
584          * next_leaf on the previously last-extent-block. */
585         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
586
587         eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
588         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
589
590         status = ocfs2_journal_dirty(handle, last_eb_bh);
591         if (status < 0)
592                 mlog_errno(status);
593         status = ocfs2_journal_dirty(handle, fe_bh);
594         if (status < 0)
595                 mlog_errno(status);
596         if (eb_bh) {
597                 status = ocfs2_journal_dirty(handle, eb_bh);
598                 if (status < 0)
599                         mlog_errno(status);
600         }
601
602         status = 0;
603 bail:
604         if (new_eb_bhs) {
605                 for (i = 0; i < new_blocks; i++)
606                         if (new_eb_bhs[i])
607                                 brelse(new_eb_bhs[i]);
608                 kfree(new_eb_bhs);
609         }
610
611         mlog_exit(status);
612         return status;
613 }
614
615 /*
616  * adds another level to the allocation tree.
617  * returns back the new extent block so you can add a branch to it
618  * after this call.
619  */
620 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
621                                   handle_t *handle,
622                                   struct inode *inode,
623                                   struct buffer_head *fe_bh,
624                                   struct ocfs2_alloc_context *meta_ac,
625                                   struct buffer_head **ret_new_eb_bh)
626 {
627         int status, i;
628         u32 new_clusters;
629         struct buffer_head *new_eb_bh = NULL;
630         struct ocfs2_dinode *fe;
631         struct ocfs2_extent_block *eb;
632         struct ocfs2_extent_list  *fe_el;
633         struct ocfs2_extent_list  *eb_el;
634
635         mlog_entry_void();
636
637         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
638                                            &new_eb_bh);
639         if (status < 0) {
640                 mlog_errno(status);
641                 goto bail;
642         }
643
644         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
645         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
646                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
647                 status = -EIO;
648                 goto bail;
649         }
650
651         eb_el = &eb->h_list;
652         fe = (struct ocfs2_dinode *) fe_bh->b_data;
653         fe_el = &fe->id2.i_list;
654
655         status = ocfs2_journal_access(handle, inode, new_eb_bh,
656                                       OCFS2_JOURNAL_ACCESS_CREATE);
657         if (status < 0) {
658                 mlog_errno(status);
659                 goto bail;
660         }
661
662         /* copy the fe data into the new extent block */
663         eb_el->l_tree_depth = fe_el->l_tree_depth;
664         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
665         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
666                 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
667                 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
668                 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
669         }
670
671         status = ocfs2_journal_dirty(handle, new_eb_bh);
672         if (status < 0) {
673                 mlog_errno(status);
674                 goto bail;
675         }
676
677         status = ocfs2_journal_access(handle, inode, fe_bh,
678                                       OCFS2_JOURNAL_ACCESS_WRITE);
679         if (status < 0) {
680                 mlog_errno(status);
681                 goto bail;
682         }
683
684         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
685
686         /* update fe now */
687         le16_add_cpu(&fe_el->l_tree_depth, 1);
688         fe_el->l_recs[0].e_cpos = 0;
689         fe_el->l_recs[0].e_blkno = eb->h_blkno;
690         fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
691         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
692                 fe_el->l_recs[i].e_cpos = 0;
693                 fe_el->l_recs[i].e_clusters = 0;
694                 fe_el->l_recs[i].e_blkno = 0;
695         }
696         fe_el->l_next_free_rec = cpu_to_le16(1);
697
698         /* If this is our 1st tree depth shift, then last_eb_blk
699          * becomes the allocated extent block */
700         if (fe_el->l_tree_depth == cpu_to_le16(1))
701                 fe->i_last_eb_blk = eb->h_blkno;
702
703         status = ocfs2_journal_dirty(handle, fe_bh);
704         if (status < 0) {
705                 mlog_errno(status);
706                 goto bail;
707         }
708
709         *ret_new_eb_bh = new_eb_bh;
710         new_eb_bh = NULL;
711         status = 0;
712 bail:
713         if (new_eb_bh)
714                 brelse(new_eb_bh);
715
716         mlog_exit(status);
717         return status;
718 }
719
720 /*
721  * Should only be called when there is no space left in any of the
722  * leaf nodes. What we want to do is find the lowest tree depth
723  * non-leaf extent block with room for new records. There are three
724  * valid results of this search:
725  *
726  * 1) a lowest extent block is found, then we pass it back in
727  *    *lowest_eb_bh and return '0'
728  *
729  * 2) the search fails to find anything, but the dinode has room. We
730  *    pass NULL back in *lowest_eb_bh, but still return '0'
731  *
732  * 3) the search fails to find anything AND the dinode is full, in
733  *    which case we return > 0
734  *
735  * return status < 0 indicates an error.
736  */
737 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
738                                     struct inode *inode,
739                                     struct buffer_head *fe_bh,
740                                     struct buffer_head **target_bh)
741 {
742         int status = 0, i;
743         u64 blkno;
744         struct ocfs2_dinode *fe;
745         struct ocfs2_extent_block *eb;
746         struct ocfs2_extent_list  *el;
747         struct buffer_head *bh = NULL;
748         struct buffer_head *lowest_bh = NULL;
749
750         mlog_entry_void();
751
752         *target_bh = NULL;
753
754         fe = (struct ocfs2_dinode *) fe_bh->b_data;
755         el = &fe->id2.i_list;
756
757         while(le16_to_cpu(el->l_tree_depth) > 1) {
758                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
759                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
760                                     "extent list (next_free_rec == 0)",
761                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
762                         status = -EIO;
763                         goto bail;
764                 }
765                 i = le16_to_cpu(el->l_next_free_rec) - 1;
766                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
767                 if (!blkno) {
768                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
769                                     "list where extent # %d has no physical "
770                                     "block start",
771                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
772                         status = -EIO;
773                         goto bail;
774                 }
775
776                 if (bh) {
777                         brelse(bh);
778                         bh = NULL;
779                 }
780
781                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
782                                           inode);
783                 if (status < 0) {
784                         mlog_errno(status);
785                         goto bail;
786                 }
787
788                 eb = (struct ocfs2_extent_block *) bh->b_data;
789                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
790                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
791                         status = -EIO;
792                         goto bail;
793                 }
794                 el = &eb->h_list;
795
796                 if (le16_to_cpu(el->l_next_free_rec) <
797                     le16_to_cpu(el->l_count)) {
798                         if (lowest_bh)
799                                 brelse(lowest_bh);
800                         lowest_bh = bh;
801                         get_bh(lowest_bh);
802                 }
803         }
804
805         /* If we didn't find one and the fe doesn't have any room,
806          * then return '1' */
807         if (!lowest_bh
808             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
809                 status = 1;
810
811         *target_bh = lowest_bh;
812 bail:
813         if (bh)
814                 brelse(bh);
815
816         mlog_exit(status);
817         return status;
818 }
819
820 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
821 {
822         return !rec->e_clusters;
823 }
824
825 /*
826  * This function will discard the rightmost extent record.
827  */
828 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
829 {
830         int next_free = le16_to_cpu(el->l_next_free_rec);
831         int count = le16_to_cpu(el->l_count);
832         unsigned int num_bytes;
833
834         BUG_ON(!next_free);
835         /* This will cause us to go off the end of our extent list. */
836         BUG_ON(next_free >= count);
837
838         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
839
840         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
841 }
842
843 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
844                               struct ocfs2_extent_rec *insert_rec)
845 {
846         int i, insert_index, next_free, has_empty, num_bytes;
847         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
848         struct ocfs2_extent_rec *rec;
849
850         next_free = le16_to_cpu(el->l_next_free_rec);
851         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
852
853         BUG_ON(!next_free);
854
855         /* The tree code before us didn't allow enough room in the leaf. */
856         if (el->l_next_free_rec == el->l_count && !has_empty)
857                 BUG();
858
859         /*
860          * The easiest way to approach this is to just remove the
861          * empty extent and temporarily decrement next_free.
862          */
863         if (has_empty) {
864                 /*
865                  * If next_free was 1 (only an empty extent), this
866                  * loop won't execute, which is fine. We still want
867                  * the decrement above to happen.
868                  */
869                 for(i = 0; i < (next_free - 1); i++)
870                         el->l_recs[i] = el->l_recs[i+1];
871
872                 next_free--;
873         }
874
875         /*
876          * Figure out what the new record index should be.
877          */
878         for(i = 0; i < next_free; i++) {
879                 rec = &el->l_recs[i];
880
881                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
882                         break;
883         }
884         insert_index = i;
885
886         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
887              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
888
889         BUG_ON(insert_index < 0);
890         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
891         BUG_ON(insert_index > next_free);
892
893         /*
894          * No need to memmove if we're just adding to the tail.
895          */
896         if (insert_index != next_free) {
897                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
898
899                 num_bytes = next_free - insert_index;
900                 num_bytes *= sizeof(struct ocfs2_extent_rec);
901                 memmove(&el->l_recs[insert_index + 1],
902                         &el->l_recs[insert_index],
903                         num_bytes);
904         }
905
906         /*
907          * Either we had an empty extent, and need to re-increment or
908          * there was no empty extent on a non full rightmost leaf node,
909          * in which case we still need to increment.
910          */
911         next_free++;
912         el->l_next_free_rec = cpu_to_le16(next_free);
913         /*
914          * Make sure none of the math above just messed up our tree.
915          */
916         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
917
918         el->l_recs[insert_index] = *insert_rec;
919
920 }
921
922 /*
923  * Create an empty extent record .
924  *
925  * l_next_free_rec may be updated.
926  *
927  * If an empty extent already exists do nothing.
928  */
929 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
930 {
931         int next_free = le16_to_cpu(el->l_next_free_rec);
932
933         if (next_free == 0)
934                 goto set_and_inc;
935
936         if (ocfs2_is_empty_extent(&el->l_recs[0]))
937                 return;
938
939         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
940                         "Asked to create an empty extent in a full list:\n"
941                         "count = %u, tree depth = %u",
942                         le16_to_cpu(el->l_count),
943                         le16_to_cpu(el->l_tree_depth));
944
945         ocfs2_shift_records_right(el);
946
947 set_and_inc:
948         le16_add_cpu(&el->l_next_free_rec, 1);
949         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
950 }
951
952 /*
953  * For a rotation which involves two leaf nodes, the "root node" is
954  * the lowest level tree node which contains a path to both leafs. This
955  * resulting set of information can be used to form a complete "subtree"
956  *
957  * This function is passed two full paths from the dinode down to a
958  * pair of adjacent leaves. It's task is to figure out which path
959  * index contains the subtree root - this can be the root index itself
960  * in a worst-case rotation.
961  *
962  * The array index of the subtree root is passed back.
963  */
964 static int ocfs2_find_subtree_root(struct inode *inode,
965                                    struct ocfs2_path *left,
966                                    struct ocfs2_path *right)
967 {
968         int i = 0;
969
970         /*
971          * Check that the caller passed in two paths from the same tree.
972          */
973         BUG_ON(path_root_bh(left) != path_root_bh(right));
974
975         do {
976                 i++;
977
978                 /*
979                  * The caller didn't pass two adjacent paths.
980                  */
981                 mlog_bug_on_msg(i > left->p_tree_depth,
982                                 "Inode %lu, left depth %u, right depth %u\n"
983                                 "left leaf blk %llu, right leaf blk %llu\n",
984                                 inode->i_ino, left->p_tree_depth,
985                                 right->p_tree_depth,
986                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
987                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
988         } while (left->p_node[i].bh->b_blocknr ==
989                  right->p_node[i].bh->b_blocknr);
990
991         return i - 1;
992 }
993
994 typedef void (path_insert_t)(void *, struct buffer_head *);
995
996 /*
997  * Traverse a btree path in search of cpos, starting at root_el.
998  *
999  * This code can be called with a cpos larger than the tree, in which
1000  * case it will return the rightmost path.
1001  */
1002 static int __ocfs2_find_path(struct inode *inode,
1003                              struct ocfs2_extent_list *root_el, u32 cpos,
1004                              path_insert_t *func, void *data)
1005 {
1006         int i, ret = 0;
1007         u32 range;
1008         u64 blkno;
1009         struct buffer_head *bh = NULL;
1010         struct ocfs2_extent_block *eb;
1011         struct ocfs2_extent_list *el;
1012         struct ocfs2_extent_rec *rec;
1013         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1014
1015         el = root_el;
1016         while (el->l_tree_depth) {
1017                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1018                         ocfs2_error(inode->i_sb,
1019                                     "Inode %llu has empty extent list at "
1020                                     "depth %u\n",
1021                                     (unsigned long long)oi->ip_blkno,
1022                                     le16_to_cpu(el->l_tree_depth));
1023                         ret = -EROFS;
1024                         goto out;
1025
1026                 }
1027
1028                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1029                         rec = &el->l_recs[i];
1030
1031                         /*
1032                          * In the case that cpos is off the allocation
1033                          * tree, this should just wind up returning the
1034                          * rightmost record.
1035                          */
1036                         range = le32_to_cpu(rec->e_cpos) +
1037                                 le32_to_cpu(rec->e_clusters);
1038                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1039                             break;
1040                 }
1041
1042                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1043                 if (blkno == 0) {
1044                         ocfs2_error(inode->i_sb,
1045                                     "Inode %llu has bad blkno in extent list "
1046                                     "at depth %u (index %d)\n",
1047                                     (unsigned long long)oi->ip_blkno,
1048                                     le16_to_cpu(el->l_tree_depth), i);
1049                         ret = -EROFS;
1050                         goto out;
1051                 }
1052
1053                 brelse(bh);
1054                 bh = NULL;
1055                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1056                                        &bh, OCFS2_BH_CACHED, inode);
1057                 if (ret) {
1058                         mlog_errno(ret);
1059                         goto out;
1060                 }
1061
1062                 eb = (struct ocfs2_extent_block *) bh->b_data;
1063                 el = &eb->h_list;
1064                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1065                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1066                         ret = -EIO;
1067                         goto out;
1068                 }
1069
1070                 if (le16_to_cpu(el->l_next_free_rec) >
1071                     le16_to_cpu(el->l_count)) {
1072                         ocfs2_error(inode->i_sb,
1073                                     "Inode %llu has bad count in extent list "
1074                                     "at block %llu (next free=%u, count=%u)\n",
1075                                     (unsigned long long)oi->ip_blkno,
1076                                     (unsigned long long)bh->b_blocknr,
1077                                     le16_to_cpu(el->l_next_free_rec),
1078                                     le16_to_cpu(el->l_count));
1079                         ret = -EROFS;
1080                         goto out;
1081                 }
1082
1083                 if (func)
1084                         func(data, bh);
1085         }
1086
1087 out:
1088         /*
1089          * Catch any trailing bh that the loop didn't handle.
1090          */
1091         brelse(bh);
1092
1093         return ret;
1094 }
1095
1096 /*
1097  * Given an initialized path (that is, it has a valid root extent
1098  * list), this function will traverse the btree in search of the path
1099  * which would contain cpos.
1100  *
1101  * The path traveled is recorded in the path structure.
1102  *
1103  * Note that this will not do any comparisons on leaf node extent
1104  * records, so it will work fine in the case that we just added a tree
1105  * branch.
1106  */
1107 struct find_path_data {
1108         int index;
1109         struct ocfs2_path *path;
1110 };
1111 static void find_path_ins(void *data, struct buffer_head *bh)
1112 {
1113         struct find_path_data *fp = data;
1114
1115         get_bh(bh);
1116         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1117         fp->index++;
1118 }
1119 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1120                            u32 cpos)
1121 {
1122         struct find_path_data data;
1123
1124         data.index = 1;
1125         data.path = path;
1126         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1127                                  find_path_ins, &data);
1128 }
1129
1130 static void find_leaf_ins(void *data, struct buffer_head *bh)
1131 {
1132         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1133         struct ocfs2_extent_list *el = &eb->h_list;
1134         struct buffer_head **ret = data;
1135
1136         /* We want to retain only the leaf block. */
1137         if (le16_to_cpu(el->l_tree_depth) == 0) {
1138                 get_bh(bh);
1139                 *ret = bh;
1140         }
1141 }
1142 /*
1143  * Find the leaf block in the tree which would contain cpos. No
1144  * checking of the actual leaf is done.
1145  *
1146  * Some paths want to call this instead of allocating a path structure
1147  * and calling ocfs2_find_path().
1148  *
1149  * This function doesn't handle non btree extent lists.
1150  */
1151 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1152                     u32 cpos, struct buffer_head **leaf_bh)
1153 {
1154         int ret;
1155         struct buffer_head *bh = NULL;
1156
1157         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1158         if (ret) {
1159                 mlog_errno(ret);
1160                 goto out;
1161         }
1162
1163         *leaf_bh = bh;
1164 out:
1165         return ret;
1166 }
1167
1168 /*
1169  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1170  *
1171  * Basically, we've moved stuff around at the bottom of the tree and
1172  * we need to fix up the extent records above the changes to reflect
1173  * the new changes.
1174  *
1175  * left_rec: the record on the left.
1176  * left_child_el: is the child list pointed to by left_rec
1177  * right_rec: the record to the right of left_rec
1178  * right_child_el: is the child list pointed to by right_rec
1179  *
1180  * By definition, this only works on interior nodes.
1181  */
1182 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1183                                   struct ocfs2_extent_list *left_child_el,
1184                                   struct ocfs2_extent_rec *right_rec,
1185                                   struct ocfs2_extent_list *right_child_el)
1186 {
1187         u32 left_clusters, right_end;
1188
1189         /*
1190          * Interior nodes never have holes. Their cpos is the cpos of
1191          * the leftmost record in their child list. Their cluster
1192          * count covers the full theoretical range of their child list
1193          * - the range between their cpos and the cpos of the record
1194          * immediately to their right.
1195          */
1196         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1197         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1198         left_rec->e_clusters = cpu_to_le32(left_clusters);
1199
1200         /*
1201          * Calculate the rightmost cluster count boundary before
1202          * moving cpos - we will need to adjust e_clusters after
1203          * updating e_cpos to keep the same highest cluster count.
1204          */
1205         right_end = le32_to_cpu(right_rec->e_cpos);
1206         right_end += le32_to_cpu(right_rec->e_clusters);
1207
1208         right_rec->e_cpos = left_rec->e_cpos;
1209         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1210
1211         right_end -= le32_to_cpu(right_rec->e_cpos);
1212         right_rec->e_clusters = cpu_to_le32(right_end);
1213 }
1214
1215 /*
1216  * Adjust the adjacent root node records involved in a
1217  * rotation. left_el_blkno is passed in as a key so that we can easily
1218  * find it's index in the root list.
1219  */
1220 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1221                                       struct ocfs2_extent_list *left_el,
1222                                       struct ocfs2_extent_list *right_el,
1223                                       u64 left_el_blkno)
1224 {
1225         int i;
1226
1227         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1228                le16_to_cpu(left_el->l_tree_depth));
1229
1230         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1231                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1232                         break;
1233         }
1234
1235         /*
1236          * The path walking code should have never returned a root and
1237          * two paths which are not adjacent.
1238          */
1239         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1240
1241         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1242                                       &root_el->l_recs[i + 1], right_el);
1243 }
1244
1245 /*
1246  * We've changed a leaf block (in right_path) and need to reflect that
1247  * change back up the subtree.
1248  *
1249  * This happens in multiple places:
1250  *   - When we've moved an extent record from the left path leaf to the right
1251  *     path leaf to make room for an empty extent in the left path leaf.
1252  *   - When our insert into the right path leaf is at the leftmost edge
1253  *     and requires an update of the path immediately to it's left. This
1254  *     can occur at the end of some types of rotation and appending inserts.
1255  */
1256 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1257                                        struct ocfs2_path *left_path,
1258                                        struct ocfs2_path *right_path,
1259                                        int subtree_index)
1260 {
1261         int ret, i, idx;
1262         struct ocfs2_extent_list *el, *left_el, *right_el;
1263         struct ocfs2_extent_rec *left_rec, *right_rec;
1264         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1265
1266         /*
1267          * Update the counts and position values within all the
1268          * interior nodes to reflect the leaf rotation we just did.
1269          *
1270          * The root node is handled below the loop.
1271          *
1272          * We begin the loop with right_el and left_el pointing to the
1273          * leaf lists and work our way up.
1274          *
1275          * NOTE: within this loop, left_el and right_el always refer
1276          * to the *child* lists.
1277          */
1278         left_el = path_leaf_el(left_path);
1279         right_el = path_leaf_el(right_path);
1280         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1281                 mlog(0, "Adjust records at index %u\n", i);
1282
1283                 /*
1284                  * One nice property of knowing that all of these
1285                  * nodes are below the root is that we only deal with
1286                  * the leftmost right node record and the rightmost
1287                  * left node record.
1288                  */
1289                 el = left_path->p_node[i].el;
1290                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1291                 left_rec = &el->l_recs[idx];
1292
1293                 el = right_path->p_node[i].el;
1294                 right_rec = &el->l_recs[0];
1295
1296                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1297                                               right_el);
1298
1299                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1300                 if (ret)
1301                         mlog_errno(ret);
1302
1303                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1304                 if (ret)
1305                         mlog_errno(ret);
1306
1307                 /*
1308                  * Setup our list pointers now so that the current
1309                  * parents become children in the next iteration.
1310                  */
1311                 left_el = left_path->p_node[i].el;
1312                 right_el = right_path->p_node[i].el;
1313         }
1314
1315         /*
1316          * At the root node, adjust the two adjacent records which
1317          * begin our path to the leaves.
1318          */
1319
1320         el = left_path->p_node[subtree_index].el;
1321         left_el = left_path->p_node[subtree_index + 1].el;
1322         right_el = right_path->p_node[subtree_index + 1].el;
1323
1324         ocfs2_adjust_root_records(el, left_el, right_el,
1325                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1326
1327         root_bh = left_path->p_node[subtree_index].bh;
1328
1329         ret = ocfs2_journal_dirty(handle, root_bh);
1330         if (ret)
1331                 mlog_errno(ret);
1332 }
1333
1334 static int ocfs2_rotate_subtree_right(struct inode *inode,
1335                                       handle_t *handle,
1336                                       struct ocfs2_path *left_path,
1337                                       struct ocfs2_path *right_path,
1338                                       int subtree_index)
1339 {
1340         int ret, i;
1341         struct buffer_head *right_leaf_bh;
1342         struct buffer_head *left_leaf_bh = NULL;
1343         struct buffer_head *root_bh;
1344         struct ocfs2_extent_list *right_el, *left_el;
1345         struct ocfs2_extent_rec move_rec;
1346
1347         left_leaf_bh = path_leaf_bh(left_path);
1348         left_el = path_leaf_el(left_path);
1349
1350         if (left_el->l_next_free_rec != left_el->l_count) {
1351                 ocfs2_error(inode->i_sb,
1352                             "Inode %llu has non-full interior leaf node %llu"
1353                             "(next free = %u)",
1354                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1355                             (unsigned long long)left_leaf_bh->b_blocknr,
1356                             le16_to_cpu(left_el->l_next_free_rec));
1357                 return -EROFS;
1358         }
1359
1360         /*
1361          * This extent block may already have an empty record, so we
1362          * return early if so.
1363          */
1364         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1365                 return 0;
1366
1367         root_bh = left_path->p_node[subtree_index].bh;
1368         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1369
1370         ret = ocfs2_journal_access(handle, inode, root_bh,
1371                                    OCFS2_JOURNAL_ACCESS_WRITE);
1372         if (ret) {
1373                 mlog_errno(ret);
1374                 goto out;
1375         }
1376
1377         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1378                 ret = ocfs2_journal_access(handle, inode,
1379                                            right_path->p_node[i].bh,
1380                                            OCFS2_JOURNAL_ACCESS_WRITE);
1381                 if (ret) {
1382                         mlog_errno(ret);
1383                         goto out;
1384                 }
1385
1386                 ret = ocfs2_journal_access(handle, inode,
1387                                            left_path->p_node[i].bh,
1388                                            OCFS2_JOURNAL_ACCESS_WRITE);
1389                 if (ret) {
1390                         mlog_errno(ret);
1391                         goto out;
1392                 }
1393         }
1394
1395         right_leaf_bh = path_leaf_bh(right_path);
1396         right_el = path_leaf_el(right_path);
1397
1398         /* This is a code error, not a disk corruption. */
1399         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1400                         "because rightmost leaf block %llu is empty\n",
1401                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1402                         (unsigned long long)right_leaf_bh->b_blocknr);
1403
1404         ocfs2_create_empty_extent(right_el);
1405
1406         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1407         if (ret) {
1408                 mlog_errno(ret);
1409                 goto out;
1410         }
1411
1412         /* Do the copy now. */
1413         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1414         move_rec = left_el->l_recs[i];
1415         right_el->l_recs[0] = move_rec;
1416
1417         /*
1418          * Clear out the record we just copied and shift everything
1419          * over, leaving an empty extent in the left leaf.
1420          *
1421          * We temporarily subtract from next_free_rec so that the
1422          * shift will lose the tail record (which is now defunct).
1423          */
1424         le16_add_cpu(&left_el->l_next_free_rec, -1);
1425         ocfs2_shift_records_right(left_el);
1426         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1427         le16_add_cpu(&left_el->l_next_free_rec, 1);
1428
1429         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1430         if (ret) {
1431                 mlog_errno(ret);
1432                 goto out;
1433         }
1434
1435         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1436                                 subtree_index);
1437
1438 out:
1439         return ret;
1440 }
1441
1442 /*
1443  * Given a full path, determine what cpos value would return us a path
1444  * containing the leaf immediately to the left of the current one.
1445  *
1446  * Will return zero if the path passed in is already the leftmost path.
1447  */
1448 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1449                                          struct ocfs2_path *path, u32 *cpos)
1450 {
1451         int i, j, ret = 0;
1452         u64 blkno;
1453         struct ocfs2_extent_list *el;
1454
1455         *cpos = 0;
1456
1457         blkno = path_leaf_bh(path)->b_blocknr;
1458
1459         /* Start at the tree node just above the leaf and work our way up. */
1460         i = path->p_tree_depth - 1;
1461         while (i >= 0) {
1462                 el = path->p_node[i].el;
1463
1464                 /*
1465                  * Find the extent record just before the one in our
1466                  * path.
1467                  */
1468                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1469                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1470                                 if (j == 0) {
1471                                         if (i == 0) {
1472                                                 /*
1473                                                  * We've determined that the
1474                                                  * path specified is already
1475                                                  * the leftmost one - return a
1476                                                  * cpos of zero.
1477                                                  */
1478                                                 goto out;
1479                                         }
1480                                         /*
1481                                          * The leftmost record points to our
1482                                          * leaf - we need to travel up the
1483                                          * tree one level.
1484                                          */
1485                                         goto next_node;
1486                                 }
1487
1488                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1489                                 *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
1490                                 goto out;
1491                         }
1492                 }
1493
1494                 /*
1495                  * If we got here, we never found a valid node where
1496                  * the tree indicated one should be.
1497                  */
1498                 ocfs2_error(sb,
1499                             "Invalid extent tree at extent block %llu\n",
1500                             (unsigned long long)blkno);
1501                 ret = -EROFS;
1502                 goto out;
1503
1504 next_node:
1505                 blkno = path->p_node[i].bh->b_blocknr;
1506                 i--;
1507         }
1508
1509 out:
1510         return ret;
1511 }
1512
1513 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1514                                            struct ocfs2_path *path)
1515 {
1516         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1517
1518         if (handle->h_buffer_credits < credits)
1519                 return ocfs2_extend_trans(handle, credits);
1520
1521         return 0;
1522 }
1523
1524 /*
1525  * Trap the case where we're inserting into the theoretical range past
1526  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1527  * whose cpos is less than ours into the right leaf.
1528  *
1529  * It's only necessary to look at the rightmost record of the left
1530  * leaf because the logic that calls us should ensure that the
1531  * theoretical ranges in the path components above the leaves are
1532  * correct.
1533  */
1534 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1535                                                  u32 insert_cpos)
1536 {
1537         struct ocfs2_extent_list *left_el;
1538         struct ocfs2_extent_rec *rec;
1539         int next_free;
1540
1541         left_el = path_leaf_el(left_path);
1542         next_free = le16_to_cpu(left_el->l_next_free_rec);
1543         rec = &left_el->l_recs[next_free - 1];
1544
1545         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1546                 return 1;
1547         return 0;
1548 }
1549
1550 /*
1551  * Rotate all the records in a btree right one record, starting at insert_cpos.
1552  *
1553  * The path to the rightmost leaf should be passed in.
1554  *
1555  * The array is assumed to be large enough to hold an entire path (tree depth).
1556  *
1557  * Upon succesful return from this function:
1558  *
1559  * - The 'right_path' array will contain a path to the leaf block
1560  *   whose range contains e_cpos.
1561  * - That leaf block will have a single empty extent in list index 0.
1562  * - In the case that the rotation requires a post-insert update,
1563  *   *ret_left_path will contain a valid path which can be passed to
1564  *   ocfs2_insert_path().
1565  */
1566 static int ocfs2_rotate_tree_right(struct inode *inode,
1567                                    handle_t *handle,
1568                                    u32 insert_cpos,
1569                                    struct ocfs2_path *right_path,
1570                                    struct ocfs2_path **ret_left_path)
1571 {
1572         int ret, start;
1573         u32 cpos;
1574         struct ocfs2_path *left_path = NULL;
1575
1576         *ret_left_path = NULL;
1577
1578         left_path = ocfs2_new_path(path_root_bh(right_path),
1579                                    path_root_el(right_path));
1580         if (!left_path) {
1581                 ret = -ENOMEM;
1582                 mlog_errno(ret);
1583                 goto out;
1584         }
1585
1586         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1587         if (ret) {
1588                 mlog_errno(ret);
1589                 goto out;
1590         }
1591
1592         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1593
1594         /*
1595          * What we want to do here is:
1596          *
1597          * 1) Start with the rightmost path.
1598          *
1599          * 2) Determine a path to the leaf block directly to the left
1600          *    of that leaf.
1601          *
1602          * 3) Determine the 'subtree root' - the lowest level tree node
1603          *    which contains a path to both leaves.
1604          *
1605          * 4) Rotate the subtree.
1606          *
1607          * 5) Find the next subtree by considering the left path to be
1608          *    the new right path.
1609          *
1610          * The check at the top of this while loop also accepts
1611          * insert_cpos == cpos because cpos is only a _theoretical_
1612          * value to get us the left path - insert_cpos might very well
1613          * be filling that hole.
1614          *
1615          * Stop at a cpos of '0' because we either started at the
1616          * leftmost branch (i.e., a tree with one branch and a
1617          * rotation inside of it), or we've gone as far as we can in
1618          * rotating subtrees.
1619          */
1620         while (cpos && insert_cpos <= cpos) {
1621                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1622                      insert_cpos, cpos);
1623
1624                 ret = ocfs2_find_path(inode, left_path, cpos);
1625                 if (ret) {
1626                         mlog_errno(ret);
1627                         goto out;
1628                 }
1629
1630                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1631                                 path_leaf_bh(right_path),
1632                                 "Inode %lu: error during insert of %u "
1633                                 "(left path cpos %u) results in two identical "
1634                                 "paths ending at %llu\n",
1635                                 inode->i_ino, insert_cpos, cpos,
1636                                 (unsigned long long)
1637                                 path_leaf_bh(left_path)->b_blocknr);
1638
1639                 if (ocfs2_rotate_requires_path_adjustment(left_path,
1640                                                           insert_cpos)) {
1641                         mlog(0, "Path adjustment required\n");
1642
1643                         /*
1644                          * We've rotated the tree as much as we
1645                          * should. The rest is up to
1646                          * ocfs2_insert_path() to complete, after the
1647                          * record insertion. We indicate this
1648                          * situation by returning the left path.
1649                          *
1650                          * The reason we don't adjust the records here
1651                          * before the record insert is that an error
1652                          * later might break the rule where a parent
1653                          * record e_cpos will reflect the actual
1654                          * e_cpos of the 1st nonempty record of the
1655                          * child list.
1656                          */
1657                         *ret_left_path = left_path;
1658                         goto out_ret_path;
1659                 }
1660
1661                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1662
1663                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1664                      start,
1665                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1666                      right_path->p_tree_depth);
1667
1668                 ret = ocfs2_extend_rotate_transaction(handle, start,
1669                                                       right_path);
1670                 if (ret) {
1671                         mlog_errno(ret);
1672                         goto out;
1673                 }
1674
1675                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1676                                                  right_path, start);
1677                 if (ret) {
1678                         mlog_errno(ret);
1679                         goto out;
1680                 }
1681
1682                 /*
1683                  * There is no need to re-read the next right path
1684                  * as we know that it'll be our current left
1685                  * path. Optimize by copying values instead.
1686                  */
1687                 ocfs2_mv_path(right_path, left_path);
1688
1689                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1690                                                     &cpos);
1691                 if (ret) {
1692                         mlog_errno(ret);
1693                         goto out;
1694                 }
1695         }
1696
1697 out:
1698         ocfs2_free_path(left_path);
1699
1700 out_ret_path:
1701         return ret;
1702 }
1703
1704 /*
1705  * Do the final bits of extent record insertion at the target leaf
1706  * list. If this leaf is part of an allocation tree, it is assumed
1707  * that the tree above has been prepared.
1708  */
1709 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1710                                  struct ocfs2_extent_list *el,
1711                                  struct ocfs2_insert_type *insert,
1712                                  struct inode *inode)
1713 {
1714         int i = insert->ins_contig_index;
1715         unsigned int range;
1716         struct ocfs2_extent_rec *rec;
1717
1718         BUG_ON(el->l_tree_depth);
1719
1720         /*
1721          * Contiguous insert - either left or right.
1722          */
1723         if (insert->ins_contig != CONTIG_NONE) {
1724                 rec = &el->l_recs[i];
1725                 if (insert->ins_contig == CONTIG_LEFT) {
1726                         rec->e_blkno = insert_rec->e_blkno;
1727                         rec->e_cpos = insert_rec->e_cpos;
1728                 }
1729                 le32_add_cpu(&rec->e_clusters,
1730                              le32_to_cpu(insert_rec->e_clusters));
1731                 return;
1732         }
1733
1734         /*
1735          * Handle insert into an empty leaf.
1736          */
1737         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1738             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1739              ocfs2_is_empty_extent(&el->l_recs[0]))) {
1740                 el->l_recs[0] = *insert_rec;
1741                 el->l_next_free_rec = cpu_to_le16(1);
1742                 return;
1743         }
1744
1745         /*
1746          * Appending insert.
1747          */
1748         if (insert->ins_appending == APPEND_TAIL) {
1749                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1750                 rec = &el->l_recs[i];
1751                 range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
1752                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1753
1754                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1755                                 le16_to_cpu(el->l_count),
1756                                 "inode %lu, depth %u, count %u, next free %u, "
1757                                 "rec.cpos %u, rec.clusters %u, "
1758                                 "insert.cpos %u, insert.clusters %u\n",
1759                                 inode->i_ino,
1760                                 le16_to_cpu(el->l_tree_depth),
1761                                 le16_to_cpu(el->l_count),
1762                                 le16_to_cpu(el->l_next_free_rec),
1763                                 le32_to_cpu(el->l_recs[i].e_cpos),
1764                                 le32_to_cpu(el->l_recs[i].e_clusters),
1765                                 le32_to_cpu(insert_rec->e_cpos),
1766                                 le32_to_cpu(insert_rec->e_clusters));
1767                 i++;
1768                 el->l_recs[i] = *insert_rec;
1769                 le16_add_cpu(&el->l_next_free_rec, 1);
1770                 return;
1771         }
1772
1773         /*
1774          * Ok, we have to rotate.
1775          *
1776          * At this point, it is safe to assume that inserting into an
1777          * empty leaf and appending to a leaf have both been handled
1778          * above.
1779          *
1780          * This leaf needs to have space, either by the empty 1st
1781          * extent record, or by virtue of an l_next_rec < l_count.
1782          */
1783         ocfs2_rotate_leaf(el, insert_rec);
1784 }
1785
1786 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1787                                                 struct ocfs2_dinode *di,
1788                                                 u32 clusters)
1789 {
1790         le32_add_cpu(&di->i_clusters, clusters);
1791         spin_lock(&OCFS2_I(inode)->ip_lock);
1792         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1793         spin_unlock(&OCFS2_I(inode)->ip_lock);
1794 }
1795
1796 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1797                                     struct ocfs2_extent_rec *insert_rec,
1798                                     struct ocfs2_path *right_path,
1799                                     struct ocfs2_path **ret_left_path)
1800 {
1801         int ret, i, next_free;
1802         struct buffer_head *bh;
1803         struct ocfs2_extent_list *el;
1804         struct ocfs2_path *left_path = NULL;
1805
1806         *ret_left_path = NULL;
1807
1808         /*
1809          * If our appending insert is at the leftmost edge of a leaf,
1810          * then we might need to update the rightmost records of the
1811          * neighboring path.
1812          */
1813         el = path_leaf_el(right_path);
1814         next_free = le16_to_cpu(el->l_next_free_rec);
1815         if (next_free == 0 ||
1816             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1817                 u32 left_cpos;
1818
1819                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1820                                                     &left_cpos);
1821                 if (ret) {
1822                         mlog_errno(ret);
1823                         goto out;
1824                 }
1825
1826                 mlog(0, "Append may need a left path update. cpos: %u, "
1827                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1828                      left_cpos);
1829
1830                 /*
1831                  * No need to worry if the append is already in the
1832                  * leftmost leaf.
1833                  */
1834                 if (left_cpos) {
1835                         left_path = ocfs2_new_path(path_root_bh(right_path),
1836                                                    path_root_el(right_path));
1837                         if (!left_path) {
1838                                 ret = -ENOMEM;
1839                                 mlog_errno(ret);
1840                                 goto out;
1841                         }
1842
1843                         ret = ocfs2_find_path(inode, left_path, left_cpos);
1844                         if (ret) {
1845                                 mlog_errno(ret);
1846                                 goto out;
1847                         }
1848
1849                         /*
1850                          * ocfs2_insert_path() will pass the left_path to the
1851                          * journal for us.
1852                          */
1853                 }
1854         }
1855
1856         ret = ocfs2_journal_access_path(inode, handle, right_path);
1857         if (ret) {
1858                 mlog_errno(ret);
1859                 goto out;
1860         }
1861
1862         el = path_root_el(right_path);
1863         bh = path_root_bh(right_path);
1864         i = 0;
1865         while (1) {
1866                 next_free = le16_to_cpu(el->l_next_free_rec);
1867                 if (next_free == 0) {
1868                         ocfs2_error(inode->i_sb,
1869                                     "Dinode %llu has a bad extent list",
1870                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1871                         ret = -EIO;
1872                         goto out;
1873                 }
1874
1875                 el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
1876                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1877                              le32_to_cpu(insert_rec->e_clusters));
1878                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1879                             -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
1880
1881                 ret = ocfs2_journal_dirty(handle, bh);
1882                 if (ret)
1883                         mlog_errno(ret);
1884
1885                 if (++i >= right_path->p_tree_depth)
1886                         break;
1887
1888                 bh = right_path->p_node[i].bh;
1889                 el = right_path->p_node[i].el;
1890         }
1891
1892         *ret_left_path = left_path;
1893         ret = 0;
1894 out:
1895         if (ret != 0)
1896                 ocfs2_free_path(left_path);
1897
1898         return ret;
1899 }
1900
1901 /*
1902  * This function only does inserts on an allocation b-tree. For dinode
1903  * lists, ocfs2_insert_at_leaf() is called directly.
1904  *
1905  * right_path is the path we want to do the actual insert
1906  * in. left_path should only be passed in if we need to update that
1907  * portion of the tree after an edge insert.
1908  */
1909 static int ocfs2_insert_path(struct inode *inode,
1910                              handle_t *handle,
1911                              struct ocfs2_path *left_path,
1912                              struct ocfs2_path *right_path,
1913                              struct ocfs2_extent_rec *insert_rec,
1914                              struct ocfs2_insert_type *insert)
1915 {
1916         int ret, subtree_index;
1917         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1918         struct ocfs2_extent_list *el;
1919
1920         /*
1921          * Pass both paths to the journal. The majority of inserts
1922          * will be touching all components anyway.
1923          */
1924         ret = ocfs2_journal_access_path(inode, handle, right_path);
1925         if (ret < 0) {
1926                 mlog_errno(ret);
1927                 goto out;
1928         }
1929
1930         if (left_path) {
1931                 int credits = handle->h_buffer_credits;
1932
1933                 /*
1934                  * There's a chance that left_path got passed back to
1935                  * us without being accounted for in the
1936                  * journal. Extend our transaction here to be sure we
1937                  * can change those blocks.
1938                  */
1939                 credits += left_path->p_tree_depth;
1940
1941                 ret = ocfs2_extend_trans(handle, credits);
1942                 if (ret < 0) {
1943                         mlog_errno(ret);
1944                         goto out;
1945                 }
1946
1947                 ret = ocfs2_journal_access_path(inode, handle, left_path);
1948                 if (ret < 0) {
1949                         mlog_errno(ret);
1950                         goto out;
1951                 }
1952         }
1953
1954         el = path_leaf_el(right_path);
1955
1956         ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1957         ret = ocfs2_journal_dirty(handle, leaf_bh);
1958         if (ret)
1959                 mlog_errno(ret);
1960
1961         if (left_path) {
1962                 /*
1963                  * The rotate code has indicated that we need to fix
1964                  * up portions of the tree after the insert.
1965                  *
1966                  * XXX: Should we extend the transaction here?
1967                  */
1968                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
1969                                                         right_path);
1970                 ocfs2_complete_edge_insert(inode, handle, left_path,
1971                                            right_path, subtree_index);
1972         }
1973
1974         ret = 0;
1975 out:
1976         return ret;
1977 }
1978
1979 static int ocfs2_do_insert_extent(struct inode *inode,
1980                                   handle_t *handle,
1981                                   struct buffer_head *di_bh,
1982                                   struct ocfs2_extent_rec *insert_rec,
1983                                   struct ocfs2_insert_type *type)
1984 {
1985         int ret, rotate = 0;
1986         u32 cpos;
1987         struct ocfs2_path *right_path = NULL;
1988         struct ocfs2_path *left_path = NULL;
1989         struct ocfs2_dinode *di;
1990         struct ocfs2_extent_list *el;
1991
1992         di = (struct ocfs2_dinode *) di_bh->b_data;
1993         el = &di->id2.i_list;
1994
1995         ret = ocfs2_journal_access(handle, inode, di_bh,
1996                                    OCFS2_JOURNAL_ACCESS_WRITE);
1997         if (ret) {
1998                 mlog_errno(ret);
1999                 goto out;
2000         }
2001
2002         if (le16_to_cpu(el->l_tree_depth) == 0) {
2003                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2004                 goto out_update_clusters;
2005         }
2006
2007         right_path = ocfs2_new_inode_path(di_bh);
2008         if (!right_path) {
2009                 ret = -ENOMEM;
2010                 mlog_errno(ret);
2011                 goto out;
2012         }
2013
2014         /*
2015          * Determine the path to start with. Rotations need the
2016          * rightmost path, everything else can go directly to the
2017          * target leaf.
2018          */
2019         cpos = le32_to_cpu(insert_rec->e_cpos);
2020         if (type->ins_appending == APPEND_NONE &&
2021             type->ins_contig == CONTIG_NONE) {
2022                 rotate = 1;
2023                 cpos = UINT_MAX;
2024         }
2025
2026         ret = ocfs2_find_path(inode, right_path, cpos);
2027         if (ret) {
2028                 mlog_errno(ret);
2029                 goto out;
2030         }
2031
2032         /*
2033          * Rotations and appends need special treatment - they modify
2034          * parts of the tree's above them.
2035          *
2036          * Both might pass back a path immediate to the left of the
2037          * one being inserted to. This will be cause
2038          * ocfs2_insert_path() to modify the rightmost records of
2039          * left_path to account for an edge insert.
2040          *
2041          * XXX: When modifying this code, keep in mind that an insert
2042          * can wind up skipping both of these two special cases...
2043          */
2044         if (rotate) {
2045                 ret = ocfs2_rotate_tree_right(inode, handle,
2046                                               le32_to_cpu(insert_rec->e_cpos),
2047                                               right_path, &left_path);
2048                 if (ret) {
2049                         mlog_errno(ret);
2050                         goto out;
2051                 }
2052         } else if (type->ins_appending == APPEND_TAIL
2053                    && type->ins_contig != CONTIG_LEFT) {
2054                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2055                                                right_path, &left_path);
2056                 if (ret) {
2057                         mlog_errno(ret);
2058                         goto out;
2059                 }
2060         }
2061
2062         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2063                                 insert_rec, type);
2064         if (ret) {
2065                 mlog_errno(ret);
2066                 goto out;
2067         }
2068
2069 out_update_clusters:
2070         ocfs2_update_dinode_clusters(inode, di,
2071                                      le32_to_cpu(insert_rec->e_clusters));
2072
2073         ret = ocfs2_journal_dirty(handle, di_bh);
2074         if (ret)
2075                 mlog_errno(ret);
2076
2077 out:
2078         ocfs2_free_path(left_path);
2079         ocfs2_free_path(right_path);
2080
2081         return ret;
2082 }
2083
2084 static void ocfs2_figure_contig_type(struct inode *inode,
2085                                      struct ocfs2_insert_type *insert,
2086                                      struct ocfs2_extent_list *el,
2087                                      struct ocfs2_extent_rec *insert_rec)
2088 {
2089         int i;
2090         enum ocfs2_contig_type contig_type = CONTIG_NONE;
2091
2092         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2093                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2094                                                   insert_rec);
2095                 if (contig_type != CONTIG_NONE) {
2096                         insert->ins_contig_index = i;
2097                         break;
2098                 }
2099         }
2100         insert->ins_contig = contig_type;
2101 }
2102
2103 /*
2104  * This should only be called against the righmost leaf extent list.
2105  *
2106  * ocfs2_figure_appending_type() will figure out whether we'll have to
2107  * insert at the tail of the rightmost leaf.
2108  *
2109  * This should also work against the dinode list for tree's with 0
2110  * depth. If we consider the dinode list to be the rightmost leaf node
2111  * then the logic here makes sense.
2112  */
2113 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2114                                         struct ocfs2_extent_list *el,
2115                                         struct ocfs2_extent_rec *insert_rec)
2116 {
2117         int i;
2118         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2119         struct ocfs2_extent_rec *rec;
2120
2121         insert->ins_appending = APPEND_NONE;
2122
2123         BUG_ON(el->l_tree_depth);
2124
2125         if (!el->l_next_free_rec)
2126                 goto set_tail_append;
2127
2128         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2129                 /* Were all records empty? */
2130                 if (le16_to_cpu(el->l_next_free_rec) == 1)
2131                         goto set_tail_append;
2132         }
2133
2134         i = le16_to_cpu(el->l_next_free_rec) - 1;
2135         rec = &el->l_recs[i];
2136
2137         if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
2138                 goto set_tail_append;
2139
2140         return;
2141
2142 set_tail_append:
2143         insert->ins_appending = APPEND_TAIL;
2144 }
2145
2146 /*
2147  * Helper function called at the begining of an insert.
2148  *
2149  * This computes a few things that are commonly used in the process of
2150  * inserting into the btree:
2151  *   - Whether the new extent is contiguous with an existing one.
2152  *   - The current tree depth.
2153  *   - Whether the insert is an appending one.
2154  *   - The total # of free records in the tree.
2155  *
2156  * All of the information is stored on the ocfs2_insert_type
2157  * structure.
2158  */
2159 static int ocfs2_figure_insert_type(struct inode *inode,
2160                                     struct buffer_head *di_bh,
2161                                     struct buffer_head **last_eb_bh,
2162                                     struct ocfs2_extent_rec *insert_rec,
2163                                     struct ocfs2_insert_type *insert)
2164 {
2165         int ret;
2166         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2167         struct ocfs2_extent_block *eb;
2168         struct ocfs2_extent_list *el;
2169         struct ocfs2_path *path = NULL;
2170         struct buffer_head *bh = NULL;
2171
2172         el = &di->id2.i_list;
2173         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2174
2175         if (el->l_tree_depth) {
2176                 /*
2177                  * If we have tree depth, we read in the
2178                  * rightmost extent block ahead of time as
2179                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
2180                  * may want it later.
2181                  */
2182                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2183                                        le64_to_cpu(di->i_last_eb_blk), &bh,
2184                                        OCFS2_BH_CACHED, inode);
2185                 if (ret) {
2186                         mlog_exit(ret);
2187                         goto out;
2188                 }
2189                 eb = (struct ocfs2_extent_block *) bh->b_data;
2190                 el = &eb->h_list;
2191         }
2192
2193         /*
2194          * Unless we have a contiguous insert, we'll need to know if
2195          * there is room left in our allocation tree for another
2196          * extent record.
2197          *
2198          * XXX: This test is simplistic, we can search for empty
2199          * extent records too.
2200          */
2201         insert->ins_free_records = le16_to_cpu(el->l_count) -
2202                 le16_to_cpu(el->l_next_free_rec);
2203
2204         if (!insert->ins_tree_depth) {
2205                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2206                 ocfs2_figure_appending_type(insert, el, insert_rec);
2207                 return 0;
2208         }
2209
2210         path = ocfs2_new_inode_path(di_bh);
2211         if (!path) {
2212                 ret = -ENOMEM;
2213                 mlog_errno(ret);
2214                 goto out;
2215         }
2216
2217         /*
2218          * In the case that we're inserting past what the tree
2219          * currently accounts for, ocfs2_find_path() will return for
2220          * us the rightmost tree path. This is accounted for below in
2221          * the appending code.
2222          */
2223         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2224         if (ret) {
2225                 mlog_errno(ret);
2226                 goto out;
2227         }
2228
2229         el = path_leaf_el(path);
2230
2231         /*
2232          * Now that we have the path, there's two things we want to determine:
2233          * 1) Contiguousness (also set contig_index if this is so)
2234          *
2235          * 2) Are we doing an append? We can trivially break this up
2236          *     into two types of appends: simple record append, or a
2237          *     rotate inside the tail leaf.
2238          */
2239         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2240
2241         /*
2242          * The insert code isn't quite ready to deal with all cases of
2243          * left contiguousness. Specifically, if it's an insert into
2244          * the 1st record in a leaf, it will require the adjustment of
2245          * e_clusters on the last record of the path directly to it's
2246          * left. For now, just catch that case and fool the layers
2247          * above us. This works just fine for tree_depth == 0, which
2248          * is why we allow that above.
2249          */
2250         if (insert->ins_contig == CONTIG_LEFT &&
2251             insert->ins_contig_index == 0)
2252                 insert->ins_contig = CONTIG_NONE;
2253
2254         /*
2255          * Ok, so we can simply compare against last_eb to figure out
2256          * whether the path doesn't exist. This will only happen in
2257          * the case that we're doing a tail append, so maybe we can
2258          * take advantage of that information somehow.
2259          */
2260         if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2261                 /*
2262                  * Ok, ocfs2_find_path() returned us the rightmost
2263                  * tree path. This might be an appending insert. There are
2264                  * two cases:
2265                  *    1) We're doing a true append at the tail:
2266                  *      -This might even be off the end of the leaf
2267                  *    2) We're "appending" by rotating in the tail
2268                  */
2269                 ocfs2_figure_appending_type(insert, el, insert_rec);
2270         }
2271
2272 out:
2273         ocfs2_free_path(path);
2274
2275         if (ret == 0)
2276                 *last_eb_bh = bh;
2277         else
2278                 brelse(bh);
2279         return ret;
2280 }
2281
2282 /*
2283  * Insert an extent into an inode btree.
2284  *
2285  * The caller needs to update fe->i_clusters
2286  */
2287 int ocfs2_insert_extent(struct ocfs2_super *osb,
2288                         handle_t *handle,
2289                         struct inode *inode,
2290                         struct buffer_head *fe_bh,
2291                         u32 cpos,
2292                         u64 start_blk,
2293                         u32 new_clusters,
2294                         struct ocfs2_alloc_context *meta_ac)
2295 {
2296         int status, shift;
2297         struct buffer_head *last_eb_bh = NULL;
2298         struct buffer_head *bh = NULL;
2299         struct ocfs2_insert_type insert = {0, };
2300         struct ocfs2_extent_rec rec;
2301
2302         mlog(0, "add %u clusters at position %u to inode %llu\n",
2303              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2304
2305         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2306                         (OCFS2_I(inode)->ip_clusters != cpos),
2307                         "Device %s, asking for sparse allocation: inode %llu, "
2308                         "cpos %u, clusters %u\n",
2309                         osb->dev_str,
2310                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2311                         OCFS2_I(inode)->ip_clusters);
2312
2313         rec.e_cpos = cpu_to_le32(cpos);
2314         rec.e_blkno = cpu_to_le64(start_blk);
2315         rec.e_clusters = cpu_to_le32(new_clusters);
2316
2317         status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2318                                           &insert);
2319         if (status < 0) {
2320                 mlog_errno(status);
2321                 goto bail;
2322         }
2323
2324         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
2325              "Insert.contig_index: %d, Insert.free_records: %d, "
2326              "Insert.tree_depth: %d\n",
2327              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2328              insert.ins_free_records, insert.ins_tree_depth);
2329
2330         /*
2331          * Avoid growing the tree unless we're out of records and the
2332          * insert type requres one.
2333          */
2334         if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2335                 goto out_add;
2336
2337         shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2338         if (shift < 0) {
2339                 status = shift;
2340                 mlog_errno(status);
2341                 goto bail;
2342         }
2343
2344         /* We traveled all the way to the bottom of the allocation tree
2345          * and didn't find room for any more extents - we need to add
2346          * another tree level */
2347         if (shift) {
2348                 BUG_ON(bh);
2349                 mlog(0, "need to shift tree depth "
2350                      "(current = %d)\n", insert.ins_tree_depth);
2351
2352                 /* ocfs2_shift_tree_depth will return us a buffer with
2353                  * the new extent block (so we can pass that to
2354                  * ocfs2_add_branch). */
2355                 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2356                                                 meta_ac, &bh);
2357                 if (status < 0) {
2358                         mlog_errno(status);
2359                         goto bail;
2360                 }
2361                 insert.ins_tree_depth++;
2362                 /* Special case: we have room now if we shifted from
2363                  * tree_depth 0 */
2364                 if (insert.ins_tree_depth == 1)
2365                         goto out_add;
2366         }
2367
2368         /* call ocfs2_add_branch to add the final part of the tree with
2369          * the new data. */
2370         mlog(0, "add branch. bh = %p\n", bh);
2371         status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2372                                   meta_ac);
2373         if (status < 0) {
2374                 mlog_errno(status);
2375                 goto bail;
2376         }
2377
2378 out_add:
2379         /* Finally, we can add clusters. This might rotate the tree for us. */
2380         status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2381         if (status < 0)
2382                 mlog_errno(status);
2383
2384 bail:
2385         if (bh)
2386                 brelse(bh);
2387
2388         if (last_eb_bh)
2389                 brelse(last_eb_bh);
2390
2391         mlog_exit(status);
2392         return status;
2393 }
2394
2395 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2396 {
2397         struct buffer_head *tl_bh = osb->osb_tl_bh;
2398         struct ocfs2_dinode *di;
2399         struct ocfs2_truncate_log *tl;
2400
2401         di = (struct ocfs2_dinode *) tl_bh->b_data;
2402         tl = &di->id2.i_dealloc;
2403
2404         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
2405                         "slot %d, invalid truncate log parameters: used = "
2406                         "%u, count = %u\n", osb->slot_num,
2407                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
2408         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
2409 }
2410
2411 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2412                                            unsigned int new_start)
2413 {
2414         unsigned int tail_index;
2415         unsigned int current_tail;
2416
2417         /* No records, nothing to coalesce */
2418         if (!le16_to_cpu(tl->tl_used))
2419                 return 0;
2420
2421         tail_index = le16_to_cpu(tl->tl_used) - 1;
2422         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
2423         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
2424
2425         return current_tail == new_start;
2426 }
2427
2428 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2429                                      handle_t *handle,
2430                                      u64 start_blk,
2431                                      unsigned int num_clusters)
2432 {
2433         int status, index;
2434         unsigned int start_cluster, tl_count;
2435         struct inode *tl_inode = osb->osb_tl_inode;
2436         struct buffer_head *tl_bh = osb->osb_tl_bh;
2437         struct ocfs2_dinode *di;
2438         struct ocfs2_truncate_log *tl;
2439
2440         mlog_entry("start_blk = %llu, num_clusters = %u\n",
2441                    (unsigned long long)start_blk, num_clusters);
2442
2443         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2444
2445         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
2446
2447         di = (struct ocfs2_dinode *) tl_bh->b_data;
2448         tl = &di->id2.i_dealloc;
2449         if (!OCFS2_IS_VALID_DINODE(di)) {
2450                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2451                 status = -EIO;
2452                 goto bail;
2453         }
2454
2455         tl_count = le16_to_cpu(tl->tl_count);
2456         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
2457                         tl_count == 0,
2458                         "Truncate record count on #%llu invalid "
2459                         "wanted %u, actual %u\n",
2460                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
2461                         ocfs2_truncate_recs_per_inode(osb->sb),
2462                         le16_to_cpu(tl->tl_count));
2463
2464         /* Caller should have known to flush before calling us. */
2465         index = le16_to_cpu(tl->tl_used);
2466         if (index >= tl_count) {
2467                 status = -ENOSPC;
2468                 mlog_errno(status);
2469                 goto bail;
2470         }
2471
2472         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2473                                       OCFS2_JOURNAL_ACCESS_WRITE);
2474         if (status < 0) {
2475                 mlog_errno(status);
2476                 goto bail;
2477         }
2478
2479         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
2480              "%llu (index = %d)\n", num_clusters, start_cluster,
2481              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
2482
2483         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
2484                 /*
2485                  * Move index back to the record we are coalescing with.
2486                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
2487                  */
2488                 index--;
2489
2490                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
2491                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
2492                      index, le32_to_cpu(tl->tl_recs[index].t_start),
2493                      num_clusters);
2494         } else {
2495                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
2496                 tl->tl_used = cpu_to_le16(index + 1);
2497         }
2498         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
2499
2500         status = ocfs2_journal_dirty(handle, tl_bh);
2501         if (status < 0) {
2502                 mlog_errno(status);
2503                 goto bail;
2504         }
2505
2506 bail:
2507         mlog_exit(status);
2508         return status;
2509 }
2510
2511 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
2512                                          handle_t *handle,
2513                                          struct inode *data_alloc_inode,
2514                                          struct buffer_head *data_alloc_bh)
2515 {
2516         int status = 0;
2517         int i;
2518         unsigned int num_clusters;
2519         u64 start_blk;
2520         struct ocfs2_truncate_rec rec;
2521         struct ocfs2_dinode *di;
2522         struct ocfs2_truncate_log *tl;
2523         struct inode *tl_inode = osb->osb_tl_inode;
2524         struct buffer_head *tl_bh = osb->osb_tl_bh;
2525
2526         mlog_entry_void();
2527
2528         di = (struct ocfs2_dinode *) tl_bh->b_data;
2529         tl = &di->id2.i_dealloc;
2530         i = le16_to_cpu(tl->tl_used) - 1;
2531         while (i >= 0) {
2532                 /* Caller has given us at least enough credits to
2533                  * update the truncate log dinode */
2534                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2535                                               OCFS2_JOURNAL_ACCESS_WRITE);
2536                 if (status < 0) {
2537                         mlog_errno(status);
2538                         goto bail;
2539                 }
2540
2541                 tl->tl_used = cpu_to_le16(i);
2542
2543                 status = ocfs2_journal_dirty(handle, tl_bh);
2544                 if (status < 0) {
2545                         mlog_errno(status);
2546                         goto bail;
2547                 }
2548
2549                 /* TODO: Perhaps we can calculate the bulk of the
2550                  * credits up front rather than extending like
2551                  * this. */
2552                 status = ocfs2_extend_trans(handle,
2553                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
2554                 if (status < 0) {
2555                         mlog_errno(status);
2556                         goto bail;
2557                 }
2558
2559                 rec = tl->tl_recs[i];
2560                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
2561                                                     le32_to_cpu(rec.t_start));
2562                 num_clusters = le32_to_cpu(rec.t_clusters);
2563
2564                 /* if start_blk is not set, we ignore the record as
2565                  * invalid. */
2566                 if (start_blk) {
2567                         mlog(0, "free record %d, start = %u, clusters = %u\n",
2568                              i, le32_to_cpu(rec.t_start), num_clusters);
2569
2570                         status = ocfs2_free_clusters(handle, data_alloc_inode,
2571                                                      data_alloc_bh, start_blk,
2572                                                      num_clusters);
2573                         if (status < 0) {
2574                                 mlog_errno(status);
2575                                 goto bail;
2576                         }
2577                 }
2578                 i--;
2579         }
2580
2581 bail:
2582         mlog_exit(status);
2583         return status;
2584 }
2585
2586 /* Expects you to already be holding tl_inode->i_mutex */
2587 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2588 {
2589         int status;
2590         unsigned int num_to_flush;
2591         handle_t *handle;
2592         struct inode *tl_inode = osb->osb_tl_inode;
2593         struct inode *data_alloc_inode = NULL;
2594         struct buffer_head *tl_bh = osb->osb_tl_bh;
2595         struct buffer_head *data_alloc_bh = NULL;
2596         struct ocfs2_dinode *di;
2597         struct ocfs2_truncate_log *tl;
2598
2599         mlog_entry_void();
2600
2601         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2602
2603         di = (struct ocfs2_dinode *) tl_bh->b_data;
2604         tl = &di->id2.i_dealloc;
2605         if (!OCFS2_IS_VALID_DINODE(di)) {
2606                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2607                 status = -EIO;
2608                 goto out;
2609         }
2610
2611         num_to_flush = le16_to_cpu(tl->tl_used);
2612         mlog(0, "Flush %u records from truncate log #%llu\n",
2613              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
2614         if (!num_to_flush) {
2615                 status = 0;
2616                 goto out;
2617         }
2618
2619         data_alloc_inode = ocfs2_get_system_file_inode(osb,
2620                                                        GLOBAL_BITMAP_SYSTEM_INODE,
2621                                                        OCFS2_INVALID_SLOT);
2622         if (!data_alloc_inode) {
2623                 status = -EINVAL;
2624                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
2625                 goto out;
2626         }
2627
2628         mutex_lock(&data_alloc_inode->i_mutex);
2629
2630         status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
2631         if (status < 0) {
2632                 mlog_errno(status);
2633                 goto out_mutex;
2634         }
2635
2636         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2637         if (IS_ERR(handle)) {
2638                 status = PTR_ERR(handle);
2639                 mlog_errno(status);
2640                 goto out_unlock;
2641         }
2642
2643         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
2644                                                data_alloc_bh);
2645         if (status < 0)
2646                 mlog_errno(status);
2647
2648         ocfs2_commit_trans(osb, handle);
2649
2650 out_unlock:
2651         brelse(data_alloc_bh);
2652         ocfs2_meta_unlock(data_alloc_inode, 1);
2653
2654 out_mutex:
2655         mutex_unlock(&data_alloc_inode->i_mutex);
2656         iput(data_alloc_inode);
2657
2658 out:
2659         mlog_exit(status);
2660         return status;
2661 }
2662
2663 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2664 {
2665         int status;
2666         struct inode *tl_inode = osb->osb_tl_inode;
2667
2668         mutex_lock(&tl_inode->i_mutex);
2669         status = __ocfs2_flush_truncate_log(osb);
2670         mutex_unlock(&tl_inode->i_mutex);
2671
2672         return status;
2673 }
2674
2675 static void ocfs2_truncate_log_worker(struct work_struct *work)
2676 {
2677         int status;
2678         struct ocfs2_super *osb =
2679                 container_of(work, struct ocfs2_super,
2680                              osb_truncate_log_wq.work);
2681
2682         mlog_entry_void();
2683
2684         status = ocfs2_flush_truncate_log(osb);
2685         if (status < 0)
2686                 mlog_errno(status);
2687
2688         mlog_exit(status);
2689 }
2690
2691 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
2692 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
2693                                        int cancel)
2694 {
2695         if (osb->osb_tl_inode) {
2696                 /* We want to push off log flushes while truncates are
2697                  * still running. */
2698                 if (cancel)
2699                         cancel_delayed_work(&osb->osb_truncate_log_wq);
2700
2701                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
2702                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
2703         }
2704 }
2705
2706 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
2707                                        int slot_num,
2708                                        struct inode **tl_inode,
2709                                        struct buffer_head **tl_bh)
2710 {
2711         int status;
2712         struct inode *inode = NULL;
2713         struct buffer_head *bh = NULL;
2714
2715         inode = ocfs2_get_system_file_inode(osb,
2716                                            TRUNCATE_LOG_SYSTEM_INODE,
2717                                            slot_num);
2718         if (!inode) {
2719                 status = -EINVAL;
2720                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
2721                 goto bail;
2722         }
2723
2724         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
2725                                   OCFS2_BH_CACHED, inode);
2726         if (status < 0) {
2727                 iput(inode);
2728                 mlog_errno(status);
2729                 goto bail;
2730         }
2731
2732         *tl_inode = inode;
2733         *tl_bh    = bh;
2734 bail:
2735         mlog_exit(status);
2736         return status;
2737 }
2738
2739 /* called during the 1st stage of node recovery. we stamp a clean
2740  * truncate log and pass back a copy for processing later. if the
2741  * truncate log does not require processing, a *tl_copy is set to
2742  * NULL. */
2743 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
2744                                       int slot_num,
2745                                       struct ocfs2_dinode **tl_copy)
2746 {
2747         int status;
2748         struct inode *tl_inode = NULL;
2749         struct buffer_head *tl_bh = NULL;
2750         struct ocfs2_dinode *di;
2751         struct ocfs2_truncate_log *tl;
2752
2753         *tl_copy = NULL;
2754
2755         mlog(0, "recover truncate log from slot %d\n", slot_num);
2756
2757         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
2758         if (status < 0) {
2759                 mlog_errno(status);
2760                 goto bail;
2761         }
2762
2763         di = (struct ocfs2_dinode *) tl_bh->b_data;
2764         tl = &di->id2.i_dealloc;
2765         if (!OCFS2_IS_VALID_DINODE(di)) {
2766                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
2767                 status = -EIO;
2768                 goto bail;
2769         }
2770
2771         if (le16_to_cpu(tl->tl_used)) {
2772                 mlog(0, "We'll have %u logs to recover\n",
2773                      le16_to_cpu(tl->tl_used));
2774
2775                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
2776                 if (!(*tl_copy)) {
2777                         status = -ENOMEM;
2778                         mlog_errno(status);
2779                         goto bail;
2780                 }
2781
2782                 /* Assuming the write-out below goes well, this copy
2783                  * will be passed back to recovery for processing. */
2784                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
2785
2786                 /* All we need to do to clear the truncate log is set
2787                  * tl_used. */
2788                 tl->tl_used = 0;
2789
2790                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
2791                 if (status < 0) {
2792                         mlog_errno(status);
2793                         goto bail;
2794                 }
2795         }
2796
2797 bail:
2798         if (tl_inode)
2799                 iput(tl_inode);
2800         if (tl_bh)
2801                 brelse(tl_bh);
2802
2803         if (status < 0 && (*tl_copy)) {
2804                 kfree(*tl_copy);
2805                 *tl_copy = NULL;
2806         }
2807
2808         mlog_exit(status);
2809         return status;
2810 }
2811
2812 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
2813                                          struct ocfs2_dinode *tl_copy)
2814 {
2815         int status = 0;
2816         int i;
2817         unsigned int clusters, num_recs, start_cluster;
2818         u64 start_blk;
2819         handle_t *handle;
2820         struct inode *tl_inode = osb->osb_tl_inode;
2821         struct ocfs2_truncate_log *tl;
2822
2823         mlog_entry_void();
2824
2825         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
2826                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
2827                 return -EINVAL;
2828         }
2829
2830         tl = &tl_copy->id2.i_dealloc;
2831         num_recs = le16_to_cpu(tl->tl_used);
2832         mlog(0, "cleanup %u records from %llu\n", num_recs,
2833              (unsigned long long)tl_copy->i_blkno);
2834
2835         mutex_lock(&tl_inode->i_mutex);
2836         for(i = 0; i < num_recs; i++) {
2837                 if (ocfs2_truncate_log_needs_flush(osb)) {
2838                         status = __ocfs2_flush_truncate_log(osb);
2839                         if (status < 0) {
2840                                 mlog_errno(status);
2841                                 goto bail_up;
2842                         }
2843                 }
2844
2845                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2846                 if (IS_ERR(handle)) {
2847                         status = PTR_ERR(handle);
2848                         mlog_errno(status);
2849                         goto bail_up;
2850                 }
2851
2852                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
2853                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
2854                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
2855
2856                 status = ocfs2_truncate_log_append(osb, handle,
2857                                                    start_blk, clusters);
2858                 ocfs2_commit_trans(osb, handle);
2859                 if (status < 0) {
2860                         mlog_errno(status);
2861                         goto bail_up;
2862                 }
2863         }
2864
2865 bail_up:
2866         mutex_unlock(&tl_inode->i_mutex);
2867
2868         mlog_exit(status);
2869         return status;
2870 }
2871
2872 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
2873 {
2874         int status;
2875         struct inode *tl_inode = osb->osb_tl_inode;
2876
2877         mlog_entry_void();
2878
2879         if (tl_inode) {
2880                 cancel_delayed_work(&osb->osb_truncate_log_wq);
2881                 flush_workqueue(ocfs2_wq);
2882
2883                 status = ocfs2_flush_truncate_log(osb);
2884                 if (status < 0)
2885                         mlog_errno(status);
2886
2887                 brelse(osb->osb_tl_bh);
2888                 iput(osb->osb_tl_inode);
2889         }
2890
2891         mlog_exit_void();
2892 }
2893
2894 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2895 {
2896         int status;
2897         struct inode *tl_inode = NULL;
2898         struct buffer_head *tl_bh = NULL;
2899
2900         mlog_entry_void();
2901
2902         status = ocfs2_get_truncate_log_info(osb,
2903                                              osb->slot_num,
2904                                              &tl_inode,
2905                                              &tl_bh);
2906         if (status < 0)
2907                 mlog_errno(status);
2908
2909         /* ocfs2_truncate_log_shutdown keys on the existence of
2910          * osb->osb_tl_inode so we don't set any of the osb variables
2911          * until we're sure all is well. */
2912         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
2913                           ocfs2_truncate_log_worker);
2914         osb->osb_tl_bh    = tl_bh;
2915         osb->osb_tl_inode = tl_inode;
2916
2917         mlog_exit(status);
2918         return status;
2919 }
2920
2921 /* This function will figure out whether the currently last extent
2922  * block will be deleted, and if it will, what the new last extent
2923  * block will be so we can update his h_next_leaf_blk field, as well
2924  * as the dinodes i_last_eb_blk */
2925 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
2926                                        unsigned int clusters_to_del,
2927                                        struct ocfs2_path *path,
2928                                        struct buffer_head **new_last_eb)
2929 {
2930         int next_free, ret = 0;
2931         u32 cpos;
2932         struct ocfs2_extent_rec *rec;
2933         struct ocfs2_extent_block *eb;
2934         struct ocfs2_extent_list *el;
2935         struct buffer_head *bh = NULL;
2936
2937         *new_last_eb = NULL;
2938
2939         /* we have no tree, so of course, no last_eb. */
2940         if (!path->p_tree_depth)
2941                 goto out;
2942
2943         /* trunc to zero special case - this makes tree_depth = 0
2944          * regardless of what it is.  */
2945         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
2946                 goto out;
2947
2948         el = path_leaf_el(path);
2949         BUG_ON(!el->l_next_free_rec);
2950
2951         /*
2952          * Make sure that this extent list will actually be empty
2953          * after we clear away the data. We can shortcut out if
2954          * there's more than one non-empty extent in the
2955          * list. Otherwise, a check of the remaining extent is
2956          * necessary.
2957          */
2958         next_free = le16_to_cpu(el->l_next_free_rec);
2959         rec = NULL;
2960         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2961                 if (next_free > 2)
2962                         goto out;
2963
2964                 /* We may have a valid extent in index 1, check it. */
2965                 if (next_free == 2)
2966                         rec = &el->l_recs[1];
2967
2968                 /*
2969                  * Fall through - no more nonempty extents, so we want
2970                  * to delete this leaf.
2971                  */
2972         } else {
2973                 if (next_free > 1)
2974                         goto out;
2975
2976                 rec = &el->l_recs[0];
2977         }
2978
2979         if (rec) {
2980                 /*
2981                  * Check it we'll only be trimming off the end of this
2982                  * cluster.
2983                  */
2984                 if (le16_to_cpu(rec->e_clusters) > clusters_to_del)
2985                         goto out;
2986         }
2987
2988         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2989         if (ret) {
2990                 mlog_errno(ret);
2991                 goto out;
2992         }
2993
2994         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
2995         if (ret) {
2996                 mlog_errno(ret);
2997                 goto out;
2998         }
2999
3000         eb = (struct ocfs2_extent_block *) bh->b_data;
3001         el = &eb->h_list;
3002         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3003                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3004                 ret = -EROFS;
3005                 goto out;
3006         }
3007
3008         *new_last_eb = bh;
3009         get_bh(*new_last_eb);
3010         mlog(0, "returning block %llu, (cpos: %u)\n",
3011              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
3012 out:
3013         brelse(bh);
3014
3015         return ret;
3016 }
3017
3018 /*
3019  * Trim some clusters off the rightmost edge of a tree. Only called
3020  * during truncate.
3021  *
3022  * The caller needs to:
3023  *   - start journaling of each path component.
3024  *   - compute and fully set up any new last ext block
3025  */
3026 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
3027                            handle_t *handle, struct ocfs2_truncate_context *tc,
3028                            u32 clusters_to_del, u64 *delete_start)
3029 {
3030         int ret, i, index = path->p_tree_depth;
3031         u32 new_edge = 0;
3032         u64 deleted_eb = 0;
3033         struct buffer_head *bh;
3034         struct ocfs2_extent_list *el;
3035         struct ocfs2_extent_rec *rec;
3036
3037         *delete_start = 0;
3038
3039         while (index >= 0) {
3040                 bh = path->p_node[index].bh;
3041                 el = path->p_node[index].el;
3042
3043                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
3044                      index,  (unsigned long long)bh->b_blocknr);
3045
3046                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3047
3048                 if (index !=
3049                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3050                         ocfs2_error(inode->i_sb,
3051                                     "Inode %lu has invalid ext. block %llu",
3052                                     inode->i_ino,
3053                                     (unsigned long long)bh->b_blocknr);
3054                         ret = -EROFS;
3055                         goto out;
3056                 }
3057
3058 find_tail_record:
3059                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3060                 rec = &el->l_recs[i];
3061
3062                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
3063                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
3064                      le32_to_cpu(rec->e_clusters),
3065                      (unsigned long long)le64_to_cpu(rec->e_blkno),
3066                      le16_to_cpu(el->l_next_free_rec));
3067
3068                 BUG_ON(le32_to_cpu(rec->e_clusters) < clusters_to_del);
3069
3070                 if (le16_to_cpu(el->l_tree_depth) == 0) {
3071                         /*
3072                          * If the leaf block contains a single empty
3073                          * extent and no records, we can just remove
3074                          * the block.
3075                          */
3076                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
3077                                 memset(rec, 0,
3078                                        sizeof(struct ocfs2_extent_rec));
3079                                 el->l_next_free_rec = cpu_to_le16(0);
3080
3081                                 goto delete;
3082                         }
3083
3084                         /*
3085                          * Remove any empty extents by shifting things
3086                          * left. That should make life much easier on
3087                          * the code below. This condition is rare
3088                          * enough that we shouldn't see a performance
3089                          * hit.
3090                          */
3091                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3092                                 le16_add_cpu(&el->l_next_free_rec, -1);
3093
3094                                 for(i = 0;
3095                                     i < le16_to_cpu(el->l_next_free_rec); i++)
3096                                         el->l_recs[i] = el->l_recs[i + 1];
3097
3098                                 memset(&el->l_recs[i], 0,
3099                                        sizeof(struct ocfs2_extent_rec));
3100
3101                                 /*
3102                                  * We've modified our extent list. The
3103                                  * simplest way to handle this change
3104                                  * is to being the search from the
3105                                  * start again.
3106                                  */
3107                                 goto find_tail_record;
3108                         }
3109
3110                         le32_add_cpu(&rec->e_clusters, -clusters_to_del);
3111
3112                         /*
3113                          * We'll use "new_edge" on our way back up the
3114                          * tree to know what our rightmost cpos is.
3115                          */
3116                         new_edge = le32_to_cpu(rec->e_clusters);
3117                         new_edge += le32_to_cpu(rec->e_cpos);
3118
3119                         /*
3120                          * The caller will use this to delete data blocks.
3121                          */
3122                         *delete_start = le64_to_cpu(rec->e_blkno)
3123                                 + ocfs2_clusters_to_blocks(inode->i_sb,
3124                                         le32_to_cpu(rec->e_clusters));
3125
3126                         /*
3127                          * If it's now empty, remove this record.
3128                          */
3129                         if (le32_to_cpu(rec->e_clusters) == 0) {
3130                                 memset(rec, 0,
3131                                        sizeof(struct ocfs2_extent_rec));
3132                                 le16_add_cpu(&el->l_next_free_rec, -1);
3133                         }
3134                 } else {
3135                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
3136                                 memset(rec, 0,
3137                                        sizeof(struct ocfs2_extent_rec));
3138                                 le16_add_cpu(&el->l_next_free_rec, -1);
3139
3140                                 goto delete;
3141                         }
3142
3143                         /* Can this actually happen? */
3144                         if (le16_to_cpu(el->l_next_free_rec) == 0)
3145                                 goto delete;
3146
3147                         /*
3148                          * We never actually deleted any clusters
3149                          * because our leaf was empty. There's no
3150                          * reason to adjust the rightmost edge then.
3151                          */
3152                         if (new_edge == 0)
3153                                 goto delete;
3154
3155                         rec->e_clusters = cpu_to_le32(new_edge);
3156                         le32_add_cpu(&rec->e_clusters,
3157                                      -le32_to_cpu(rec->e_cpos));
3158
3159                          /*
3160                           * A deleted child record should have been
3161                           * caught above.
3162                           */
3163                          BUG_ON(le32_to_cpu(rec->e_clusters) == 0);
3164                 }
3165
3166 delete:
3167                 ret = ocfs2_journal_dirty(handle, bh);
3168                 if (ret) {
3169                         mlog_errno(ret);
3170                         goto out;
3171                 }
3172
3173                 mlog(0, "extent list container %llu, after: record %d: "
3174                      "(%u, %u, %llu), next = %u.\n",
3175                      (unsigned long long)bh->b_blocknr, i,
3176                      le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
3177                      (unsigned long long)le64_to_cpu(rec->e_blkno),
3178                      le16_to_cpu(el->l_next_free_rec));
3179
3180                 /*
3181                  * We must be careful to only attempt delete of an
3182                  * extent block (and not the root inode block).
3183                  */
3184                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
3185                         struct ocfs2_extent_block *eb =
3186                                 (struct ocfs2_extent_block *)bh->b_data;
3187
3188                         /*
3189                          * Save this for use when processing the
3190                          * parent block.
3191                          */
3192                         deleted_eb = le64_to_cpu(eb->h_blkno);
3193
3194                         mlog(0, "deleting this extent block.\n");
3195
3196                         ocfs2_remove_from_cache(inode, bh);
3197
3198                         BUG_ON(le32_to_cpu(el->l_recs[0].e_clusters));
3199                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3200                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3201
3202                         if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
3203                                 /*
3204                                  * This code only understands how to
3205                                  * lock the suballocator in slot 0,
3206                                  * which is fine because allocation is
3207                                  * only ever done out of that
3208                                  * suballocator too. A future version
3209                                  * might change that however, so avoid
3210                                  * a free if we don't know how to
3211                                  * handle it. This way an fs incompat
3212                                  * bit will not be necessary.
3213                                  */
3214                                 ret = ocfs2_free_extent_block(handle,
3215                                                               tc->tc_ext_alloc_inode,
3216                                                               tc->tc_ext_alloc_bh,
3217                                                               eb);
3218
3219                                 /* An error here is not fatal. */
3220                                 if (ret < 0)
3221                                         mlog_errno(ret);
3222                         }
3223                 } else {
3224                         deleted_eb = 0;
3225                 }
3226
3227                 index--;
3228         }
3229
3230         ret = 0;
3231 out:
3232         return ret;
3233 }
3234
3235 static int ocfs2_do_truncate(struct ocfs2_super *osb,
3236                              unsigned int clusters_to_del,
3237                              struct inode *inode,
3238                              struct buffer_head *fe_bh,
3239                              handle_t *handle,
3240                              struct ocfs2_truncate_context *tc,
3241                              struct ocfs2_path *path)
3242 {
3243         int status;
3244         struct ocfs2_dinode *fe;
3245         struct ocfs2_extent_block *last_eb = NULL;
3246         struct ocfs2_extent_list *el;
3247         struct buffer_head *last_eb_bh = NULL;
3248         u64 delete_blk = 0;
3249
3250         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3251
3252         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
3253                                              path, &last_eb_bh);
3254         if (status < 0) {
3255                 mlog_errno(status);
3256                 goto bail;
3257         }
3258
3259         /*
3260          * Each component will be touched, so we might as well journal
3261          * here to avoid having to handle errors later.
3262          */
3263         status = ocfs2_journal_access_path(inode, handle, path);
3264         if (status < 0) {
3265                 mlog_errno(status);
3266                 goto bail;
3267         }
3268
3269         if (last_eb_bh) {
3270                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3271                                               OCFS2_JOURNAL_ACCESS_WRITE);
3272                 if (status < 0) {
3273                         mlog_errno(status);
3274                         goto bail;
3275                 }
3276
3277                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3278         }
3279
3280         el = &(fe->id2.i_list);
3281
3282         /*
3283          * Lower levels depend on this never happening, but it's best
3284          * to check it up here before changing the tree.
3285          */
3286         if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
3287                 ocfs2_error(inode->i_sb,
3288                             "Inode %lu has an empty extent record, depth %u\n",
3289                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
3290                 status = -EROFS;
3291                 goto bail;
3292         }
3293
3294         spin_lock(&OCFS2_I(inode)->ip_lock);
3295         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
3296                                       clusters_to_del;
3297         spin_unlock(&OCFS2_I(inode)->ip_lock);
3298         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
3299
3300         status = ocfs2_trim_tree(inode, path, handle, tc,
3301                                  clusters_to_del, &delete_blk);
3302         if (status) {
3303                 mlog_errno(status);
3304                 goto bail;
3305         }
3306
3307         if (le32_to_cpu(fe->i_clusters) == 0) {
3308                 /* trunc to zero is a special case. */
3309                 el->l_tree_depth = 0;
3310                 fe->i_last_eb_blk = 0;
3311         } else if (last_eb)
3312                 fe->i_last_eb_blk = last_eb->h_blkno;
3313
3314         status = ocfs2_journal_dirty(handle, fe_bh);
3315         if (status < 0) {
3316                 mlog_errno(status);
3317                 goto bail;
3318         }
3319
3320         if (last_eb) {
3321                 /* If there will be a new last extent block, then by
3322                  * definition, there cannot be any leaves to the right of
3323                  * him. */
3324                 last_eb->h_next_leaf_blk = 0;
3325                 status = ocfs2_journal_dirty(handle, last_eb_bh);
3326                 if (status < 0) {
3327                         mlog_errno(status);
3328                         goto bail;
3329                 }
3330         }
3331
3332         if (delete_blk) {
3333                 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
3334                                                    clusters_to_del);
3335                 if (status < 0) {
3336                         mlog_errno(status);
3337                         goto bail;
3338                 }
3339         }
3340         status = 0;
3341 bail:
3342
3343         mlog_exit(status);
3344         return status;
3345 }
3346
3347 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
3348 {
3349         set_buffer_uptodate(bh);
3350         mark_buffer_dirty(bh);
3351         return 0;
3352 }
3353
3354 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3355 {
3356         set_buffer_uptodate(bh);
3357         mark_buffer_dirty(bh);
3358         return ocfs2_journal_dirty_data(handle, bh);
3359 }
3360
3361 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3362                                      struct page **pages, int numpages,
3363                                      u64 phys, handle_t *handle)
3364 {
3365         int i, ret, partial = 0;
3366         void *kaddr;
3367         struct page *page;
3368         unsigned int from, to = PAGE_CACHE_SIZE;
3369         struct super_block *sb = inode->i_sb;
3370
3371         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3372
3373         if (numpages == 0)
3374                 goto out;
3375
3376         from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
3377         if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3378                 /*
3379                  * Since 'from' has been capped to a value below page
3380                  * size, this calculation won't be able to overflow
3381                  * 'to'
3382                  */
3383                 to = ocfs2_align_bytes_to_clusters(sb, from);
3384
3385                 /*
3386                  * The truncate tail in this case should never contain
3387                  * more than one page at maximum. The loop below also
3388                  * assumes this.
3389                  */
3390                 BUG_ON(numpages != 1);
3391         }
3392
3393         for(i = 0; i < numpages; i++) {
3394                 page = pages[i];
3395
3396                 BUG_ON(from > PAGE_CACHE_SIZE);
3397                 BUG_ON(to > PAGE_CACHE_SIZE);
3398
3399                 ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
3400                 if (ret)
3401                         mlog_errno(ret);
3402
3403                 kaddr = kmap_atomic(page, KM_USER0);
3404                 memset(kaddr + from, 0, to - from);
3405                 kunmap_atomic(kaddr, KM_USER0);
3406
3407                 /*
3408                  * Need to set the buffers we zero'd into uptodate
3409                  * here if they aren't - ocfs2_map_page_blocks()
3410                  * might've skipped some
3411                  */
3412                 if (ocfs2_should_order_data(inode)) {
3413                         ret = walk_page_buffers(handle,
3414                                                 page_buffers(page),
3415                                                 from, to, &partial,
3416                                                 ocfs2_ordered_zero_func);
3417                         if (ret < 0)
3418                                 mlog_errno(ret);
3419                 } else {
3420                         ret = walk_page_buffers(handle, page_buffers(page),
3421                                                 from, to, &partial,
3422                                                 ocfs2_writeback_zero_func);
3423                         if (ret < 0)
3424                                 mlog_errno(ret);
3425                 }
3426
3427                 if (!partial)
3428                         SetPageUptodate(page);
3429
3430                 flush_dcache_page(page);
3431
3432                 /*
3433                  * Every page after the 1st one should be completely zero'd.
3434                  */
3435                 from = 0;
3436         }
3437 out:
3438         if (pages) {
3439                 for (i = 0; i < numpages; i++) {
3440                         page = pages[i];
3441                         unlock_page(page);
3442                         mark_page_accessed(page);
3443                         page_cache_release(page);
3444                 }
3445         }
3446 }
3447
3448 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
3449                                 int *num, u64 *phys)
3450 {
3451         int i, numpages = 0, ret = 0;
3452         unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3453         struct super_block *sb = inode->i_sb;
3454         struct address_space *mapping = inode->i_mapping;
3455         unsigned long index;
3456         u64 next_cluster_bytes;
3457
3458         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3459
3460         /* Cluster boundary, so we don't need to grab any pages. */
3461         if ((isize & (csize - 1)) == 0)
3462                 goto out;
3463
3464         ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
3465                                           phys, NULL);
3466         if (ret) {
3467                 mlog_errno(ret);
3468                 goto out;
3469         }
3470
3471         /* Tail is a hole. */
3472         if (*phys == 0)
3473                 goto out;
3474
3475         next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
3476         index = isize >> PAGE_CACHE_SHIFT;
3477         do {
3478                 pages[numpages] = grab_cache_page(mapping, index);
3479                 if (!pages[numpages]) {
3480                         ret = -ENOMEM;
3481                         mlog_errno(ret);
3482                         goto out;
3483                 }
3484
3485                 numpages++;
3486                 index++;
3487         } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
3488
3489 out:
3490         if (ret != 0) {
3491                 if (pages) {
3492                         for (i = 0; i < numpages; i++) {
3493                                 if (pages[i]) {
3494                                         unlock_page(pages[i]);
3495                                         page_cache_release(pages[i]);
3496                                 }
3497                         }
3498                 }
3499                 numpages = 0;
3500         }
3501
3502         *num = numpages;
3503
3504         return ret;
3505 }
3506
3507 /*
3508  * Zero the area past i_size but still within an allocated
3509  * cluster. This avoids exposing nonzero data on subsequent file
3510  * extends.
3511  *
3512  * We need to call this before i_size is updated on the inode because
3513  * otherwise block_write_full_page() will skip writeout of pages past
3514  * i_size. The new_i_size parameter is passed for this reason.
3515  */
3516 int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3517                                  u64 new_i_size)
3518 {
3519         int ret, numpages;
3520         struct page **pages = NULL;
3521         u64 phys;
3522
3523         /*
3524          * File systems which don't support sparse files zero on every
3525          * extend.
3526          */
3527         if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
3528                 return 0;
3529
3530         pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
3531                         sizeof(struct page *), GFP_NOFS);
3532         if (pages == NULL) {
3533                 ret = -ENOMEM;
3534                 mlog_errno(ret);
3535                 goto out;
3536         }
3537
3538         ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
3539         if (ret) {
3540                 mlog_errno(ret);
3541                 goto out;
3542         }
3543
3544         /*
3545          * Truncate on an i_size boundary - nothing more to do.
3546          */
3547         if (numpages == 0)
3548                 goto out;
3549
3550         ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
3551                                  handle);
3552
3553         /*
3554          * Initiate writeout of the pages we zero'd here. We don't
3555          * wait on them - the truncate_inode_pages() call later will
3556          * do that for us.
3557          */
3558         ret = filemap_fdatawrite(inode->i_mapping);
3559         if (ret)
3560                 mlog_errno(ret);
3561
3562 out:
3563         if (pages)
3564                 kfree(pages);
3565
3566         return ret;
3567 }
3568
3569 /*
3570  * It is expected, that by the time you call this function,
3571  * inode->i_size and fe->i_size have been adjusted.
3572  *
3573  * WARNING: This will kfree the truncate context
3574  */
3575 int ocfs2_commit_truncate(struct ocfs2_super *osb,
3576                           struct inode *inode,
3577                           struct buffer_head *fe_bh,
3578                           struct ocfs2_truncate_context *tc)
3579 {
3580         int status, i, credits, tl_sem = 0;
3581         u32 clusters_to_del, new_highest_cpos, range;
3582         struct ocfs2_extent_list *el;
3583         handle_t *handle = NULL;
3584         struct inode *tl_inode = osb->osb_tl_inode;
3585         struct ocfs2_path *path = NULL;
3586
3587         mlog_entry_void();
3588
3589         down_write(&OCFS2_I(inode)->ip_alloc_sem);
3590
3591         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3592                                                      i_size_read(inode));
3593
3594         path = ocfs2_new_inode_path(fe_bh);
3595         if (!path) {
3596                 status = -ENOMEM;
3597                 mlog_errno(status);
3598                 goto bail;
3599         }
3600 start:
3601         /*
3602          * Check that we still have allocation to delete.
3603          */
3604         if (OCFS2_I(inode)->ip_clusters == 0) {
3605                 status = 0;
3606                 goto bail;
3607         }
3608
3609         /*
3610          * Truncate always works against the rightmost tree branch.
3611          */
3612         status = ocfs2_find_path(inode, path, UINT_MAX);
3613         if (status) {
3614                 mlog_errno(status);
3615                 goto bail;
3616         }
3617
3618         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
3619              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
3620
3621         /*
3622          * By now, el will point to the extent list on the bottom most
3623          * portion of this tree. Only the tail record is considered in
3624          * each pass.
3625          *
3626          * We handle the following cases, in order:
3627          * - empty extent: delete the remaining branch
3628          * - remove the entire record
3629          * - remove a partial record
3630          * - no record needs to be removed (truncate has completed)
3631          */
3632         el = path_leaf_el(path);
3633         if (le16_to_cpu(el->l_next_free_rec) == 0) {
3634                 ocfs2_error(inode->i_sb,
3635                             "Inode %llu has empty extent block at %llu\n",
3636                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
3637                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
3638                 status = -EROFS;
3639                 goto bail;
3640         }
3641
3642         i = le16_to_cpu(el->l_next_free_rec) - 1;
3643         range = le32_to_cpu(el->l_recs[i].e_cpos) +
3644                 le32_to_cpu(el->l_recs[i].e_clusters);
3645         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
3646                 clusters_to_del = 0;
3647         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3648                 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
3649         } else if (range > new_highest_cpos) {
3650                 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
3651                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
3652                                   new_highest_cpos;
3653         } else {
3654                 status = 0;
3655                 goto bail;
3656         }
3657
3658         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3659              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3660
3661         BUG_ON(clusters_to_del == 0);
3662
3663         mutex_lock(&tl_inode->i_mutex);
3664         tl_sem = 1;
3665         /* ocfs2_truncate_log_needs_flush guarantees us at least one
3666          * record is free for use. If there isn't any, we flush to get
3667          * an empty truncate log.  */
3668         if (ocfs2_truncate_log_needs_flush(osb)) {
3669                 status = __ocfs2_flush_truncate_log(osb);
3670                 if (status < 0) {
3671                         mlog_errno(status);
3672                         goto bail;
3673                 }
3674         }
3675
3676         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
3677                                                 (struct ocfs2_dinode *)fe_bh->b_data,
3678                                                 el);
3679         handle = ocfs2_start_trans(osb, credits);
3680         if (IS_ERR(handle)) {
3681                 status = PTR_ERR(handle);
3682                 handle = NULL;
3683                 mlog_errno(status);
3684                 goto bail;
3685         }
3686
3687         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
3688                                    tc, path);
3689         if (status < 0) {
3690                 mlog_errno(status);
3691                 goto bail;
3692         }
3693
3694         mutex_unlock(&tl_inode->i_mutex);
3695         tl_sem = 0;
3696
3697         ocfs2_commit_trans(osb, handle);
3698         handle = NULL;
3699
3700         ocfs2_reinit_path(path, 1);
3701
3702         /*
3703          * The check above will catch the case where we've truncated
3704          * away all allocation.
3705          */
3706         goto start;
3707
3708 bail:
3709         up_write(&OCFS2_I(inode)->ip_alloc_sem);
3710
3711         ocfs2_schedule_truncate_log_flush(osb, 1);
3712
3713         if (tl_sem)
3714                 mutex_unlock(&tl_inode->i_mutex);
3715
3716         if (handle)
3717                 ocfs2_commit_trans(osb, handle);
3718
3719         ocfs2_free_path(path);
3720
3721         /* This will drop the ext_alloc cluster lock for us */
3722         ocfs2_free_truncate_context(tc);
3723
3724         mlog_exit(status);
3725         return status;
3726 }
3727
3728 /*
3729  * Expects the inode to already be locked. This will figure out which
3730  * inodes need to be locked and will put them on the returned truncate
3731  * context.
3732  */
3733 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3734                            struct inode *inode,
3735                            struct buffer_head *fe_bh,
3736                            struct ocfs2_truncate_context **tc)
3737 {
3738         int status, metadata_delete, i;
3739         unsigned int new_i_clusters;
3740         struct ocfs2_dinode *fe;
3741         struct ocfs2_extent_block *eb;
3742         struct ocfs2_extent_list *el;
3743         struct buffer_head *last_eb_bh = NULL;
3744         struct inode *ext_alloc_inode = NULL;
3745         struct buffer_head *ext_alloc_bh = NULL;
3746
3747         mlog_entry_void();
3748
3749         *tc = NULL;
3750
3751         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
3752                                                   i_size_read(inode));
3753         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3754
3755         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
3756              "%llu\n", fe->i_clusters, new_i_clusters,
3757              (unsigned long long)fe->i_size);
3758
3759         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
3760         if (!(*tc)) {
3761                 status = -ENOMEM;
3762                 mlog_errno(status);
3763                 goto bail;
3764         }
3765
3766         metadata_delete = 0;
3767         if (fe->id2.i_list.l_tree_depth) {
3768                 /* If we have a tree, then the truncate may result in
3769                  * metadata deletes. Figure this out from the
3770                  * rightmost leaf block.*/
3771                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3772                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
3773                 if (status < 0) {
3774                         mlog_errno(status);
3775                         goto bail;
3776                 }
3777                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3778                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3779                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3780
3781                         brelse(last_eb_bh);
3782                         status = -EIO;
3783                         goto bail;
3784                 }
3785                 el = &(eb->h_list);
3786
3787                 i = 0;
3788                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3789                         i = 1;
3790                 /*
3791                  * XXX: Should we check that next_free_rec contains
3792                  * the extent?
3793                  */
3794                 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3795                         metadata_delete = 1;
3796         }
3797
3798         (*tc)->tc_last_eb_bh = last_eb_bh;
3799
3800         if (metadata_delete) {
3801                 mlog(0, "Will have to delete metadata for this trunc. "
3802                      "locking allocator.\n");
3803                 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3804                 if (!ext_alloc_inode) {
3805                         status = -ENOMEM;
3806                         mlog_errno(status);
3807                         goto bail;
3808                 }
3809
3810                 mutex_lock(&ext_alloc_inode->i_mutex);
3811                 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3812
3813                 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3814                 if (status < 0) {
3815                         mlog_errno(status);
3816                         goto bail;
3817                 }
3818                 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3819                 (*tc)->tc_ext_alloc_locked = 1;
3820         }
3821
3822         status = 0;
3823 bail:
3824         if (status < 0) {
3825                 if (*tc)
3826                         ocfs2_free_truncate_context(*tc);
3827                 *tc = NULL;
3828         }
3829         mlog_exit_void();
3830         return status;
3831 }
3832
3833 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3834 {
3835         if (tc->tc_ext_alloc_inode) {
3836                 if (tc->tc_ext_alloc_locked)
3837                         ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
3838
3839                 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
3840                 iput(tc->tc_ext_alloc_inode);
3841         }
3842
3843         if (tc->tc_ext_alloc_bh)
3844                 brelse(tc->tc_ext_alloc_bh);
3845
3846         if (tc->tc_last_eb_bh)
3847                 brelse(tc->tc_last_eb_bh);
3848
3849         kfree(tc);
3850 }