1 /* -*- mode: c; c-basic-offset: 8; -*-
 
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
 
   6  * Extent allocs and frees
 
   8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
 
  10  * This program is free software; you can redistribute it and/or
 
  11  * modify it under the terms of the GNU General Public
 
  12  * License as published by the Free Software Foundation; either
 
  13  * version 2 of the License, or (at your option) any later version.
 
  15  * This program is distributed in the hope that it will be useful,
 
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 
  18  * General Public License for more details.
 
  20  * You should have received a copy of the GNU General Public
 
  21  * License along with this program; if not, write to the
 
  22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 
  23  * Boston, MA 021110-1307, USA.
 
  27 #include <linux/types.h>
 
  28 #include <linux/slab.h>
 
  29 #include <linux/highmem.h>
 
  31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 
  32 #include <cluster/masklog.h>
 
  38 #include "extent_map.h"
 
  41 #include "localalloc.h"
 
  48 #include "buffer_head_io.h"
 
  50 static int ocfs2_extent_contig(struct inode *inode,
 
  51                                struct ocfs2_extent_rec *ext,
 
  54 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 
  58                                      struct ocfs2_alloc_context *meta_ac,
 
  59                                      struct buffer_head *bhs[]);
 
  61 static int ocfs2_add_branch(struct ocfs2_super *osb,
 
  64                             struct buffer_head *fe_bh,
 
  65                             struct buffer_head *eb_bh,
 
  66                             struct buffer_head *last_eb_bh,
 
  67                             struct ocfs2_alloc_context *meta_ac);
 
  69 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 
  72                                   struct buffer_head *fe_bh,
 
  73                                   struct ocfs2_alloc_context *meta_ac,
 
  74                                   struct buffer_head **ret_new_eb_bh);
 
  76 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
 
  79                                   struct buffer_head *fe_bh,
 
  83 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
  85                                     struct buffer_head *fe_bh,
 
  86                                     struct buffer_head **target_bh);
 
  88 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
 
  90                                        struct ocfs2_dinode *fe,
 
  91                                        unsigned int new_i_clusters,
 
  92                                        struct buffer_head *old_last_eb,
 
  93                                        struct buffer_head **new_last_eb);
 
  95 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 
  97 static int ocfs2_extent_contig(struct inode *inode,
 
  98                                struct ocfs2_extent_rec *ext,
 
 101         return blkno == (le64_to_cpu(ext->e_blkno) +
 
 102                          ocfs2_clusters_to_blocks(inode->i_sb,
 
 103                                                   le32_to_cpu(ext->e_clusters)));
 
 107  * How many free extents have we got before we need more meta data?
 
 109 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 
 111                            struct ocfs2_dinode *fe)
 
 114         struct ocfs2_extent_list *el;
 
 115         struct ocfs2_extent_block *eb;
 
 116         struct buffer_head *eb_bh = NULL;
 
 120         if (!OCFS2_IS_VALID_DINODE(fe)) {
 
 121                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 
 126         if (fe->i_last_eb_blk) {
 
 127                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
 
 128                                           &eb_bh, OCFS2_BH_CACHED, inode);
 
 133                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 
 136                 el = &fe->id2.i_list;
 
 138         BUG_ON(el->l_tree_depth != 0);
 
 140         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 
 149 /* expects array to already be allocated
 
 151  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
 
 154 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 
 158                                      struct ocfs2_alloc_context *meta_ac,
 
 159                                      struct buffer_head *bhs[])
 
 161         int count, status, i;
 
 162         u16 suballoc_bit_start;
 
 165         struct ocfs2_extent_block *eb;
 
 170         while (count < wanted) {
 
 171                 status = ocfs2_claim_metadata(osb,
 
 183                 for(i = count;  i < (num_got + count); i++) {
 
 184                         bhs[i] = sb_getblk(osb->sb, first_blkno);
 
 185                         if (bhs[i] == NULL) {
 
 190                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
 
 192                         status = ocfs2_journal_access(handle, inode, bhs[i],
 
 193                                                       OCFS2_JOURNAL_ACCESS_CREATE);
 
 199                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
 
 200                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
 
 201                         /* Ok, setup the minimal stuff here. */
 
 202                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
 
 203                         eb->h_blkno = cpu_to_le64(first_blkno);
 
 204                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
 
 206 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
 
 207                         /* we always use slot zero's suballocator */
 
 208                         eb->h_suballoc_slot = 0;
 
 210                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
 
 212                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 
 214                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
 
 216                         suballoc_bit_start++;
 
 219                         /* We'll also be dirtied by the caller, so
 
 220                          * this isn't absolutely necessary. */
 
 221                         status = ocfs2_journal_dirty(handle, bhs[i]);
 
 234                 for(i = 0; i < wanted; i++) {
 
 245  * Add an entire tree branch to our inode. eb_bh is the extent block
 
 246  * to start at, if we don't want to start the branch at the dinode
 
 249  * last_eb_bh is required as we have to update it's next_leaf pointer
 
 250  * for the new last extent block.
 
 252  * the new branch will be 'empty' in the sense that every block will
 
 253  * contain a single record with e_clusters == 0.
 
 255 static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 258                             struct buffer_head *fe_bh,
 
 259                             struct buffer_head *eb_bh,
 
 260                             struct buffer_head *last_eb_bh,
 
 261                             struct ocfs2_alloc_context *meta_ac)
 
 263         int status, new_blocks, i;
 
 264         u64 next_blkno, new_last_eb_blk;
 
 265         struct buffer_head *bh;
 
 266         struct buffer_head **new_eb_bhs = NULL;
 
 267         struct ocfs2_dinode *fe;
 
 268         struct ocfs2_extent_block *eb;
 
 269         struct ocfs2_extent_list  *eb_el;
 
 270         struct ocfs2_extent_list  *el;
 
 276         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 279                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 
 282                 el = &fe->id2.i_list;
 
 284         /* we never add a branch to a leaf. */
 
 285         BUG_ON(!el->l_tree_depth);
 
 287         new_blocks = le16_to_cpu(el->l_tree_depth);
 
 289         /* allocate the number of new eb blocks we need */
 
 290         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
 
 298         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
 
 299                                            meta_ac, new_eb_bhs);
 
 305         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
 
 306          * linked with the rest of the tree.
 
 307          * conversly, new_eb_bhs[0] is the new bottommost leaf.
 
 309          * when we leave the loop, new_last_eb_blk will point to the
 
 310          * newest leaf, and next_blkno will point to the topmost extent
 
 312         next_blkno = new_last_eb_blk = 0;
 
 313         for(i = 0; i < new_blocks; i++) {
 
 315                 eb = (struct ocfs2_extent_block *) bh->b_data;
 
 316                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
 317                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
 323                 status = ocfs2_journal_access(handle, inode, bh,
 
 324                                               OCFS2_JOURNAL_ACCESS_CREATE);
 
 330                 eb->h_next_leaf_blk = 0;
 
 331                 eb_el->l_tree_depth = cpu_to_le16(i);
 
 332                 eb_el->l_next_free_rec = cpu_to_le16(1);
 
 333                 eb_el->l_recs[0].e_cpos = fe->i_clusters;
 
 334                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
 
 335                 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
 
 336                 if (!eb_el->l_tree_depth)
 
 337                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
 339                 status = ocfs2_journal_dirty(handle, bh);
 
 345                 next_blkno = le64_to_cpu(eb->h_blkno);
 
 348         /* This is a bit hairy. We want to update up to three blocks
 
 349          * here without leaving any of them in an inconsistent state
 
 350          * in case of error. We don't have to worry about
 
 351          * journal_dirty erroring as it won't unless we've aborted the
 
 352          * handle (in which case we would never be here) so reserving
 
 353          * the write with journal_access is all we need to do. */
 
 354         status = ocfs2_journal_access(handle, inode, last_eb_bh,
 
 355                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
 360         status = ocfs2_journal_access(handle, inode, fe_bh,
 
 361                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
 367                 status = ocfs2_journal_access(handle, inode, eb_bh,
 
 368                                               OCFS2_JOURNAL_ACCESS_WRITE);
 
 375         /* Link the new branch into the rest of the tree (el will
 
 376          * either be on the fe, or the extent block passed in. */
 
 377         i = le16_to_cpu(el->l_next_free_rec);
 
 378         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 
 379         el->l_recs[i].e_cpos = fe->i_clusters;
 
 380         el->l_recs[i].e_clusters = 0;
 
 381         le16_add_cpu(&el->l_next_free_rec, 1);
 
 383         /* fe needs a new last extent block pointer, as does the
 
 384          * next_leaf on the previously last-extent-block. */
 
 385         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
 
 387         eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
 388         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
 
 390         status = ocfs2_journal_dirty(handle, last_eb_bh);
 
 393         status = ocfs2_journal_dirty(handle, fe_bh);
 
 397                 status = ocfs2_journal_dirty(handle, eb_bh);
 
 405                 for (i = 0; i < new_blocks; i++)
 
 407                                 brelse(new_eb_bhs[i]);
 
 416  * adds another level to the allocation tree.
 
 417  * returns back the new extent block so you can add a branch to it
 
 420 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 
 423                                   struct buffer_head *fe_bh,
 
 424                                   struct ocfs2_alloc_context *meta_ac,
 
 425                                   struct buffer_head **ret_new_eb_bh)
 
 428         struct buffer_head *new_eb_bh = NULL;
 
 429         struct ocfs2_dinode *fe;
 
 430         struct ocfs2_extent_block *eb;
 
 431         struct ocfs2_extent_list  *fe_el;
 
 432         struct ocfs2_extent_list  *eb_el;
 
 436         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
 
 443         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
 
 444         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
 445                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
 451         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 452         fe_el = &fe->id2.i_list;
 
 454         status = ocfs2_journal_access(handle, inode, new_eb_bh,
 
 455                                       OCFS2_JOURNAL_ACCESS_CREATE);
 
 461         /* copy the fe data into the new extent block */
 
 462         eb_el->l_tree_depth = fe_el->l_tree_depth;
 
 463         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
 
 464         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
 
 465                 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
 
 466                 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
 
 467                 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
 
 470         status = ocfs2_journal_dirty(handle, new_eb_bh);
 
 476         status = ocfs2_journal_access(handle, inode, fe_bh,
 
 477                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
 484         le16_add_cpu(&fe_el->l_tree_depth, 1);
 
 485         fe_el->l_recs[0].e_cpos = 0;
 
 486         fe_el->l_recs[0].e_blkno = eb->h_blkno;
 
 487         fe_el->l_recs[0].e_clusters = fe->i_clusters;
 
 488         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
 
 489                 fe_el->l_recs[i].e_cpos = 0;
 
 490                 fe_el->l_recs[i].e_clusters = 0;
 
 491                 fe_el->l_recs[i].e_blkno = 0;
 
 493         fe_el->l_next_free_rec = cpu_to_le16(1);
 
 495         /* If this is our 1st tree depth shift, then last_eb_blk
 
 496          * becomes the allocated extent block */
 
 497         if (fe_el->l_tree_depth == cpu_to_le16(1))
 
 498                 fe->i_last_eb_blk = eb->h_blkno;
 
 500         status = ocfs2_journal_dirty(handle, fe_bh);
 
 506         *ret_new_eb_bh = new_eb_bh;
 
 518  * Expects the tree to already have room in the rightmost leaf for the
 
 519  * extent.  Updates all the extent blocks (and the dinode) on the way
 
 522 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
 
 525                                   struct buffer_head *fe_bh,
 
 529         int status, i, num_bhs = 0;
 
 532         struct buffer_head **eb_bhs = NULL;
 
 533         struct ocfs2_dinode *fe;
 
 534         struct ocfs2_extent_block *eb;
 
 535         struct ocfs2_extent_list  *el;
 
 539         status = ocfs2_journal_access(handle, inode, fe_bh,
 
 540                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
 546         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 547         el = &fe->id2.i_list;
 
 548         if (el->l_tree_depth) {
 
 549                 /* This is another operation where we want to be
 
 550                  * careful about our tree updates. An error here means
 
 551                  * none of the previous changes we made should roll
 
 552                  * forward. As a result, we have to record the buffers
 
 553                  * for this part of the tree in an array and reserve a
 
 554                  * journal write to them before making any changes. */
 
 555                 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
 
 556                 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
 
 565                 while(el->l_tree_depth) {
 
 566                         next_free = le16_to_cpu(el->l_next_free_rec);
 
 567                         if (next_free == 0) {
 
 568                                 ocfs2_error(inode->i_sb,
 
 569                                             "Dinode %llu has a bad extent list",
 
 570                                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 574                         next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
 
 576                         BUG_ON(i >= num_bhs);
 
 577                         status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
 
 578                                                   OCFS2_BH_CACHED, inode);
 
 583                         eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
 
 584                         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
 585                                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
 
 591                         status = ocfs2_journal_access(handle, inode, eb_bhs[i],
 
 592                                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
 600                         /* When we leave this loop, eb_bhs[num_bhs - 1] will
 
 601                          * hold the bottom-most leaf extent block. */
 
 603                 BUG_ON(el->l_tree_depth);
 
 605                 el = &fe->id2.i_list;
 
 606                 /* If we have tree depth, then the fe update is
 
 607                  * trivial, and we want to switch el out for the
 
 608                  * bottom-most leaf in order to update it with the
 
 609                  * actual extent data below. */
 
 610                 next_free = le16_to_cpu(el->l_next_free_rec);
 
 611                 if (next_free == 0) {
 
 612                         ocfs2_error(inode->i_sb,
 
 613                                     "Dinode %llu has a bad extent list",
 
 614                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 618                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
 
 620                 /* (num_bhs - 1) to avoid the leaf */
 
 621                 for(i = 0; i < (num_bhs - 1); i++) {
 
 622                         eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
 
 625                         /* finally, make our actual change to the
 
 626                          * intermediate extent blocks. */
 
 627                         next_free = le16_to_cpu(el->l_next_free_rec);
 
 628                         le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
 
 631                         status = ocfs2_journal_dirty(handle, eb_bhs[i]);
 
 635                 BUG_ON(i != (num_bhs - 1));
 
 636                 /* note that the leaf block wasn't touched in
 
 638                 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
 
 640                 BUG_ON(el->l_tree_depth);
 
 643         /* yay, we can finally add the actual extent now! */
 
 644         i = le16_to_cpu(el->l_next_free_rec) - 1;
 
 645         if (le16_to_cpu(el->l_next_free_rec) &&
 
 646             ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
 
 647                 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
 
 648         } else if (le16_to_cpu(el->l_next_free_rec) &&
 
 649                    (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
 
 650                 /* having an empty extent at eof is legal. */
 
 651                 if (el->l_recs[i].e_cpos != fe->i_clusters) {
 
 652                         ocfs2_error(inode->i_sb,
 
 653                                     "Dinode %llu trailing extent is bad: "
 
 654                                     "cpos (%u) != number of clusters (%u)",
 
 655                                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 
 656                                     le32_to_cpu(el->l_recs[i].e_cpos),
 
 657                                     le32_to_cpu(fe->i_clusters));
 
 661                 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
 
 662                 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
 
 664                 /* No contiguous record, or no empty record at eof, so
 
 665                  * we add a new one. */
 
 667                 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
 
 668                        le16_to_cpu(el->l_count));
 
 669                 i = le16_to_cpu(el->l_next_free_rec);
 
 671                 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
 
 672                 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
 
 673                 el->l_recs[i].e_cpos = fe->i_clusters;
 
 674                 le16_add_cpu(&el->l_next_free_rec, 1);
 
 678          * extent_map errors are not fatal, so they are ignored outside
 
 679          * of flushing the thing.
 
 681         status = ocfs2_extent_map_append(inode, &el->l_recs[i],
 
 685                 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
 
 688         status = ocfs2_journal_dirty(handle, fe_bh);
 
 691         if (fe->id2.i_list.l_tree_depth) {
 
 692                 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
 
 700                 for (i = 0; i < num_bhs; i++)
 
 711  * Should only be called when there is no space left in any of the
 
 712  * leaf nodes. What we want to do is find the lowest tree depth
 
 713  * non-leaf extent block with room for new records. There are three
 
 714  * valid results of this search:
 
 716  * 1) a lowest extent block is found, then we pass it back in
 
 717  *    *lowest_eb_bh and return '0'
 
 719  * 2) the search fails to find anything, but the dinode has room. We
 
 720  *    pass NULL back in *lowest_eb_bh, but still return '0'
 
 722  * 3) the search fails to find anything AND the dinode is full, in
 
 723  *    which case we return > 0
 
 725  * return status < 0 indicates an error.
 
 727 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 729                                     struct buffer_head *fe_bh,
 
 730                                     struct buffer_head **target_bh)
 
 734         struct ocfs2_dinode *fe;
 
 735         struct ocfs2_extent_block *eb;
 
 736         struct ocfs2_extent_list  *el;
 
 737         struct buffer_head *bh = NULL;
 
 738         struct buffer_head *lowest_bh = NULL;
 
 744         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 745         el = &fe->id2.i_list;
 
 747         while(le16_to_cpu(el->l_tree_depth) > 1) {
 
 748                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
 
 749                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
 
 750                                     "extent list (next_free_rec == 0)",
 
 751                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 755                 i = le16_to_cpu(el->l_next_free_rec) - 1;
 
 756                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
 
 758                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
 
 759                                     "list where extent # %d has no physical "
 
 761                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
 
 771                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
 
 778                 eb = (struct ocfs2_extent_block *) bh->b_data;
 
 779                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
 780                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
 786                 if (le16_to_cpu(el->l_next_free_rec) <
 
 787                     le16_to_cpu(el->l_count)) {
 
 795         /* If we didn't find one and the fe doesn't have any room,
 
 798             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
 
 801         *target_bh = lowest_bh;
 
 810 /* the caller needs to update fe->i_clusters */
 
 811 int ocfs2_insert_extent(struct ocfs2_super *osb,
 
 814                         struct buffer_head *fe_bh,
 
 817                         struct ocfs2_alloc_context *meta_ac)
 
 819         int status, i, shift;
 
 820         struct buffer_head *last_eb_bh = NULL;
 
 821         struct buffer_head *bh = NULL;
 
 822         struct ocfs2_dinode *fe;
 
 823         struct ocfs2_extent_block *eb;
 
 824         struct ocfs2_extent_list  *el;
 
 828         mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
 
 829              new_clusters, (unsigned long long)start_blk,
 
 830              (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 832         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 833         el = &fe->id2.i_list;
 
 835         if (el->l_tree_depth) {
 
 836                 /* jump to end of tree */
 
 837                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
 
 838                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
 
 843                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
 847         /* Can we allocate without adding/shifting tree bits? */
 
 848         i = le16_to_cpu(el->l_next_free_rec) - 1;
 
 849         if (le16_to_cpu(el->l_next_free_rec) == 0
 
 850             || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
 
 851             || le32_to_cpu(el->l_recs[i].e_clusters) == 0
 
 852             || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
 
 855         mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
 
 858         shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
 
 865         /* We traveled all the way to the bottom of the allocation tree
 
 866          * and didn't find room for any more extents - we need to add
 
 867          * another tree level */
 
 869                 /* if we hit a leaf, we'd better be empty :) */
 
 870                 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
 
 871                        le16_to_cpu(el->l_count));
 
 873                 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
 
 875                      le16_to_cpu(fe->id2.i_list.l_tree_depth));
 
 877                 /* ocfs2_shift_tree_depth will return us a buffer with
 
 878                  * the new extent block (so we can pass that to
 
 879                  * ocfs2_add_branch). */
 
 880                 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
 
 886                 /* Special case: we have room now if we shifted from
 
 888                 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
 
 892         /* call ocfs2_add_branch to add the final part of the tree with
 
 894         mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
 
 895         status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
 
 903         /* Finally, we can add clusters. */
 
 904         status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
 
 905                                         start_blk, new_clusters);
 
 920 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 
 922         struct buffer_head *tl_bh = osb->osb_tl_bh;
 
 923         struct ocfs2_dinode *di;
 
 924         struct ocfs2_truncate_log *tl;
 
 926         di = (struct ocfs2_dinode *) tl_bh->b_data;
 
 927         tl = &di->id2.i_dealloc;
 
 929         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
 
 930                         "slot %d, invalid truncate log parameters: used = "
 
 931                         "%u, count = %u\n", osb->slot_num,
 
 932                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
 
 933         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
 
 936 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
 
 937                                            unsigned int new_start)
 
 939         unsigned int tail_index;
 
 940         unsigned int current_tail;
 
 942         /* No records, nothing to coalesce */
 
 943         if (!le16_to_cpu(tl->tl_used))
 
 946         tail_index = le16_to_cpu(tl->tl_used) - 1;
 
 947         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
 
 948         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
 
 950         return current_tail == new_start;
 
 953 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 
 956                                      unsigned int num_clusters)
 
 959         unsigned int start_cluster, tl_count;
 
 960         struct inode *tl_inode = osb->osb_tl_inode;
 
 961         struct buffer_head *tl_bh = osb->osb_tl_bh;
 
 962         struct ocfs2_dinode *di;
 
 963         struct ocfs2_truncate_log *tl;
 
 965         mlog_entry("start_blk = %llu, num_clusters = %u\n",
 
 966                    (unsigned long long)start_blk, num_clusters);
 
 968         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 970         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
 972         di = (struct ocfs2_dinode *) tl_bh->b_data;
 
 973         tl = &di->id2.i_dealloc;
 
 974         if (!OCFS2_IS_VALID_DINODE(di)) {
 
 975                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
 
 980         tl_count = le16_to_cpu(tl->tl_count);
 
 981         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
 
 983                         "Truncate record count on #%llu invalid "
 
 984                         "wanted %u, actual %u\n",
 
 985                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
 
 986                         ocfs2_truncate_recs_per_inode(osb->sb),
 
 987                         le16_to_cpu(tl->tl_count));
 
 989         /* Caller should have known to flush before calling us. */
 
 990         index = le16_to_cpu(tl->tl_used);
 
 991         if (index >= tl_count) {
 
 997         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
 
 998                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
1004         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
 
1005              "%llu (index = %d)\n", num_clusters, start_cluster,
 
1006              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
 
1008         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
 
1010                  * Move index back to the record we are coalescing with.
 
1011                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
 
1015                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
 
1016                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
 
1017                      index, le32_to_cpu(tl->tl_recs[index].t_start),
 
1020                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
 
1021                 tl->tl_used = cpu_to_le16(index + 1);
 
1023         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
 
1025         status = ocfs2_journal_dirty(handle, tl_bh);
 
1036 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 
1038                                          struct inode *data_alloc_inode,
 
1039                                          struct buffer_head *data_alloc_bh)
 
1043         unsigned int num_clusters;
 
1045         struct ocfs2_truncate_rec rec;
 
1046         struct ocfs2_dinode *di;
 
1047         struct ocfs2_truncate_log *tl;
 
1048         struct inode *tl_inode = osb->osb_tl_inode;
 
1049         struct buffer_head *tl_bh = osb->osb_tl_bh;
 
1053         di = (struct ocfs2_dinode *) tl_bh->b_data;
 
1054         tl = &di->id2.i_dealloc;
 
1055         i = le16_to_cpu(tl->tl_used) - 1;
 
1057                 /* Caller has given us at least enough credits to
 
1058                  * update the truncate log dinode */
 
1059                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
 
1060                                               OCFS2_JOURNAL_ACCESS_WRITE);
 
1066                 tl->tl_used = cpu_to_le16(i);
 
1068                 status = ocfs2_journal_dirty(handle, tl_bh);
 
1074                 /* TODO: Perhaps we can calculate the bulk of the
 
1075                  * credits up front rather than extending like
 
1077                 status = ocfs2_extend_trans(handle,
 
1078                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
 
1084                 rec = tl->tl_recs[i];
 
1085                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
 
1086                                                     le32_to_cpu(rec.t_start));
 
1087                 num_clusters = le32_to_cpu(rec.t_clusters);
 
1089                 /* if start_blk is not set, we ignore the record as
 
1092                         mlog(0, "free record %d, start = %u, clusters = %u\n",
 
1093                              i, le32_to_cpu(rec.t_start), num_clusters);
 
1095                         status = ocfs2_free_clusters(handle, data_alloc_inode,
 
1096                                                      data_alloc_bh, start_blk,
 
1111 /* Expects you to already be holding tl_inode->i_mutex */
 
1112 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
1115         unsigned int num_to_flush;
 
1117         struct inode *tl_inode = osb->osb_tl_inode;
 
1118         struct inode *data_alloc_inode = NULL;
 
1119         struct buffer_head *tl_bh = osb->osb_tl_bh;
 
1120         struct buffer_head *data_alloc_bh = NULL;
 
1121         struct ocfs2_dinode *di;
 
1122         struct ocfs2_truncate_log *tl;
 
1126         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
1128         di = (struct ocfs2_dinode *) tl_bh->b_data;
 
1129         tl = &di->id2.i_dealloc;
 
1130         if (!OCFS2_IS_VALID_DINODE(di)) {
 
1131                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
 
1136         num_to_flush = le16_to_cpu(tl->tl_used);
 
1137         mlog(0, "Flush %u records from truncate log #%llu\n",
 
1138              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
 
1139         if (!num_to_flush) {
 
1144         data_alloc_inode = ocfs2_get_system_file_inode(osb,
 
1145                                                        GLOBAL_BITMAP_SYSTEM_INODE,
 
1146                                                        OCFS2_INVALID_SLOT);
 
1147         if (!data_alloc_inode) {
 
1149                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
 
1153         mutex_lock(&data_alloc_inode->i_mutex);
 
1155         status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
 
1161         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
 
1162         if (IS_ERR(handle)) {
 
1163                 status = PTR_ERR(handle);
 
1168         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
 
1173         ocfs2_commit_trans(osb, handle);
 
1176         brelse(data_alloc_bh);
 
1177         ocfs2_meta_unlock(data_alloc_inode, 1);
 
1180         mutex_unlock(&data_alloc_inode->i_mutex);
 
1181         iput(data_alloc_inode);
 
1188 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
1191         struct inode *tl_inode = osb->osb_tl_inode;
 
1193         mutex_lock(&tl_inode->i_mutex);
 
1194         status = __ocfs2_flush_truncate_log(osb);
 
1195         mutex_unlock(&tl_inode->i_mutex);
 
1200 static void ocfs2_truncate_log_worker(struct work_struct *work)
 
1203         struct ocfs2_super *osb =
 
1204                 container_of(work, struct ocfs2_super,
 
1205                              osb_truncate_log_wq.work);
 
1209         status = ocfs2_flush_truncate_log(osb);
 
1216 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
 
1217 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
 
1220         if (osb->osb_tl_inode) {
 
1221                 /* We want to push off log flushes while truncates are
 
1224                         cancel_delayed_work(&osb->osb_truncate_log_wq);
 
1226                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
 
1227                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
 
1231 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 
1233                                        struct inode **tl_inode,
 
1234                                        struct buffer_head **tl_bh)
 
1237         struct inode *inode = NULL;
 
1238         struct buffer_head *bh = NULL;
 
1240         inode = ocfs2_get_system_file_inode(osb,
 
1241                                            TRUNCATE_LOG_SYSTEM_INODE,
 
1245                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
 
1249         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 
1250                                   OCFS2_BH_CACHED, inode);
 
1264 /* called during the 1st stage of node recovery. we stamp a clean
 
1265  * truncate log and pass back a copy for processing later. if the
 
1266  * truncate log does not require processing, a *tl_copy is set to
 
1268 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 
1270                                       struct ocfs2_dinode **tl_copy)
 
1273         struct inode *tl_inode = NULL;
 
1274         struct buffer_head *tl_bh = NULL;
 
1275         struct ocfs2_dinode *di;
 
1276         struct ocfs2_truncate_log *tl;
 
1280         mlog(0, "recover truncate log from slot %d\n", slot_num);
 
1282         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
 
1288         di = (struct ocfs2_dinode *) tl_bh->b_data;
 
1289         tl = &di->id2.i_dealloc;
 
1290         if (!OCFS2_IS_VALID_DINODE(di)) {
 
1291                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
 
1296         if (le16_to_cpu(tl->tl_used)) {
 
1297                 mlog(0, "We'll have %u logs to recover\n",
 
1298                      le16_to_cpu(tl->tl_used));
 
1300                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
 
1307                 /* Assuming the write-out below goes well, this copy
 
1308                  * will be passed back to recovery for processing. */
 
1309                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
 
1311                 /* All we need to do to clear the truncate log is set
 
1315                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
 
1328         if (status < 0 && (*tl_copy)) {
 
1337 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 
1338                                          struct ocfs2_dinode *tl_copy)
 
1342         unsigned int clusters, num_recs, start_cluster;
 
1345         struct inode *tl_inode = osb->osb_tl_inode;
 
1346         struct ocfs2_truncate_log *tl;
 
1350         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
 
1351                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
 
1355         tl = &tl_copy->id2.i_dealloc;
 
1356         num_recs = le16_to_cpu(tl->tl_used);
 
1357         mlog(0, "cleanup %u records from %llu\n", num_recs,
 
1358              (unsigned long long)tl_copy->i_blkno);
 
1360         mutex_lock(&tl_inode->i_mutex);
 
1361         for(i = 0; i < num_recs; i++) {
 
1362                 if (ocfs2_truncate_log_needs_flush(osb)) {
 
1363                         status = __ocfs2_flush_truncate_log(osb);
 
1370                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
 
1371                 if (IS_ERR(handle)) {
 
1372                         status = PTR_ERR(handle);
 
1377                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
 
1378                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
 
1379                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
 
1381                 status = ocfs2_truncate_log_append(osb, handle,
 
1382                                                    start_blk, clusters);
 
1383                 ocfs2_commit_trans(osb, handle);
 
1391         mutex_unlock(&tl_inode->i_mutex);
 
1397 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
 
1400         struct inode *tl_inode = osb->osb_tl_inode;
 
1405                 cancel_delayed_work(&osb->osb_truncate_log_wq);
 
1406                 flush_workqueue(ocfs2_wq);
 
1408                 status = ocfs2_flush_truncate_log(osb);
 
1412                 brelse(osb->osb_tl_bh);
 
1413                 iput(osb->osb_tl_inode);
 
1419 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 
1422         struct inode *tl_inode = NULL;
 
1423         struct buffer_head *tl_bh = NULL;
 
1427         status = ocfs2_get_truncate_log_info(osb,
 
1434         /* ocfs2_truncate_log_shutdown keys on the existence of
 
1435          * osb->osb_tl_inode so we don't set any of the osb variables
 
1436          * until we're sure all is well. */
 
1437         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
 
1438                           ocfs2_truncate_log_worker);
 
1439         osb->osb_tl_bh    = tl_bh;
 
1440         osb->osb_tl_inode = tl_inode;
 
1446 /* This function will figure out whether the currently last extent
 
1447  * block will be deleted, and if it will, what the new last extent
 
1448  * block will be so we can update his h_next_leaf_blk field, as well
 
1449  * as the dinodes i_last_eb_blk */
 
1450 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
 
1451                                        struct inode *inode,
 
1452                                        struct ocfs2_dinode *fe,
 
1454                                        struct buffer_head *old_last_eb,
 
1455                                        struct buffer_head **new_last_eb)
 
1459         struct ocfs2_extent_block *eb;
 
1460         struct ocfs2_extent_list *el;
 
1461         struct buffer_head *bh = NULL;
 
1463         *new_last_eb = NULL;
 
1465         if (!OCFS2_IS_VALID_DINODE(fe)) {
 
1466                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 
1471         /* we have no tree, so of course, no last_eb. */
 
1472         if (!fe->id2.i_list.l_tree_depth)
 
1475         /* trunc to zero special case - this makes tree_depth = 0
 
1476          * regardless of what it is.  */
 
1477         if (!new_i_clusters)
 
1480         eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
 
1482         BUG_ON(!el->l_next_free_rec);
 
1484         /* Make sure that this guy will actually be empty after we
 
1485          * clear away the data. */
 
1486         if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
 
1489         /* Ok, at this point, we know that last_eb will definitely
 
1490          * change, so lets traverse the tree and find the second to
 
1491          * last extent block. */
 
1492         el = &(fe->id2.i_list);
 
1493         /* go down the tree, */
 
1495                 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
 
1496                         if (le32_to_cpu(el->l_recs[i].e_cpos) <
 
1498                                 block = le64_to_cpu(el->l_recs[i].e_blkno);
 
1509                 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
 
1515                 eb = (struct ocfs2_extent_block *) bh->b_data;
 
1517                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
1518                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
1522         } while (el->l_tree_depth);
 
1525         get_bh(*new_last_eb);
 
1526         mlog(0, "returning block %llu\n",
 
1527              (unsigned long long)le64_to_cpu(eb->h_blkno));
 
1535 static int ocfs2_do_truncate(struct ocfs2_super *osb,
 
1536                              unsigned int clusters_to_del,
 
1537                              struct inode *inode,
 
1538                              struct buffer_head *fe_bh,
 
1539                              struct buffer_head *old_last_eb_bh,
 
1541                              struct ocfs2_truncate_context *tc)
 
1543         int status, i, depth;
 
1544         struct ocfs2_dinode *fe;
 
1545         struct ocfs2_extent_block *eb;
 
1546         struct ocfs2_extent_block *last_eb = NULL;
 
1547         struct ocfs2_extent_list *el;
 
1548         struct buffer_head *eb_bh = NULL;
 
1549         struct buffer_head *last_eb_bh = NULL;
 
1553         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
1555         status = ocfs2_find_new_last_ext_blk(osb,
 
1558                                              le32_to_cpu(fe->i_clusters) -
 
1567                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
1569         status = ocfs2_journal_access(handle, inode, fe_bh,
 
1570                                       OCFS2_JOURNAL_ACCESS_WRITE);
 
1575         el = &(fe->id2.i_list);
 
1577         spin_lock(&OCFS2_I(inode)->ip_lock);
 
1578         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
 
1580         spin_unlock(&OCFS2_I(inode)->ip_lock);
 
1581         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
 
1582         fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
 
1583         fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
 
1585         i = le16_to_cpu(el->l_next_free_rec) - 1;
 
1587         BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
 
1588         le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
 
1589         /* tree depth zero, we can just delete the clusters, otherwise
 
1590          * we need to record the offset of the next level extent block
 
1591          * as we may overwrite it. */
 
1592         if (!el->l_tree_depth)
 
1593                 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
 
1594                         + ocfs2_clusters_to_blocks(osb->sb,
 
1595                                         le32_to_cpu(el->l_recs[i].e_clusters));
 
1597                 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
 
1599         if (!el->l_recs[i].e_clusters) {
 
1600                 /* if we deleted the whole extent record, then clear
 
1601                  * out the other fields and update the extent
 
1602                  * list. For depth > 0 trees, we've already recorded
 
1603                  * the extent block in 'next_eb' */
 
1604                 el->l_recs[i].e_cpos = 0;
 
1605                 el->l_recs[i].e_blkno = 0;
 
1606                 BUG_ON(!el->l_next_free_rec);
 
1607                 le16_add_cpu(&el->l_next_free_rec, -1);
 
1610         depth = le16_to_cpu(el->l_tree_depth);
 
1611         if (!fe->i_clusters) {
 
1612                 /* trunc to zero is a special case. */
 
1613                 el->l_tree_depth = 0;
 
1614                 fe->i_last_eb_blk = 0;
 
1616                 fe->i_last_eb_blk = last_eb->h_blkno;
 
1618         status = ocfs2_journal_dirty(handle, fe_bh);
 
1625                 /* If there will be a new last extent block, then by
 
1626                  * definition, there cannot be any leaves to the right of
 
1628                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
 
1629                                               OCFS2_JOURNAL_ACCESS_WRITE);
 
1634                 last_eb->h_next_leaf_blk = 0;
 
1635                 status = ocfs2_journal_dirty(handle, last_eb_bh);
 
1642         /* if our tree depth > 0, update all the tree blocks below us. */
 
1644                 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
 
1645                      depth,  (unsigned long long)next_eb);
 
1646                 status = ocfs2_read_block(osb, next_eb, &eb_bh,
 
1647                                           OCFS2_BH_CACHED, inode);
 
1652                 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
 
1653                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
1654                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
1660                 status = ocfs2_journal_access(handle, inode, eb_bh,
 
1661                                               OCFS2_JOURNAL_ACCESS_WRITE);
 
1667                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
 
1668                 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
 
1670                 i = le16_to_cpu(el->l_next_free_rec) - 1;
 
1672                 mlog(0, "extent block %llu, before: record %d: "
 
1673                      "(%u, %u, %llu), next = %u\n",
 
1674                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
 
1675                      le32_to_cpu(el->l_recs[i].e_cpos),
 
1676                      le32_to_cpu(el->l_recs[i].e_clusters),
 
1677                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
 
1678                      le16_to_cpu(el->l_next_free_rec));
 
1680                 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
 
1681                 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
 
1683                 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
 
1684                 /* bottom-most block requires us to delete data.*/
 
1685                 if (!el->l_tree_depth)
 
1686                         delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
 
1687                                 + ocfs2_clusters_to_blocks(osb->sb,
 
1688                                         le32_to_cpu(el->l_recs[i].e_clusters));
 
1689                 if (!el->l_recs[i].e_clusters) {
 
1690                         el->l_recs[i].e_cpos = 0;
 
1691                         el->l_recs[i].e_blkno = 0;
 
1692                         BUG_ON(!el->l_next_free_rec);
 
1693                         le16_add_cpu(&el->l_next_free_rec, -1);
 
1695                 mlog(0, "extent block %llu, after: record %d: "
 
1696                      "(%u, %u, %llu), next = %u\n",
 
1697                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
 
1698                      le32_to_cpu(el->l_recs[i].e_cpos),
 
1699                      le32_to_cpu(el->l_recs[i].e_clusters),
 
1700                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
 
1701                      le16_to_cpu(el->l_next_free_rec));
 
1703                 status = ocfs2_journal_dirty(handle, eb_bh);
 
1709                 if (!el->l_next_free_rec) {
 
1710                         mlog(0, "deleting this extent block.\n");
 
1712                         ocfs2_remove_from_cache(inode, eb_bh);
 
1714                         BUG_ON(el->l_recs[0].e_clusters);
 
1715                         BUG_ON(el->l_recs[0].e_cpos);
 
1716                         BUG_ON(el->l_recs[0].e_blkno);
 
1717                         if (eb->h_suballoc_slot == 0) {
 
1719                                  * This code only understands how to
 
1720                                  * lock the suballocator in slot 0,
 
1721                                  * which is fine because allocation is
 
1722                                  * only ever done out of that
 
1723                                  * suballocator too. A future version
 
1724                                  * might change that however, so avoid
 
1725                                  * a free if we don't know how to
 
1726                                  * handle it. This way an fs incompat
 
1727                                  * bit will not be necessary.
 
1729                                 status = ocfs2_free_extent_block(handle,
 
1730                                                                  tc->tc_ext_alloc_inode,
 
1731                                                                  tc->tc_ext_alloc_bh,
 
1744         BUG_ON(!delete_blk);
 
1745         status = ocfs2_truncate_log_append(osb, handle, delete_blk,
 
1754                 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
 
1756                 ocfs2_extent_map_drop(inode, 0);
 
1762  * It is expected, that by the time you call this function,
 
1763  * inode->i_size and fe->i_size have been adjusted.
 
1765  * WARNING: This will kfree the truncate context
 
1767 int ocfs2_commit_truncate(struct ocfs2_super *osb,
 
1768                           struct inode *inode,
 
1769                           struct buffer_head *fe_bh,
 
1770                           struct ocfs2_truncate_context *tc)
 
1772         int status, i, credits, tl_sem = 0;
 
1773         u32 clusters_to_del, target_i_clusters;
 
1775         struct ocfs2_dinode *fe;
 
1776         struct ocfs2_extent_block *eb;
 
1777         struct ocfs2_extent_list *el;
 
1778         struct buffer_head *last_eb_bh;
 
1779         handle_t *handle = NULL;
 
1780         struct inode *tl_inode = osb->osb_tl_inode;
 
1784         down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
1786         target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
 
1787                                                      i_size_read(inode));
 
1789         last_eb_bh = tc->tc_last_eb_bh;
 
1790         tc->tc_last_eb_bh = NULL;
 
1792         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
1794         if (fe->id2.i_list.l_tree_depth) {
 
1795                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
1798                 el = &fe->id2.i_list;
 
1799         last_eb = le64_to_cpu(fe->i_last_eb_blk);
 
1801         mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
 
1802              "last_eb = %llu, fe->i_last_eb_blk = %llu, "
 
1803              "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
 
1804              le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
 
1805              (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
 
1806              le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
 
1808         if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
 
1809                 mlog(0, "last_eb changed!\n");
 
1810                 BUG_ON(!fe->id2.i_list.l_tree_depth);
 
1811                 last_eb = le64_to_cpu(fe->i_last_eb_blk);
 
1812                 /* i_last_eb_blk may have changed, read it if
 
1813                  * necessary. We don't have to worry about the
 
1814                  * truncate to zero case here (where there becomes no
 
1815                  * last_eb) because we never loop back after our work
 
1822                 status = ocfs2_read_block(osb, last_eb,
 
1823                                           &last_eb_bh, OCFS2_BH_CACHED,
 
1829                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
1830                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
1831                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
1838         /* by now, el will point to the extent list on the bottom most
 
1839          * portion of this tree. */
 
1840         i = le16_to_cpu(el->l_next_free_rec) - 1;
 
1841         if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
 
1842                 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
 
1844                 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
 
1845                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
 
1848         mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
 
1850         mutex_lock(&tl_inode->i_mutex);
 
1852         /* ocfs2_truncate_log_needs_flush guarantees us at least one
 
1853          * record is free for use. If there isn't any, we flush to get
 
1854          * an empty truncate log.  */
 
1855         if (ocfs2_truncate_log_needs_flush(osb)) {
 
1856                 status = __ocfs2_flush_truncate_log(osb);
 
1863         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
 
1865         handle = ocfs2_start_trans(osb, credits);
 
1866         if (IS_ERR(handle)) {
 
1867                 status = PTR_ERR(handle);
 
1873         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
1874         status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 
1878         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
 
1879                                    last_eb_bh, handle, tc);
 
1885         mutex_unlock(&tl_inode->i_mutex);
 
1888         ocfs2_commit_trans(osb, handle);
 
1891         BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
 
1892         if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
 
1895         up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
1897         ocfs2_schedule_truncate_log_flush(osb, 1);
 
1900                 mutex_unlock(&tl_inode->i_mutex);
 
1903                 ocfs2_commit_trans(osb, handle);
 
1908         /* This will drop the ext_alloc cluster lock for us */
 
1909         ocfs2_free_truncate_context(tc);
 
1917  * Expects the inode to already be locked. This will figure out which
 
1918  * inodes need to be locked and will put them on the returned truncate
 
1921 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 
1922                            struct inode *inode,
 
1923                            struct buffer_head *fe_bh,
 
1924                            struct ocfs2_truncate_context **tc)
 
1926         int status, metadata_delete;
 
1927         unsigned int new_i_clusters;
 
1928         struct ocfs2_dinode *fe;
 
1929         struct ocfs2_extent_block *eb;
 
1930         struct ocfs2_extent_list *el;
 
1931         struct buffer_head *last_eb_bh = NULL;
 
1932         struct inode *ext_alloc_inode = NULL;
 
1933         struct buffer_head *ext_alloc_bh = NULL;
 
1939         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
 
1940                                                   i_size_read(inode));
 
1941         fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
1943         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
 
1944              "%llu\n", fe->i_clusters, new_i_clusters,
 
1945              (unsigned long long)fe->i_size);
 
1947         if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
 
1948                 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
 
1949                             "%u and size %llu whereas struct inode has "
 
1950                             "cluster count %u and size %llu which caused an "
 
1951                             "invalid truncate to %u clusters.",
 
1952                             (unsigned long long)le64_to_cpu(fe->i_blkno),
 
1953                             le32_to_cpu(fe->i_clusters),
 
1954                             (unsigned long long)le64_to_cpu(fe->i_size),
 
1955                             OCFS2_I(inode)->ip_clusters, i_size_read(inode),
 
1957                 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
 
1962         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
 
1969         metadata_delete = 0;
 
1970         if (fe->id2.i_list.l_tree_depth) {
 
1971                 /* If we have a tree, then the truncate may result in
 
1972                  * metadata deletes. Figure this out from the
 
1973                  * rightmost leaf block.*/
 
1974                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
 
1975                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
 
1980                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
1981                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 
1982                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
 
1989                 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
 
1990                         metadata_delete = 1;
 
1993         (*tc)->tc_last_eb_bh = last_eb_bh;
 
1995         if (metadata_delete) {
 
1996                 mlog(0, "Will have to delete metadata for this trunc. "
 
1997                      "locking allocator.\n");
 
1998                 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
 
1999                 if (!ext_alloc_inode) {
 
2005                 mutex_lock(&ext_alloc_inode->i_mutex);
 
2006                 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
 
2008                 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
 
2013                 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
 
2014                 (*tc)->tc_ext_alloc_locked = 1;
 
2021                         ocfs2_free_truncate_context(*tc);
 
2028 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 
2030         if (tc->tc_ext_alloc_inode) {
 
2031                 if (tc->tc_ext_alloc_locked)
 
2032                         ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
 
2034                 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
 
2035                 iput(tc->tc_ext_alloc_inode);
 
2038         if (tc->tc_ext_alloc_bh)
 
2039                 brelse(tc->tc_ext_alloc_bh);
 
2041         if (tc->tc_last_eb_bh)
 
2042                 brelse(tc->tc_last_eb_bh);