ocfs2: block read meta ecc.
[linux-2.6] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 1
52
53 #define OCFS2_MAX_INODES_TO_STEAL       1024
54
55 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
56 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
57 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_fill(handle_t *handle,
59                                   struct inode *alloc_inode,
60                                   struct buffer_head *bg_bh,
61                                   u64 group_blkno,
62                                   u16 my_chain,
63                                   struct ocfs2_chain_list *cl);
64 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65                                    struct inode *alloc_inode,
66                                    struct buffer_head *bh,
67                                    u64 max_block);
68
69 static int ocfs2_cluster_group_search(struct inode *inode,
70                                       struct buffer_head *group_bh,
71                                       u32 bits_wanted, u32 min_bits,
72                                       u64 max_block,
73                                       u16 *bit_off, u16 *bits_found);
74 static int ocfs2_block_group_search(struct inode *inode,
75                                     struct buffer_head *group_bh,
76                                     u32 bits_wanted, u32 min_bits,
77                                     u64 max_block,
78                                     u16 *bit_off, u16 *bits_found);
79 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80                                      struct ocfs2_alloc_context *ac,
81                                      handle_t *handle,
82                                      u32 bits_wanted,
83                                      u32 min_bits,
84                                      u16 *bit_off,
85                                      unsigned int *num_bits,
86                                      u64 *bg_blkno);
87 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
88                                          int nr);
89 static inline int ocfs2_block_group_set_bits(handle_t *handle,
90                                              struct inode *alloc_inode,
91                                              struct ocfs2_group_desc *bg,
92                                              struct buffer_head *group_bh,
93                                              unsigned int bit_off,
94                                              unsigned int num_bits);
95 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
96                                                struct inode *alloc_inode,
97                                                struct ocfs2_group_desc *bg,
98                                                struct buffer_head *group_bh,
99                                                unsigned int bit_off,
100                                                unsigned int num_bits);
101
102 static int ocfs2_relink_block_group(handle_t *handle,
103                                     struct inode *alloc_inode,
104                                     struct buffer_head *fe_bh,
105                                     struct buffer_head *bg_bh,
106                                     struct buffer_head *prev_bg_bh,
107                                     u16 chain);
108 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
109                                                      u32 wanted);
110 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
111                                                    u64 bg_blkno,
112                                                    u16 bg_bit_off);
113 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
114                                                 u64 data_blkno,
115                                                 u64 *bg_blkno,
116                                                 u16 *bg_bit_off);
117 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118                                              u32 bits_wanted, u64 max_block,
119                                              struct ocfs2_alloc_context **ac);
120
121 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
122 {
123         struct inode *inode = ac->ac_inode;
124
125         if (inode) {
126                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
127                         ocfs2_inode_unlock(inode, 1);
128
129                 mutex_unlock(&inode->i_mutex);
130
131                 iput(inode);
132                 ac->ac_inode = NULL;
133         }
134         brelse(ac->ac_bh);
135         ac->ac_bh = NULL;
136 }
137
138 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
139 {
140         ocfs2_free_ac_resource(ac);
141         kfree(ac);
142 }
143
144 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
145 {
146         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
147 }
148
149 #define do_error(fmt, ...)                                              \
150         do{                                                             \
151                 if (clean_error)                                        \
152                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
153                 else                                                    \
154                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
155         } while (0)
156
157 static int ocfs2_validate_gd_self(struct super_block *sb,
158                                   struct buffer_head *bh,
159                                   int clean_error)
160 {
161         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
162
163         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
164                 do_error("Group descriptor #%llu has bad signature %.*s",
165                          (unsigned long long)bh->b_blocknr, 7,
166                          gd->bg_signature);
167                 return -EINVAL;
168         }
169
170         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
171                 do_error("Group descriptor #%llu has an invalid bg_blkno "
172                          "of %llu",
173                          (unsigned long long)bh->b_blocknr,
174                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
175                 return -EINVAL;
176         }
177
178         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
179                 do_error("Group descriptor #%llu has an invalid "
180                          "fs_generation of #%u",
181                          (unsigned long long)bh->b_blocknr,
182                          le32_to_cpu(gd->bg_generation));
183                 return -EINVAL;
184         }
185
186         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187                 do_error("Group descriptor #%llu has bit count %u but "
188                          "claims that %u are free",
189                          (unsigned long long)bh->b_blocknr,
190                          le16_to_cpu(gd->bg_bits),
191                          le16_to_cpu(gd->bg_free_bits_count));
192                 return -EINVAL;
193         }
194
195         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
196                 do_error("Group descriptor #%llu has bit count %u but "
197                          "max bitmap bits of %u",
198                          (unsigned long long)bh->b_blocknr,
199                          le16_to_cpu(gd->bg_bits),
200                          8 * le16_to_cpu(gd->bg_size));
201                 return -EINVAL;
202         }
203
204         return 0;
205 }
206
207 static int ocfs2_validate_gd_parent(struct super_block *sb,
208                                     struct ocfs2_dinode *di,
209                                     struct buffer_head *bh,
210                                     int clean_error)
211 {
212         unsigned int max_bits;
213         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
214
215         if (di->i_blkno != gd->bg_parent_dinode) {
216                 do_error("Group descriptor #%llu has bad parent "
217                          "pointer (%llu, expected %llu)",
218                          (unsigned long long)bh->b_blocknr,
219                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
220                          (unsigned long long)le64_to_cpu(di->i_blkno));
221                 return -EINVAL;
222         }
223
224         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
225         if (le16_to_cpu(gd->bg_bits) > max_bits) {
226                 do_error("Group descriptor #%llu has bit count of %u",
227                          (unsigned long long)bh->b_blocknr,
228                          le16_to_cpu(gd->bg_bits));
229                 return -EINVAL;
230         }
231
232         if (le16_to_cpu(gd->bg_chain) >=
233             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
234                 do_error("Group descriptor #%llu has bad chain %u",
235                          (unsigned long long)bh->b_blocknr,
236                          le16_to_cpu(gd->bg_chain));
237                 return -EINVAL;
238         }
239
240         return 0;
241 }
242
243 #undef do_error
244
245 /*
246  * This version only prints errors.  It does not fail the filesystem, and
247  * exists only for resize.
248  */
249 int ocfs2_check_group_descriptor(struct super_block *sb,
250                                  struct ocfs2_dinode *di,
251                                  struct buffer_head *bh)
252 {
253         int rc;
254         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255
256         BUG_ON(!buffer_uptodate(bh));
257
258         /*
259          * If the ecc fails, we return the error but otherwise
260          * leave the filesystem running.  We know any error is
261          * local to this block.
262          */
263         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264         if (!rc)
265                 rc = ocfs2_validate_gd_self(sb, bh, 1);
266         if (!rc)
267                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
268
269         return rc;
270 }
271
272 static int ocfs2_validate_group_descriptor(struct super_block *sb,
273                                            struct buffer_head *bh)
274 {
275         int rc;
276         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
277
278         mlog(0, "Validating group descriptor %llu\n",
279              (unsigned long long)bh->b_blocknr);
280
281         BUG_ON(!buffer_uptodate(bh));
282
283         /*
284          * If the ecc fails, we return the error but otherwise
285          * leave the filesystem running.  We know any error is
286          * local to this block.
287          */
288         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
289         if (rc)
290                 return rc;
291
292         /*
293          * Errors after here are fatal.
294          */
295
296         return ocfs2_validate_gd_self(sb, bh, 0);
297 }
298
299 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
300                                 u64 gd_blkno, struct buffer_head **bh)
301 {
302         int rc;
303         struct buffer_head *tmp = *bh;
304
305         rc = ocfs2_read_block(inode, gd_blkno, &tmp,
306                               ocfs2_validate_group_descriptor);
307         if (rc)
308                 goto out;
309
310         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
311         if (rc) {
312                 brelse(tmp);
313                 goto out;
314         }
315
316         /* If ocfs2_read_block() got us a new bh, pass it up. */
317         if (!*bh)
318                 *bh = tmp;
319
320 out:
321         return rc;
322 }
323
324 static int ocfs2_block_group_fill(handle_t *handle,
325                                   struct inode *alloc_inode,
326                                   struct buffer_head *bg_bh,
327                                   u64 group_blkno,
328                                   u16 my_chain,
329                                   struct ocfs2_chain_list *cl)
330 {
331         int status = 0;
332         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
333         struct super_block * sb = alloc_inode->i_sb;
334
335         mlog_entry_void();
336
337         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
338                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
339                             "b_blocknr (%llu)",
340                             (unsigned long long)group_blkno,
341                             (unsigned long long) bg_bh->b_blocknr);
342                 status = -EIO;
343                 goto bail;
344         }
345
346         status = ocfs2_journal_access(handle,
347                                       alloc_inode,
348                                       bg_bh,
349                                       OCFS2_JOURNAL_ACCESS_CREATE);
350         if (status < 0) {
351                 mlog_errno(status);
352                 goto bail;
353         }
354
355         memset(bg, 0, sb->s_blocksize);
356         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
357         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
358         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
359         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
360         bg->bg_chain = cpu_to_le16(my_chain);
361         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
362         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
363         bg->bg_blkno = cpu_to_le64(group_blkno);
364         /* set the 1st bit in the bitmap to account for the descriptor block */
365         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
366         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
367
368         status = ocfs2_journal_dirty(handle, bg_bh);
369         if (status < 0)
370                 mlog_errno(status);
371
372         /* There is no need to zero out or otherwise initialize the
373          * other blocks in a group - All valid FS metadata in a block
374          * group stores the superblock fs_generation value at
375          * allocation time. */
376
377 bail:
378         mlog_exit(status);
379         return status;
380 }
381
382 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
383 {
384         u16 curr, best;
385
386         best = curr = 0;
387         while (curr < le16_to_cpu(cl->cl_count)) {
388                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
389                     le32_to_cpu(cl->cl_recs[curr].c_total))
390                         best = curr;
391                 curr++;
392         }
393         return best;
394 }
395
396 /*
397  * We expect the block group allocator to already be locked.
398  */
399 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
400                                    struct inode *alloc_inode,
401                                    struct buffer_head *bh,
402                                    u64 max_block)
403 {
404         int status, credits;
405         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
406         struct ocfs2_chain_list *cl;
407         struct ocfs2_alloc_context *ac = NULL;
408         handle_t *handle = NULL;
409         u32 bit_off, num_bits;
410         u16 alloc_rec;
411         u64 bg_blkno;
412         struct buffer_head *bg_bh = NULL;
413         struct ocfs2_group_desc *bg;
414
415         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
416
417         mlog_entry_void();
418
419         cl = &fe->id2.i_chain;
420         status = ocfs2_reserve_clusters_with_limit(osb,
421                                                    le16_to_cpu(cl->cl_cpg),
422                                                    max_block, &ac);
423         if (status < 0) {
424                 if (status != -ENOSPC)
425                         mlog_errno(status);
426                 goto bail;
427         }
428
429         credits = ocfs2_calc_group_alloc_credits(osb->sb,
430                                                  le16_to_cpu(cl->cl_cpg));
431         handle = ocfs2_start_trans(osb, credits);
432         if (IS_ERR(handle)) {
433                 status = PTR_ERR(handle);
434                 handle = NULL;
435                 mlog_errno(status);
436                 goto bail;
437         }
438
439         status = ocfs2_claim_clusters(osb,
440                                       handle,
441                                       ac,
442                                       le16_to_cpu(cl->cl_cpg),
443                                       &bit_off,
444                                       &num_bits);
445         if (status < 0) {
446                 if (status != -ENOSPC)
447                         mlog_errno(status);
448                 goto bail;
449         }
450
451         alloc_rec = ocfs2_find_smallest_chain(cl);
452
453         /* setup the group */
454         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
455         mlog(0, "new descriptor, record %u, at block %llu\n",
456              alloc_rec, (unsigned long long)bg_blkno);
457
458         bg_bh = sb_getblk(osb->sb, bg_blkno);
459         if (!bg_bh) {
460                 status = -EIO;
461                 mlog_errno(status);
462                 goto bail;
463         }
464         ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
465
466         status = ocfs2_block_group_fill(handle,
467                                         alloc_inode,
468                                         bg_bh,
469                                         bg_blkno,
470                                         alloc_rec,
471                                         cl);
472         if (status < 0) {
473                 mlog_errno(status);
474                 goto bail;
475         }
476
477         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
478
479         status = ocfs2_journal_access(handle, alloc_inode,
480                                       bh, OCFS2_JOURNAL_ACCESS_WRITE);
481         if (status < 0) {
482                 mlog_errno(status);
483                 goto bail;
484         }
485
486         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
487                      le16_to_cpu(bg->bg_free_bits_count));
488         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
489         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
490         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
491                 le16_add_cpu(&cl->cl_next_free_rec, 1);
492
493         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
494                                         le16_to_cpu(bg->bg_free_bits_count));
495         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
496         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
497
498         status = ocfs2_journal_dirty(handle, bh);
499         if (status < 0) {
500                 mlog_errno(status);
501                 goto bail;
502         }
503
504         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
505         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
506         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
507                                              le32_to_cpu(fe->i_clusters)));
508         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
509         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
510         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
511
512         status = 0;
513 bail:
514         if (handle)
515                 ocfs2_commit_trans(osb, handle);
516
517         if (ac)
518                 ocfs2_free_alloc_context(ac);
519
520         brelse(bg_bh);
521
522         mlog_exit(status);
523         return status;
524 }
525
526 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
527                                        struct ocfs2_alloc_context *ac,
528                                        int type,
529                                        u32 slot,
530                                        int alloc_new_group)
531 {
532         int status;
533         u32 bits_wanted = ac->ac_bits_wanted;
534         struct inode *alloc_inode;
535         struct buffer_head *bh = NULL;
536         struct ocfs2_dinode *fe;
537         u32 free_bits;
538
539         mlog_entry_void();
540
541         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
542         if (!alloc_inode) {
543                 mlog_errno(-EINVAL);
544                 return -EINVAL;
545         }
546
547         mutex_lock(&alloc_inode->i_mutex);
548
549         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
550         if (status < 0) {
551                 mutex_unlock(&alloc_inode->i_mutex);
552                 iput(alloc_inode);
553
554                 mlog_errno(status);
555                 return status;
556         }
557
558         ac->ac_inode = alloc_inode;
559         ac->ac_alloc_slot = slot;
560
561         fe = (struct ocfs2_dinode *) bh->b_data;
562
563         /* The bh was validated by the inode read inside
564          * ocfs2_inode_lock().  Any corruption is a code bug. */
565         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
566
567         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
568                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
569                             (unsigned long long)le64_to_cpu(fe->i_blkno));
570                 status = -EIO;
571                 goto bail;
572         }
573
574         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
575                 le32_to_cpu(fe->id1.bitmap1.i_used);
576
577         if (bits_wanted > free_bits) {
578                 /* cluster bitmap never grows */
579                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
580                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
581                              bits_wanted, free_bits);
582                         status = -ENOSPC;
583                         goto bail;
584                 }
585
586                 if (alloc_new_group != ALLOC_NEW_GROUP) {
587                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
588                              "and we don't alloc a new group for it.\n",
589                              slot, bits_wanted, free_bits);
590                         status = -ENOSPC;
591                         goto bail;
592                 }
593
594                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
595                                                  ac->ac_max_block);
596                 if (status < 0) {
597                         if (status != -ENOSPC)
598                                 mlog_errno(status);
599                         goto bail;
600                 }
601                 atomic_inc(&osb->alloc_stats.bg_extends);
602
603                 /* You should never ask for this much metadata */
604                 BUG_ON(bits_wanted >
605                        (le32_to_cpu(fe->id1.bitmap1.i_total)
606                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
607         }
608
609         get_bh(bh);
610         ac->ac_bh = bh;
611 bail:
612         brelse(bh);
613
614         mlog_exit(status);
615         return status;
616 }
617
618 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
619                                       int blocks,
620                                       struct ocfs2_alloc_context **ac)
621 {
622         int status;
623         u32 slot;
624
625         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
626         if (!(*ac)) {
627                 status = -ENOMEM;
628                 mlog_errno(status);
629                 goto bail;
630         }
631
632         (*ac)->ac_bits_wanted = blocks;
633         (*ac)->ac_which = OCFS2_AC_USE_META;
634         slot = osb->slot_num;
635         (*ac)->ac_group_search = ocfs2_block_group_search;
636
637         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
638                                              EXTENT_ALLOC_SYSTEM_INODE,
639                                              slot, ALLOC_NEW_GROUP);
640         if (status < 0) {
641                 if (status != -ENOSPC)
642                         mlog_errno(status);
643                 goto bail;
644         }
645
646         status = 0;
647 bail:
648         if ((status < 0) && *ac) {
649                 ocfs2_free_alloc_context(*ac);
650                 *ac = NULL;
651         }
652
653         mlog_exit(status);
654         return status;
655 }
656
657 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
658                                struct ocfs2_extent_list *root_el,
659                                struct ocfs2_alloc_context **ac)
660 {
661         return ocfs2_reserve_new_metadata_blocks(osb,
662                                         ocfs2_extend_meta_needed(root_el),
663                                         ac);
664 }
665
666 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
667                                               struct ocfs2_alloc_context *ac)
668 {
669         int i, status = -ENOSPC;
670         s16 slot = ocfs2_get_inode_steal_slot(osb);
671
672         /* Start to steal inodes from the first slot after ours. */
673         if (slot == OCFS2_INVALID_SLOT)
674                 slot = osb->slot_num + 1;
675
676         for (i = 0; i < osb->max_slots; i++, slot++) {
677                 if (slot == osb->max_slots)
678                         slot = 0;
679
680                 if (slot == osb->slot_num)
681                         continue;
682
683                 status = ocfs2_reserve_suballoc_bits(osb, ac,
684                                                      INODE_ALLOC_SYSTEM_INODE,
685                                                      slot, NOT_ALLOC_NEW_GROUP);
686                 if (status >= 0) {
687                         ocfs2_set_inode_steal_slot(osb, slot);
688                         break;
689                 }
690
691                 ocfs2_free_ac_resource(ac);
692         }
693
694         return status;
695 }
696
697 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
698                             struct ocfs2_alloc_context **ac)
699 {
700         int status;
701         s16 slot = ocfs2_get_inode_steal_slot(osb);
702
703         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
704         if (!(*ac)) {
705                 status = -ENOMEM;
706                 mlog_errno(status);
707                 goto bail;
708         }
709
710         (*ac)->ac_bits_wanted = 1;
711         (*ac)->ac_which = OCFS2_AC_USE_INODE;
712
713         (*ac)->ac_group_search = ocfs2_block_group_search;
714
715         /*
716          * stat(2) can't handle i_ino > 32bits, so we tell the
717          * lower levels not to allocate us a block group past that
718          * limit.  The 'inode64' mount option avoids this behavior.
719          */
720         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
721                 (*ac)->ac_max_block = (u32)~0U;
722
723         /*
724          * slot is set when we successfully steal inode from other nodes.
725          * It is reset in 3 places:
726          * 1. when we flush the truncate log
727          * 2. when we complete local alloc recovery.
728          * 3. when we successfully allocate from our own slot.
729          * After it is set, we will go on stealing inodes until we find the
730          * need to check our slots to see whether there is some space for us.
731          */
732         if (slot != OCFS2_INVALID_SLOT &&
733             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
734                 goto inode_steal;
735
736         atomic_set(&osb->s_num_inodes_stolen, 0);
737         status = ocfs2_reserve_suballoc_bits(osb, *ac,
738                                              INODE_ALLOC_SYSTEM_INODE,
739                                              osb->slot_num, ALLOC_NEW_GROUP);
740         if (status >= 0) {
741                 status = 0;
742
743                 /*
744                  * Some inodes must be freed by us, so try to allocate
745                  * from our own next time.
746                  */
747                 if (slot != OCFS2_INVALID_SLOT)
748                         ocfs2_init_inode_steal_slot(osb);
749                 goto bail;
750         } else if (status < 0 && status != -ENOSPC) {
751                 mlog_errno(status);
752                 goto bail;
753         }
754
755         ocfs2_free_ac_resource(*ac);
756
757 inode_steal:
758         status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
759         atomic_inc(&osb->s_num_inodes_stolen);
760         if (status < 0) {
761                 if (status != -ENOSPC)
762                         mlog_errno(status);
763                 goto bail;
764         }
765
766         status = 0;
767 bail:
768         if ((status < 0) && *ac) {
769                 ocfs2_free_alloc_context(*ac);
770                 *ac = NULL;
771         }
772
773         mlog_exit(status);
774         return status;
775 }
776
777 /* local alloc code has to do the same thing, so rather than do this
778  * twice.. */
779 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
780                                       struct ocfs2_alloc_context *ac)
781 {
782         int status;
783
784         ac->ac_which = OCFS2_AC_USE_MAIN;
785         ac->ac_group_search = ocfs2_cluster_group_search;
786
787         status = ocfs2_reserve_suballoc_bits(osb, ac,
788                                              GLOBAL_BITMAP_SYSTEM_INODE,
789                                              OCFS2_INVALID_SLOT,
790                                              ALLOC_NEW_GROUP);
791         if (status < 0 && status != -ENOSPC) {
792                 mlog_errno(status);
793                 goto bail;
794         }
795
796 bail:
797         return status;
798 }
799
800 /* Callers don't need to care which bitmap (local alloc or main) to
801  * use so we figure it out for them, but unfortunately this clutters
802  * things a bit. */
803 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
804                                              u32 bits_wanted, u64 max_block,
805                                              struct ocfs2_alloc_context **ac)
806 {
807         int status;
808
809         mlog_entry_void();
810
811         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
812         if (!(*ac)) {
813                 status = -ENOMEM;
814                 mlog_errno(status);
815                 goto bail;
816         }
817
818         (*ac)->ac_bits_wanted = bits_wanted;
819         (*ac)->ac_max_block = max_block;
820
821         status = -ENOSPC;
822         if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
823                 status = ocfs2_reserve_local_alloc_bits(osb,
824                                                         bits_wanted,
825                                                         *ac);
826                 if (status == -EFBIG) {
827                         /* The local alloc window is outside ac_max_block.
828                          * use the main bitmap. */
829                         status = -ENOSPC;
830                 } else if ((status < 0) && (status != -ENOSPC)) {
831                         mlog_errno(status);
832                         goto bail;
833                 }
834         }
835
836         if (status == -ENOSPC) {
837                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
838                 if (status < 0) {
839                         if (status != -ENOSPC)
840                                 mlog_errno(status);
841                         goto bail;
842                 }
843         }
844
845         status = 0;
846 bail:
847         if ((status < 0) && *ac) {
848                 ocfs2_free_alloc_context(*ac);
849                 *ac = NULL;
850         }
851
852         mlog_exit(status);
853         return status;
854 }
855
856 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
857                            u32 bits_wanted,
858                            struct ocfs2_alloc_context **ac)
859 {
860         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
861 }
862
863 /*
864  * More or less lifted from ext3. I'll leave their description below:
865  *
866  * "For ext3 allocations, we must not reuse any blocks which are
867  * allocated in the bitmap buffer's "last committed data" copy.  This
868  * prevents deletes from freeing up the page for reuse until we have
869  * committed the delete transaction.
870  *
871  * If we didn't do this, then deleting something and reallocating it as
872  * data would allow the old block to be overwritten before the
873  * transaction committed (because we force data to disk before commit).
874  * This would lead to corruption if we crashed between overwriting the
875  * data and committing the delete.
876  *
877  * @@@ We may want to make this allocation behaviour conditional on
878  * data-writes at some point, and disable it for metadata allocations or
879  * sync-data inodes."
880  *
881  * Note: OCFS2 already does this differently for metadata vs data
882  * allocations, as those bitmaps are separate and undo access is never
883  * called on a metadata group descriptor.
884  */
885 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
886                                          int nr)
887 {
888         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
889
890         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
891                 return 0;
892         if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
893                 return 1;
894
895         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
896         return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
897 }
898
899 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
900                                              struct buffer_head *bg_bh,
901                                              unsigned int bits_wanted,
902                                              unsigned int total_bits,
903                                              u16 *bit_off,
904                                              u16 *bits_found)
905 {
906         void *bitmap;
907         u16 best_offset, best_size;
908         int offset, start, found, status = 0;
909         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
910
911         /* Callers got this descriptor from
912          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
913         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
914
915         found = start = best_offset = best_size = 0;
916         bitmap = bg->bg_bitmap;
917
918         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
919                 if (offset == total_bits)
920                         break;
921
922                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
923                         /* We found a zero, but we can't use it as it
924                          * hasn't been put to disk yet! */
925                         found = 0;
926                         start = offset + 1;
927                 } else if (offset == start) {
928                         /* we found a zero */
929                         found++;
930                         /* move start to the next bit to test */
931                         start++;
932                 } else {
933                         /* got a zero after some ones */
934                         found = 1;
935                         start = offset + 1;
936                 }
937                 if (found > best_size) {
938                         best_size = found;
939                         best_offset = start - found;
940                 }
941                 /* we got everything we needed */
942                 if (found == bits_wanted) {
943                         /* mlog(0, "Found it all!\n"); */
944                         break;
945                 }
946         }
947
948         /* XXX: I think the first clause is equivalent to the second
949          *      - jlbec */
950         if (found == bits_wanted) {
951                 *bit_off = start - found;
952                 *bits_found = found;
953         } else if (best_size) {
954                 *bit_off = best_offset;
955                 *bits_found = best_size;
956         } else {
957                 status = -ENOSPC;
958                 /* No error log here -- see the comment above
959                  * ocfs2_test_bg_bit_allocatable */
960         }
961
962         return status;
963 }
964
965 static inline int ocfs2_block_group_set_bits(handle_t *handle,
966                                              struct inode *alloc_inode,
967                                              struct ocfs2_group_desc *bg,
968                                              struct buffer_head *group_bh,
969                                              unsigned int bit_off,
970                                              unsigned int num_bits)
971 {
972         int status;
973         void *bitmap = bg->bg_bitmap;
974         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
975
976         mlog_entry_void();
977
978         /* All callers get the descriptor via
979          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
980         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
981         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
982
983         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
984              num_bits);
985
986         if (ocfs2_is_cluster_bitmap(alloc_inode))
987                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
988
989         status = ocfs2_journal_access(handle,
990                                       alloc_inode,
991                                       group_bh,
992                                       journal_type);
993         if (status < 0) {
994                 mlog_errno(status);
995                 goto bail;
996         }
997
998         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
999
1000         while(num_bits--)
1001                 ocfs2_set_bit(bit_off++, bitmap);
1002
1003         status = ocfs2_journal_dirty(handle,
1004                                      group_bh);
1005         if (status < 0) {
1006                 mlog_errno(status);
1007                 goto bail;
1008         }
1009
1010 bail:
1011         mlog_exit(status);
1012         return status;
1013 }
1014
1015 /* find the one with the most empty bits */
1016 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1017 {
1018         u16 curr, best;
1019
1020         BUG_ON(!cl->cl_next_free_rec);
1021
1022         best = curr = 0;
1023         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1024                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1025                     le32_to_cpu(cl->cl_recs[best].c_free))
1026                         best = curr;
1027                 curr++;
1028         }
1029
1030         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1031         return best;
1032 }
1033
1034 static int ocfs2_relink_block_group(handle_t *handle,
1035                                     struct inode *alloc_inode,
1036                                     struct buffer_head *fe_bh,
1037                                     struct buffer_head *bg_bh,
1038                                     struct buffer_head *prev_bg_bh,
1039                                     u16 chain)
1040 {
1041         int status;
1042         /* there is a really tiny chance the journal calls could fail,
1043          * but we wouldn't want inconsistent blocks in *any* case. */
1044         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1045         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1046         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1047         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1048
1049         /* The caller got these descriptors from
1050          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1051         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1052         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1053
1054         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1055              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1056              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1057              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1058
1059         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1060         bg_ptr = le64_to_cpu(bg->bg_next_group);
1061         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1062
1063         status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
1064                                       OCFS2_JOURNAL_ACCESS_WRITE);
1065         if (status < 0) {
1066                 mlog_errno(status);
1067                 goto out_rollback;
1068         }
1069
1070         prev_bg->bg_next_group = bg->bg_next_group;
1071
1072         status = ocfs2_journal_dirty(handle, prev_bg_bh);
1073         if (status < 0) {
1074                 mlog_errno(status);
1075                 goto out_rollback;
1076         }
1077
1078         status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
1079                                       OCFS2_JOURNAL_ACCESS_WRITE);
1080         if (status < 0) {
1081                 mlog_errno(status);
1082                 goto out_rollback;
1083         }
1084
1085         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1086
1087         status = ocfs2_journal_dirty(handle, bg_bh);
1088         if (status < 0) {
1089                 mlog_errno(status);
1090                 goto out_rollback;
1091         }
1092
1093         status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
1094                                       OCFS2_JOURNAL_ACCESS_WRITE);
1095         if (status < 0) {
1096                 mlog_errno(status);
1097                 goto out_rollback;
1098         }
1099
1100         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1101
1102         status = ocfs2_journal_dirty(handle, fe_bh);
1103         if (status < 0) {
1104                 mlog_errno(status);
1105                 goto out_rollback;
1106         }
1107
1108         status = 0;
1109 out_rollback:
1110         if (status < 0) {
1111                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1112                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1113                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1114         }
1115
1116         mlog_exit(status);
1117         return status;
1118 }
1119
1120 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1121                                                      u32 wanted)
1122 {
1123         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1124 }
1125
1126 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1127  * value on error. */
1128 static int ocfs2_cluster_group_search(struct inode *inode,
1129                                       struct buffer_head *group_bh,
1130                                       u32 bits_wanted, u32 min_bits,
1131                                       u64 max_block,
1132                                       u16 *bit_off, u16 *bits_found)
1133 {
1134         int search = -ENOSPC;
1135         int ret;
1136         u64 blkoff;
1137         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1138         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1139         u16 tmp_off, tmp_found;
1140         unsigned int max_bits, gd_cluster_off;
1141
1142         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1143
1144         if (gd->bg_free_bits_count) {
1145                 max_bits = le16_to_cpu(gd->bg_bits);
1146
1147                 /* Tail groups in cluster bitmaps which aren't cpg
1148                  * aligned are prone to partial extention by a failed
1149                  * fs resize. If the file system resize never got to
1150                  * update the dinode cluster count, then we don't want
1151                  * to trust any clusters past it, regardless of what
1152                  * the group descriptor says. */
1153                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1154                                                           le64_to_cpu(gd->bg_blkno));
1155                 if ((gd_cluster_off + max_bits) >
1156                     OCFS2_I(inode)->ip_clusters) {
1157                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1158                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1159                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1160                              le16_to_cpu(gd->bg_bits),
1161                              OCFS2_I(inode)->ip_clusters, max_bits);
1162                 }
1163
1164                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1165                                                         group_bh, bits_wanted,
1166                                                         max_bits,
1167                                                         &tmp_off, &tmp_found);
1168                 if (ret)
1169                         return ret;
1170
1171                 if (max_block) {
1172                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1173                                                           gd_cluster_off +
1174                                                           tmp_off + tmp_found);
1175                         mlog(0, "Checking %llu against %llu\n",
1176                              (unsigned long long)blkoff,
1177                              (unsigned long long)max_block);
1178                         if (blkoff > max_block)
1179                                 return -ENOSPC;
1180                 }
1181
1182                 /* ocfs2_block_group_find_clear_bits() might
1183                  * return success, but we still want to return
1184                  * -ENOSPC unless it found the minimum number
1185                  * of bits. */
1186                 if (min_bits <= tmp_found) {
1187                         *bit_off = tmp_off;
1188                         *bits_found = tmp_found;
1189                         search = 0; /* success */
1190                 } else if (tmp_found) {
1191                         /*
1192                          * Don't show bits which we'll be returning
1193                          * for allocation to the local alloc bitmap.
1194                          */
1195                         ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1196                 }
1197         }
1198
1199         return search;
1200 }
1201
1202 static int ocfs2_block_group_search(struct inode *inode,
1203                                     struct buffer_head *group_bh,
1204                                     u32 bits_wanted, u32 min_bits,
1205                                     u64 max_block,
1206                                     u16 *bit_off, u16 *bits_found)
1207 {
1208         int ret = -ENOSPC;
1209         u64 blkoff;
1210         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1211
1212         BUG_ON(min_bits != 1);
1213         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1214
1215         if (bg->bg_free_bits_count) {
1216                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1217                                                         group_bh, bits_wanted,
1218                                                         le16_to_cpu(bg->bg_bits),
1219                                                         bit_off, bits_found);
1220                 if (!ret && max_block) {
1221                         blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1222                                 *bits_found;
1223                         mlog(0, "Checking %llu against %llu\n",
1224                              (unsigned long long)blkoff,
1225                              (unsigned long long)max_block);
1226                         if (blkoff > max_block)
1227                                 ret = -ENOSPC;
1228                 }
1229         }
1230
1231         return ret;
1232 }
1233
1234 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1235                                        handle_t *handle,
1236                                        struct buffer_head *di_bh,
1237                                        u32 num_bits,
1238                                        u16 chain)
1239 {
1240         int ret;
1241         u32 tmp_used;
1242         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1243         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1244
1245         ret = ocfs2_journal_access(handle, inode, di_bh,
1246                                    OCFS2_JOURNAL_ACCESS_WRITE);
1247         if (ret < 0) {
1248                 mlog_errno(ret);
1249                 goto out;
1250         }
1251
1252         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1253         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1254         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1255
1256         ret = ocfs2_journal_dirty(handle, di_bh);
1257         if (ret < 0)
1258                 mlog_errno(ret);
1259
1260 out:
1261         return ret;
1262 }
1263
1264 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1265                                   handle_t *handle,
1266                                   u32 bits_wanted,
1267                                   u32 min_bits,
1268                                   u16 *bit_off,
1269                                   unsigned int *num_bits,
1270                                   u64 gd_blkno,
1271                                   u16 *bits_left)
1272 {
1273         int ret;
1274         u16 found;
1275         struct buffer_head *group_bh = NULL;
1276         struct ocfs2_group_desc *gd;
1277         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1278         struct inode *alloc_inode = ac->ac_inode;
1279
1280         ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1281                                           &group_bh);
1282         if (ret < 0) {
1283                 mlog_errno(ret);
1284                 return ret;
1285         }
1286
1287         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1288         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1289                                   ac->ac_max_block, bit_off, &found);
1290         if (ret < 0) {
1291                 if (ret != -ENOSPC)
1292                         mlog_errno(ret);
1293                 goto out;
1294         }
1295
1296         *num_bits = found;
1297
1298         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1299                                                *num_bits,
1300                                                le16_to_cpu(gd->bg_chain));
1301         if (ret < 0) {
1302                 mlog_errno(ret);
1303                 goto out;
1304         }
1305
1306         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1307                                          *bit_off, *num_bits);
1308         if (ret < 0)
1309                 mlog_errno(ret);
1310
1311         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1312
1313 out:
1314         brelse(group_bh);
1315
1316         return ret;
1317 }
1318
1319 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1320                               handle_t *handle,
1321                               u32 bits_wanted,
1322                               u32 min_bits,
1323                               u16 *bit_off,
1324                               unsigned int *num_bits,
1325                               u64 *bg_blkno,
1326                               u16 *bits_left)
1327 {
1328         int status;
1329         u16 chain, tmp_bits;
1330         u32 tmp_used;
1331         u64 next_group;
1332         struct inode *alloc_inode = ac->ac_inode;
1333         struct buffer_head *group_bh = NULL;
1334         struct buffer_head *prev_group_bh = NULL;
1335         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1336         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1337         struct ocfs2_group_desc *bg;
1338
1339         chain = ac->ac_chain;
1340         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1341              bits_wanted, chain,
1342              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1343
1344         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1345                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1346                                              &group_bh);
1347         if (status < 0) {
1348                 mlog_errno(status);
1349                 goto bail;
1350         }
1351         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1352
1353         status = -ENOSPC;
1354         /* for now, the chain search is a bit simplistic. We just use
1355          * the 1st group with any empty bits. */
1356         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1357                                              bits_wanted, min_bits,
1358                                              ac->ac_max_block, bit_off,
1359                                              &tmp_bits)) == -ENOSPC) {
1360                 if (!bg->bg_next_group)
1361                         break;
1362
1363                 brelse(prev_group_bh);
1364                 prev_group_bh = NULL;
1365
1366                 next_group = le64_to_cpu(bg->bg_next_group);
1367                 prev_group_bh = group_bh;
1368                 group_bh = NULL;
1369                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1370                                                      next_group, &group_bh);
1371                 if (status < 0) {
1372                         mlog_errno(status);
1373                         goto bail;
1374                 }
1375                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1376         }
1377         if (status < 0) {
1378                 if (status != -ENOSPC)
1379                         mlog_errno(status);
1380                 goto bail;
1381         }
1382
1383         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1384              tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1385
1386         *num_bits = tmp_bits;
1387
1388         BUG_ON(*num_bits == 0);
1389
1390         /*
1391          * Keep track of previous block descriptor read. When
1392          * we find a target, if we have read more than X
1393          * number of descriptors, and the target is reasonably
1394          * empty, relink him to top of his chain.
1395          *
1396          * We've read 0 extra blocks and only send one more to
1397          * the transaction, yet the next guy to search has a
1398          * much easier time.
1399          *
1400          * Do this *after* figuring out how many bits we're taking out
1401          * of our target group.
1402          */
1403         if (ac->ac_allow_chain_relink &&
1404             (prev_group_bh) &&
1405             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1406                 status = ocfs2_relink_block_group(handle, alloc_inode,
1407                                                   ac->ac_bh, group_bh,
1408                                                   prev_group_bh, chain);
1409                 if (status < 0) {
1410                         mlog_errno(status);
1411                         goto bail;
1412                 }
1413         }
1414
1415         /* Ok, claim our bits now: set the info on dinode, chainlist
1416          * and then the group */
1417         status = ocfs2_journal_access(handle,
1418                                       alloc_inode,
1419                                       ac->ac_bh,
1420                                       OCFS2_JOURNAL_ACCESS_WRITE);
1421         if (status < 0) {
1422                 mlog_errno(status);
1423                 goto bail;
1424         }
1425
1426         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1427         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1428         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1429
1430         status = ocfs2_journal_dirty(handle,
1431                                      ac->ac_bh);
1432         if (status < 0) {
1433                 mlog_errno(status);
1434                 goto bail;
1435         }
1436
1437         status = ocfs2_block_group_set_bits(handle,
1438                                             alloc_inode,
1439                                             bg,
1440                                             group_bh,
1441                                             *bit_off,
1442                                             *num_bits);
1443         if (status < 0) {
1444                 mlog_errno(status);
1445                 goto bail;
1446         }
1447
1448         mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1449              (unsigned long long)le64_to_cpu(fe->i_blkno));
1450
1451         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1452         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1453 bail:
1454         brelse(group_bh);
1455         brelse(prev_group_bh);
1456
1457         mlog_exit(status);
1458         return status;
1459 }
1460
1461 /* will give out up to bits_wanted contiguous bits. */
1462 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1463                                      struct ocfs2_alloc_context *ac,
1464                                      handle_t *handle,
1465                                      u32 bits_wanted,
1466                                      u32 min_bits,
1467                                      u16 *bit_off,
1468                                      unsigned int *num_bits,
1469                                      u64 *bg_blkno)
1470 {
1471         int status;
1472         u16 victim, i;
1473         u16 bits_left = 0;
1474         u64 hint_blkno = ac->ac_last_group;
1475         struct ocfs2_chain_list *cl;
1476         struct ocfs2_dinode *fe;
1477
1478         mlog_entry_void();
1479
1480         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1481         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1482         BUG_ON(!ac->ac_bh);
1483
1484         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1485
1486         /* The bh was validated by the inode read during
1487          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1488         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1489
1490         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1491             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1492                 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1493                             "bits but only %u total.",
1494                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1495                             le32_to_cpu(fe->id1.bitmap1.i_used),
1496                             le32_to_cpu(fe->id1.bitmap1.i_total));
1497                 status = -EIO;
1498                 goto bail;
1499         }
1500
1501         if (hint_blkno) {
1502                 /* Attempt to short-circuit the usual search mechanism
1503                  * by jumping straight to the most recently used
1504                  * allocation group. This helps us mantain some
1505                  * contiguousness across allocations. */
1506                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1507                                                 min_bits, bit_off, num_bits,
1508                                                 hint_blkno, &bits_left);
1509                 if (!status) {
1510                         /* Be careful to update *bg_blkno here as the
1511                          * caller is expecting it to be filled in, and
1512                          * ocfs2_search_one_group() won't do that for
1513                          * us. */
1514                         *bg_blkno = hint_blkno;
1515                         goto set_hint;
1516                 }
1517                 if (status < 0 && status != -ENOSPC) {
1518                         mlog_errno(status);
1519                         goto bail;
1520                 }
1521         }
1522
1523         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1524
1525         victim = ocfs2_find_victim_chain(cl);
1526         ac->ac_chain = victim;
1527         ac->ac_allow_chain_relink = 1;
1528
1529         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1530                                     num_bits, bg_blkno, &bits_left);
1531         if (!status)
1532                 goto set_hint;
1533         if (status < 0 && status != -ENOSPC) {
1534                 mlog_errno(status);
1535                 goto bail;
1536         }
1537
1538         mlog(0, "Search of victim chain %u came up with nothing, "
1539              "trying all chains now.\n", victim);
1540
1541         /* If we didn't pick a good victim, then just default to
1542          * searching each chain in order. Don't allow chain relinking
1543          * because we only calculate enough journal credits for one
1544          * relink per alloc. */
1545         ac->ac_allow_chain_relink = 0;
1546         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1547                 if (i == victim)
1548                         continue;
1549                 if (!cl->cl_recs[i].c_free)
1550                         continue;
1551
1552                 ac->ac_chain = i;
1553                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1554                                             bit_off, num_bits, bg_blkno,
1555                                             &bits_left);
1556                 if (!status)
1557                         break;
1558                 if (status < 0 && status != -ENOSPC) {
1559                         mlog_errno(status);
1560                         goto bail;
1561                 }
1562         }
1563
1564 set_hint:
1565         if (status != -ENOSPC) {
1566                 /* If the next search of this group is not likely to
1567                  * yield a suitable extent, then we reset the last
1568                  * group hint so as to not waste a disk read */
1569                 if (bits_left < min_bits)
1570                         ac->ac_last_group = 0;
1571                 else
1572                         ac->ac_last_group = *bg_blkno;
1573         }
1574
1575 bail:
1576         mlog_exit(status);
1577         return status;
1578 }
1579
1580 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1581                          handle_t *handle,
1582                          struct ocfs2_alloc_context *ac,
1583                          u32 bits_wanted,
1584                          u16 *suballoc_bit_start,
1585                          unsigned int *num_bits,
1586                          u64 *blkno_start)
1587 {
1588         int status;
1589         u64 bg_blkno;
1590
1591         BUG_ON(!ac);
1592         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1593         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1594
1595         status = ocfs2_claim_suballoc_bits(osb,
1596                                            ac,
1597                                            handle,
1598                                            bits_wanted,
1599                                            1,
1600                                            suballoc_bit_start,
1601                                            num_bits,
1602                                            &bg_blkno);
1603         if (status < 0) {
1604                 mlog_errno(status);
1605                 goto bail;
1606         }
1607         atomic_inc(&osb->alloc_stats.bg_allocs);
1608
1609         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1610         ac->ac_bits_given += (*num_bits);
1611         status = 0;
1612 bail:
1613         mlog_exit(status);
1614         return status;
1615 }
1616
1617 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1618                           handle_t *handle,
1619                           struct ocfs2_alloc_context *ac,
1620                           u16 *suballoc_bit,
1621                           u64 *fe_blkno)
1622 {
1623         int status;
1624         unsigned int num_bits;
1625         u64 bg_blkno;
1626
1627         mlog_entry_void();
1628
1629         BUG_ON(!ac);
1630         BUG_ON(ac->ac_bits_given != 0);
1631         BUG_ON(ac->ac_bits_wanted != 1);
1632         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1633
1634         status = ocfs2_claim_suballoc_bits(osb,
1635                                            ac,
1636                                            handle,
1637                                            1,
1638                                            1,
1639                                            suballoc_bit,
1640                                            &num_bits,
1641                                            &bg_blkno);
1642         if (status < 0) {
1643                 mlog_errno(status);
1644                 goto bail;
1645         }
1646         atomic_inc(&osb->alloc_stats.bg_allocs);
1647
1648         BUG_ON(num_bits != 1);
1649
1650         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1651         ac->ac_bits_given++;
1652         status = 0;
1653 bail:
1654         mlog_exit(status);
1655         return status;
1656 }
1657
1658 /* translate a group desc. blkno and it's bitmap offset into
1659  * disk cluster offset. */
1660 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1661                                                    u64 bg_blkno,
1662                                                    u16 bg_bit_off)
1663 {
1664         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1665         u32 cluster = 0;
1666
1667         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1668
1669         if (bg_blkno != osb->first_cluster_group_blkno)
1670                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1671         cluster += (u32) bg_bit_off;
1672         return cluster;
1673 }
1674
1675 /* given a cluster offset, calculate which block group it belongs to
1676  * and return that block offset. */
1677 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1678 {
1679         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1680         u32 group_no;
1681
1682         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1683
1684         group_no = cluster / osb->bitmap_cpg;
1685         if (!group_no)
1686                 return osb->first_cluster_group_blkno;
1687         return ocfs2_clusters_to_blocks(inode->i_sb,
1688                                         group_no * osb->bitmap_cpg);
1689 }
1690
1691 /* given the block number of a cluster start, calculate which cluster
1692  * group and descriptor bitmap offset that corresponds to. */
1693 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1694                                                 u64 data_blkno,
1695                                                 u64 *bg_blkno,
1696                                                 u16 *bg_bit_off)
1697 {
1698         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1699         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1700
1701         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1702
1703         *bg_blkno = ocfs2_which_cluster_group(inode,
1704                                               data_cluster);
1705
1706         if (*bg_blkno == osb->first_cluster_group_blkno)
1707                 *bg_bit_off = (u16) data_cluster;
1708         else
1709                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1710                                                              data_blkno - *bg_blkno);
1711 }
1712
1713 /*
1714  * min_bits - minimum contiguous chunk from this total allocation we
1715  * can handle. set to what we asked for originally for a full
1716  * contig. allocation, set to '1' to indicate we can deal with extents
1717  * of any size.
1718  */
1719 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1720                            handle_t *handle,
1721                            struct ocfs2_alloc_context *ac,
1722                            u32 min_clusters,
1723                            u32 max_clusters,
1724                            u32 *cluster_start,
1725                            u32 *num_clusters)
1726 {
1727         int status;
1728         unsigned int bits_wanted = max_clusters;
1729         u64 bg_blkno = 0;
1730         u16 bg_bit_off;
1731
1732         mlog_entry_void();
1733
1734         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1735
1736         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1737                && ac->ac_which != OCFS2_AC_USE_MAIN);
1738
1739         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1740                 status = ocfs2_claim_local_alloc_bits(osb,
1741                                                       handle,
1742                                                       ac,
1743                                                       bits_wanted,
1744                                                       cluster_start,
1745                                                       num_clusters);
1746                 if (!status)
1747                         atomic_inc(&osb->alloc_stats.local_data);
1748         } else {
1749                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1750                         /* The only paths asking for contiguousness
1751                          * should know about this already. */
1752                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1753                              "group bitmap size %u!\n", min_clusters,
1754                              osb->bitmap_cpg);
1755                         status = -ENOSPC;
1756                         goto bail;
1757                 }
1758                 /* clamp the current request down to a realistic size. */
1759                 if (bits_wanted > (osb->bitmap_cpg - 1))
1760                         bits_wanted = osb->bitmap_cpg - 1;
1761
1762                 status = ocfs2_claim_suballoc_bits(osb,
1763                                                    ac,
1764                                                    handle,
1765                                                    bits_wanted,
1766                                                    min_clusters,
1767                                                    &bg_bit_off,
1768                                                    num_clusters,
1769                                                    &bg_blkno);
1770                 if (!status) {
1771                         *cluster_start =
1772                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1773                                                                  bg_blkno,
1774                                                                  bg_bit_off);
1775                         atomic_inc(&osb->alloc_stats.bitmap_data);
1776                 }
1777         }
1778         if (status < 0) {
1779                 if (status != -ENOSPC)
1780                         mlog_errno(status);
1781                 goto bail;
1782         }
1783
1784         ac->ac_bits_given += *num_clusters;
1785
1786 bail:
1787         mlog_exit(status);
1788         return status;
1789 }
1790
1791 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1792                          handle_t *handle,
1793                          struct ocfs2_alloc_context *ac,
1794                          u32 min_clusters,
1795                          u32 *cluster_start,
1796                          u32 *num_clusters)
1797 {
1798         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1799
1800         return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1801                                       bits_wanted, cluster_start, num_clusters);
1802 }
1803
1804 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1805                                                struct inode *alloc_inode,
1806                                                struct ocfs2_group_desc *bg,
1807                                                struct buffer_head *group_bh,
1808                                                unsigned int bit_off,
1809                                                unsigned int num_bits)
1810 {
1811         int status;
1812         unsigned int tmp;
1813         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1814         struct ocfs2_group_desc *undo_bg = NULL;
1815
1816         mlog_entry_void();
1817
1818         /* The caller got this descriptor from
1819          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1820         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1821
1822         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1823
1824         if (ocfs2_is_cluster_bitmap(alloc_inode))
1825                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1826
1827         status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1828                                       journal_type);
1829         if (status < 0) {
1830                 mlog_errno(status);
1831                 goto bail;
1832         }
1833
1834         if (ocfs2_is_cluster_bitmap(alloc_inode))
1835                 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1836
1837         tmp = num_bits;
1838         while(tmp--) {
1839                 ocfs2_clear_bit((bit_off + tmp),
1840                                 (unsigned long *) bg->bg_bitmap);
1841                 if (ocfs2_is_cluster_bitmap(alloc_inode))
1842                         ocfs2_set_bit(bit_off + tmp,
1843                                       (unsigned long *) undo_bg->bg_bitmap);
1844         }
1845         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1846
1847         status = ocfs2_journal_dirty(handle, group_bh);
1848         if (status < 0)
1849                 mlog_errno(status);
1850 bail:
1851         return status;
1852 }
1853
1854 /*
1855  * expects the suballoc inode to already be locked.
1856  */
1857 int ocfs2_free_suballoc_bits(handle_t *handle,
1858                              struct inode *alloc_inode,
1859                              struct buffer_head *alloc_bh,
1860                              unsigned int start_bit,
1861                              u64 bg_blkno,
1862                              unsigned int count)
1863 {
1864         int status = 0;
1865         u32 tmp_used;
1866         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1867         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1868         struct buffer_head *group_bh = NULL;
1869         struct ocfs2_group_desc *group;
1870
1871         mlog_entry_void();
1872
1873         /* The alloc_bh comes from ocfs2_free_dinode() or
1874          * ocfs2_free_clusters().  The callers have all locked the
1875          * allocator and gotten alloc_bh from the lock call.  This
1876          * validates the dinode buffer.  Any corruption that has happended
1877          * is a code bug. */
1878         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1879         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1880
1881         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1882              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1883              (unsigned long long)bg_blkno, start_bit);
1884
1885         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1886                                              &group_bh);
1887         if (status < 0) {
1888                 mlog_errno(status);
1889                 goto bail;
1890         }
1891         group = (struct ocfs2_group_desc *) group_bh->b_data;
1892
1893         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1894
1895         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1896                                               group, group_bh,
1897                                               start_bit, count);
1898         if (status < 0) {
1899                 mlog_errno(status);
1900                 goto bail;
1901         }
1902
1903         status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1904                                       OCFS2_JOURNAL_ACCESS_WRITE);
1905         if (status < 0) {
1906                 mlog_errno(status);
1907                 goto bail;
1908         }
1909
1910         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1911                      count);
1912         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1913         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1914
1915         status = ocfs2_journal_dirty(handle, alloc_bh);
1916         if (status < 0) {
1917                 mlog_errno(status);
1918                 goto bail;
1919         }
1920
1921 bail:
1922         brelse(group_bh);
1923
1924         mlog_exit(status);
1925         return status;
1926 }
1927
1928 int ocfs2_free_dinode(handle_t *handle,
1929                       struct inode *inode_alloc_inode,
1930                       struct buffer_head *inode_alloc_bh,
1931                       struct ocfs2_dinode *di)
1932 {
1933         u64 blk = le64_to_cpu(di->i_blkno);
1934         u16 bit = le16_to_cpu(di->i_suballoc_bit);
1935         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1936
1937         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1938                                         inode_alloc_bh, bit, bg_blkno, 1);
1939 }
1940
1941 int ocfs2_free_clusters(handle_t *handle,
1942                        struct inode *bitmap_inode,
1943                        struct buffer_head *bitmap_bh,
1944                        u64 start_blk,
1945                        unsigned int num_clusters)
1946 {
1947         int status;
1948         u16 bg_start_bit;
1949         u64 bg_blkno;
1950         struct ocfs2_dinode *fe;
1951
1952         /* You can't ever have a contiguous set of clusters
1953          * bigger than a block group bitmap so we never have to worry
1954          * about looping on them. */
1955
1956         mlog_entry_void();
1957
1958         /* This is expensive. We can safely remove once this stuff has
1959          * gotten tested really well. */
1960         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1961
1962         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1963
1964         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1965                                      &bg_start_bit);
1966
1967         mlog(0, "want to free %u clusters starting at block %llu\n",
1968              num_clusters, (unsigned long long)start_blk);
1969         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1970              (unsigned long long)bg_blkno, bg_start_bit);
1971
1972         status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1973                                           bg_start_bit, bg_blkno,
1974                                           num_clusters);
1975         if (status < 0) {
1976                 mlog_errno(status);
1977                 goto out;
1978         }
1979
1980         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
1981                                          num_clusters);
1982
1983 out:
1984         mlog_exit(status);
1985         return status;
1986 }
1987
1988 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1989 {
1990         printk("Block Group:\n");
1991         printk("bg_signature:       %s\n", bg->bg_signature);
1992         printk("bg_size:            %u\n", bg->bg_size);
1993         printk("bg_bits:            %u\n", bg->bg_bits);
1994         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1995         printk("bg_chain:           %u\n", bg->bg_chain);
1996         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1997         printk("bg_next_group:      %llu\n",
1998                (unsigned long long)bg->bg_next_group);
1999         printk("bg_parent_dinode:   %llu\n",
2000                (unsigned long long)bg->bg_parent_dinode);
2001         printk("bg_blkno:           %llu\n",
2002                (unsigned long long)bg->bg_blkno);
2003 }
2004
2005 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2006 {
2007         int i;
2008
2009         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2010         printk("i_signature:                  %s\n", fe->i_signature);
2011         printk("i_size:                       %llu\n",
2012                (unsigned long long)fe->i_size);
2013         printk("i_clusters:                   %u\n", fe->i_clusters);
2014         printk("i_generation:                 %u\n",
2015                le32_to_cpu(fe->i_generation));
2016         printk("id1.bitmap1.i_used:           %u\n",
2017                le32_to_cpu(fe->id1.bitmap1.i_used));
2018         printk("id1.bitmap1.i_total:          %u\n",
2019                le32_to_cpu(fe->id1.bitmap1.i_total));
2020         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2021         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2022         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2023         printk("id2.i_chain.cl_next_free_rec: %u\n",
2024                fe->id2.i_chain.cl_next_free_rec);
2025         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2026                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2027                        fe->id2.i_chain.cl_recs[i].c_free);
2028                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2029                        fe->id2.i_chain.cl_recs[i].c_total);
2030                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2031                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2032         }
2033 }
2034
2035 /*
2036  * For a given allocation, determine which allocators will need to be
2037  * accessed, and lock them, reserving the appropriate number of bits.
2038  *
2039  * Sparse file systems call this from ocfs2_write_begin_nolock()
2040  * and ocfs2_allocate_unwritten_extents().
2041  *
2042  * File systems which don't support holes call this from
2043  * ocfs2_extend_allocation().
2044  */
2045 int ocfs2_lock_allocators(struct inode *inode,
2046                           struct ocfs2_extent_tree *et,
2047                           u32 clusters_to_add, u32 extents_to_split,
2048                           struct ocfs2_alloc_context **data_ac,
2049                           struct ocfs2_alloc_context **meta_ac)
2050 {
2051         int ret = 0, num_free_extents;
2052         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2053         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2054
2055         *meta_ac = NULL;
2056         if (data_ac)
2057                 *data_ac = NULL;
2058
2059         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2060
2061         num_free_extents = ocfs2_num_free_extents(osb, inode, et);
2062         if (num_free_extents < 0) {
2063                 ret = num_free_extents;
2064                 mlog_errno(ret);
2065                 goto out;
2066         }
2067
2068         /*
2069          * Sparse allocation file systems need to be more conservative
2070          * with reserving room for expansion - the actual allocation
2071          * happens while we've got a journal handle open so re-taking
2072          * a cluster lock (because we ran out of room for another
2073          * extent) will violate ordering rules.
2074          *
2075          * Most of the time we'll only be seeing this 1 cluster at a time
2076          * anyway.
2077          *
2078          * Always lock for any unwritten extents - we might want to
2079          * add blocks during a split.
2080          */
2081         if (!num_free_extents ||
2082             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2083                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2084                 if (ret < 0) {
2085                         if (ret != -ENOSPC)
2086                                 mlog_errno(ret);
2087                         goto out;
2088                 }
2089         }
2090
2091         if (clusters_to_add == 0)
2092                 goto out;
2093
2094         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2095         if (ret < 0) {
2096                 if (ret != -ENOSPC)
2097                         mlog_errno(ret);
2098                 goto out;
2099         }
2100
2101 out:
2102         if (ret) {
2103                 if (*meta_ac) {
2104                         ocfs2_free_alloc_context(*meta_ac);
2105                         *meta_ac = NULL;
2106                 }
2107
2108                 /*
2109                  * We cannot have an error and a non null *data_ac.
2110                  */
2111         }
2112
2113         return ret;
2114 }