Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core...
[linux-2.6] / fs / ext4 / balloc.c
1 /*
2  *  linux/fs/ext4/balloc.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10  *  Big-endian to little-endian byte-swapping/bitmaps by
11  *        David S. Miller (davem@caip.rutgers.edu), 1995
12  */
13
14 #include <linux/time.h>
15 #include <linux/capability.h>
16 #include <linux/fs.h>
17 #include <linux/jbd2.h>
18 #include <linux/quotaops.h>
19 #include <linux/buffer_head.h>
20 #include "ext4.h"
21 #include "ext4_jbd2.h"
22 #include "group.h"
23
24 /*
25  * balloc.c contains the blocks allocation and deallocation routines
26  */
27
28 /*
29  * Calculate the block group number and offset, given a block number
30  */
31 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
32                 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
33 {
34         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
35         ext4_grpblk_t offset;
36
37         blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
38         offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
39         if (offsetp)
40                 *offsetp = offset;
41         if (blockgrpp)
42                 *blockgrpp = blocknr;
43
44 }
45
46 static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
47                         ext4_group_t block_group)
48 {
49         ext4_group_t actual_group;
50         ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
51         if (actual_group == block_group)
52                 return 1;
53         return 0;
54 }
55
56 static int ext4_group_used_meta_blocks(struct super_block *sb,
57                                 ext4_group_t block_group)
58 {
59         ext4_fsblk_t tmp;
60         struct ext4_sb_info *sbi = EXT4_SB(sb);
61         /* block bitmap, inode bitmap, and inode table blocks */
62         int used_blocks = sbi->s_itb_per_group + 2;
63
64         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
65                 struct ext4_group_desc *gdp;
66                 struct buffer_head *bh;
67
68                 gdp = ext4_get_group_desc(sb, block_group, &bh);
69                 if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
70                                         block_group))
71                         used_blocks--;
72
73                 if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
74                                         block_group))
75                         used_blocks--;
76
77                 tmp = ext4_inode_table(sb, gdp);
78                 for (; tmp < ext4_inode_table(sb, gdp) +
79                                 sbi->s_itb_per_group; tmp++) {
80                         if (!ext4_block_in_group(sb, tmp, block_group))
81                                 used_blocks -= 1;
82                 }
83         }
84         return used_blocks;
85 }
86 /* Initializes an uninitialized block bitmap if given, and returns the
87  * number of blocks free in the group. */
88 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
89                  ext4_group_t block_group, struct ext4_group_desc *gdp)
90 {
91         int bit, bit_max;
92         unsigned free_blocks, group_blocks;
93         struct ext4_sb_info *sbi = EXT4_SB(sb);
94
95         if (bh) {
96                 J_ASSERT_BH(bh, buffer_locked(bh));
97
98                 /* If checksum is bad mark all blocks used to prevent allocation
99                  * essentially implementing a per-group read-only flag. */
100                 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
101                         ext4_error(sb, __func__,
102                                   "Checksum bad for group %lu\n", block_group);
103                         gdp->bg_free_blocks_count = 0;
104                         gdp->bg_free_inodes_count = 0;
105                         gdp->bg_itable_unused = 0;
106                         memset(bh->b_data, 0xff, sb->s_blocksize);
107                         return 0;
108                 }
109                 memset(bh->b_data, 0, sb->s_blocksize);
110         }
111
112         /* Check for superblock and gdt backups in this group */
113         bit_max = ext4_bg_has_super(sb, block_group);
114
115         if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
116             block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
117                           sbi->s_desc_per_block) {
118                 if (bit_max) {
119                         bit_max += ext4_bg_num_gdb(sb, block_group);
120                         bit_max +=
121                                 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
122                 }
123         } else { /* For META_BG_BLOCK_GROUPS */
124                 bit_max += ext4_bg_num_gdb(sb, block_group);
125         }
126
127         if (block_group == sbi->s_groups_count - 1) {
128                 /*
129                  * Even though mke2fs always initialize first and last group
130                  * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
131                  * to make sure we calculate the right free blocks
132                  */
133                 group_blocks = ext4_blocks_count(sbi->s_es) -
134                         le32_to_cpu(sbi->s_es->s_first_data_block) -
135                         (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
136         } else {
137                 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
138         }
139
140         free_blocks = group_blocks - bit_max;
141
142         if (bh) {
143                 ext4_fsblk_t start, tmp;
144                 int flex_bg = 0;
145
146                 for (bit = 0; bit < bit_max; bit++)
147                         ext4_set_bit(bit, bh->b_data);
148
149                 start = ext4_group_first_block_no(sb, block_group);
150
151                 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
152                                               EXT4_FEATURE_INCOMPAT_FLEX_BG))
153                         flex_bg = 1;
154
155                 /* Set bits for block and inode bitmaps, and inode table */
156                 tmp = ext4_block_bitmap(sb, gdp);
157                 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
158                         ext4_set_bit(tmp - start, bh->b_data);
159
160                 tmp = ext4_inode_bitmap(sb, gdp);
161                 if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
162                         ext4_set_bit(tmp - start, bh->b_data);
163
164                 tmp = ext4_inode_table(sb, gdp);
165                 for (; tmp < ext4_inode_table(sb, gdp) +
166                                 sbi->s_itb_per_group; tmp++) {
167                         if (!flex_bg ||
168                                 ext4_block_in_group(sb, tmp, block_group))
169                                 ext4_set_bit(tmp - start, bh->b_data);
170                 }
171                 /*
172                  * Also if the number of blocks within the group is
173                  * less than the blocksize * 8 ( which is the size
174                  * of bitmap ), set rest of the block bitmap to 1
175                  */
176                 mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
177         }
178         return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
179 }
180
181
182 /*
183  * The free blocks are managed by bitmaps.  A file system contains several
184  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
185  * block for inodes, N blocks for the inode table and data blocks.
186  *
187  * The file system contains group descriptors which are located after the
188  * super block.  Each descriptor contains the number of the bitmap block and
189  * the free blocks count in the block.  The descriptors are loaded in memory
190  * when a file system is mounted (see ext4_fill_super).
191  */
192
193
194 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
195
196 /**
197  * ext4_get_group_desc() -- load group descriptor from disk
198  * @sb:                 super block
199  * @block_group:        given block group
200  * @bh:                 pointer to the buffer head to store the block
201  *                      group descriptor
202  */
203 struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
204                                              ext4_group_t block_group,
205                                              struct buffer_head ** bh)
206 {
207         unsigned long group_desc;
208         unsigned long offset;
209         struct ext4_group_desc * desc;
210         struct ext4_sb_info *sbi = EXT4_SB(sb);
211
212         if (block_group >= sbi->s_groups_count) {
213                 ext4_error (sb, "ext4_get_group_desc",
214                             "block_group >= groups_count - "
215                             "block_group = %lu, groups_count = %lu",
216                             block_group, sbi->s_groups_count);
217
218                 return NULL;
219         }
220         smp_rmb();
221
222         group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
223         offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
224         if (!sbi->s_group_desc[group_desc]) {
225                 ext4_error (sb, "ext4_get_group_desc",
226                             "Group descriptor not loaded - "
227                             "block_group = %lu, group_desc = %lu, desc = %lu",
228                              block_group, group_desc, offset);
229                 return NULL;
230         }
231
232         desc = (struct ext4_group_desc *)(
233                 (__u8 *)sbi->s_group_desc[group_desc]->b_data +
234                 offset * EXT4_DESC_SIZE(sb));
235         if (bh)
236                 *bh = sbi->s_group_desc[group_desc];
237         return desc;
238 }
239
240 static int ext4_valid_block_bitmap(struct super_block *sb,
241                                         struct ext4_group_desc *desc,
242                                         unsigned int block_group,
243                                         struct buffer_head *bh)
244 {
245         ext4_grpblk_t offset;
246         ext4_grpblk_t next_zero_bit;
247         ext4_fsblk_t bitmap_blk;
248         ext4_fsblk_t group_first_block;
249
250         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
251                 /* with FLEX_BG, the inode/block bitmaps and itable
252                  * blocks may not be in the group at all
253                  * so the bitmap validation will be skipped for those groups
254                  * or it has to also read the block group where the bitmaps
255                  * are located to verify they are set.
256                  */
257                 return 1;
258         }
259         group_first_block = ext4_group_first_block_no(sb, block_group);
260
261         /* check whether block bitmap block number is set */
262         bitmap_blk = ext4_block_bitmap(sb, desc);
263         offset = bitmap_blk - group_first_block;
264         if (!ext4_test_bit(offset, bh->b_data))
265                 /* bad block bitmap */
266                 goto err_out;
267
268         /* check whether the inode bitmap block number is set */
269         bitmap_blk = ext4_inode_bitmap(sb, desc);
270         offset = bitmap_blk - group_first_block;
271         if (!ext4_test_bit(offset, bh->b_data))
272                 /* bad block bitmap */
273                 goto err_out;
274
275         /* check whether the inode table block number is set */
276         bitmap_blk = ext4_inode_table(sb, desc);
277         offset = bitmap_blk - group_first_block;
278         next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
279                                 offset + EXT4_SB(sb)->s_itb_per_group,
280                                 offset);
281         if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
282                 /* good bitmap for inode tables */
283                 return 1;
284
285 err_out:
286         ext4_error(sb, __func__,
287                         "Invalid block bitmap - "
288                         "block_group = %d, block = %llu",
289                         block_group, bitmap_blk);
290         return 0;
291 }
292 /**
293  * ext4_read_block_bitmap()
294  * @sb:                 super block
295  * @block_group:        given block group
296  *
297  * Read the bitmap for a given block_group,and validate the
298  * bits for block/inode/inode tables are set in the bitmaps
299  *
300  * Return buffer_head on success or NULL in case of failure.
301  */
302 struct buffer_head *
303 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
304 {
305         struct ext4_group_desc * desc;
306         struct buffer_head * bh = NULL;
307         ext4_fsblk_t bitmap_blk;
308
309         desc = ext4_get_group_desc(sb, block_group, NULL);
310         if (!desc)
311                 return NULL;
312         bitmap_blk = ext4_block_bitmap(sb, desc);
313         bh = sb_getblk(sb, bitmap_blk);
314         if (unlikely(!bh)) {
315                 ext4_error(sb, __func__,
316                             "Cannot read block bitmap - "
317                             "block_group = %d, block_bitmap = %llu",
318                             (int)block_group, (unsigned long long)bitmap_blk);
319                 return NULL;
320         }
321         if (bh_uptodate_or_lock(bh))
322                 return bh;
323
324         if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
325                 ext4_init_block_bitmap(sb, bh, block_group, desc);
326                 set_buffer_uptodate(bh);
327                 unlock_buffer(bh);
328                 return bh;
329         }
330         if (bh_submit_read(bh) < 0) {
331                 put_bh(bh);
332                 ext4_error(sb, __func__,
333                             "Cannot read block bitmap - "
334                             "block_group = %d, block_bitmap = %llu",
335                             (int)block_group, (unsigned long long)bitmap_blk);
336                 return NULL;
337         }
338         ext4_valid_block_bitmap(sb, desc, block_group, bh);
339         /*
340          * file system mounted not to panic on error,
341          * continue with corrupt bitmap
342          */
343         return bh;
344 }
345 /*
346  * The reservation window structure operations
347  * --------------------------------------------
348  * Operations include:
349  * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
350  *
351  * We use a red-black tree to represent per-filesystem reservation
352  * windows.
353  *
354  */
355
356 /**
357  * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
358  * @rb_root:            root of per-filesystem reservation rb tree
359  * @verbose:            verbose mode
360  * @fn:                 function which wishes to dump the reservation map
361  *
362  * If verbose is turned on, it will print the whole block reservation
363  * windows(start, end). Otherwise, it will only print out the "bad" windows,
364  * those windows that overlap with their immediate neighbors.
365  */
366 #if 1
367 static void __rsv_window_dump(struct rb_root *root, int verbose,
368                               const char *fn)
369 {
370         struct rb_node *n;
371         struct ext4_reserve_window_node *rsv, *prev;
372         int bad;
373
374 restart:
375         n = rb_first(root);
376         bad = 0;
377         prev = NULL;
378
379         printk("Block Allocation Reservation Windows Map (%s):\n", fn);
380         while (n) {
381                 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
382                 if (verbose)
383                         printk("reservation window 0x%p "
384                                "start:  %llu, end:  %llu\n",
385                                rsv, rsv->rsv_start, rsv->rsv_end);
386                 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
387                         printk("Bad reservation %p (start >= end)\n",
388                                rsv);
389                         bad = 1;
390                 }
391                 if (prev && prev->rsv_end >= rsv->rsv_start) {
392                         printk("Bad reservation %p (prev->end >= start)\n",
393                                rsv);
394                         bad = 1;
395                 }
396                 if (bad) {
397                         if (!verbose) {
398                                 printk("Restarting reservation walk in verbose mode\n");
399                                 verbose = 1;
400                                 goto restart;
401                         }
402                 }
403                 n = rb_next(n);
404                 prev = rsv;
405         }
406         printk("Window map complete.\n");
407         BUG_ON(bad);
408 }
409 #define rsv_window_dump(root, verbose) \
410         __rsv_window_dump((root), (verbose), __func__)
411 #else
412 #define rsv_window_dump(root, verbose) do {} while (0)
413 #endif
414
415 /**
416  * goal_in_my_reservation()
417  * @rsv:                inode's reservation window
418  * @grp_goal:           given goal block relative to the allocation block group
419  * @group:              the current allocation block group
420  * @sb:                 filesystem super block
421  *
422  * Test if the given goal block (group relative) is within the file's
423  * own block reservation window range.
424  *
425  * If the reservation window is outside the goal allocation group, return 0;
426  * grp_goal (given goal block) could be -1, which means no specific
427  * goal block. In this case, always return 1.
428  * If the goal block is within the reservation window, return 1;
429  * otherwise, return 0;
430  */
431 static int
432 goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
433                         ext4_group_t group, struct super_block *sb)
434 {
435         ext4_fsblk_t group_first_block, group_last_block;
436
437         group_first_block = ext4_group_first_block_no(sb, group);
438         group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
439
440         if ((rsv->_rsv_start > group_last_block) ||
441             (rsv->_rsv_end < group_first_block))
442                 return 0;
443         if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
444                 || (grp_goal + group_first_block > rsv->_rsv_end)))
445                 return 0;
446         return 1;
447 }
448
449 /**
450  * search_reserve_window()
451  * @rb_root:            root of reservation tree
452  * @goal:               target allocation block
453  *
454  * Find the reserved window which includes the goal, or the previous one
455  * if the goal is not in any window.
456  * Returns NULL if there are no windows or if all windows start after the goal.
457  */
458 static struct ext4_reserve_window_node *
459 search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
460 {
461         struct rb_node *n = root->rb_node;
462         struct ext4_reserve_window_node *rsv;
463
464         if (!n)
465                 return NULL;
466
467         do {
468                 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
469
470                 if (goal < rsv->rsv_start)
471                         n = n->rb_left;
472                 else if (goal > rsv->rsv_end)
473                         n = n->rb_right;
474                 else
475                         return rsv;
476         } while (n);
477         /*
478          * We've fallen off the end of the tree: the goal wasn't inside
479          * any particular node.  OK, the previous node must be to one
480          * side of the interval containing the goal.  If it's the RHS,
481          * we need to back up one.
482          */
483         if (rsv->rsv_start > goal) {
484                 n = rb_prev(&rsv->rsv_node);
485                 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
486         }
487         return rsv;
488 }
489
490 /**
491  * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
492  * @sb:                 super block
493  * @rsv:                reservation window to add
494  *
495  * Must be called with rsv_lock hold.
496  */
497 void ext4_rsv_window_add(struct super_block *sb,
498                     struct ext4_reserve_window_node *rsv)
499 {
500         struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
501         struct rb_node *node = &rsv->rsv_node;
502         ext4_fsblk_t start = rsv->rsv_start;
503
504         struct rb_node ** p = &root->rb_node;
505         struct rb_node * parent = NULL;
506         struct ext4_reserve_window_node *this;
507
508         while (*p)
509         {
510                 parent = *p;
511                 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
512
513                 if (start < this->rsv_start)
514                         p = &(*p)->rb_left;
515                 else if (start > this->rsv_end)
516                         p = &(*p)->rb_right;
517                 else {
518                         rsv_window_dump(root, 1);
519                         BUG();
520                 }
521         }
522
523         rb_link_node(node, parent, p);
524         rb_insert_color(node, root);
525 }
526
527 /**
528  * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
529  * @sb:                 super block
530  * @rsv:                reservation window to remove
531  *
532  * Mark the block reservation window as not allocated, and unlink it
533  * from the filesystem reservation window rb tree. Must be called with
534  * rsv_lock hold.
535  */
536 static void rsv_window_remove(struct super_block *sb,
537                               struct ext4_reserve_window_node *rsv)
538 {
539         rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
540         rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
541         rsv->rsv_alloc_hit = 0;
542         rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
543 }
544
545 /*
546  * rsv_is_empty() -- Check if the reservation window is allocated.
547  * @rsv:                given reservation window to check
548  *
549  * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
550  */
551 static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
552 {
553         /* a valid reservation end block could not be 0 */
554         return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
555 }
556
557 /**
558  * ext4_init_block_alloc_info()
559  * @inode:              file inode structure
560  *
561  * Allocate and initialize the  reservation window structure, and
562  * link the window to the ext4 inode structure at last
563  *
564  * The reservation window structure is only dynamically allocated
565  * and linked to ext4 inode the first time the open file
566  * needs a new block. So, before every ext4_new_block(s) call, for
567  * regular files, we should check whether the reservation window
568  * structure exists or not. In the latter case, this function is called.
569  * Fail to do so will result in block reservation being turned off for that
570  * open file.
571  *
572  * This function is called from ext4_get_blocks_handle(), also called
573  * when setting the reservation window size through ioctl before the file
574  * is open for write (needs block allocation).
575  *
576  * Needs down_write(i_data_sem) protection prior to call this function.
577  */
578 void ext4_init_block_alloc_info(struct inode *inode)
579 {
580         struct ext4_inode_info *ei = EXT4_I(inode);
581         struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
582         struct super_block *sb = inode->i_sb;
583
584         block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
585         if (block_i) {
586                 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
587
588                 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
589                 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
590
591                 /*
592                  * if filesystem is mounted with NORESERVATION, the goal
593                  * reservation window size is set to zero to indicate
594                  * block reservation is off
595                  */
596                 if (!test_opt(sb, RESERVATION))
597                         rsv->rsv_goal_size = 0;
598                 else
599                         rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
600                 rsv->rsv_alloc_hit = 0;
601                 block_i->last_alloc_logical_block = 0;
602                 block_i->last_alloc_physical_block = 0;
603         }
604         ei->i_block_alloc_info = block_i;
605 }
606
607 /**
608  * ext4_discard_reservation()
609  * @inode:              inode
610  *
611  * Discard(free) block reservation window on last file close, or truncate
612  * or at last iput().
613  *
614  * It is being called in three cases:
615  *      ext4_release_file(): last writer close the file
616  *      ext4_clear_inode(): last iput(), when nobody link to this file.
617  *      ext4_truncate(): when the block indirect map is about to change.
618  *
619  */
620 void ext4_discard_reservation(struct inode *inode)
621 {
622         struct ext4_inode_info *ei = EXT4_I(inode);
623         struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
624         struct ext4_reserve_window_node *rsv;
625         spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
626
627         ext4_mb_discard_inode_preallocations(inode);
628
629         if (!block_i)
630                 return;
631
632         rsv = &block_i->rsv_window_node;
633         if (!rsv_is_empty(&rsv->rsv_window)) {
634                 spin_lock(rsv_lock);
635                 if (!rsv_is_empty(&rsv->rsv_window))
636                         rsv_window_remove(inode->i_sb, rsv);
637                 spin_unlock(rsv_lock);
638         }
639 }
640
641 /**
642  * ext4_free_blocks_sb() -- Free given blocks and update quota
643  * @handle:                     handle to this transaction
644  * @sb:                         super block
645  * @block:                      start physcial block to free
646  * @count:                      number of blocks to free
647  * @pdquot_freed_blocks:        pointer to quota
648  */
649 void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
650                          ext4_fsblk_t block, unsigned long count,
651                          unsigned long *pdquot_freed_blocks)
652 {
653         struct buffer_head *bitmap_bh = NULL;
654         struct buffer_head *gd_bh;
655         ext4_group_t block_group;
656         ext4_grpblk_t bit;
657         unsigned long i;
658         unsigned long overflow;
659         struct ext4_group_desc * desc;
660         struct ext4_super_block * es;
661         struct ext4_sb_info *sbi;
662         int err = 0, ret;
663         ext4_grpblk_t group_freed;
664
665         *pdquot_freed_blocks = 0;
666         sbi = EXT4_SB(sb);
667         es = sbi->s_es;
668         if (block < le32_to_cpu(es->s_first_data_block) ||
669             block + count < block ||
670             block + count > ext4_blocks_count(es)) {
671                 ext4_error (sb, "ext4_free_blocks",
672                             "Freeing blocks not in datazone - "
673                             "block = %llu, count = %lu", block, count);
674                 goto error_return;
675         }
676
677         ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
678
679 do_more:
680         overflow = 0;
681         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
682         /*
683          * Check to see if we are freeing blocks across a group
684          * boundary.
685          */
686         if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
687                 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
688                 count -= overflow;
689         }
690         brelse(bitmap_bh);
691         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
692         if (!bitmap_bh)
693                 goto error_return;
694         desc = ext4_get_group_desc (sb, block_group, &gd_bh);
695         if (!desc)
696                 goto error_return;
697
698         if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
699             in_range(ext4_inode_bitmap(sb, desc), block, count) ||
700             in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
701             in_range(block + count - 1, ext4_inode_table(sb, desc),
702                      sbi->s_itb_per_group)) {
703                 ext4_error (sb, "ext4_free_blocks",
704                             "Freeing blocks in system zones - "
705                             "Block = %llu, count = %lu",
706                             block, count);
707                 goto error_return;
708         }
709
710         /*
711          * We are about to start releasing blocks in the bitmap,
712          * so we need undo access.
713          */
714         /* @@@ check errors */
715         BUFFER_TRACE(bitmap_bh, "getting undo access");
716         err = ext4_journal_get_undo_access(handle, bitmap_bh);
717         if (err)
718                 goto error_return;
719
720         /*
721          * We are about to modify some metadata.  Call the journal APIs
722          * to unshare ->b_data if a currently-committing transaction is
723          * using it
724          */
725         BUFFER_TRACE(gd_bh, "get_write_access");
726         err = ext4_journal_get_write_access(handle, gd_bh);
727         if (err)
728                 goto error_return;
729
730         jbd_lock_bh_state(bitmap_bh);
731
732         for (i = 0, group_freed = 0; i < count; i++) {
733                 /*
734                  * An HJ special.  This is expensive...
735                  */
736 #ifdef CONFIG_JBD2_DEBUG
737                 jbd_unlock_bh_state(bitmap_bh);
738                 {
739                         struct buffer_head *debug_bh;
740                         debug_bh = sb_find_get_block(sb, block + i);
741                         if (debug_bh) {
742                                 BUFFER_TRACE(debug_bh, "Deleted!");
743                                 if (!bh2jh(bitmap_bh)->b_committed_data)
744                                         BUFFER_TRACE(debug_bh,
745                                                 "No commited data in bitmap");
746                                 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
747                                 __brelse(debug_bh);
748                         }
749                 }
750                 jbd_lock_bh_state(bitmap_bh);
751 #endif
752                 if (need_resched()) {
753                         jbd_unlock_bh_state(bitmap_bh);
754                         cond_resched();
755                         jbd_lock_bh_state(bitmap_bh);
756                 }
757                 /* @@@ This prevents newly-allocated data from being
758                  * freed and then reallocated within the same
759                  * transaction.
760                  *
761                  * Ideally we would want to allow that to happen, but to
762                  * do so requires making jbd2_journal_forget() capable of
763                  * revoking the queued write of a data block, which
764                  * implies blocking on the journal lock.  *forget()
765                  * cannot block due to truncate races.
766                  *
767                  * Eventually we can fix this by making jbd2_journal_forget()
768                  * return a status indicating whether or not it was able
769                  * to revoke the buffer.  On successful revoke, it is
770                  * safe not to set the allocation bit in the committed
771                  * bitmap, because we know that there is no outstanding
772                  * activity on the buffer any more and so it is safe to
773                  * reallocate it.
774                  */
775                 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
776                 J_ASSERT_BH(bitmap_bh,
777                                 bh2jh(bitmap_bh)->b_committed_data != NULL);
778                 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
779                                 bh2jh(bitmap_bh)->b_committed_data);
780
781                 /*
782                  * We clear the bit in the bitmap after setting the committed
783                  * data bit, because this is the reverse order to that which
784                  * the allocator uses.
785                  */
786                 BUFFER_TRACE(bitmap_bh, "clear bit");
787                 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
788                                                 bit + i, bitmap_bh->b_data)) {
789                         jbd_unlock_bh_state(bitmap_bh);
790                         ext4_error(sb, __func__,
791                                    "bit already cleared for block %llu",
792                                    (ext4_fsblk_t)(block + i));
793                         jbd_lock_bh_state(bitmap_bh);
794                         BUFFER_TRACE(bitmap_bh, "bit already cleared");
795                 } else {
796                         group_freed++;
797                 }
798         }
799         jbd_unlock_bh_state(bitmap_bh);
800
801         spin_lock(sb_bgl_lock(sbi, block_group));
802         le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
803         desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
804         spin_unlock(sb_bgl_lock(sbi, block_group));
805         percpu_counter_add(&sbi->s_freeblocks_counter, count);
806
807         if (sbi->s_log_groups_per_flex) {
808                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
809                 spin_lock(sb_bgl_lock(sbi, flex_group));
810                 sbi->s_flex_groups[flex_group].free_blocks += count;
811                 spin_unlock(sb_bgl_lock(sbi, flex_group));
812         }
813
814         /* We dirtied the bitmap block */
815         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
816         err = ext4_journal_dirty_metadata(handle, bitmap_bh);
817
818         /* And the group descriptor block */
819         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
820         ret = ext4_journal_dirty_metadata(handle, gd_bh);
821         if (!err) err = ret;
822         *pdquot_freed_blocks += group_freed;
823
824         if (overflow && !err) {
825                 block += count;
826                 count = overflow;
827                 goto do_more;
828         }
829         sb->s_dirt = 1;
830 error_return:
831         brelse(bitmap_bh);
832         ext4_std_error(sb, err);
833         return;
834 }
835
836 /**
837  * ext4_free_blocks() -- Free given blocks and update quota
838  * @handle:             handle for this transaction
839  * @inode:              inode
840  * @block:              start physical block to free
841  * @count:              number of blocks to count
842  * @metadata:           Are these metadata blocks
843  */
844 void ext4_free_blocks(handle_t *handle, struct inode *inode,
845                         ext4_fsblk_t block, unsigned long count,
846                         int metadata)
847 {
848         struct super_block * sb;
849         unsigned long dquot_freed_blocks;
850
851         /* this isn't the right place to decide whether block is metadata
852          * inode.c/extents.c knows better, but for safety ... */
853         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
854                         ext4_should_journal_data(inode))
855                 metadata = 1;
856
857         sb = inode->i_sb;
858
859         if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
860                 ext4_free_blocks_sb(handle, sb, block, count,
861                                                 &dquot_freed_blocks);
862         else
863                 ext4_mb_free_blocks(handle, inode, block, count,
864                                                 metadata, &dquot_freed_blocks);
865         if (dquot_freed_blocks)
866                 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
867         return;
868 }
869
870 /**
871  * ext4_test_allocatable()
872  * @nr:                 given allocation block group
873  * @bh:                 bufferhead contains the bitmap of the given block group
874  *
875  * For ext4 allocations, we must not reuse any blocks which are
876  * allocated in the bitmap buffer's "last committed data" copy.  This
877  * prevents deletes from freeing up the page for reuse until we have
878  * committed the delete transaction.
879  *
880  * If we didn't do this, then deleting something and reallocating it as
881  * data would allow the old block to be overwritten before the
882  * transaction committed (because we force data to disk before commit).
883  * This would lead to corruption if we crashed between overwriting the
884  * data and committing the delete.
885  *
886  * @@@ We may want to make this allocation behaviour conditional on
887  * data-writes at some point, and disable it for metadata allocations or
888  * sync-data inodes.
889  */
890 static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
891 {
892         int ret;
893         struct journal_head *jh = bh2jh(bh);
894
895         if (ext4_test_bit(nr, bh->b_data))
896                 return 0;
897
898         jbd_lock_bh_state(bh);
899         if (!jh->b_committed_data)
900                 ret = 1;
901         else
902                 ret = !ext4_test_bit(nr, jh->b_committed_data);
903         jbd_unlock_bh_state(bh);
904         return ret;
905 }
906
907 /**
908  * bitmap_search_next_usable_block()
909  * @start:              the starting block (group relative) of the search
910  * @bh:                 bufferhead contains the block group bitmap
911  * @maxblocks:          the ending block (group relative) of the reservation
912  *
913  * The bitmap search --- search forward alternately through the actual
914  * bitmap on disk and the last-committed copy in journal, until we find a
915  * bit free in both bitmaps.
916  */
917 static ext4_grpblk_t
918 bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
919                                         ext4_grpblk_t maxblocks)
920 {
921         ext4_grpblk_t next;
922         struct journal_head *jh = bh2jh(bh);
923
924         while (start < maxblocks) {
925                 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
926                 if (next >= maxblocks)
927                         return -1;
928                 if (ext4_test_allocatable(next, bh))
929                         return next;
930                 jbd_lock_bh_state(bh);
931                 if (jh->b_committed_data)
932                         start = ext4_find_next_zero_bit(jh->b_committed_data,
933                                                         maxblocks, next);
934                 jbd_unlock_bh_state(bh);
935         }
936         return -1;
937 }
938
939 /**
940  * find_next_usable_block()
941  * @start:              the starting block (group relative) to find next
942  *                      allocatable block in bitmap.
943  * @bh:                 bufferhead contains the block group bitmap
944  * @maxblocks:          the ending block (group relative) for the search
945  *
946  * Find an allocatable block in a bitmap.  We honor both the bitmap and
947  * its last-committed copy (if that exists), and perform the "most
948  * appropriate allocation" algorithm of looking for a free block near
949  * the initial goal; then for a free byte somewhere in the bitmap; then
950  * for any free bit in the bitmap.
951  */
952 static ext4_grpblk_t
953 find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
954                         ext4_grpblk_t maxblocks)
955 {
956         ext4_grpblk_t here, next;
957         char *p, *r;
958
959         if (start > 0) {
960                 /*
961                  * The goal was occupied; search forward for a free
962                  * block within the next XX blocks.
963                  *
964                  * end_goal is more or less random, but it has to be
965                  * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
966                  * next 64-bit boundary is simple..
967                  */
968                 ext4_grpblk_t end_goal = (start + 63) & ~63;
969                 if (end_goal > maxblocks)
970                         end_goal = maxblocks;
971                 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
972                 if (here < end_goal && ext4_test_allocatable(here, bh))
973                         return here;
974                 ext4_debug("Bit not found near goal\n");
975         }
976
977         here = start;
978         if (here < 0)
979                 here = 0;
980
981         p = ((char *)bh->b_data) + (here >> 3);
982         r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
983         next = (r - ((char *)bh->b_data)) << 3;
984
985         if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
986                 return next;
987
988         /*
989          * The bitmap search --- search forward alternately through the actual
990          * bitmap and the last-committed copy until we find a bit free in
991          * both
992          */
993         here = bitmap_search_next_usable_block(here, bh, maxblocks);
994         return here;
995 }
996
997 /**
998  * claim_block()
999  * @block:              the free block (group relative) to allocate
1000  * @bh:                 the bufferhead containts the block group bitmap
1001  *
1002  * We think we can allocate this block in this bitmap.  Try to set the bit.
1003  * If that succeeds then check that nobody has allocated and then freed the
1004  * block since we saw that is was not marked in b_committed_data.  If it _was_
1005  * allocated and freed then clear the bit in the bitmap again and return
1006  * zero (failure).
1007  */
1008 static inline int
1009 claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
1010 {
1011         struct journal_head *jh = bh2jh(bh);
1012         int ret;
1013
1014         if (ext4_set_bit_atomic(lock, block, bh->b_data))
1015                 return 0;
1016         jbd_lock_bh_state(bh);
1017         if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
1018                 ext4_clear_bit_atomic(lock, block, bh->b_data);
1019                 ret = 0;
1020         } else {
1021                 ret = 1;
1022         }
1023         jbd_unlock_bh_state(bh);
1024         return ret;
1025 }
1026
1027 /**
1028  * ext4_try_to_allocate()
1029  * @sb:                 superblock
1030  * @handle:             handle to this transaction
1031  * @group:              given allocation block group
1032  * @bitmap_bh:          bufferhead holds the block bitmap
1033  * @grp_goal:           given target block within the group
1034  * @count:              target number of blocks to allocate
1035  * @my_rsv:             reservation window
1036  *
1037  * Attempt to allocate blocks within a give range. Set the range of allocation
1038  * first, then find the first free bit(s) from the bitmap (within the range),
1039  * and at last, allocate the blocks by claiming the found free bit as allocated.
1040  *
1041  * To set the range of this allocation:
1042  *      if there is a reservation window, only try to allocate block(s) from the
1043  *      file's own reservation window;
1044  *      Otherwise, the allocation range starts from the give goal block, ends at
1045  *      the block group's last block.
1046  *
1047  * If we failed to allocate the desired block then we may end up crossing to a
1048  * new bitmap.  In that case we must release write access to the old one via
1049  * ext4_journal_release_buffer(), else we'll run out of credits.
1050  */
1051 static ext4_grpblk_t
1052 ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
1053                         ext4_group_t group, struct buffer_head *bitmap_bh,
1054                         ext4_grpblk_t grp_goal, unsigned long *count,
1055                         struct ext4_reserve_window *my_rsv)
1056 {
1057         ext4_fsblk_t group_first_block;
1058         ext4_grpblk_t start, end;
1059         unsigned long num = 0;
1060
1061         /* we do allocation within the reservation window if we have a window */
1062         if (my_rsv) {
1063                 group_first_block = ext4_group_first_block_no(sb, group);
1064                 if (my_rsv->_rsv_start >= group_first_block)
1065                         start = my_rsv->_rsv_start - group_first_block;
1066                 else
1067                         /* reservation window cross group boundary */
1068                         start = 0;
1069                 end = my_rsv->_rsv_end - group_first_block + 1;
1070                 if (end > EXT4_BLOCKS_PER_GROUP(sb))
1071                         /* reservation window crosses group boundary */
1072                         end = EXT4_BLOCKS_PER_GROUP(sb);
1073                 if ((start <= grp_goal) && (grp_goal < end))
1074                         start = grp_goal;
1075                 else
1076                         grp_goal = -1;
1077         } else {
1078                 if (grp_goal > 0)
1079                         start = grp_goal;
1080                 else
1081                         start = 0;
1082                 end = EXT4_BLOCKS_PER_GROUP(sb);
1083         }
1084
1085         BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
1086
1087 repeat:
1088         if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
1089                 grp_goal = find_next_usable_block(start, bitmap_bh, end);
1090                 if (grp_goal < 0)
1091                         goto fail_access;
1092                 if (!my_rsv) {
1093                         int i;
1094
1095                         for (i = 0; i < 7 && grp_goal > start &&
1096                                         ext4_test_allocatable(grp_goal - 1,
1097                                                                 bitmap_bh);
1098                                         i++, grp_goal--)
1099                                 ;
1100                 }
1101         }
1102         start = grp_goal;
1103
1104         if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1105                 grp_goal, bitmap_bh)) {
1106                 /*
1107                  * The block was allocated by another thread, or it was
1108                  * allocated and then freed by another thread
1109                  */
1110                 start++;
1111                 grp_goal++;
1112                 if (start >= end)
1113                         goto fail_access;
1114                 goto repeat;
1115         }
1116         num++;
1117         grp_goal++;
1118         while (num < *count && grp_goal < end
1119                 && ext4_test_allocatable(grp_goal, bitmap_bh)
1120                 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1121                                 grp_goal, bitmap_bh)) {
1122                 num++;
1123                 grp_goal++;
1124         }
1125         *count = num;
1126         return grp_goal - num;
1127 fail_access:
1128         *count = num;
1129         return -1;
1130 }
1131
1132 /**
1133  *      find_next_reservable_window():
1134  *              find a reservable space within the given range.
1135  *              It does not allocate the reservation window for now:
1136  *              alloc_new_reservation() will do the work later.
1137  *
1138  *      @search_head: the head of the searching list;
1139  *              This is not necessarily the list head of the whole filesystem
1140  *
1141  *              We have both head and start_block to assist the search
1142  *              for the reservable space. The list starts from head,
1143  *              but we will shift to the place where start_block is,
1144  *              then start from there, when looking for a reservable space.
1145  *
1146  *      @size: the target new reservation window size
1147  *
1148  *      @group_first_block: the first block we consider to start
1149  *                      the real search from
1150  *
1151  *      @last_block:
1152  *              the maximum block number that our goal reservable space
1153  *              could start from. This is normally the last block in this
1154  *              group. The search will end when we found the start of next
1155  *              possible reservable space is out of this boundary.
1156  *              This could handle the cross boundary reservation window
1157  *              request.
1158  *
1159  *      basically we search from the given range, rather than the whole
1160  *      reservation double linked list, (start_block, last_block)
1161  *      to find a free region that is of my size and has not
1162  *      been reserved.
1163  *
1164  */
1165 static int find_next_reservable_window(
1166                                 struct ext4_reserve_window_node *search_head,
1167                                 struct ext4_reserve_window_node *my_rsv,
1168                                 struct super_block * sb,
1169                                 ext4_fsblk_t start_block,
1170                                 ext4_fsblk_t last_block)
1171 {
1172         struct rb_node *next;
1173         struct ext4_reserve_window_node *rsv, *prev;
1174         ext4_fsblk_t cur;
1175         int size = my_rsv->rsv_goal_size;
1176
1177         /* TODO: make the start of the reservation window byte-aligned */
1178         /* cur = *start_block & ~7;*/
1179         cur = start_block;
1180         rsv = search_head;
1181         if (!rsv)
1182                 return -1;
1183
1184         while (1) {
1185                 if (cur <= rsv->rsv_end)
1186                         cur = rsv->rsv_end + 1;
1187
1188                 /* TODO?
1189                  * in the case we could not find a reservable space
1190                  * that is what is expected, during the re-search, we could
1191                  * remember what's the largest reservable space we could have
1192                  * and return that one.
1193                  *
1194                  * For now it will fail if we could not find the reservable
1195                  * space with expected-size (or more)...
1196                  */
1197                 if (cur > last_block)
1198                         return -1;              /* fail */
1199
1200                 prev = rsv;
1201                 next = rb_next(&rsv->rsv_node);
1202                 rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
1203
1204                 /*
1205                  * Reached the last reservation, we can just append to the
1206                  * previous one.
1207                  */
1208                 if (!next)
1209                         break;
1210
1211                 if (cur + size <= rsv->rsv_start) {
1212                         /*
1213                          * Found a reserveable space big enough.  We could
1214                          * have a reservation across the group boundary here
1215                          */
1216                         break;
1217                 }
1218         }
1219         /*
1220          * we come here either :
1221          * when we reach the end of the whole list,
1222          * and there is empty reservable space after last entry in the list.
1223          * append it to the end of the list.
1224          *
1225          * or we found one reservable space in the middle of the list,
1226          * return the reservation window that we could append to.
1227          * succeed.
1228          */
1229
1230         if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1231                 rsv_window_remove(sb, my_rsv);
1232
1233         /*
1234          * Let's book the whole avaliable window for now.  We will check the
1235          * disk bitmap later and then, if there are free blocks then we adjust
1236          * the window size if it's larger than requested.
1237          * Otherwise, we will remove this node from the tree next time
1238          * call find_next_reservable_window.
1239          */
1240         my_rsv->rsv_start = cur;
1241         my_rsv->rsv_end = cur + size - 1;
1242         my_rsv->rsv_alloc_hit = 0;
1243
1244         if (prev != my_rsv)
1245                 ext4_rsv_window_add(sb, my_rsv);
1246
1247         return 0;
1248 }
1249
1250 /**
1251  *      alloc_new_reservation()--allocate a new reservation window
1252  *
1253  *              To make a new reservation, we search part of the filesystem
1254  *              reservation list (the list that inside the group). We try to
1255  *              allocate a new reservation window near the allocation goal,
1256  *              or the beginning of the group, if there is no goal.
1257  *
1258  *              We first find a reservable space after the goal, then from
1259  *              there, we check the bitmap for the first free block after
1260  *              it. If there is no free block until the end of group, then the
1261  *              whole group is full, we failed. Otherwise, check if the free
1262  *              block is inside the expected reservable space, if so, we
1263  *              succeed.
1264  *              If the first free block is outside the reservable space, then
1265  *              start from the first free block, we search for next available
1266  *              space, and go on.
1267  *
1268  *      on succeed, a new reservation will be found and inserted into the list
1269  *      It contains at least one free block, and it does not overlap with other
1270  *      reservation windows.
1271  *
1272  *      failed: we failed to find a reservation window in this group
1273  *
1274  *      @rsv: the reservation
1275  *
1276  *      @grp_goal: The goal (group-relative).  It is where the search for a
1277  *              free reservable space should start from.
1278  *              if we have a grp_goal(grp_goal >0 ), then start from there,
1279  *              no grp_goal(grp_goal = -1), we start from the first block
1280  *              of the group.
1281  *
1282  *      @sb: the super block
1283  *      @group: the group we are trying to allocate in
1284  *      @bitmap_bh: the block group block bitmap
1285  *
1286  */
1287 static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1288                 ext4_grpblk_t grp_goal, struct super_block *sb,
1289                 ext4_group_t group, struct buffer_head *bitmap_bh)
1290 {
1291         struct ext4_reserve_window_node *search_head;
1292         ext4_fsblk_t group_first_block, group_end_block, start_block;
1293         ext4_grpblk_t first_free_block;
1294         struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1295         unsigned long size;
1296         int ret;
1297         spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1298
1299         group_first_block = ext4_group_first_block_no(sb, group);
1300         group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1301
1302         if (grp_goal < 0)
1303                 start_block = group_first_block;
1304         else
1305                 start_block = grp_goal + group_first_block;
1306
1307         size = my_rsv->rsv_goal_size;
1308
1309         if (!rsv_is_empty(&my_rsv->rsv_window)) {
1310                 /*
1311                  * if the old reservation is cross group boundary
1312                  * and if the goal is inside the old reservation window,
1313                  * we will come here when we just failed to allocate from
1314                  * the first part of the window. We still have another part
1315                  * that belongs to the next group. In this case, there is no
1316                  * point to discard our window and try to allocate a new one
1317                  * in this group(which will fail). we should
1318                  * keep the reservation window, just simply move on.
1319                  *
1320                  * Maybe we could shift the start block of the reservation
1321                  * window to the first block of next group.
1322                  */
1323
1324                 if ((my_rsv->rsv_start <= group_end_block) &&
1325                                 (my_rsv->rsv_end > group_end_block) &&
1326                                 (start_block >= my_rsv->rsv_start))
1327                         return -1;
1328
1329                 if ((my_rsv->rsv_alloc_hit >
1330                      (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1331                         /*
1332                          * if the previously allocation hit ratio is
1333                          * greater than 1/2, then we double the size of
1334                          * the reservation window the next time,
1335                          * otherwise we keep the same size window
1336                          */
1337                         size = size * 2;
1338                         if (size > EXT4_MAX_RESERVE_BLOCKS)
1339                                 size = EXT4_MAX_RESERVE_BLOCKS;
1340                         my_rsv->rsv_goal_size= size;
1341                 }
1342         }
1343
1344         spin_lock(rsv_lock);
1345         /*
1346          * shift the search start to the window near the goal block
1347          */
1348         search_head = search_reserve_window(fs_rsv_root, start_block);
1349
1350         /*
1351          * find_next_reservable_window() simply finds a reservable window
1352          * inside the given range(start_block, group_end_block).
1353          *
1354          * To make sure the reservation window has a free bit inside it, we
1355          * need to check the bitmap after we found a reservable window.
1356          */
1357 retry:
1358         ret = find_next_reservable_window(search_head, my_rsv, sb,
1359                                                 start_block, group_end_block);
1360
1361         if (ret == -1) {
1362                 if (!rsv_is_empty(&my_rsv->rsv_window))
1363                         rsv_window_remove(sb, my_rsv);
1364                 spin_unlock(rsv_lock);
1365                 return -1;
1366         }
1367
1368         /*
1369          * On success, find_next_reservable_window() returns the
1370          * reservation window where there is a reservable space after it.
1371          * Before we reserve this reservable space, we need
1372          * to make sure there is at least a free block inside this region.
1373          *
1374          * searching the first free bit on the block bitmap and copy of
1375          * last committed bitmap alternatively, until we found a allocatable
1376          * block. Search start from the start block of the reservable space
1377          * we just found.
1378          */
1379         spin_unlock(rsv_lock);
1380         first_free_block = bitmap_search_next_usable_block(
1381                         my_rsv->rsv_start - group_first_block,
1382                         bitmap_bh, group_end_block - group_first_block + 1);
1383
1384         if (first_free_block < 0) {
1385                 /*
1386                  * no free block left on the bitmap, no point
1387                  * to reserve the space. return failed.
1388                  */
1389                 spin_lock(rsv_lock);
1390                 if (!rsv_is_empty(&my_rsv->rsv_window))
1391                         rsv_window_remove(sb, my_rsv);
1392                 spin_unlock(rsv_lock);
1393                 return -1;              /* failed */
1394         }
1395
1396         start_block = first_free_block + group_first_block;
1397         /*
1398          * check if the first free block is within the
1399          * free space we just reserved
1400          */
1401         if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1402                 return 0;               /* success */
1403         /*
1404          * if the first free bit we found is out of the reservable space
1405          * continue search for next reservable space,
1406          * start from where the free block is,
1407          * we also shift the list head to where we stopped last time
1408          */
1409         search_head = my_rsv;
1410         spin_lock(rsv_lock);
1411         goto retry;
1412 }
1413
1414 /**
1415  * try_to_extend_reservation()
1416  * @my_rsv:             given reservation window
1417  * @sb:                 super block
1418  * @size:               the delta to extend
1419  *
1420  * Attempt to expand the reservation window large enough to have
1421  * required number of free blocks
1422  *
1423  * Since ext4_try_to_allocate() will always allocate blocks within
1424  * the reservation window range, if the window size is too small,
1425  * multiple blocks allocation has to stop at the end of the reservation
1426  * window. To make this more efficient, given the total number of
1427  * blocks needed and the current size of the window, we try to
1428  * expand the reservation window size if necessary on a best-effort
1429  * basis before ext4_new_blocks() tries to allocate blocks,
1430  */
1431 static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1432                         struct super_block *sb, int size)
1433 {
1434         struct ext4_reserve_window_node *next_rsv;
1435         struct rb_node *next;
1436         spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1437
1438         if (!spin_trylock(rsv_lock))
1439                 return;
1440
1441         next = rb_next(&my_rsv->rsv_node);
1442
1443         if (!next)
1444                 my_rsv->rsv_end += size;
1445         else {
1446                 next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
1447
1448                 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1449                         my_rsv->rsv_end += size;
1450                 else
1451                         my_rsv->rsv_end = next_rsv->rsv_start - 1;
1452         }
1453         spin_unlock(rsv_lock);
1454 }
1455
1456 /**
1457  * ext4_try_to_allocate_with_rsv()
1458  * @sb:                 superblock
1459  * @handle:             handle to this transaction
1460  * @group:              given allocation block group
1461  * @bitmap_bh:          bufferhead holds the block bitmap
1462  * @grp_goal:           given target block within the group
1463  * @count:              target number of blocks to allocate
1464  * @my_rsv:             reservation window
1465  * @errp:               pointer to store the error code
1466  *
1467  * This is the main function used to allocate a new block and its reservation
1468  * window.
1469  *
1470  * Each time when a new block allocation is need, first try to allocate from
1471  * its own reservation.  If it does not have a reservation window, instead of
1472  * looking for a free bit on bitmap first, then look up the reservation list to
1473  * see if it is inside somebody else's reservation window, we try to allocate a
1474  * reservation window for it starting from the goal first. Then do the block
1475  * allocation within the reservation window.
1476  *
1477  * This will avoid keeping on searching the reservation list again and
1478  * again when somebody is looking for a free block (without
1479  * reservation), and there are lots of free blocks, but they are all
1480  * being reserved.
1481  *
1482  * We use a red-black tree for the per-filesystem reservation list.
1483  *
1484  */
1485 static ext4_grpblk_t
1486 ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1487                         ext4_group_t group, struct buffer_head *bitmap_bh,
1488                         ext4_grpblk_t grp_goal,
1489                         struct ext4_reserve_window_node * my_rsv,
1490                         unsigned long *count, int *errp)
1491 {
1492         ext4_fsblk_t group_first_block, group_last_block;
1493         ext4_grpblk_t ret = 0;
1494         int fatal;
1495         unsigned long num = *count;
1496
1497         *errp = 0;
1498
1499         /*
1500          * Make sure we use undo access for the bitmap, because it is critical
1501          * that we do the frozen_data COW on bitmap buffers in all cases even
1502          * if the buffer is in BJ_Forget state in the committing transaction.
1503          */
1504         BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1505         fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1506         if (fatal) {
1507                 *errp = fatal;
1508                 return -1;
1509         }
1510
1511         /*
1512          * we don't deal with reservation when
1513          * filesystem is mounted without reservation
1514          * or the file is not a regular file
1515          * or last attempt to allocate a block with reservation turned on failed
1516          */
1517         if (my_rsv == NULL ) {
1518                 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1519                                                 grp_goal, count, NULL);
1520                 goto out;
1521         }
1522         /*
1523          * grp_goal is a group relative block number (if there is a goal)
1524          * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1525          * first block is a filesystem wide block number
1526          * first block is the block number of the first block in this group
1527          */
1528         group_first_block = ext4_group_first_block_no(sb, group);
1529         group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1530
1531         /*
1532          * Basically we will allocate a new block from inode's reservation
1533          * window.
1534          *
1535          * We need to allocate a new reservation window, if:
1536          * a) inode does not have a reservation window; or
1537          * b) last attempt to allocate a block from existing reservation
1538          *    failed; or
1539          * c) we come here with a goal and with a reservation window
1540          *
1541          * We do not need to allocate a new reservation window if we come here
1542          * at the beginning with a goal and the goal is inside the window, or
1543          * we don't have a goal but already have a reservation window.
1544          * then we could go to allocate from the reservation window directly.
1545          */
1546         while (1) {
1547                 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1548                         !goal_in_my_reservation(&my_rsv->rsv_window,
1549                                                 grp_goal, group, sb)) {
1550                         if (my_rsv->rsv_goal_size < *count)
1551                                 my_rsv->rsv_goal_size = *count;
1552                         ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1553                                                         group, bitmap_bh);
1554                         if (ret < 0)
1555                                 break;                  /* failed */
1556
1557                         if (!goal_in_my_reservation(&my_rsv->rsv_window,
1558                                                         grp_goal, group, sb))
1559                                 grp_goal = -1;
1560                 } else if (grp_goal >= 0) {
1561                         int curr = my_rsv->rsv_end -
1562                                         (grp_goal + group_first_block) + 1;
1563
1564                         if (curr < *count)
1565                                 try_to_extend_reservation(my_rsv, sb,
1566                                                         *count - curr);
1567                 }
1568
1569                 if ((my_rsv->rsv_start > group_last_block) ||
1570                                 (my_rsv->rsv_end < group_first_block)) {
1571                         rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1572                         BUG();
1573                 }
1574                 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1575                                            grp_goal, &num, &my_rsv->rsv_window);
1576                 if (ret >= 0) {
1577                         my_rsv->rsv_alloc_hit += num;
1578                         *count = num;
1579                         break;                          /* succeed */
1580                 }
1581                 num = *count;
1582         }
1583 out:
1584         if (ret >= 0) {
1585                 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1586                                         "bitmap block");
1587                 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1588                 if (fatal) {
1589                         *errp = fatal;
1590                         return -1;
1591                 }
1592                 return ret;
1593         }
1594
1595         BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1596         ext4_journal_release_buffer(handle, bitmap_bh);
1597         return ret;
1598 }
1599
1600 /**
1601  * ext4_has_free_blocks()
1602  * @sbi:        in-core super block structure.
1603  * @nblocks:    number of neeed blocks
1604  *
1605  * Check if filesystem has free blocks available for allocation.
1606  * Return the number of blocks avaible for allocation for this request
1607  * On success, return nblocks
1608  */
1609 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1610                                                 ext4_fsblk_t nblocks)
1611 {
1612         ext4_fsblk_t free_blocks;
1613         ext4_fsblk_t root_blocks = 0;
1614
1615         free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1616
1617         if (!capable(CAP_SYS_RESOURCE) &&
1618                 sbi->s_resuid != current->fsuid &&
1619                 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1620                 root_blocks = ext4_r_blocks_count(sbi->s_es);
1621 #ifdef CONFIG_SMP
1622         if (free_blocks - root_blocks < FBC_BATCH)
1623                 free_blocks =
1624                         percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
1625 #endif
1626         if (free_blocks - root_blocks < nblocks)
1627                 return free_blocks - root_blocks;
1628         return nblocks;
1629  }
1630
1631
1632 /**
1633  * ext4_should_retry_alloc()
1634  * @sb:                 super block
1635  * @retries             number of attemps has been made
1636  *
1637  * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
1638  * it is profitable to retry the operation, this function will wait
1639  * for the current or commiting transaction to complete, and then
1640  * return TRUE.
1641  *
1642  * if the total number of retries exceed three times, return FALSE.
1643  */
1644 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1645 {
1646         if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
1647                 return 0;
1648
1649         jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1650
1651         return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1652 }
1653
1654 /**
1655  * ext4_old_new_blocks() -- core block bitmap based block allocation function
1656  *
1657  * @handle:             handle to this transaction
1658  * @inode:              file inode
1659  * @goal:               given target block(filesystem wide)
1660  * @count:              target number of blocks to allocate
1661  * @errp:               error code
1662  *
1663  * ext4_old_new_blocks uses a goal block to assist allocation and look up
1664  * the block bitmap directly to do block allocation.  It tries to
1665  * allocate block(s) from the block group contains the goal block first. If
1666  * that fails, it will try to allocate block(s) from other block groups
1667  * without any specific goal block.
1668  *
1669  * This function is called when -o nomballoc mount option is enabled
1670  *
1671  */
1672 ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1673                         ext4_fsblk_t goal, unsigned long *count, int *errp)
1674 {
1675         struct buffer_head *bitmap_bh = NULL;
1676         struct buffer_head *gdp_bh;
1677         ext4_group_t group_no;
1678         ext4_group_t goal_group;
1679         ext4_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
1680         ext4_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
1681         ext4_fsblk_t ret_block;         /* filesyetem-wide allocated block */
1682         ext4_group_t bgi;                       /* blockgroup iteration index */
1683         int fatal = 0, err;
1684         int performed_allocation = 0;
1685         ext4_grpblk_t free_blocks;      /* number of free blocks in a group */
1686         struct super_block *sb;
1687         struct ext4_group_desc *gdp;
1688         struct ext4_super_block *es;
1689         struct ext4_sb_info *sbi;
1690         struct ext4_reserve_window_node *my_rsv = NULL;
1691         struct ext4_block_alloc_info *block_i;
1692         unsigned short windowsz = 0;
1693         ext4_group_t ngroups;
1694         unsigned long num = *count;
1695
1696         sb = inode->i_sb;
1697         if (!sb) {
1698                 *errp = -ENODEV;
1699                 printk("ext4_new_block: nonexistent device");
1700                 return 0;
1701         }
1702
1703         sbi = EXT4_SB(sb);
1704         if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1705                 /*
1706                  * With delalloc we already reserved the blocks
1707                  */
1708                 *count = ext4_has_free_blocks(sbi, *count);
1709         }
1710         if (*count == 0) {
1711                 *errp = -ENOSPC;
1712                 return 0;       /*return with ENOSPC error */
1713         }
1714         num = *count;
1715
1716         /*
1717          * Check quota for allocation of this block.
1718          */
1719         if (DQUOT_ALLOC_BLOCK(inode, num)) {
1720                 *errp = -EDQUOT;
1721                 return 0;
1722         }
1723
1724         sbi = EXT4_SB(sb);
1725         es = EXT4_SB(sb)->s_es;
1726         ext4_debug("goal=%llu.\n", goal);
1727         /*
1728          * Allocate a block from reservation only when
1729          * filesystem is mounted with reservation(default,-o reservation), and
1730          * it's a regular file, and
1731          * the desired window size is greater than 0 (One could use ioctl
1732          * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1733          * reservation on that particular file)
1734          */
1735         block_i = EXT4_I(inode)->i_block_alloc_info;
1736         if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1737                 my_rsv = &block_i->rsv_window_node;
1738
1739         /*
1740          * First, test whether the goal block is free.
1741          */
1742         if (goal < le32_to_cpu(es->s_first_data_block) ||
1743             goal >= ext4_blocks_count(es))
1744                 goal = le32_to_cpu(es->s_first_data_block);
1745         ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1746         goal_group = group_no;
1747 retry_alloc:
1748         gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1749         if (!gdp)
1750                 goto io_error;
1751
1752         free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1753         /*
1754          * if there is not enough free blocks to make a new resevation
1755          * turn off reservation for this allocation
1756          */
1757         if (my_rsv && (free_blocks < windowsz)
1758                 && (rsv_is_empty(&my_rsv->rsv_window)))
1759                 my_rsv = NULL;
1760
1761         if (free_blocks > 0) {
1762                 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1763                 if (!bitmap_bh)
1764                         goto io_error;
1765                 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1766                                         group_no, bitmap_bh, grp_target_blk,
1767                                         my_rsv, &num, &fatal);
1768                 if (fatal)
1769                         goto out;
1770                 if (grp_alloc_blk >= 0)
1771                         goto allocated;
1772         }
1773
1774         ngroups = EXT4_SB(sb)->s_groups_count;
1775         smp_rmb();
1776
1777         /*
1778          * Now search the rest of the groups.  We assume that
1779          * group_no and gdp correctly point to the last group visited.
1780          */
1781         for (bgi = 0; bgi < ngroups; bgi++) {
1782                 group_no++;
1783                 if (group_no >= ngroups)
1784                         group_no = 0;
1785                 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1786                 if (!gdp)
1787                         goto io_error;
1788                 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1789                 /*
1790                  * skip this group if the number of
1791                  * free blocks is less than half of the reservation
1792                  * window size.
1793                  */
1794                 if (free_blocks <= (windowsz/2))
1795                         continue;
1796
1797                 brelse(bitmap_bh);
1798                 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1799                 if (!bitmap_bh)
1800                         goto io_error;
1801                 /*
1802                  * try to allocate block(s) from this group, without a goal(-1).
1803                  */
1804                 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1805                                         group_no, bitmap_bh, -1, my_rsv,
1806                                         &num, &fatal);
1807                 if (fatal)
1808                         goto out;
1809                 if (grp_alloc_blk >= 0)
1810                         goto allocated;
1811         }
1812         /*
1813          * We may end up a bogus ealier ENOSPC error due to
1814          * filesystem is "full" of reservations, but
1815          * there maybe indeed free blocks avaliable on disk
1816          * In this case, we just forget about the reservations
1817          * just do block allocation as without reservations.
1818          */
1819         if (my_rsv) {
1820                 my_rsv = NULL;
1821                 windowsz = 0;
1822                 group_no = goal_group;
1823                 goto retry_alloc;
1824         }
1825         /* No space left on the device */
1826         *errp = -ENOSPC;
1827         goto out;
1828
1829 allocated:
1830
1831         ext4_debug("using block group %lu(%d)\n",
1832                         group_no, gdp->bg_free_blocks_count);
1833
1834         BUFFER_TRACE(gdp_bh, "get_write_access");
1835         fatal = ext4_journal_get_write_access(handle, gdp_bh);
1836         if (fatal)
1837                 goto out;
1838
1839         ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1840
1841         if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1842             in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1843             in_range(ret_block, ext4_inode_table(sb, gdp),
1844                      EXT4_SB(sb)->s_itb_per_group) ||
1845             in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1846                      EXT4_SB(sb)->s_itb_per_group)) {
1847                 ext4_error(sb, "ext4_new_block",
1848                             "Allocating block in system zone - "
1849                             "blocks from %llu, length %lu",
1850                              ret_block, num);
1851                 /*
1852                  * claim_block marked the blocks we allocated
1853                  * as in use. So we may want to selectively
1854                  * mark some of the blocks as free
1855                  */
1856                 goto retry_alloc;
1857         }
1858
1859         performed_allocation = 1;
1860
1861 #ifdef CONFIG_JBD2_DEBUG
1862         {
1863                 struct buffer_head *debug_bh;
1864
1865                 /* Record bitmap buffer state in the newly allocated block */
1866                 debug_bh = sb_find_get_block(sb, ret_block);
1867                 if (debug_bh) {
1868                         BUFFER_TRACE(debug_bh, "state when allocated");
1869                         BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1870                         brelse(debug_bh);
1871                 }
1872         }
1873         jbd_lock_bh_state(bitmap_bh);
1874         spin_lock(sb_bgl_lock(sbi, group_no));
1875         if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1876                 int i;
1877
1878                 for (i = 0; i < num; i++) {
1879                         if (ext4_test_bit(grp_alloc_blk+i,
1880                                         bh2jh(bitmap_bh)->b_committed_data)) {
1881                                 printk("%s: block was unexpectedly set in "
1882                                         "b_committed_data\n", __func__);
1883                         }
1884                 }
1885         }
1886         ext4_debug("found bit %d\n", grp_alloc_blk);
1887         spin_unlock(sb_bgl_lock(sbi, group_no));
1888         jbd_unlock_bh_state(bitmap_bh);
1889 #endif
1890
1891         if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1892                 ext4_error(sb, "ext4_new_block",
1893                             "block(%llu) >= blocks count(%llu) - "
1894                             "block_group = %lu, es == %p ", ret_block,
1895                         ext4_blocks_count(es), group_no, es);
1896                 goto out;
1897         }
1898
1899         /*
1900          * It is up to the caller to add the new buffer to a journal
1901          * list of some description.  We don't know in advance whether
1902          * the caller wants to use it as metadata or data.
1903          */
1904         spin_lock(sb_bgl_lock(sbi, group_no));
1905         if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1906                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1907         le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1908         gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1909         spin_unlock(sb_bgl_lock(sbi, group_no));
1910         if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1911                 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1912
1913         if (sbi->s_log_groups_per_flex) {
1914                 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1915                 spin_lock(sb_bgl_lock(sbi, flex_group));
1916                 sbi->s_flex_groups[flex_group].free_blocks -= num;
1917                 spin_unlock(sb_bgl_lock(sbi, flex_group));
1918         }
1919
1920         BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1921         err = ext4_journal_dirty_metadata(handle, gdp_bh);
1922         if (!fatal)
1923                 fatal = err;
1924
1925         sb->s_dirt = 1;
1926         if (fatal)
1927                 goto out;
1928
1929         *errp = 0;
1930         brelse(bitmap_bh);
1931         DQUOT_FREE_BLOCK(inode, *count-num);
1932         *count = num;
1933         return ret_block;
1934
1935 io_error:
1936         *errp = -EIO;
1937 out:
1938         if (fatal) {
1939                 *errp = fatal;
1940                 ext4_std_error(sb, fatal);
1941         }
1942         /*
1943          * Undo the block allocation
1944          */
1945         if (!performed_allocation)
1946                 DQUOT_FREE_BLOCK(inode, *count);
1947         brelse(bitmap_bh);
1948         return 0;
1949 }
1950
1951 #define EXT4_META_BLOCK 0x1
1952
1953 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1954                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
1955                                 unsigned long *count, int *errp, int flags)
1956 {
1957         struct ext4_allocation_request ar;
1958         ext4_fsblk_t ret;
1959
1960         if (!test_opt(inode->i_sb, MBALLOC)) {
1961                 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1962         }
1963
1964         memset(&ar, 0, sizeof(ar));
1965         /* Fill with neighbour allocated blocks */
1966
1967         ar.inode = inode;
1968         ar.goal = goal;
1969         ar.len = *count;
1970         ar.logical = iblock;
1971
1972         if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
1973                 /* enable in-core preallocation for data block allocation */
1974                 ar.flags = EXT4_MB_HINT_DATA;
1975         else
1976                 /* disable in-core preallocation for non-regular files */
1977                 ar.flags = 0;
1978
1979         ret = ext4_mb_new_blocks(handle, &ar, errp);
1980         *count = ar.len;
1981         return ret;
1982 }
1983
1984 /*
1985  * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
1986  *
1987  * @handle:             handle to this transaction
1988  * @inode:              file inode
1989  * @goal:               given target block(filesystem wide)
1990  * @count:              total number of blocks need
1991  * @errp:               error code
1992  *
1993  * Return 1st allocated block numberon success, *count stores total account
1994  * error stores in errp pointer
1995  */
1996 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1997                 ext4_fsblk_t goal, unsigned long *count, int *errp)
1998 {
1999         ext4_fsblk_t ret;
2000         ret = do_blk_alloc(handle, inode, 0, goal,
2001                                 count, errp, EXT4_META_BLOCK);
2002         /*
2003          * Account for the allocated meta blocks
2004          */
2005         if (!(*errp)) {
2006                 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2007                 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2008                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2009         }
2010         return ret;
2011 }
2012
2013 /*
2014  * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
2015  *
2016  * @handle:             handle to this transaction
2017  * @inode:              file inode
2018  * @goal:               given target block(filesystem wide)
2019  * @errp:               error code
2020  *
2021  * Return allocated block number on success
2022  */
2023 ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
2024                 ext4_fsblk_t goal, int *errp)
2025 {
2026         unsigned long count = 1;
2027         return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
2028 }
2029
2030 /*
2031  * ext4_new_blocks() -- allocate data blocks
2032  *
2033  * @handle:             handle to this transaction
2034  * @inode:              file inode
2035  * @goal:               given target block(filesystem wide)
2036  * @count:              total number of blocks need
2037  * @errp:               error code
2038  *
2039  * Return 1st allocated block numberon success, *count stores total account
2040  * error stores in errp pointer
2041  */
2042
2043 ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
2044                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
2045                                 unsigned long *count, int *errp)
2046 {
2047         return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
2048 }
2049
2050 /**
2051  * ext4_count_free_blocks() -- count filesystem free blocks
2052  * @sb:         superblock
2053  *
2054  * Adds up the number of free blocks from each block group.
2055  */
2056 ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
2057 {
2058         ext4_fsblk_t desc_count;
2059         struct ext4_group_desc *gdp;
2060         ext4_group_t i;
2061         ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2062 #ifdef EXT4FS_DEBUG
2063         struct ext4_super_block *es;
2064         ext4_fsblk_t bitmap_count;
2065         unsigned long x;
2066         struct buffer_head *bitmap_bh = NULL;
2067
2068         es = EXT4_SB(sb)->s_es;
2069         desc_count = 0;
2070         bitmap_count = 0;
2071         gdp = NULL;
2072
2073         smp_rmb();
2074         for (i = 0; i < ngroups; i++) {
2075                 gdp = ext4_get_group_desc(sb, i, NULL);
2076                 if (!gdp)
2077                         continue;
2078                 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
2079                 brelse(bitmap_bh);
2080                 bitmap_bh = ext4_read_block_bitmap(sb, i);
2081                 if (bitmap_bh == NULL)
2082                         continue;
2083
2084                 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
2085                 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
2086                         i, le16_to_cpu(gdp->bg_free_blocks_count), x);
2087                 bitmap_count += x;
2088         }
2089         brelse(bitmap_bh);
2090         printk("ext4_count_free_blocks: stored = %llu"
2091                 ", computed = %llu, %llu\n",
2092                 ext4_free_blocks_count(es),
2093                 desc_count, bitmap_count);
2094         return bitmap_count;
2095 #else
2096         desc_count = 0;
2097         smp_rmb();
2098         for (i = 0; i < ngroups; i++) {
2099                 gdp = ext4_get_group_desc(sb, i, NULL);
2100                 if (!gdp)
2101                         continue;
2102                 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
2103         }
2104
2105         return desc_count;
2106 #endif
2107 }
2108
2109 static inline int test_root(ext4_group_t a, int b)
2110 {
2111         int num = b;
2112
2113         while (a > num)
2114                 num *= b;
2115         return num == a;
2116 }
2117
2118 static int ext4_group_sparse(ext4_group_t group)
2119 {
2120         if (group <= 1)
2121                 return 1;
2122         if (!(group & 1))
2123                 return 0;
2124         return (test_root(group, 7) || test_root(group, 5) ||
2125                 test_root(group, 3));
2126 }
2127
2128 /**
2129  *      ext4_bg_has_super - number of blocks used by the superblock in group
2130  *      @sb: superblock for filesystem
2131  *      @group: group number to check
2132  *
2133  *      Return the number of blocks used by the superblock (primary or backup)
2134  *      in this group.  Currently this will be only 0 or 1.
2135  */
2136 int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
2137 {
2138         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
2139                                 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
2140                         !ext4_group_sparse(group))
2141                 return 0;
2142         return 1;
2143 }
2144
2145 static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
2146                                         ext4_group_t group)
2147 {
2148         unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
2149         ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
2150         ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
2151
2152         if (group == first || group == first + 1 || group == last)
2153                 return 1;
2154         return 0;
2155 }
2156
2157 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
2158                                         ext4_group_t group)
2159 {
2160         return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
2161 }
2162
2163 /**
2164  *      ext4_bg_num_gdb - number of blocks used by the group table in group
2165  *      @sb: superblock for filesystem
2166  *      @group: group number to check
2167  *
2168  *      Return the number of blocks used by the group descriptor table
2169  *      (primary or backup) in this group.  In the future there may be a
2170  *      different number of descriptor blocks in each group.
2171  */
2172 unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
2173 {
2174         unsigned long first_meta_bg =
2175                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
2176         unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
2177
2178         if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
2179                         metagroup < first_meta_bg)
2180                 return ext4_bg_num_gdb_nometa(sb,group);
2181
2182         return ext4_bg_num_gdb_meta(sb,group);
2183
2184 }