Btrfs: try to keep a healthy ratio of metadata vs data block groups
[linux-2.6] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include "compat.h"
25 #include "hash.h"
26 #include "crc32c.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "ref-cache.h"
34 #include "free-space-cache.h"
35
36 #define PENDING_EXTENT_INSERT 0
37 #define PENDING_EXTENT_DELETE 1
38 #define PENDING_BACKREF_UPDATE 2
39
40 struct pending_extent_op {
41         int type;
42         u64 bytenr;
43         u64 num_bytes;
44         u64 parent;
45         u64 orig_parent;
46         u64 generation;
47         u64 orig_generation;
48         int level;
49         struct list_head list;
50         int del;
51 };
52
53 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
54                                          struct btrfs_root *root, u64 parent,
55                                          u64 root_objectid, u64 ref_generation,
56                                          u64 owner, struct btrfs_key *ins,
57                                          int ref_mod);
58 static int update_reserved_extents(struct btrfs_root *root,
59                                    u64 bytenr, u64 num, int reserve);
60 static int update_block_group(struct btrfs_trans_handle *trans,
61                               struct btrfs_root *root,
62                               u64 bytenr, u64 num_bytes, int alloc,
63                               int mark_free);
64 static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
65                                         struct btrfs_root *root,
66                                         u64 bytenr, u64 num_bytes, u64 parent,
67                                         u64 root_objectid, u64 ref_generation,
68                                         u64 owner_objectid, int pin,
69                                         int ref_to_drop);
70
71 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
72                           struct btrfs_root *extent_root, u64 alloc_bytes,
73                           u64 flags, int force);
74
75 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
76 {
77         return (cache->flags & bits) == bits;
78 }
79
80 /*
81  * this adds the block group to the fs_info rb tree for the block group
82  * cache
83  */
84 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
85                                 struct btrfs_block_group_cache *block_group)
86 {
87         struct rb_node **p;
88         struct rb_node *parent = NULL;
89         struct btrfs_block_group_cache *cache;
90
91         spin_lock(&info->block_group_cache_lock);
92         p = &info->block_group_cache_tree.rb_node;
93
94         while (*p) {
95                 parent = *p;
96                 cache = rb_entry(parent, struct btrfs_block_group_cache,
97                                  cache_node);
98                 if (block_group->key.objectid < cache->key.objectid) {
99                         p = &(*p)->rb_left;
100                 } else if (block_group->key.objectid > cache->key.objectid) {
101                         p = &(*p)->rb_right;
102                 } else {
103                         spin_unlock(&info->block_group_cache_lock);
104                         return -EEXIST;
105                 }
106         }
107
108         rb_link_node(&block_group->cache_node, parent, p);
109         rb_insert_color(&block_group->cache_node,
110                         &info->block_group_cache_tree);
111         spin_unlock(&info->block_group_cache_lock);
112
113         return 0;
114 }
115
116 /*
117  * This will return the block group at or after bytenr if contains is 0, else
118  * it will return the block group that contains the bytenr
119  */
120 static struct btrfs_block_group_cache *
121 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
122                               int contains)
123 {
124         struct btrfs_block_group_cache *cache, *ret = NULL;
125         struct rb_node *n;
126         u64 end, start;
127
128         spin_lock(&info->block_group_cache_lock);
129         n = info->block_group_cache_tree.rb_node;
130
131         while (n) {
132                 cache = rb_entry(n, struct btrfs_block_group_cache,
133                                  cache_node);
134                 end = cache->key.objectid + cache->key.offset - 1;
135                 start = cache->key.objectid;
136
137                 if (bytenr < start) {
138                         if (!contains && (!ret || start < ret->key.objectid))
139                                 ret = cache;
140                         n = n->rb_left;
141                 } else if (bytenr > start) {
142                         if (contains && bytenr <= end) {
143                                 ret = cache;
144                                 break;
145                         }
146                         n = n->rb_right;
147                 } else {
148                         ret = cache;
149                         break;
150                 }
151         }
152         if (ret)
153                 atomic_inc(&ret->count);
154         spin_unlock(&info->block_group_cache_lock);
155
156         return ret;
157 }
158
159 /*
160  * this is only called by cache_block_group, since we could have freed extents
161  * we need to check the pinned_extents for any extents that can't be used yet
162  * since their free space will be released as soon as the transaction commits.
163  */
164 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
165                               struct btrfs_fs_info *info, u64 start, u64 end)
166 {
167         u64 extent_start, extent_end, size;
168         int ret;
169
170         while (start < end) {
171                 ret = find_first_extent_bit(&info->pinned_extents, start,
172                                             &extent_start, &extent_end,
173                                             EXTENT_DIRTY);
174                 if (ret)
175                         break;
176
177                 if (extent_start == start) {
178                         start = extent_end + 1;
179                 } else if (extent_start > start && extent_start < end) {
180                         size = extent_start - start;
181                         ret = btrfs_add_free_space(block_group, start,
182                                                    size);
183                         BUG_ON(ret);
184                         start = extent_end + 1;
185                 } else {
186                         break;
187                 }
188         }
189
190         if (start < end) {
191                 size = end - start;
192                 ret = btrfs_add_free_space(block_group, start, size);
193                 BUG_ON(ret);
194         }
195
196         return 0;
197 }
198
199 static int remove_sb_from_cache(struct btrfs_root *root,
200                                 struct btrfs_block_group_cache *cache)
201 {
202         u64 bytenr;
203         u64 *logical;
204         int stripe_len;
205         int i, nr, ret;
206
207         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
208                 bytenr = btrfs_sb_offset(i);
209                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
210                                        cache->key.objectid, bytenr, 0,
211                                        &logical, &nr, &stripe_len);
212                 BUG_ON(ret);
213                 while (nr--) {
214                         btrfs_remove_free_space(cache, logical[nr],
215                                                 stripe_len);
216                 }
217                 kfree(logical);
218         }
219         return 0;
220 }
221
222 static int cache_block_group(struct btrfs_root *root,
223                              struct btrfs_block_group_cache *block_group)
224 {
225         struct btrfs_path *path;
226         int ret = 0;
227         struct btrfs_key key;
228         struct extent_buffer *leaf;
229         int slot;
230         u64 last;
231
232         if (!block_group)
233                 return 0;
234
235         root = root->fs_info->extent_root;
236
237         if (block_group->cached)
238                 return 0;
239
240         path = btrfs_alloc_path();
241         if (!path)
242                 return -ENOMEM;
243
244         path->reada = 2;
245         /*
246          * we get into deadlocks with paths held by callers of this function.
247          * since the alloc_mutex is protecting things right now, just
248          * skip the locking here
249          */
250         path->skip_locking = 1;
251         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
252         key.objectid = last;
253         key.offset = 0;
254         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
255         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
256         if (ret < 0)
257                 goto err;
258
259         while (1) {
260                 leaf = path->nodes[0];
261                 slot = path->slots[0];
262                 if (slot >= btrfs_header_nritems(leaf)) {
263                         ret = btrfs_next_leaf(root, path);
264                         if (ret < 0)
265                                 goto err;
266                         if (ret == 0)
267                                 continue;
268                         else
269                                 break;
270                 }
271                 btrfs_item_key_to_cpu(leaf, &key, slot);
272                 if (key.objectid < block_group->key.objectid)
273                         goto next;
274
275                 if (key.objectid >= block_group->key.objectid +
276                     block_group->key.offset)
277                         break;
278
279                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
280                         add_new_free_space(block_group, root->fs_info, last,
281                                            key.objectid);
282
283                         last = key.objectid + key.offset;
284                 }
285 next:
286                 path->slots[0]++;
287         }
288
289         add_new_free_space(block_group, root->fs_info, last,
290                            block_group->key.objectid +
291                            block_group->key.offset);
292
293         block_group->cached = 1;
294         remove_sb_from_cache(root, block_group);
295         ret = 0;
296 err:
297         btrfs_free_path(path);
298         return ret;
299 }
300
301 /*
302  * return the block group that starts at or after bytenr
303  */
304 static struct btrfs_block_group_cache *
305 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
306 {
307         struct btrfs_block_group_cache *cache;
308
309         cache = block_group_cache_tree_search(info, bytenr, 0);
310
311         return cache;
312 }
313
314 /*
315  * return the block group that contains teh given bytenr
316  */
317 struct btrfs_block_group_cache *btrfs_lookup_block_group(
318                                                  struct btrfs_fs_info *info,
319                                                  u64 bytenr)
320 {
321         struct btrfs_block_group_cache *cache;
322
323         cache = block_group_cache_tree_search(info, bytenr, 1);
324
325         return cache;
326 }
327
328 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
329 {
330         if (atomic_dec_and_test(&cache->count))
331                 kfree(cache);
332 }
333
334 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
335                                                   u64 flags)
336 {
337         struct list_head *head = &info->space_info;
338         struct btrfs_space_info *found;
339
340         rcu_read_lock();
341         list_for_each_entry_rcu(found, head, list) {
342                 if (found->flags == flags) {
343                         rcu_read_unlock();
344                         return found;
345                 }
346         }
347         rcu_read_unlock();
348         return NULL;
349 }
350
351 /*
352  * after adding space to the filesystem, we need to clear the full flags
353  * on all the space infos.
354  */
355 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
356 {
357         struct list_head *head = &info->space_info;
358         struct btrfs_space_info *found;
359
360         rcu_read_lock();
361         list_for_each_entry_rcu(found, head, list)
362                 found->full = 0;
363         rcu_read_unlock();
364 }
365
366 static u64 div_factor(u64 num, int factor)
367 {
368         if (factor == 10)
369                 return num;
370         num *= factor;
371         do_div(num, 10);
372         return num;
373 }
374
375 u64 btrfs_find_block_group(struct btrfs_root *root,
376                            u64 search_start, u64 search_hint, int owner)
377 {
378         struct btrfs_block_group_cache *cache;
379         u64 used;
380         u64 last = max(search_hint, search_start);
381         u64 group_start = 0;
382         int full_search = 0;
383         int factor = 9;
384         int wrapped = 0;
385 again:
386         while (1) {
387                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
388                 if (!cache)
389                         break;
390
391                 spin_lock(&cache->lock);
392                 last = cache->key.objectid + cache->key.offset;
393                 used = btrfs_block_group_used(&cache->item);
394
395                 if ((full_search || !cache->ro) &&
396                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
397                         if (used + cache->pinned + cache->reserved <
398                             div_factor(cache->key.offset, factor)) {
399                                 group_start = cache->key.objectid;
400                                 spin_unlock(&cache->lock);
401                                 btrfs_put_block_group(cache);
402                                 goto found;
403                         }
404                 }
405                 spin_unlock(&cache->lock);
406                 btrfs_put_block_group(cache);
407                 cond_resched();
408         }
409         if (!wrapped) {
410                 last = search_start;
411                 wrapped = 1;
412                 goto again;
413         }
414         if (!full_search && factor < 10) {
415                 last = search_start;
416                 full_search = 1;
417                 factor = 10;
418                 goto again;
419         }
420 found:
421         return group_start;
422 }
423
424 /* simple helper to search for an existing extent at a given offset */
425 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
426 {
427         int ret;
428         struct btrfs_key key;
429         struct btrfs_path *path;
430
431         path = btrfs_alloc_path();
432         BUG_ON(!path);
433         key.objectid = start;
434         key.offset = len;
435         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
436         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
437                                 0, 0);
438         btrfs_free_path(path);
439         return ret;
440 }
441
442 /*
443  * Back reference rules.  Back refs have three main goals:
444  *
445  * 1) differentiate between all holders of references to an extent so that
446  *    when a reference is dropped we can make sure it was a valid reference
447  *    before freeing the extent.
448  *
449  * 2) Provide enough information to quickly find the holders of an extent
450  *    if we notice a given block is corrupted or bad.
451  *
452  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
453  *    maintenance.  This is actually the same as #2, but with a slightly
454  *    different use case.
455  *
456  * File extents can be referenced by:
457  *
458  * - multiple snapshots, subvolumes, or different generations in one subvol
459  * - different files inside a single subvolume
460  * - different offsets inside a file (bookend extents in file.c)
461  *
462  * The extent ref structure has fields for:
463  *
464  * - Objectid of the subvolume root
465  * - Generation number of the tree holding the reference
466  * - objectid of the file holding the reference
467  * - number of references holding by parent node (alway 1 for tree blocks)
468  *
469  * Btree leaf may hold multiple references to a file extent. In most cases,
470  * these references are from same file and the corresponding offsets inside
471  * the file are close together.
472  *
473  * When a file extent is allocated the fields are filled in:
474  *     (root_key.objectid, trans->transid, inode objectid, 1)
475  *
476  * When a leaf is cow'd new references are added for every file extent found
477  * in the leaf.  It looks similar to the create case, but trans->transid will
478  * be different when the block is cow'd.
479  *
480  *     (root_key.objectid, trans->transid, inode objectid,
481  *      number of references in the leaf)
482  *
483  * When a file extent is removed either during snapshot deletion or
484  * file truncation, we find the corresponding back reference and check
485  * the following fields:
486  *
487  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
488  *      inode objectid)
489  *
490  * Btree extents can be referenced by:
491  *
492  * - Different subvolumes
493  * - Different generations of the same subvolume
494  *
495  * When a tree block is created, back references are inserted:
496  *
497  * (root->root_key.objectid, trans->transid, level, 1)
498  *
499  * When a tree block is cow'd, new back references are added for all the
500  * blocks it points to. If the tree block isn't in reference counted root,
501  * the old back references are removed. These new back references are of
502  * the form (trans->transid will have increased since creation):
503  *
504  * (root->root_key.objectid, trans->transid, level, 1)
505  *
506  * When a backref is in deleting, the following fields are checked:
507  *
508  * if backref was for a tree root:
509  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
510  * else
511  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
512  *
513  * Back Reference Key composing:
514  *
515  * The key objectid corresponds to the first byte in the extent, the key
516  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
517  * byte of parent extent. If a extent is tree root, the key offset is set
518  * to the key objectid.
519  */
520
521 static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
522                                           struct btrfs_root *root,
523                                           struct btrfs_path *path,
524                                           u64 bytenr, u64 parent,
525                                           u64 ref_root, u64 ref_generation,
526                                           u64 owner_objectid, int del)
527 {
528         struct btrfs_key key;
529         struct btrfs_extent_ref *ref;
530         struct extent_buffer *leaf;
531         u64 ref_objectid;
532         int ret;
533
534         key.objectid = bytenr;
535         key.type = BTRFS_EXTENT_REF_KEY;
536         key.offset = parent;
537
538         ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
539         if (ret < 0)
540                 goto out;
541         if (ret > 0) {
542                 ret = -ENOENT;
543                 goto out;
544         }
545
546         leaf = path->nodes[0];
547         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
548         ref_objectid = btrfs_ref_objectid(leaf, ref);
549         if (btrfs_ref_root(leaf, ref) != ref_root ||
550             btrfs_ref_generation(leaf, ref) != ref_generation ||
551             (ref_objectid != owner_objectid &&
552              ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
553                 ret = -EIO;
554                 WARN_ON(1);
555                 goto out;
556         }
557         ret = 0;
558 out:
559         return ret;
560 }
561
562 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
563                                           struct btrfs_root *root,
564                                           struct btrfs_path *path,
565                                           u64 bytenr, u64 parent,
566                                           u64 ref_root, u64 ref_generation,
567                                           u64 owner_objectid,
568                                           int refs_to_add)
569 {
570         struct btrfs_key key;
571         struct extent_buffer *leaf;
572         struct btrfs_extent_ref *ref;
573         u32 num_refs;
574         int ret;
575
576         key.objectid = bytenr;
577         key.type = BTRFS_EXTENT_REF_KEY;
578         key.offset = parent;
579
580         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
581         if (ret == 0) {
582                 leaf = path->nodes[0];
583                 ref = btrfs_item_ptr(leaf, path->slots[0],
584                                      struct btrfs_extent_ref);
585                 btrfs_set_ref_root(leaf, ref, ref_root);
586                 btrfs_set_ref_generation(leaf, ref, ref_generation);
587                 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
588                 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
589         } else if (ret == -EEXIST) {
590                 u64 existing_owner;
591
592                 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
593                 leaf = path->nodes[0];
594                 ref = btrfs_item_ptr(leaf, path->slots[0],
595                                      struct btrfs_extent_ref);
596                 if (btrfs_ref_root(leaf, ref) != ref_root ||
597                     btrfs_ref_generation(leaf, ref) != ref_generation) {
598                         ret = -EIO;
599                         WARN_ON(1);
600                         goto out;
601                 }
602
603                 num_refs = btrfs_ref_num_refs(leaf, ref);
604                 BUG_ON(num_refs == 0);
605                 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
606
607                 existing_owner = btrfs_ref_objectid(leaf, ref);
608                 if (existing_owner != owner_objectid &&
609                     existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
610                         btrfs_set_ref_objectid(leaf, ref,
611                                         BTRFS_MULTIPLE_OBJECTIDS);
612                 }
613                 ret = 0;
614         } else {
615                 goto out;
616         }
617         btrfs_unlock_up_safe(path, 1);
618         btrfs_mark_buffer_dirty(path->nodes[0]);
619 out:
620         btrfs_release_path(root, path);
621         return ret;
622 }
623
624 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
625                                           struct btrfs_root *root,
626                                           struct btrfs_path *path,
627                                           int refs_to_drop)
628 {
629         struct extent_buffer *leaf;
630         struct btrfs_extent_ref *ref;
631         u32 num_refs;
632         int ret = 0;
633
634         leaf = path->nodes[0];
635         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
636         num_refs = btrfs_ref_num_refs(leaf, ref);
637         BUG_ON(num_refs < refs_to_drop);
638         num_refs -= refs_to_drop;
639         if (num_refs == 0) {
640                 ret = btrfs_del_item(trans, root, path);
641         } else {
642                 btrfs_set_ref_num_refs(leaf, ref, num_refs);
643                 btrfs_mark_buffer_dirty(leaf);
644         }
645         btrfs_release_path(root, path);
646         return ret;
647 }
648
649 #ifdef BIO_RW_DISCARD
650 static void btrfs_issue_discard(struct block_device *bdev,
651                                 u64 start, u64 len)
652 {
653         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
654 }
655 #endif
656
657 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
658                                 u64 num_bytes)
659 {
660 #ifdef BIO_RW_DISCARD
661         int ret;
662         u64 map_length = num_bytes;
663         struct btrfs_multi_bio *multi = NULL;
664
665         /* Tell the block device(s) that the sectors can be discarded */
666         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
667                               bytenr, &map_length, &multi, 0);
668         if (!ret) {
669                 struct btrfs_bio_stripe *stripe = multi->stripes;
670                 int i;
671
672                 if (map_length > num_bytes)
673                         map_length = num_bytes;
674
675                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
676                         btrfs_issue_discard(stripe->dev->bdev,
677                                             stripe->physical,
678                                             map_length);
679                 }
680                 kfree(multi);
681         }
682
683         return ret;
684 #else
685         return 0;
686 #endif
687 }
688
689 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
690                                      struct btrfs_root *root, u64 bytenr,
691                                      u64 num_bytes,
692                                      u64 orig_parent, u64 parent,
693                                      u64 orig_root, u64 ref_root,
694                                      u64 orig_generation, u64 ref_generation,
695                                      u64 owner_objectid)
696 {
697         int ret;
698         int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
699
700         ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
701                                        orig_parent, parent, orig_root,
702                                        ref_root, orig_generation,
703                                        ref_generation, owner_objectid, pin);
704         BUG_ON(ret);
705         return ret;
706 }
707
708 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
709                             struct btrfs_root *root, u64 bytenr,
710                             u64 num_bytes, u64 orig_parent, u64 parent,
711                             u64 ref_root, u64 ref_generation,
712                             u64 owner_objectid)
713 {
714         int ret;
715         if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
716             owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
717                 return 0;
718
719         ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
720                                         orig_parent, parent, ref_root,
721                                         ref_root, ref_generation,
722                                         ref_generation, owner_objectid);
723         return ret;
724 }
725 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
726                                   struct btrfs_root *root, u64 bytenr,
727                                   u64 num_bytes,
728                                   u64 orig_parent, u64 parent,
729                                   u64 orig_root, u64 ref_root,
730                                   u64 orig_generation, u64 ref_generation,
731                                   u64 owner_objectid)
732 {
733         int ret;
734
735         ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
736                                     ref_generation, owner_objectid,
737                                     BTRFS_ADD_DELAYED_REF, 0);
738         BUG_ON(ret);
739         return ret;
740 }
741
742 static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
743                           struct btrfs_root *root, u64 bytenr,
744                           u64 num_bytes, u64 parent, u64 ref_root,
745                           u64 ref_generation, u64 owner_objectid,
746                           int refs_to_add)
747 {
748         struct btrfs_path *path;
749         int ret;
750         struct btrfs_key key;
751         struct extent_buffer *l;
752         struct btrfs_extent_item *item;
753         u32 refs;
754
755         path = btrfs_alloc_path();
756         if (!path)
757                 return -ENOMEM;
758
759         path->reada = 1;
760         path->leave_spinning = 1;
761         key.objectid = bytenr;
762         key.type = BTRFS_EXTENT_ITEM_KEY;
763         key.offset = num_bytes;
764
765         /* first find the extent item and update its reference count */
766         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
767                                 path, 0, 1);
768         if (ret < 0) {
769                 btrfs_set_path_blocking(path);
770                 return ret;
771         }
772
773         if (ret > 0) {
774                 WARN_ON(1);
775                 btrfs_free_path(path);
776                 return -EIO;
777         }
778         l = path->nodes[0];
779
780         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
781         if (key.objectid != bytenr) {
782                 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
783                 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
784                        (unsigned long long)bytenr,
785                        (unsigned long long)key.objectid);
786                 BUG();
787         }
788         BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
789
790         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
792         refs = btrfs_extent_refs(l, item);
793         btrfs_set_extent_refs(l, item, refs + refs_to_add);
794         btrfs_unlock_up_safe(path, 1);
795
796         btrfs_mark_buffer_dirty(path->nodes[0]);
797
798         btrfs_release_path(root->fs_info->extent_root, path);
799
800         path->reada = 1;
801         path->leave_spinning = 1;
802
803         /* now insert the actual backref */
804         ret = insert_extent_backref(trans, root->fs_info->extent_root,
805                                     path, bytenr, parent,
806                                     ref_root, ref_generation,
807                                     owner_objectid, refs_to_add);
808         BUG_ON(ret);
809         btrfs_free_path(path);
810         return 0;
811 }
812
813 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
814                          struct btrfs_root *root,
815                          u64 bytenr, u64 num_bytes, u64 parent,
816                          u64 ref_root, u64 ref_generation,
817                          u64 owner_objectid)
818 {
819         int ret;
820         if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
821             owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
822                 return 0;
823
824         ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
825                                      0, ref_root, 0, ref_generation,
826                                      owner_objectid);
827         return ret;
828 }
829
830 static int drop_delayed_ref(struct btrfs_trans_handle *trans,
831                                         struct btrfs_root *root,
832                                         struct btrfs_delayed_ref_node *node)
833 {
834         int ret = 0;
835         struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
836
837         BUG_ON(node->ref_mod == 0);
838         ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
839                                   node->parent, ref->root, ref->generation,
840                                   ref->owner_objectid, ref->pin, node->ref_mod);
841
842         return ret;
843 }
844
845 /* helper function to actually process a single delayed ref entry */
846 static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
847                                         struct btrfs_root *root,
848                                         struct btrfs_delayed_ref_node *node,
849                                         int insert_reserved)
850 {
851         int ret;
852         struct btrfs_delayed_ref *ref;
853
854         if (node->parent == (u64)-1) {
855                 struct btrfs_delayed_ref_head *head;
856                 /*
857                  * we've hit the end of the chain and we were supposed
858                  * to insert this extent into the tree.  But, it got
859                  * deleted before we ever needed to insert it, so all
860                  * we have to do is clean up the accounting
861                  */
862                 if (insert_reserved) {
863                         update_reserved_extents(root, node->bytenr,
864                                                 node->num_bytes, 0);
865                 }
866                 head = btrfs_delayed_node_to_head(node);
867                 mutex_unlock(&head->mutex);
868                 return 0;
869         }
870
871         ref = btrfs_delayed_node_to_ref(node);
872         if (ref->action == BTRFS_ADD_DELAYED_REF) {
873                 if (insert_reserved) {
874                         struct btrfs_key ins;
875
876                         ins.objectid = node->bytenr;
877                         ins.offset = node->num_bytes;
878                         ins.type = BTRFS_EXTENT_ITEM_KEY;
879
880                         /* record the full extent allocation */
881                         ret = __btrfs_alloc_reserved_extent(trans, root,
882                                         node->parent, ref->root,
883                                         ref->generation, ref->owner_objectid,
884                                         &ins, node->ref_mod);
885                         update_reserved_extents(root, node->bytenr,
886                                                 node->num_bytes, 0);
887                 } else {
888                         /* just add one backref */
889                         ret = add_extent_ref(trans, root, node->bytenr,
890                                      node->num_bytes,
891                                      node->parent, ref->root, ref->generation,
892                                      ref->owner_objectid, node->ref_mod);
893                 }
894                 BUG_ON(ret);
895         } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
896                 WARN_ON(insert_reserved);
897                 ret = drop_delayed_ref(trans, root, node);
898         }
899         return 0;
900 }
901
902 static noinline struct btrfs_delayed_ref_node *
903 select_delayed_ref(struct btrfs_delayed_ref_head *head)
904 {
905         struct rb_node *node;
906         struct btrfs_delayed_ref_node *ref;
907         int action = BTRFS_ADD_DELAYED_REF;
908 again:
909         /*
910          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
911          * this prevents ref count from going down to zero when
912          * there still are pending delayed ref.
913          */
914         node = rb_prev(&head->node.rb_node);
915         while (1) {
916                 if (!node)
917                         break;
918                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
919                                 rb_node);
920                 if (ref->bytenr != head->node.bytenr)
921                         break;
922                 if (btrfs_delayed_node_to_ref(ref)->action == action)
923                         return ref;
924                 node = rb_prev(node);
925         }
926         if (action == BTRFS_ADD_DELAYED_REF) {
927                 action = BTRFS_DROP_DELAYED_REF;
928                 goto again;
929         }
930         return NULL;
931 }
932
933 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
934                                        struct btrfs_root *root,
935                                        struct list_head *cluster)
936 {
937         struct btrfs_delayed_ref_root *delayed_refs;
938         struct btrfs_delayed_ref_node *ref;
939         struct btrfs_delayed_ref_head *locked_ref = NULL;
940         int ret;
941         int count = 0;
942         int must_insert_reserved = 0;
943
944         delayed_refs = &trans->transaction->delayed_refs;
945         while (1) {
946                 if (!locked_ref) {
947                         /* pick a new head ref from the cluster list */
948                         if (list_empty(cluster))
949                                 break;
950
951                         locked_ref = list_entry(cluster->next,
952                                      struct btrfs_delayed_ref_head, cluster);
953
954                         /* grab the lock that says we are going to process
955                          * all the refs for this head */
956                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
957
958                         /*
959                          * we may have dropped the spin lock to get the head
960                          * mutex lock, and that might have given someone else
961                          * time to free the head.  If that's true, it has been
962                          * removed from our list and we can move on.
963                          */
964                         if (ret == -EAGAIN) {
965                                 locked_ref = NULL;
966                                 count++;
967                                 continue;
968                         }
969                 }
970
971                 /*
972                  * record the must insert reserved flag before we
973                  * drop the spin lock.
974                  */
975                 must_insert_reserved = locked_ref->must_insert_reserved;
976                 locked_ref->must_insert_reserved = 0;
977
978                 /*
979                  * locked_ref is the head node, so we have to go one
980                  * node back for any delayed ref updates
981                  */
982                 ref = select_delayed_ref(locked_ref);
983                 if (!ref) {
984                         /* All delayed refs have been processed, Go ahead
985                          * and send the head node to run_one_delayed_ref,
986                          * so that any accounting fixes can happen
987                          */
988                         ref = &locked_ref->node;
989                         list_del_init(&locked_ref->cluster);
990                         locked_ref = NULL;
991                 }
992
993                 ref->in_tree = 0;
994                 rb_erase(&ref->rb_node, &delayed_refs->root);
995                 delayed_refs->num_entries--;
996                 spin_unlock(&delayed_refs->lock);
997
998                 ret = run_one_delayed_ref(trans, root, ref,
999                                           must_insert_reserved);
1000                 BUG_ON(ret);
1001                 btrfs_put_delayed_ref(ref);
1002
1003                 count++;
1004                 cond_resched();
1005                 spin_lock(&delayed_refs->lock);
1006         }
1007         return count;
1008 }
1009
1010 /*
1011  * this starts processing the delayed reference count updates and
1012  * extent insertions we have queued up so far.  count can be
1013  * 0, which means to process everything in the tree at the start
1014  * of the run (but not newly added entries), or it can be some target
1015  * number you'd like to process.
1016  */
1017 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1018                            struct btrfs_root *root, unsigned long count)
1019 {
1020         struct rb_node *node;
1021         struct btrfs_delayed_ref_root *delayed_refs;
1022         struct btrfs_delayed_ref_node *ref;
1023         struct list_head cluster;
1024         int ret;
1025         int run_all = count == (unsigned long)-1;
1026         int run_most = 0;
1027
1028         if (root == root->fs_info->extent_root)
1029                 root = root->fs_info->tree_root;
1030
1031         delayed_refs = &trans->transaction->delayed_refs;
1032         INIT_LIST_HEAD(&cluster);
1033 again:
1034         spin_lock(&delayed_refs->lock);
1035         if (count == 0) {
1036                 count = delayed_refs->num_entries * 2;
1037                 run_most = 1;
1038         }
1039         while (1) {
1040                 if (!(run_all || run_most) &&
1041                     delayed_refs->num_heads_ready < 64)
1042                         break;
1043
1044                 /*
1045                  * go find something we can process in the rbtree.  We start at
1046                  * the beginning of the tree, and then build a cluster
1047                  * of refs to process starting at the first one we are able to
1048                  * lock
1049                  */
1050                 ret = btrfs_find_ref_cluster(trans, &cluster,
1051                                              delayed_refs->run_delayed_start);
1052                 if (ret)
1053                         break;
1054
1055                 ret = run_clustered_refs(trans, root, &cluster);
1056                 BUG_ON(ret < 0);
1057
1058                 count -= min_t(unsigned long, ret, count);
1059
1060                 if (count == 0)
1061                         break;
1062         }
1063
1064         if (run_all) {
1065                 node = rb_first(&delayed_refs->root);
1066                 if (!node)
1067                         goto out;
1068                 count = (unsigned long)-1;
1069
1070                 while (node) {
1071                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
1072                                        rb_node);
1073                         if (btrfs_delayed_ref_is_head(ref)) {
1074                                 struct btrfs_delayed_ref_head *head;
1075
1076                                 head = btrfs_delayed_node_to_head(ref);
1077                                 atomic_inc(&ref->refs);
1078
1079                                 spin_unlock(&delayed_refs->lock);
1080                                 mutex_lock(&head->mutex);
1081                                 mutex_unlock(&head->mutex);
1082
1083                                 btrfs_put_delayed_ref(ref);
1084                                 cond_resched();
1085                                 goto again;
1086                         }
1087                         node = rb_next(node);
1088                 }
1089                 spin_unlock(&delayed_refs->lock);
1090                 schedule_timeout(1);
1091                 goto again;
1092         }
1093 out:
1094         spin_unlock(&delayed_refs->lock);
1095         return 0;
1096 }
1097
1098 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1099                           struct btrfs_root *root, u64 objectid, u64 bytenr)
1100 {
1101         struct btrfs_root *extent_root = root->fs_info->extent_root;
1102         struct btrfs_path *path;
1103         struct extent_buffer *leaf;
1104         struct btrfs_extent_ref *ref_item;
1105         struct btrfs_key key;
1106         struct btrfs_key found_key;
1107         u64 ref_root;
1108         u64 last_snapshot;
1109         u32 nritems;
1110         int ret;
1111
1112         key.objectid = bytenr;
1113         key.offset = (u64)-1;
1114         key.type = BTRFS_EXTENT_ITEM_KEY;
1115
1116         path = btrfs_alloc_path();
1117         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1118         if (ret < 0)
1119                 goto out;
1120         BUG_ON(ret == 0);
1121
1122         ret = -ENOENT;
1123         if (path->slots[0] == 0)
1124                 goto out;
1125
1126         path->slots[0]--;
1127         leaf = path->nodes[0];
1128         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1129
1130         if (found_key.objectid != bytenr ||
1131             found_key.type != BTRFS_EXTENT_ITEM_KEY)
1132                 goto out;
1133
1134         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1135         while (1) {
1136                 leaf = path->nodes[0];
1137                 nritems = btrfs_header_nritems(leaf);
1138                 if (path->slots[0] >= nritems) {
1139                         ret = btrfs_next_leaf(extent_root, path);
1140                         if (ret < 0)
1141                                 goto out;
1142                         if (ret == 0)
1143                                 continue;
1144                         break;
1145                 }
1146                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1147                 if (found_key.objectid != bytenr)
1148                         break;
1149
1150                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1151                         path->slots[0]++;
1152                         continue;
1153                 }
1154
1155                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1156                                           struct btrfs_extent_ref);
1157                 ref_root = btrfs_ref_root(leaf, ref_item);
1158                 if ((ref_root != root->root_key.objectid &&
1159                      ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1160                      objectid != btrfs_ref_objectid(leaf, ref_item)) {
1161                         ret = 1;
1162                         goto out;
1163                 }
1164                 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1165                         ret = 1;
1166                         goto out;
1167                 }
1168
1169                 path->slots[0]++;
1170         }
1171         ret = 0;
1172 out:
1173         btrfs_free_path(path);
1174         return ret;
1175 }
1176
1177 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1178                     struct extent_buffer *buf, u32 nr_extents)
1179 {
1180         struct btrfs_key key;
1181         struct btrfs_file_extent_item *fi;
1182         u64 root_gen;
1183         u32 nritems;
1184         int i;
1185         int level;
1186         int ret = 0;
1187         int shared = 0;
1188
1189         if (!root->ref_cows)
1190                 return 0;
1191
1192         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1193                 shared = 0;
1194                 root_gen = root->root_key.offset;
1195         } else {
1196                 shared = 1;
1197                 root_gen = trans->transid - 1;
1198         }
1199
1200         level = btrfs_header_level(buf);
1201         nritems = btrfs_header_nritems(buf);
1202
1203         if (level == 0) {
1204                 struct btrfs_leaf_ref *ref;
1205                 struct btrfs_extent_info *info;
1206
1207                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1208                 if (!ref) {
1209                         ret = -ENOMEM;
1210                         goto out;
1211                 }
1212
1213                 ref->root_gen = root_gen;
1214                 ref->bytenr = buf->start;
1215                 ref->owner = btrfs_header_owner(buf);
1216                 ref->generation = btrfs_header_generation(buf);
1217                 ref->nritems = nr_extents;
1218                 info = ref->extents;
1219
1220                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1221                         u64 disk_bytenr;
1222                         btrfs_item_key_to_cpu(buf, &key, i);
1223                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1224                                 continue;
1225                         fi = btrfs_item_ptr(buf, i,
1226                                             struct btrfs_file_extent_item);
1227                         if (btrfs_file_extent_type(buf, fi) ==
1228                             BTRFS_FILE_EXTENT_INLINE)
1229                                 continue;
1230                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1231                         if (disk_bytenr == 0)
1232                                 continue;
1233
1234                         info->bytenr = disk_bytenr;
1235                         info->num_bytes =
1236                                 btrfs_file_extent_disk_num_bytes(buf, fi);
1237                         info->objectid = key.objectid;
1238                         info->offset = key.offset;
1239                         info++;
1240                 }
1241
1242                 ret = btrfs_add_leaf_ref(root, ref, shared);
1243                 if (ret == -EEXIST && shared) {
1244                         struct btrfs_leaf_ref *old;
1245                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1246                         BUG_ON(!old);
1247                         btrfs_remove_leaf_ref(root, old);
1248                         btrfs_free_leaf_ref(root, old);
1249                         ret = btrfs_add_leaf_ref(root, ref, shared);
1250                 }
1251                 WARN_ON(ret);
1252                 btrfs_free_leaf_ref(root, ref);
1253         }
1254 out:
1255         return ret;
1256 }
1257
1258 /* when a block goes through cow, we update the reference counts of
1259  * everything that block points to.  The internal pointers of the block
1260  * can be in just about any order, and it is likely to have clusters of
1261  * things that are close together and clusters of things that are not.
1262  *
1263  * To help reduce the seeks that come with updating all of these reference
1264  * counts, sort them by byte number before actual updates are done.
1265  *
1266  * struct refsort is used to match byte number to slot in the btree block.
1267  * we sort based on the byte number and then use the slot to actually
1268  * find the item.
1269  *
1270  * struct refsort is smaller than strcut btrfs_item and smaller than
1271  * struct btrfs_key_ptr.  Since we're currently limited to the page size
1272  * for a btree block, there's no way for a kmalloc of refsorts for a
1273  * single node to be bigger than a page.
1274  */
1275 struct refsort {
1276         u64 bytenr;
1277         u32 slot;
1278 };
1279
1280 /*
1281  * for passing into sort()
1282  */
1283 static int refsort_cmp(const void *a_void, const void *b_void)
1284 {
1285         const struct refsort *a = a_void;
1286         const struct refsort *b = b_void;
1287
1288         if (a->bytenr < b->bytenr)
1289                 return -1;
1290         if (a->bytenr > b->bytenr)
1291                 return 1;
1292         return 0;
1293 }
1294
1295
1296 noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1297                            struct btrfs_root *root,
1298                            struct extent_buffer *orig_buf,
1299                            struct extent_buffer *buf, u32 *nr_extents)
1300 {
1301         u64 bytenr;
1302         u64 ref_root;
1303         u64 orig_root;
1304         u64 ref_generation;
1305         u64 orig_generation;
1306         struct refsort *sorted;
1307         u32 nritems;
1308         u32 nr_file_extents = 0;
1309         struct btrfs_key key;
1310         struct btrfs_file_extent_item *fi;
1311         int i;
1312         int level;
1313         int ret = 0;
1314         int faili = 0;
1315         int refi = 0;
1316         int slot;
1317         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1318                             u64, u64, u64, u64, u64, u64, u64, u64, u64);
1319
1320         ref_root = btrfs_header_owner(buf);
1321         ref_generation = btrfs_header_generation(buf);
1322         orig_root = btrfs_header_owner(orig_buf);
1323         orig_generation = btrfs_header_generation(orig_buf);
1324
1325         nritems = btrfs_header_nritems(buf);
1326         level = btrfs_header_level(buf);
1327
1328         sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1329         BUG_ON(!sorted);
1330
1331         if (root->ref_cows) {
1332                 process_func = __btrfs_inc_extent_ref;
1333         } else {
1334                 if (level == 0 &&
1335                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1336                         goto out;
1337                 if (level != 0 &&
1338                     root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1339                         goto out;
1340                 process_func = __btrfs_update_extent_ref;
1341         }
1342
1343         /*
1344          * we make two passes through the items.  In the first pass we
1345          * only record the byte number and slot.  Then we sort based on
1346          * byte number and do the actual work based on the sorted results
1347          */
1348         for (i = 0; i < nritems; i++) {
1349                 cond_resched();
1350                 if (level == 0) {
1351                         btrfs_item_key_to_cpu(buf, &key, i);
1352                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1353                                 continue;
1354                         fi = btrfs_item_ptr(buf, i,
1355                                             struct btrfs_file_extent_item);
1356                         if (btrfs_file_extent_type(buf, fi) ==
1357                             BTRFS_FILE_EXTENT_INLINE)
1358                                 continue;
1359                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1360                         if (bytenr == 0)
1361                                 continue;
1362
1363                         nr_file_extents++;
1364                         sorted[refi].bytenr = bytenr;
1365                         sorted[refi].slot = i;
1366                         refi++;
1367                 } else {
1368                         bytenr = btrfs_node_blockptr(buf, i);
1369                         sorted[refi].bytenr = bytenr;
1370                         sorted[refi].slot = i;
1371                         refi++;
1372                 }
1373         }
1374         /*
1375          * if refi == 0, we didn't actually put anything into the sorted
1376          * array and we're done
1377          */
1378         if (refi == 0)
1379                 goto out;
1380
1381         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1382
1383         for (i = 0; i < refi; i++) {
1384                 cond_resched();
1385                 slot = sorted[i].slot;
1386                 bytenr = sorted[i].bytenr;
1387
1388                 if (level == 0) {
1389                         btrfs_item_key_to_cpu(buf, &key, slot);
1390                         fi = btrfs_item_ptr(buf, slot,
1391                                             struct btrfs_file_extent_item);
1392
1393                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1394                         if (bytenr == 0)
1395                                 continue;
1396
1397                         ret = process_func(trans, root, bytenr,
1398                                    btrfs_file_extent_disk_num_bytes(buf, fi),
1399                                    orig_buf->start, buf->start,
1400                                    orig_root, ref_root,
1401                                    orig_generation, ref_generation,
1402                                    key.objectid);
1403
1404                         if (ret) {
1405                                 faili = slot;
1406                                 WARN_ON(1);
1407                                 goto fail;
1408                         }
1409                 } else {
1410                         ret = process_func(trans, root, bytenr, buf->len,
1411                                            orig_buf->start, buf->start,
1412                                            orig_root, ref_root,
1413                                            orig_generation, ref_generation,
1414                                            level - 1);
1415                         if (ret) {
1416                                 faili = slot;
1417                                 WARN_ON(1);
1418                                 goto fail;
1419                         }
1420                 }
1421         }
1422 out:
1423         kfree(sorted);
1424         if (nr_extents) {
1425                 if (level == 0)
1426                         *nr_extents = nr_file_extents;
1427                 else
1428                         *nr_extents = nritems;
1429         }
1430         return 0;
1431 fail:
1432         kfree(sorted);
1433         WARN_ON(1);
1434         return ret;
1435 }
1436
1437 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1438                      struct btrfs_root *root, struct extent_buffer *orig_buf,
1439                      struct extent_buffer *buf, int start_slot, int nr)
1440
1441 {
1442         u64 bytenr;
1443         u64 ref_root;
1444         u64 orig_root;
1445         u64 ref_generation;
1446         u64 orig_generation;
1447         struct btrfs_key key;
1448         struct btrfs_file_extent_item *fi;
1449         int i;
1450         int ret;
1451         int slot;
1452         int level;
1453
1454         BUG_ON(start_slot < 0);
1455         BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1456
1457         ref_root = btrfs_header_owner(buf);
1458         ref_generation = btrfs_header_generation(buf);
1459         orig_root = btrfs_header_owner(orig_buf);
1460         orig_generation = btrfs_header_generation(orig_buf);
1461         level = btrfs_header_level(buf);
1462
1463         if (!root->ref_cows) {
1464                 if (level == 0 &&
1465                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1466                         return 0;
1467                 if (level != 0 &&
1468                     root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1469                         return 0;
1470         }
1471
1472         for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1473                 cond_resched();
1474                 if (level == 0) {
1475                         btrfs_item_key_to_cpu(buf, &key, slot);
1476                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1477                                 continue;
1478                         fi = btrfs_item_ptr(buf, slot,
1479                                             struct btrfs_file_extent_item);
1480                         if (btrfs_file_extent_type(buf, fi) ==
1481                             BTRFS_FILE_EXTENT_INLINE)
1482                                 continue;
1483                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1484                         if (bytenr == 0)
1485                                 continue;
1486                         ret = __btrfs_update_extent_ref(trans, root, bytenr,
1487                                     btrfs_file_extent_disk_num_bytes(buf, fi),
1488                                     orig_buf->start, buf->start,
1489                                     orig_root, ref_root, orig_generation,
1490                                     ref_generation, key.objectid);
1491                         if (ret)
1492                                 goto fail;
1493                 } else {
1494                         bytenr = btrfs_node_blockptr(buf, slot);
1495                         ret = __btrfs_update_extent_ref(trans, root, bytenr,
1496                                             buf->len, orig_buf->start,
1497                                             buf->start, orig_root, ref_root,
1498                                             orig_generation, ref_generation,
1499                                             level - 1);
1500                         if (ret)
1501                                 goto fail;
1502                 }
1503         }
1504         return 0;
1505 fail:
1506         WARN_ON(1);
1507         return -1;
1508 }
1509
1510 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1511                                  struct btrfs_root *root,
1512                                  struct btrfs_path *path,
1513                                  struct btrfs_block_group_cache *cache)
1514 {
1515         int ret;
1516         struct btrfs_root *extent_root = root->fs_info->extent_root;
1517         unsigned long bi;
1518         struct extent_buffer *leaf;
1519
1520         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1521         if (ret < 0)
1522                 goto fail;
1523         BUG_ON(ret);
1524
1525         leaf = path->nodes[0];
1526         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1527         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1528         btrfs_mark_buffer_dirty(leaf);
1529         btrfs_release_path(extent_root, path);
1530 fail:
1531         if (ret)
1532                 return ret;
1533         return 0;
1534
1535 }
1536
1537 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1538                                    struct btrfs_root *root)
1539 {
1540         struct btrfs_block_group_cache *cache, *entry;
1541         struct rb_node *n;
1542         int err = 0;
1543         int werr = 0;
1544         struct btrfs_path *path;
1545         u64 last = 0;
1546
1547         path = btrfs_alloc_path();
1548         if (!path)
1549                 return -ENOMEM;
1550
1551         while (1) {
1552                 cache = NULL;
1553                 spin_lock(&root->fs_info->block_group_cache_lock);
1554                 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1555                      n; n = rb_next(n)) {
1556                         entry = rb_entry(n, struct btrfs_block_group_cache,
1557                                          cache_node);
1558                         if (entry->dirty) {
1559                                 cache = entry;
1560                                 break;
1561                         }
1562                 }
1563                 spin_unlock(&root->fs_info->block_group_cache_lock);
1564
1565                 if (!cache)
1566                         break;
1567
1568                 cache->dirty = 0;
1569                 last += cache->key.offset;
1570
1571                 err = write_one_cache_group(trans, root,
1572                                             path, cache);
1573                 /*
1574                  * if we fail to write the cache group, we want
1575                  * to keep it marked dirty in hopes that a later
1576                  * write will work
1577                  */
1578                 if (err) {
1579                         werr = err;
1580                         continue;
1581                 }
1582         }
1583         btrfs_free_path(path);
1584         return werr;
1585 }
1586
1587 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1588 {
1589         struct btrfs_block_group_cache *block_group;
1590         int readonly = 0;
1591
1592         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1593         if (!block_group || block_group->ro)
1594                 readonly = 1;
1595         if (block_group)
1596                 btrfs_put_block_group(block_group);
1597         return readonly;
1598 }
1599
1600 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1601                              u64 total_bytes, u64 bytes_used,
1602                              struct btrfs_space_info **space_info)
1603 {
1604         struct btrfs_space_info *found;
1605
1606         found = __find_space_info(info, flags);
1607         if (found) {
1608                 spin_lock(&found->lock);
1609                 found->total_bytes += total_bytes;
1610                 found->bytes_used += bytes_used;
1611                 found->full = 0;
1612                 spin_unlock(&found->lock);
1613                 *space_info = found;
1614                 return 0;
1615         }
1616         found = kzalloc(sizeof(*found), GFP_NOFS);
1617         if (!found)
1618                 return -ENOMEM;
1619
1620         INIT_LIST_HEAD(&found->block_groups);
1621         init_rwsem(&found->groups_sem);
1622         spin_lock_init(&found->lock);
1623         found->flags = flags;
1624         found->total_bytes = total_bytes;
1625         found->bytes_used = bytes_used;
1626         found->bytes_pinned = 0;
1627         found->bytes_reserved = 0;
1628         found->bytes_readonly = 0;
1629         found->bytes_delalloc = 0;
1630         found->full = 0;
1631         found->force_alloc = 0;
1632         *space_info = found;
1633         list_add_rcu(&found->list, &info->space_info);
1634         return 0;
1635 }
1636
1637 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1638 {
1639         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1640                                    BTRFS_BLOCK_GROUP_RAID1 |
1641                                    BTRFS_BLOCK_GROUP_RAID10 |
1642                                    BTRFS_BLOCK_GROUP_DUP);
1643         if (extra_flags) {
1644                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1645                         fs_info->avail_data_alloc_bits |= extra_flags;
1646                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1647                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1648                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1649                         fs_info->avail_system_alloc_bits |= extra_flags;
1650         }
1651 }
1652
1653 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1654 {
1655         spin_lock(&cache->space_info->lock);
1656         spin_lock(&cache->lock);
1657         if (!cache->ro) {
1658                 cache->space_info->bytes_readonly += cache->key.offset -
1659                                         btrfs_block_group_used(&cache->item);
1660                 cache->ro = 1;
1661         }
1662         spin_unlock(&cache->lock);
1663         spin_unlock(&cache->space_info->lock);
1664 }
1665
1666 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1667 {
1668         u64 num_devices = root->fs_info->fs_devices->rw_devices;
1669
1670         if (num_devices == 1)
1671                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1672         if (num_devices < 4)
1673                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1674
1675         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1676             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1677                       BTRFS_BLOCK_GROUP_RAID10))) {
1678                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1679         }
1680
1681         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1682             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1683                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1684         }
1685
1686         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1687             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1688              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1689              (flags & BTRFS_BLOCK_GROUP_DUP)))
1690                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1691         return flags;
1692 }
1693
1694 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1695 {
1696         struct btrfs_fs_info *info = root->fs_info;
1697         u64 alloc_profile;
1698
1699         if (data) {
1700                 alloc_profile = info->avail_data_alloc_bits &
1701                         info->data_alloc_profile;
1702                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1703         } else if (root == root->fs_info->chunk_root) {
1704                 alloc_profile = info->avail_system_alloc_bits &
1705                         info->system_alloc_profile;
1706                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1707         } else {
1708                 alloc_profile = info->avail_metadata_alloc_bits &
1709                         info->metadata_alloc_profile;
1710                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1711         }
1712
1713         return btrfs_reduce_alloc_profile(root, data);
1714 }
1715
1716 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
1717 {
1718         u64 alloc_target;
1719
1720         alloc_target = btrfs_get_alloc_profile(root, 1);
1721         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
1722                                                        alloc_target);
1723 }
1724
1725 /*
1726  * for now this just makes sure we have at least 5% of our metadata space free
1727  * for use.
1728  */
1729 int btrfs_check_metadata_free_space(struct btrfs_root *root)
1730 {
1731         struct btrfs_fs_info *info = root->fs_info;
1732         struct btrfs_space_info *meta_sinfo;
1733         u64 alloc_target, thresh;
1734         int committed = 0, ret;
1735
1736         /* get the space info for where the metadata will live */
1737         alloc_target = btrfs_get_alloc_profile(root, 0);
1738         meta_sinfo = __find_space_info(info, alloc_target);
1739
1740 again:
1741         spin_lock(&meta_sinfo->lock);
1742         if (!meta_sinfo->full)
1743                 thresh = meta_sinfo->total_bytes * 80;
1744         else
1745                 thresh = meta_sinfo->total_bytes * 95;
1746
1747         do_div(thresh, 100);
1748
1749         if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
1750             meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
1751                 struct btrfs_trans_handle *trans;
1752                 if (!meta_sinfo->full) {
1753                         meta_sinfo->force_alloc = 1;
1754                         spin_unlock(&meta_sinfo->lock);
1755
1756                         trans = btrfs_start_transaction(root, 1);
1757                         if (!trans)
1758                                 return -ENOMEM;
1759
1760                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1761                                              2 * 1024 * 1024, alloc_target, 0);
1762                         btrfs_end_transaction(trans, root);
1763                         goto again;
1764                 }
1765                 spin_unlock(&meta_sinfo->lock);
1766
1767                 if (!committed) {
1768                         committed = 1;
1769                         trans = btrfs_join_transaction(root, 1);
1770                         if (!trans)
1771                                 return -ENOMEM;
1772                         ret = btrfs_commit_transaction(trans, root);
1773                         if (ret)
1774                                 return ret;
1775                         goto again;
1776                 }
1777                 return -ENOSPC;
1778         }
1779         spin_unlock(&meta_sinfo->lock);
1780
1781         return 0;
1782 }
1783
1784 /*
1785  * This will check the space that the inode allocates from to make sure we have
1786  * enough space for bytes.
1787  */
1788 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1789                                 u64 bytes)
1790 {
1791         struct btrfs_space_info *data_sinfo;
1792         int ret = 0, committed = 0;
1793
1794         /* make sure bytes are sectorsize aligned */
1795         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1796
1797         data_sinfo = BTRFS_I(inode)->space_info;
1798 again:
1799         /* make sure we have enough space to handle the data first */
1800         spin_lock(&data_sinfo->lock);
1801         if (data_sinfo->total_bytes - data_sinfo->bytes_used -
1802             data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
1803             data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
1804             data_sinfo->bytes_may_use < bytes) {
1805                 struct btrfs_trans_handle *trans;
1806
1807                 /*
1808                  * if we don't have enough free bytes in this space then we need
1809                  * to alloc a new chunk.
1810                  */
1811                 if (!data_sinfo->full) {
1812                         u64 alloc_target;
1813
1814                         data_sinfo->force_alloc = 1;
1815                         spin_unlock(&data_sinfo->lock);
1816
1817                         alloc_target = btrfs_get_alloc_profile(root, 1);
1818                         trans = btrfs_start_transaction(root, 1);
1819                         if (!trans)
1820                                 return -ENOMEM;
1821
1822                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1823                                              bytes + 2 * 1024 * 1024,
1824                                              alloc_target, 0);
1825                         btrfs_end_transaction(trans, root);
1826                         if (ret)
1827                                 return ret;
1828                         goto again;
1829                 }
1830                 spin_unlock(&data_sinfo->lock);
1831
1832                 /* commit the current transaction and try again */
1833                 if (!committed) {
1834                         committed = 1;
1835                         trans = btrfs_join_transaction(root, 1);
1836                         if (!trans)
1837                                 return -ENOMEM;
1838                         ret = btrfs_commit_transaction(trans, root);
1839                         if (ret)
1840                                 return ret;
1841                         goto again;
1842                 }
1843
1844                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
1845                        ", %llu bytes_used, %llu bytes_reserved, "
1846                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
1847                        "%llu total\n", bytes, data_sinfo->bytes_delalloc,
1848                        data_sinfo->bytes_used, data_sinfo->bytes_reserved,
1849                        data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
1850                        data_sinfo->bytes_may_use, data_sinfo->total_bytes);
1851                 return -ENOSPC;
1852         }
1853         data_sinfo->bytes_may_use += bytes;
1854         BTRFS_I(inode)->reserved_bytes += bytes;
1855         spin_unlock(&data_sinfo->lock);
1856
1857         return btrfs_check_metadata_free_space(root);
1858 }
1859
1860 /*
1861  * if there was an error for whatever reason after calling
1862  * btrfs_check_data_free_space, call this so we can cleanup the counters.
1863  */
1864 void btrfs_free_reserved_data_space(struct btrfs_root *root,
1865                                     struct inode *inode, u64 bytes)
1866 {
1867         struct btrfs_space_info *data_sinfo;
1868
1869         /* make sure bytes are sectorsize aligned */
1870         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1871
1872         data_sinfo = BTRFS_I(inode)->space_info;
1873         spin_lock(&data_sinfo->lock);
1874         data_sinfo->bytes_may_use -= bytes;
1875         BTRFS_I(inode)->reserved_bytes -= bytes;
1876         spin_unlock(&data_sinfo->lock);
1877 }
1878
1879 /* called when we are adding a delalloc extent to the inode's io_tree */
1880 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1881                                   u64 bytes)
1882 {
1883         struct btrfs_space_info *data_sinfo;
1884
1885         /* get the space info for where this inode will be storing its data */
1886         data_sinfo = BTRFS_I(inode)->space_info;
1887
1888         /* make sure we have enough space to handle the data first */
1889         spin_lock(&data_sinfo->lock);
1890         data_sinfo->bytes_delalloc += bytes;
1891
1892         /*
1893          * we are adding a delalloc extent without calling
1894          * btrfs_check_data_free_space first.  This happens on a weird
1895          * writepage condition, but shouldn't hurt our accounting
1896          */
1897         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
1898                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
1899                 BTRFS_I(inode)->reserved_bytes = 0;
1900         } else {
1901                 data_sinfo->bytes_may_use -= bytes;
1902                 BTRFS_I(inode)->reserved_bytes -= bytes;
1903         }
1904
1905         spin_unlock(&data_sinfo->lock);
1906 }
1907
1908 /* called when we are clearing an delalloc extent from the inode's io_tree */
1909 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1910                               u64 bytes)
1911 {
1912         struct btrfs_space_info *info;
1913
1914         info = BTRFS_I(inode)->space_info;
1915
1916         spin_lock(&info->lock);
1917         info->bytes_delalloc -= bytes;
1918         spin_unlock(&info->lock);
1919 }
1920
1921 static void force_metadata_allocation(struct btrfs_fs_info *info)
1922 {
1923         struct list_head *head = &info->space_info;
1924         struct btrfs_space_info *found;
1925
1926         rcu_read_lock();
1927         list_for_each_entry_rcu(found, head, list) {
1928                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
1929                         found->force_alloc = 1;
1930         }
1931         rcu_read_unlock();
1932 }
1933
1934 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1935                           struct btrfs_root *extent_root, u64 alloc_bytes,
1936                           u64 flags, int force)
1937 {
1938         struct btrfs_space_info *space_info;
1939         struct btrfs_fs_info *fs_info = extent_root->fs_info;
1940         u64 thresh;
1941         int ret = 0;
1942
1943         mutex_lock(&fs_info->chunk_mutex);
1944
1945         flags = btrfs_reduce_alloc_profile(extent_root, flags);
1946
1947         space_info = __find_space_info(extent_root->fs_info, flags);
1948         if (!space_info) {
1949                 ret = update_space_info(extent_root->fs_info, flags,
1950                                         0, 0, &space_info);
1951                 BUG_ON(ret);
1952         }
1953         BUG_ON(!space_info);
1954
1955         spin_lock(&space_info->lock);
1956         if (space_info->force_alloc) {
1957                 force = 1;
1958                 space_info->force_alloc = 0;
1959         }
1960         if (space_info->full) {
1961                 spin_unlock(&space_info->lock);
1962                 goto out;
1963         }
1964
1965         thresh = space_info->total_bytes - space_info->bytes_readonly;
1966         thresh = div_factor(thresh, 6);
1967         if (!force &&
1968            (space_info->bytes_used + space_info->bytes_pinned +
1969             space_info->bytes_reserved + alloc_bytes) < thresh) {
1970                 spin_unlock(&space_info->lock);
1971                 goto out;
1972         }
1973         spin_unlock(&space_info->lock);
1974
1975         /*
1976          * if we're doing a data chunk, go ahead and make sure that
1977          * we keep a reasonable number of metadata chunks allocated in the
1978          * FS as well.
1979          */
1980         if (flags & BTRFS_BLOCK_GROUP_DATA) {
1981                 fs_info->data_chunk_allocations++;
1982                 if (!(fs_info->data_chunk_allocations %
1983                       fs_info->metadata_ratio))
1984                         force_metadata_allocation(fs_info);
1985         }
1986
1987         ret = btrfs_alloc_chunk(trans, extent_root, flags);
1988         if (ret)
1989                 space_info->full = 1;
1990 out:
1991         mutex_unlock(&extent_root->fs_info->chunk_mutex);
1992         return ret;
1993 }
1994
1995 static int update_block_group(struct btrfs_trans_handle *trans,
1996                               struct btrfs_root *root,
1997                               u64 bytenr, u64 num_bytes, int alloc,
1998                               int mark_free)
1999 {
2000         struct btrfs_block_group_cache *cache;
2001         struct btrfs_fs_info *info = root->fs_info;
2002         u64 total = num_bytes;
2003         u64 old_val;
2004         u64 byte_in_group;
2005
2006         while (total) {
2007                 cache = btrfs_lookup_block_group(info, bytenr);
2008                 if (!cache)
2009                         return -1;
2010                 byte_in_group = bytenr - cache->key.objectid;
2011                 WARN_ON(byte_in_group > cache->key.offset);
2012
2013                 spin_lock(&cache->space_info->lock);
2014                 spin_lock(&cache->lock);
2015                 cache->dirty = 1;
2016                 old_val = btrfs_block_group_used(&cache->item);
2017                 num_bytes = min(total, cache->key.offset - byte_in_group);
2018                 if (alloc) {
2019                         old_val += num_bytes;
2020                         cache->space_info->bytes_used += num_bytes;
2021                         if (cache->ro)
2022                                 cache->space_info->bytes_readonly -= num_bytes;
2023                         btrfs_set_block_group_used(&cache->item, old_val);
2024                         spin_unlock(&cache->lock);
2025                         spin_unlock(&cache->space_info->lock);
2026                 } else {
2027                         old_val -= num_bytes;
2028                         cache->space_info->bytes_used -= num_bytes;
2029                         if (cache->ro)
2030                                 cache->space_info->bytes_readonly += num_bytes;
2031                         btrfs_set_block_group_used(&cache->item, old_val);
2032                         spin_unlock(&cache->lock);
2033                         spin_unlock(&cache->space_info->lock);
2034                         if (mark_free) {
2035                                 int ret;
2036
2037                                 ret = btrfs_discard_extent(root, bytenr,
2038                                                            num_bytes);
2039                                 WARN_ON(ret);
2040
2041                                 ret = btrfs_add_free_space(cache, bytenr,
2042                                                            num_bytes);
2043                                 WARN_ON(ret);
2044                         }
2045                 }
2046                 btrfs_put_block_group(cache);
2047                 total -= num_bytes;
2048                 bytenr += num_bytes;
2049         }
2050         return 0;
2051 }
2052
2053 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2054 {
2055         struct btrfs_block_group_cache *cache;
2056         u64 bytenr;
2057
2058         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2059         if (!cache)
2060                 return 0;
2061
2062         bytenr = cache->key.objectid;
2063         btrfs_put_block_group(cache);
2064
2065         return bytenr;
2066 }
2067
2068 int btrfs_update_pinned_extents(struct btrfs_root *root,
2069                                 u64 bytenr, u64 num, int pin)
2070 {
2071         u64 len;
2072         struct btrfs_block_group_cache *cache;
2073         struct btrfs_fs_info *fs_info = root->fs_info;
2074
2075         if (pin) {
2076                 set_extent_dirty(&fs_info->pinned_extents,
2077                                 bytenr, bytenr + num - 1, GFP_NOFS);
2078         } else {
2079                 clear_extent_dirty(&fs_info->pinned_extents,
2080                                 bytenr, bytenr + num - 1, GFP_NOFS);
2081         }
2082
2083         while (num > 0) {
2084                 cache = btrfs_lookup_block_group(fs_info, bytenr);
2085                 BUG_ON(!cache);
2086                 len = min(num, cache->key.offset -
2087                           (bytenr - cache->key.objectid));
2088                 if (pin) {
2089                         spin_lock(&cache->space_info->lock);
2090                         spin_lock(&cache->lock);
2091                         cache->pinned += len;
2092                         cache->space_info->bytes_pinned += len;
2093                         spin_unlock(&cache->lock);
2094                         spin_unlock(&cache->space_info->lock);
2095                         fs_info->total_pinned += len;
2096                 } else {
2097                         spin_lock(&cache->space_info->lock);
2098                         spin_lock(&cache->lock);
2099                         cache->pinned -= len;
2100                         cache->space_info->bytes_pinned -= len;
2101                         spin_unlock(&cache->lock);
2102                         spin_unlock(&cache->space_info->lock);
2103                         fs_info->total_pinned -= len;
2104                         if (cache->cached)
2105                                 btrfs_add_free_space(cache, bytenr, len);
2106                 }
2107                 btrfs_put_block_group(cache);
2108                 bytenr += len;
2109                 num -= len;
2110         }
2111         return 0;
2112 }
2113
2114 static int update_reserved_extents(struct btrfs_root *root,
2115                                    u64 bytenr, u64 num, int reserve)
2116 {
2117         u64 len;
2118         struct btrfs_block_group_cache *cache;
2119         struct btrfs_fs_info *fs_info = root->fs_info;
2120
2121         while (num > 0) {
2122                 cache = btrfs_lookup_block_group(fs_info, bytenr);
2123                 BUG_ON(!cache);
2124                 len = min(num, cache->key.offset -
2125                           (bytenr - cache->key.objectid));
2126
2127                 spin_lock(&cache->space_info->lock);
2128                 spin_lock(&cache->lock);
2129                 if (reserve) {
2130                         cache->reserved += len;
2131                         cache->space_info->bytes_reserved += len;
2132                 } else {
2133                         cache->reserved -= len;
2134                         cache->space_info->bytes_reserved -= len;
2135                 }
2136                 spin_unlock(&cache->lock);
2137                 spin_unlock(&cache->space_info->lock);
2138                 btrfs_put_block_group(cache);
2139                 bytenr += len;
2140                 num -= len;
2141         }
2142         return 0;
2143 }
2144
2145 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2146 {
2147         u64 last = 0;
2148         u64 start;
2149         u64 end;
2150         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2151         int ret;
2152
2153         while (1) {
2154                 ret = find_first_extent_bit(pinned_extents, last,
2155                                             &start, &end, EXTENT_DIRTY);
2156                 if (ret)
2157                         break;
2158                 set_extent_dirty(copy, start, end, GFP_NOFS);
2159                 last = end + 1;
2160         }
2161         return 0;
2162 }
2163
2164 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2165                                struct btrfs_root *root,
2166                                struct extent_io_tree *unpin)
2167 {
2168         u64 start;
2169         u64 end;
2170         int ret;
2171
2172         while (1) {
2173                 ret = find_first_extent_bit(unpin, 0, &start, &end,
2174                                             EXTENT_DIRTY);
2175                 if (ret)
2176                         break;
2177
2178                 ret = btrfs_discard_extent(root, start, end + 1 - start);
2179
2180                 /* unlocks the pinned mutex */
2181                 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2182                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2183
2184                 cond_resched();
2185         }
2186         return ret;
2187 }
2188
2189 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2190                           struct btrfs_root *root,
2191                           struct btrfs_path *path,
2192                           u64 bytenr, u64 num_bytes, int is_data,
2193                           struct extent_buffer **must_clean)
2194 {
2195         int err = 0;
2196         struct extent_buffer *buf;
2197
2198         if (is_data)
2199                 goto pinit;
2200
2201         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2202         if (!buf)
2203                 goto pinit;
2204
2205         /* we can reuse a block if it hasn't been written
2206          * and it is from this transaction.  We can't
2207          * reuse anything from the tree log root because
2208          * it has tiny sub-transactions.
2209          */
2210         if (btrfs_buffer_uptodate(buf, 0) &&
2211             btrfs_try_tree_lock(buf)) {
2212                 u64 header_owner = btrfs_header_owner(buf);
2213                 u64 header_transid = btrfs_header_generation(buf);
2214                 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2215                     header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2216                     header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2217                     header_transid == trans->transid &&
2218                     !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2219                         *must_clean = buf;
2220                         return 1;
2221                 }
2222                 btrfs_tree_unlock(buf);
2223         }
2224         free_extent_buffer(buf);
2225 pinit:
2226         btrfs_set_path_blocking(path);
2227         /* unlocks the pinned mutex */
2228         btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2229
2230         BUG_ON(err < 0);
2231         return 0;
2232 }
2233
2234 /*
2235  * remove an extent from the root, returns 0 on success
2236  */
2237 static int __free_extent(struct btrfs_trans_handle *trans,
2238                          struct btrfs_root *root,
2239                          u64 bytenr, u64 num_bytes, u64 parent,
2240                          u64 root_objectid, u64 ref_generation,
2241                          u64 owner_objectid, int pin, int mark_free,
2242                          int refs_to_drop)
2243 {
2244         struct btrfs_path *path;
2245         struct btrfs_key key;
2246         struct btrfs_fs_info *info = root->fs_info;
2247         struct btrfs_root *extent_root = info->extent_root;
2248         struct extent_buffer *leaf;
2249         int ret;
2250         int extent_slot = 0;
2251         int found_extent = 0;
2252         int num_to_del = 1;
2253         struct btrfs_extent_item *ei;
2254         u32 refs;
2255
2256         key.objectid = bytenr;
2257         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2258         key.offset = num_bytes;
2259         path = btrfs_alloc_path();
2260         if (!path)
2261                 return -ENOMEM;
2262
2263         path->reada = 1;
2264         path->leave_spinning = 1;
2265         ret = lookup_extent_backref(trans, extent_root, path,
2266                                     bytenr, parent, root_objectid,
2267                                     ref_generation, owner_objectid, 1);
2268         if (ret == 0) {
2269                 struct btrfs_key found_key;
2270                 extent_slot = path->slots[0];
2271                 while (extent_slot > 0) {
2272                         extent_slot--;
2273                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2274                                               extent_slot);
2275                         if (found_key.objectid != bytenr)
2276                                 break;
2277                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2278                             found_key.offset == num_bytes) {
2279                                 found_extent = 1;
2280                                 break;
2281                         }
2282                         if (path->slots[0] - extent_slot > 5)
2283                                 break;
2284                 }
2285                 if (!found_extent) {
2286                         ret = remove_extent_backref(trans, extent_root, path,
2287                                                     refs_to_drop);
2288                         BUG_ON(ret);
2289                         btrfs_release_path(extent_root, path);
2290                         path->leave_spinning = 1;
2291                         ret = btrfs_search_slot(trans, extent_root,
2292                                                 &key, path, -1, 1);
2293                         if (ret) {
2294                                 printk(KERN_ERR "umm, got %d back from search"
2295                                        ", was looking for %llu\n", ret,
2296                                        (unsigned long long)bytenr);
2297                                 btrfs_print_leaf(extent_root, path->nodes[0]);
2298                         }
2299                         BUG_ON(ret);
2300                         extent_slot = path->slots[0];
2301                 }
2302         } else {
2303                 btrfs_print_leaf(extent_root, path->nodes[0]);
2304                 WARN_ON(1);
2305                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2306                        "parent %llu root %llu gen %llu owner %llu\n",
2307                        (unsigned long long)bytenr,
2308                        (unsigned long long)parent,
2309                        (unsigned long long)root_objectid,
2310                        (unsigned long long)ref_generation,
2311                        (unsigned long long)owner_objectid);
2312         }
2313
2314         leaf = path->nodes[0];
2315         ei = btrfs_item_ptr(leaf, extent_slot,
2316                             struct btrfs_extent_item);
2317         refs = btrfs_extent_refs(leaf, ei);
2318
2319         /*
2320          * we're not allowed to delete the extent item if there
2321          * are other delayed ref updates pending
2322          */
2323
2324         BUG_ON(refs < refs_to_drop);
2325         refs -= refs_to_drop;
2326         btrfs_set_extent_refs(leaf, ei, refs);
2327         btrfs_mark_buffer_dirty(leaf);
2328
2329         if (refs == 0 && found_extent &&
2330             path->slots[0] == extent_slot + 1) {
2331                 struct btrfs_extent_ref *ref;
2332                 ref = btrfs_item_ptr(leaf, path->slots[0],
2333                                      struct btrfs_extent_ref);
2334                 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2335                 /* if the back ref and the extent are next to each other
2336                  * they get deleted below in one shot
2337                  */
2338                 path->slots[0] = extent_slot;
2339                 num_to_del = 2;
2340         } else if (found_extent) {
2341                 /* otherwise delete the extent back ref */
2342                 ret = remove_extent_backref(trans, extent_root, path,
2343                                             refs_to_drop);
2344                 BUG_ON(ret);
2345                 /* if refs are 0, we need to setup the path for deletion */
2346                 if (refs == 0) {
2347                         btrfs_release_path(extent_root, path);
2348                         path->leave_spinning = 1;
2349                         ret = btrfs_search_slot(trans, extent_root, &key, path,
2350                                                 -1, 1);
2351                         BUG_ON(ret);
2352                 }
2353         }
2354
2355         if (refs == 0) {
2356                 u64 super_used;
2357                 u64 root_used;
2358                 struct extent_buffer *must_clean = NULL;
2359
2360                 if (pin) {
2361                         ret = pin_down_bytes(trans, root, path,
2362                                 bytenr, num_bytes,
2363                                 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
2364                                 &must_clean);
2365                         if (ret > 0)
2366                                 mark_free = 1;
2367                         BUG_ON(ret < 0);
2368                 }
2369
2370                 /* block accounting for super block */
2371                 spin_lock(&info->delalloc_lock);
2372                 super_used = btrfs_super_bytes_used(&info->super_copy);
2373                 btrfs_set_super_bytes_used(&info->super_copy,
2374                                            super_used - num_bytes);
2375
2376                 /* block accounting for root item */
2377                 root_used = btrfs_root_used(&root->root_item);
2378                 btrfs_set_root_used(&root->root_item,
2379                                            root_used - num_bytes);
2380                 spin_unlock(&info->delalloc_lock);
2381
2382                 /*
2383                  * it is going to be very rare for someone to be waiting
2384                  * on the block we're freeing.  del_items might need to
2385                  * schedule, so rather than get fancy, just force it
2386                  * to blocking here
2387                  */
2388                 if (must_clean)
2389                         btrfs_set_lock_blocking(must_clean);
2390
2391                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2392                                       num_to_del);
2393                 BUG_ON(ret);
2394                 btrfs_release_path(extent_root, path);
2395
2396                 if (must_clean) {
2397                         clean_tree_block(NULL, root, must_clean);
2398                         btrfs_tree_unlock(must_clean);
2399                         free_extent_buffer(must_clean);
2400                 }
2401
2402                 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2403                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2404                         BUG_ON(ret);
2405                 } else {
2406                         invalidate_mapping_pages(info->btree_inode->i_mapping,
2407                              bytenr >> PAGE_CACHE_SHIFT,
2408                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
2409                 }
2410
2411                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2412                                          mark_free);
2413                 BUG_ON(ret);
2414         }
2415         btrfs_free_path(path);
2416         return ret;
2417 }
2418
2419 /*
2420  * remove an extent from the root, returns 0 on success
2421  */
2422 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2423                                         struct btrfs_root *root,
2424                                         u64 bytenr, u64 num_bytes, u64 parent,
2425                                         u64 root_objectid, u64 ref_generation,
2426                                         u64 owner_objectid, int pin,
2427                                         int refs_to_drop)
2428 {
2429         WARN_ON(num_bytes < root->sectorsize);
2430
2431         /*
2432          * if metadata always pin
2433          * if data pin when any transaction has committed this
2434          */
2435         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2436             ref_generation != trans->transid)
2437                 pin = 1;
2438
2439         if (ref_generation != trans->transid)
2440                 pin = 1;
2441
2442         return __free_extent(trans, root, bytenr, num_bytes, parent,
2443                             root_objectid, ref_generation,
2444                             owner_objectid, pin, pin == 0, refs_to_drop);
2445 }
2446
2447 /*
2448  * when we free an extent, it is possible (and likely) that we free the last
2449  * delayed ref for that extent as well.  This searches the delayed ref tree for
2450  * a given extent, and if there are no other delayed refs to be processed, it
2451  * removes it from the tree.
2452  */
2453 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2454                                       struct btrfs_root *root, u64 bytenr)
2455 {
2456         struct btrfs_delayed_ref_head *head;
2457         struct btrfs_delayed_ref_root *delayed_refs;
2458         struct btrfs_delayed_ref_node *ref;
2459         struct rb_node *node;
2460         int ret;
2461
2462         delayed_refs = &trans->transaction->delayed_refs;
2463         spin_lock(&delayed_refs->lock);
2464         head = btrfs_find_delayed_ref_head(trans, bytenr);
2465         if (!head)
2466                 goto out;
2467
2468         node = rb_prev(&head->node.rb_node);
2469         if (!node)
2470                 goto out;
2471
2472         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2473
2474         /* there are still entries for this ref, we can't drop it */
2475         if (ref->bytenr == bytenr)
2476                 goto out;
2477
2478         /*
2479          * waiting for the lock here would deadlock.  If someone else has it
2480          * locked they are already in the process of dropping it anyway
2481          */
2482         if (!mutex_trylock(&head->mutex))
2483                 goto out;
2484
2485         /*
2486          * at this point we have a head with no other entries.  Go
2487          * ahead and process it.
2488          */
2489         head->node.in_tree = 0;
2490         rb_erase(&head->node.rb_node, &delayed_refs->root);
2491
2492         delayed_refs->num_entries--;
2493
2494         /*
2495          * we don't take a ref on the node because we're removing it from the
2496          * tree, so we just steal the ref the tree was holding.
2497          */
2498         delayed_refs->num_heads--;
2499         if (list_empty(&head->cluster))
2500                 delayed_refs->num_heads_ready--;
2501
2502         list_del_init(&head->cluster);
2503         spin_unlock(&delayed_refs->lock);
2504
2505         ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
2506                                   &head->node, head->must_insert_reserved);
2507         BUG_ON(ret);
2508         btrfs_put_delayed_ref(&head->node);
2509         return 0;
2510 out:
2511         spin_unlock(&delayed_refs->lock);
2512         return 0;
2513 }
2514
2515 int btrfs_free_extent(struct btrfs_trans_handle *trans,
2516                       struct btrfs_root *root,
2517                       u64 bytenr, u64 num_bytes, u64 parent,
2518                       u64 root_objectid, u64 ref_generation,
2519                       u64 owner_objectid, int pin)
2520 {
2521         int ret;
2522
2523         /*
2524          * tree log blocks never actually go into the extent allocation
2525          * tree, just update pinning info and exit early.
2526          *
2527          * data extents referenced by the tree log do need to have
2528          * their reference counts bumped.
2529          */
2530         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2531             owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2532                 /* unlocks the pinned mutex */
2533                 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2534                 update_reserved_extents(root, bytenr, num_bytes, 0);
2535                 ret = 0;
2536         } else {
2537                 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2538                                        root_objectid, ref_generation,
2539                                        owner_objectid,
2540                                        BTRFS_DROP_DELAYED_REF, 1);
2541                 BUG_ON(ret);
2542                 ret = check_ref_cleanup(trans, root, bytenr);
2543                 BUG_ON(ret);
2544         }
2545         return ret;
2546 }
2547
2548 static u64 stripe_align(struct btrfs_root *root, u64 val)
2549 {
2550         u64 mask = ((u64)root->stripesize - 1);
2551         u64 ret = (val + mask) & ~mask;
2552         return ret;
2553 }
2554
2555 /*
2556  * walks the btree of allocated extents and find a hole of a given size.
2557  * The key ins is changed to record the hole:
2558  * ins->objectid == block start
2559  * ins->flags = BTRFS_EXTENT_ITEM_KEY
2560  * ins->offset == number of blocks
2561  * Any available blocks before search_start are skipped.
2562  */
2563 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2564                                      struct btrfs_root *orig_root,
2565                                      u64 num_bytes, u64 empty_size,
2566                                      u64 search_start, u64 search_end,
2567                                      u64 hint_byte, struct btrfs_key *ins,
2568                                      u64 exclude_start, u64 exclude_nr,
2569                                      int data)
2570 {
2571         int ret = 0;
2572         struct btrfs_root *root = orig_root->fs_info->extent_root;
2573         struct btrfs_free_cluster *last_ptr = NULL;
2574         struct btrfs_block_group_cache *block_group = NULL;
2575         int empty_cluster = 2 * 1024 * 1024;
2576         int allowed_chunk_alloc = 0;
2577         struct btrfs_space_info *space_info;
2578         int last_ptr_loop = 0;
2579         int loop = 0;
2580
2581         WARN_ON(num_bytes < root->sectorsize);
2582         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2583         ins->objectid = 0;
2584         ins->offset = 0;
2585
2586         space_info = __find_space_info(root->fs_info, data);
2587
2588         if (orig_root->ref_cows || empty_size)
2589                 allowed_chunk_alloc = 1;
2590
2591         if (data & BTRFS_BLOCK_GROUP_METADATA) {
2592                 last_ptr = &root->fs_info->meta_alloc_cluster;
2593                 if (!btrfs_test_opt(root, SSD))
2594                         empty_cluster = 64 * 1024;
2595         }
2596
2597         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2598                 last_ptr = &root->fs_info->data_alloc_cluster;
2599         }
2600
2601         if (last_ptr) {
2602                 spin_lock(&last_ptr->lock);
2603                 if (last_ptr->block_group)
2604                         hint_byte = last_ptr->window_start;
2605                 spin_unlock(&last_ptr->lock);
2606         }
2607
2608         search_start = max(search_start, first_logical_byte(root, 0));
2609         search_start = max(search_start, hint_byte);
2610
2611         if (!last_ptr) {
2612                 empty_cluster = 0;
2613                 loop = 1;
2614         }
2615
2616         if (search_start == hint_byte) {
2617                 block_group = btrfs_lookup_block_group(root->fs_info,
2618                                                        search_start);
2619                 if (block_group && block_group_bits(block_group, data)) {
2620                         down_read(&space_info->groups_sem);
2621                         goto have_block_group;
2622                 } else if (block_group) {
2623                         btrfs_put_block_group(block_group);
2624                 }
2625         }
2626
2627 search:
2628         down_read(&space_info->groups_sem);
2629         list_for_each_entry(block_group, &space_info->block_groups, list) {
2630                 u64 offset;
2631
2632                 atomic_inc(&block_group->count);
2633                 search_start = block_group->key.objectid;
2634
2635 have_block_group:
2636                 if (unlikely(!block_group->cached)) {
2637                         mutex_lock(&block_group->cache_mutex);
2638                         ret = cache_block_group(root, block_group);
2639                         mutex_unlock(&block_group->cache_mutex);
2640                         if (ret) {
2641                                 btrfs_put_block_group(block_group);
2642                                 break;
2643                         }
2644                 }
2645
2646                 if (unlikely(block_group->ro))
2647                         goto loop;
2648
2649                 if (last_ptr) {
2650                         /*
2651                          * the refill lock keeps out other
2652                          * people trying to start a new cluster
2653                          */
2654                         spin_lock(&last_ptr->refill_lock);
2655                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2656                                                  num_bytes, search_start);
2657                         if (offset) {
2658                                 /* we have a block, we're done */
2659                                 spin_unlock(&last_ptr->refill_lock);
2660                                 goto checks;
2661                         }
2662
2663                         spin_lock(&last_ptr->lock);
2664                         /*
2665                          * whoops, this cluster doesn't actually point to
2666                          * this block group.  Get a ref on the block
2667                          * group is does point to and try again
2668                          */
2669                         if (!last_ptr_loop && last_ptr->block_group &&
2670                             last_ptr->block_group != block_group) {
2671
2672                                 btrfs_put_block_group(block_group);
2673                                 block_group = last_ptr->block_group;
2674                                 atomic_inc(&block_group->count);
2675                                 spin_unlock(&last_ptr->lock);
2676                                 spin_unlock(&last_ptr->refill_lock);
2677
2678                                 last_ptr_loop = 1;
2679                                 search_start = block_group->key.objectid;
2680                                 goto have_block_group;
2681                         }
2682                         spin_unlock(&last_ptr->lock);
2683
2684                         /*
2685                          * this cluster didn't work out, free it and
2686                          * start over
2687                          */
2688                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
2689
2690                         last_ptr_loop = 0;
2691
2692                         /* allocate a cluster in this block group */
2693                         ret = btrfs_find_space_cluster(trans,
2694                                                block_group, last_ptr,
2695                                                offset, num_bytes,
2696                                                empty_cluster + empty_size);
2697                         if (ret == 0) {
2698                                 /*
2699                                  * now pull our allocation out of this
2700                                  * cluster
2701                                  */
2702                                 offset = btrfs_alloc_from_cluster(block_group,
2703                                                   last_ptr, num_bytes,
2704                                                   search_start);
2705                                 if (offset) {
2706                                         /* we found one, proceed */
2707                                         spin_unlock(&last_ptr->refill_lock);
2708                                         goto checks;
2709                                 }
2710                         }
2711                         /*
2712                          * at this point we either didn't find a cluster
2713                          * or we weren't able to allocate a block from our
2714                          * cluster.  Free the cluster we've been trying
2715                          * to use, and go to the next block group
2716                          */
2717                         if (loop < 2) {
2718                                 btrfs_return_cluster_to_free_space(NULL,
2719                                                                    last_ptr);
2720                                 spin_unlock(&last_ptr->refill_lock);
2721                                 goto loop;
2722                         }
2723                         spin_unlock(&last_ptr->refill_lock);
2724                 }
2725
2726                 offset = btrfs_find_space_for_alloc(block_group, search_start,
2727                                                     num_bytes, empty_size);
2728                 if (!offset)
2729                         goto loop;
2730 checks:
2731                 search_start = stripe_align(root, offset);
2732
2733                 /* move on to the next group */
2734                 if (search_start + num_bytes >= search_end) {
2735                         btrfs_add_free_space(block_group, offset, num_bytes);
2736                         goto loop;
2737                 }
2738
2739                 /* move on to the next group */
2740                 if (search_start + num_bytes >
2741                     block_group->key.objectid + block_group->key.offset) {
2742                         btrfs_add_free_space(block_group, offset, num_bytes);
2743                         goto loop;
2744                 }
2745
2746                 if (exclude_nr > 0 &&
2747                     (search_start + num_bytes > exclude_start &&
2748                      search_start < exclude_start + exclude_nr)) {
2749                         search_start = exclude_start + exclude_nr;
2750
2751                         btrfs_add_free_space(block_group, offset, num_bytes);
2752                         /*
2753                          * if search_start is still in this block group
2754                          * then we just re-search this block group
2755                          */
2756                         if (search_start >= block_group->key.objectid &&
2757                             search_start < (block_group->key.objectid +
2758                                             block_group->key.offset))
2759                                 goto have_block_group;
2760                         goto loop;
2761                 }
2762
2763                 ins->objectid = search_start;
2764                 ins->offset = num_bytes;
2765
2766                 if (offset < search_start)
2767                         btrfs_add_free_space(block_group, offset,
2768                                              search_start - offset);
2769                 BUG_ON(offset > search_start);
2770
2771                 /* we are all good, lets return */
2772                 break;
2773 loop:
2774                 btrfs_put_block_group(block_group);
2775         }
2776         up_read(&space_info->groups_sem);
2777
2778         /* loop == 0, try to find a clustered alloc in every block group
2779          * loop == 1, try again after forcing a chunk allocation
2780          * loop == 2, set empty_size and empty_cluster to 0 and try again
2781          */
2782         if (!ins->objectid && loop < 3 &&
2783             (empty_size || empty_cluster || allowed_chunk_alloc)) {
2784                 if (loop >= 2) {
2785                         empty_size = 0;
2786                         empty_cluster = 0;
2787                 }
2788
2789                 if (allowed_chunk_alloc) {
2790                         ret = do_chunk_alloc(trans, root, num_bytes +
2791                                              2 * 1024 * 1024, data, 1);
2792                         allowed_chunk_alloc = 0;
2793                 } else {
2794                         space_info->force_alloc = 1;
2795                 }
2796
2797                 if (loop < 3) {
2798                         loop++;
2799                         goto search;
2800                 }
2801                 ret = -ENOSPC;
2802         } else if (!ins->objectid) {
2803                 ret = -ENOSPC;
2804         }
2805
2806         /* we found what we needed */
2807         if (ins->objectid) {
2808                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2809                         trans->block_group = block_group->key.objectid;
2810
2811                 btrfs_put_block_group(block_group);
2812                 ret = 0;
2813         }
2814
2815         return ret;
2816 }
2817
2818 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2819 {
2820         struct btrfs_block_group_cache *cache;
2821
2822         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
2823                (unsigned long long)(info->total_bytes - info->bytes_used -
2824                                     info->bytes_pinned - info->bytes_reserved),
2825                (info->full) ? "" : "not ");
2826         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
2827                " may_use=%llu, used=%llu\n", info->total_bytes,
2828                info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
2829                info->bytes_used);
2830
2831         down_read(&info->groups_sem);
2832         list_for_each_entry(cache, &info->block_groups, list) {
2833                 spin_lock(&cache->lock);
2834                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
2835                        "%llu pinned %llu reserved\n",
2836                        (unsigned long long)cache->key.objectid,
2837                        (unsigned long long)cache->key.offset,
2838                        (unsigned long long)btrfs_block_group_used(&cache->item),
2839                        (unsigned long long)cache->pinned,
2840                        (unsigned long long)cache->reserved);
2841                 btrfs_dump_free_space(cache, bytes);
2842                 spin_unlock(&cache->lock);
2843         }
2844         up_read(&info->groups_sem);
2845 }
2846
2847 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2848                                   struct btrfs_root *root,
2849                                   u64 num_bytes, u64 min_alloc_size,
2850                                   u64 empty_size, u64 hint_byte,
2851                                   u64 search_end, struct btrfs_key *ins,
2852                                   u64 data)
2853 {
2854         int ret;
2855         u64 search_start = 0;
2856         struct btrfs_fs_info *info = root->fs_info;
2857
2858         data = btrfs_get_alloc_profile(root, data);
2859 again:
2860         /*
2861          * the only place that sets empty_size is btrfs_realloc_node, which
2862          * is not called recursively on allocations
2863          */
2864         if (empty_size || root->ref_cows) {
2865                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
2866                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2867                                      2 * 1024 * 1024,
2868                                      BTRFS_BLOCK_GROUP_METADATA |
2869                                      (info->metadata_alloc_profile &
2870                                       info->avail_metadata_alloc_bits), 0);
2871                 }
2872                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2873                                      num_bytes + 2 * 1024 * 1024, data, 0);
2874         }
2875
2876         WARN_ON(num_bytes < root->sectorsize);
2877         ret = find_free_extent(trans, root, num_bytes, empty_size,
2878                                search_start, search_end, hint_byte, ins,
2879                                trans->alloc_exclude_start,
2880                                trans->alloc_exclude_nr, data);
2881
2882         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2883                 num_bytes = num_bytes >> 1;
2884                 num_bytes = num_bytes & ~(root->sectorsize - 1);
2885                 num_bytes = max(num_bytes, min_alloc_size);
2886                 do_chunk_alloc(trans, root->fs_info->extent_root,
2887                                num_bytes, data, 1);
2888                 goto again;
2889         }
2890         if (ret) {
2891                 struct btrfs_space_info *sinfo;
2892
2893                 sinfo = __find_space_info(root->fs_info, data);
2894                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
2895                        "wanted %llu\n", (unsigned long long)data,
2896                        (unsigned long long)num_bytes);
2897                 dump_space_info(sinfo, num_bytes);
2898                 BUG();
2899         }
2900
2901         return ret;
2902 }
2903
2904 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2905 {
2906         struct btrfs_block_group_cache *cache;
2907         int ret = 0;
2908
2909         cache = btrfs_lookup_block_group(root->fs_info, start);
2910         if (!cache) {
2911                 printk(KERN_ERR "Unable to find block group for %llu\n",
2912                        (unsigned long long)start);
2913                 return -ENOSPC;
2914         }
2915
2916         ret = btrfs_discard_extent(root, start, len);
2917
2918         btrfs_add_free_space(cache, start, len);
2919         btrfs_put_block_group(cache);
2920         update_reserved_extents(root, start, len, 0);
2921
2922         return ret;
2923 }
2924
2925 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2926                                   struct btrfs_root *root,
2927                                   u64 num_bytes, u64 min_alloc_size,
2928                                   u64 empty_size, u64 hint_byte,
2929                                   u64 search_end, struct btrfs_key *ins,
2930                                   u64 data)
2931 {
2932         int ret;
2933         ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2934                                      empty_size, hint_byte, search_end, ins,
2935                                      data);
2936         update_reserved_extents(root, ins->objectid, ins->offset, 1);
2937         return ret;
2938 }
2939
2940 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2941                                          struct btrfs_root *root, u64 parent,
2942                                          u64 root_objectid, u64 ref_generation,
2943                                          u64 owner, struct btrfs_key *ins,
2944                                          int ref_mod)
2945 {
2946         int ret;
2947         u64 super_used;
2948         u64 root_used;
2949         u64 num_bytes = ins->offset;
2950         u32 sizes[2];
2951         struct btrfs_fs_info *info = root->fs_info;
2952         struct btrfs_root *extent_root = info->extent_root;
2953         struct btrfs_extent_item *extent_item;
2954         struct btrfs_extent_ref *ref;
2955         struct btrfs_path *path;
2956         struct btrfs_key keys[2];
2957
2958         if (parent == 0)
2959                 parent = ins->objectid;
2960
2961         /* block accounting for super block */
2962         spin_lock(&info->delalloc_lock);
2963         super_used = btrfs_super_bytes_used(&info->super_copy);
2964         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2965
2966         /* block accounting for root item */
2967         root_used = btrfs_root_used(&root->root_item);
2968         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2969         spin_unlock(&info->delalloc_lock);
2970
2971         memcpy(&keys[0], ins, sizeof(*ins));
2972         keys[1].objectid = ins->objectid;
2973         keys[1].type = BTRFS_EXTENT_REF_KEY;
2974         keys[1].offset = parent;
2975         sizes[0] = sizeof(*extent_item);
2976         sizes[1] = sizeof(*ref);
2977
2978         path = btrfs_alloc_path();
2979         BUG_ON(!path);
2980
2981         path->leave_spinning = 1;
2982         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2983                                        sizes, 2);
2984         BUG_ON(ret);
2985
2986         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2987                                      struct btrfs_extent_item);
2988         btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
2989         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2990                              struct btrfs_extent_ref);
2991
2992         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2993         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2994         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2995         btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
2996
2997         btrfs_mark_buffer_dirty(path->nodes[0]);
2998
2999         trans->alloc_exclude_start = 0;
3000         trans->alloc_exclude_nr = 0;
3001         btrfs_free_path(path);
3002
3003         if (ret)
3004                 goto out;
3005
3006         ret = update_block_group(trans, root, ins->objectid,
3007                                  ins->offset, 1, 0);
3008         if (ret) {
3009                 printk(KERN_ERR "btrfs update block group failed for %llu "
3010                        "%llu\n", (unsigned long long)ins->objectid,
3011                        (unsigned long long)ins->offset);
3012                 BUG();
3013         }
3014 out:
3015         return ret;
3016 }
3017
3018 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3019                                 struct btrfs_root *root, u64 parent,
3020                                 u64 root_objectid, u64 ref_generation,
3021                                 u64 owner, struct btrfs_key *ins)
3022 {
3023         int ret;
3024
3025         if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3026                 return 0;
3027
3028         ret = btrfs_add_delayed_ref(trans, ins->objectid,
3029                                     ins->offset, parent, root_objectid,
3030                                     ref_generation, owner,
3031                                     BTRFS_ADD_DELAYED_EXTENT, 0);
3032         BUG_ON(ret);
3033         return ret;
3034 }
3035
3036 /*
3037  * this is used by the tree logging recovery code.  It records that
3038  * an extent has been allocated and makes sure to clear the free
3039  * space cache bits as well
3040  */
3041 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3042                                 struct btrfs_root *root, u64 parent,
3043                                 u64 root_objectid, u64 ref_generation,
3044                                 u64 owner, struct btrfs_key *ins)
3045 {
3046         int ret;
3047         struct btrfs_block_group_cache *block_group;
3048
3049         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3050         mutex_lock(&block_group->cache_mutex);
3051         cache_block_group(root, block_group);
3052         mutex_unlock(&block_group->cache_mutex);
3053
3054         ret = btrfs_remove_free_space(block_group, ins->objectid,
3055                                       ins->offset);
3056         BUG_ON(ret);
3057         btrfs_put_block_group(block_group);
3058         ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3059                                             ref_generation, owner, ins, 1);
3060         return ret;
3061 }
3062
3063 /*
3064  * finds a free extent and does all the dirty work required for allocation
3065  * returns the key for the extent through ins, and a tree buffer for
3066  * the first block of the extent through buf.
3067  *
3068  * returns 0 if everything worked, non-zero otherwise.
3069  */
3070 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3071                        struct btrfs_root *root,
3072                        u64 num_bytes, u64 parent, u64 min_alloc_size,
3073                        u64 root_objectid, u64 ref_generation,
3074                        u64 owner_objectid, u64 empty_size, u64 hint_byte,
3075                        u64 search_end, struct btrfs_key *ins, u64 data)
3076 {
3077         int ret;
3078         ret = __btrfs_reserve_extent(trans, root, num_bytes,
3079                                      min_alloc_size, empty_size, hint_byte,
3080                                      search_end, ins, data);
3081         BUG_ON(ret);
3082         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3083                 ret = btrfs_add_delayed_ref(trans, ins->objectid,
3084                                             ins->offset, parent, root_objectid,
3085                                             ref_generation, owner_objectid,
3086                                             BTRFS_ADD_DELAYED_EXTENT, 0);
3087                 BUG_ON(ret);
3088         }
3089         update_reserved_extents(root, ins->objectid, ins->offset, 1);
3090         return ret;
3091 }
3092
3093 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3094                                             struct btrfs_root *root,
3095                                             u64 bytenr, u32 blocksize,
3096                                             int level)
3097 {
3098         struct extent_buffer *buf;
3099
3100         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3101         if (!buf)
3102                 return ERR_PTR(-ENOMEM);
3103         btrfs_set_header_generation(buf, trans->transid);
3104         btrfs_set_buffer_lockdep_class(buf, level);
3105         btrfs_tree_lock(buf);
3106         clean_tree_block(trans, root, buf);
3107
3108         btrfs_set_lock_blocking(buf);
3109         btrfs_set_buffer_uptodate(buf);
3110
3111         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3112                 set_extent_dirty(&root->dirty_log_pages, buf->start,
3113                          buf->start + buf->len - 1, GFP_NOFS);
3114         } else {
3115                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3116                          buf->start + buf->len - 1, GFP_NOFS);
3117         }
3118         trans->blocks_used++;
3119         /* this returns a buffer locked for blocking */
3120         return buf;
3121 }
3122
3123 /*
3124  * helper function to allocate a block for a given tree
3125  * returns the tree buffer or NULL.
3126  */
3127 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3128                                              struct btrfs_root *root,
3129                                              u32 blocksize, u64 parent,
3130                                              u64 root_objectid,
3131                                              u64 ref_generation,
3132                                              int level,
3133                                              u64 hint,
3134                                              u64 empty_size)
3135 {
3136         struct btrfs_key ins;
3137         int ret;
3138         struct extent_buffer *buf;
3139
3140         ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3141                                  root_objectid, ref_generation, level,
3142                                  empty_size, hint, (u64)-1, &ins, 0);
3143         if (ret) {
3144                 BUG_ON(ret > 0);
3145                 return ERR_PTR(ret);
3146         }
3147
3148         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3149                                     blocksize, level);
3150         return buf;
3151 }
3152
3153 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3154                         struct btrfs_root *root, struct extent_buffer *leaf)
3155 {
3156         u64 leaf_owner;
3157         u64 leaf_generation;
3158         struct refsort *sorted;
3159         struct btrfs_key key;
3160         struct btrfs_file_extent_item *fi;
3161         int i;
3162         int nritems;
3163         int ret;
3164         int refi = 0;
3165         int slot;
3166
3167         BUG_ON(!btrfs_is_leaf(leaf));
3168         nritems = btrfs_header_nritems(leaf);
3169         leaf_owner = btrfs_header_owner(leaf);
3170         leaf_generation = btrfs_header_generation(leaf);
3171
3172         sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3173         /* we do this loop twice.  The first time we build a list
3174          * of the extents we have a reference on, then we sort the list
3175          * by bytenr.  The second time around we actually do the
3176          * extent freeing.
3177          */
3178         for (i = 0; i < nritems; i++) {
3179                 u64 disk_bytenr;
3180                 cond_resched();
3181
3182                 btrfs_item_key_to_cpu(leaf, &key, i);
3183
3184                 /* only extents have references, skip everything else */
3185                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3186                         continue;
3187
3188                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3189
3190                 /* inline extents live in the btree, they don't have refs */
3191                 if (btrfs_file_extent_type(leaf, fi) ==
3192                     BTRFS_FILE_EXTENT_INLINE)
3193                         continue;
3194
3195                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3196
3197                 /* holes don't have refs */
3198                 if (disk_bytenr == 0)
3199                         continue;
3200
3201                 sorted[refi].bytenr = disk_bytenr;
3202                 sorted[refi].slot = i;
3203                 refi++;
3204         }
3205
3206         if (refi == 0)
3207                 goto out;
3208
3209         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3210
3211         for (i = 0; i < refi; i++) {
3212                 u64 disk_bytenr;
3213
3214                 disk_bytenr = sorted[i].bytenr;
3215                 slot = sorted[i].slot;
3216
3217                 cond_resched();
3218
3219                 btrfs_item_key_to_cpu(leaf, &key, slot);
3220                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3221                         continue;
3222
3223                 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3224
3225                 ret = btrfs_free_extent(trans, root, disk_bytenr,
3226                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
3227                                 leaf->start, leaf_owner, leaf_generation,
3228                                 key.objectid, 0);
3229                 BUG_ON(ret);
3230
3231                 atomic_inc(&root->fs_info->throttle_gen);
3232                 wake_up(&root->fs_info->transaction_throttle);
3233                 cond_resched();
3234         }
3235 out:
3236         kfree(sorted);
3237         return 0;
3238 }
3239
3240 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3241                                         struct btrfs_root *root,
3242                                         struct btrfs_leaf_ref *ref)
3243 {
3244         int i;
3245         int ret;
3246         struct btrfs_extent_info *info;
3247         struct refsort *sorted;
3248
3249         if (ref->nritems == 0)
3250                 return 0;
3251
3252         sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3253         for (i = 0; i < ref->nritems; i++) {
3254                 sorted[i].bytenr = ref->extents[i].bytenr;
3255                 sorted[i].slot = i;
3256         }
3257         sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3258
3259         /*
3260          * the items in the ref were sorted when the ref was inserted
3261          * into the ref cache, so this is already in order
3262          */
3263         for (i = 0; i < ref->nritems; i++) {
3264                 info = ref->extents + sorted[i].slot;
3265                 ret = btrfs_free_extent(trans, root, info->bytenr,
3266                                           info->num_bytes, ref->bytenr,
3267                                           ref->owner, ref->generation,
3268                                           info->objectid, 0);
3269
3270                 atomic_inc(&root->fs_info->throttle_gen);
3271                 wake_up(&root->fs_info->transaction_throttle);
3272                 cond_resched();
3273
3274                 BUG_ON(ret);
3275                 info++;
3276         }
3277
3278         kfree(sorted);
3279         return 0;
3280 }
3281
3282 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3283                                      struct btrfs_root *root, u64 start,
3284                                      u64 len, u32 *refs)
3285 {
3286         int ret;
3287
3288         ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3289         BUG_ON(ret);
3290
3291 #if 0 /* some debugging code in case we see problems here */
3292         /* if the refs count is one, it won't get increased again.  But
3293          * if the ref count is > 1, someone may be decreasing it at
3294          * the same time we are.
3295          */
3296         if (*refs != 1) {
3297                 struct extent_buffer *eb = NULL;
3298                 eb = btrfs_find_create_tree_block(root, start, len);
3299                 if (eb)
3300                         btrfs_tree_lock(eb);
3301
3302                 mutex_lock(&root->fs_info->alloc_mutex);
3303                 ret = lookup_extent_ref(NULL, root, start, len, refs);
3304                 BUG_ON(ret);
3305                 mutex_unlock(&root->fs_info->alloc_mutex);
3306
3307                 if (eb) {
3308                         btrfs_tree_unlock(eb);
3309                         free_extent_buffer(eb);
3310                 }
3311                 if (*refs == 1) {
3312                         printk(KERN_ERR "btrfs block %llu went down to one "
3313                                "during drop_snap\n", (unsigned long long)start);
3314                 }
3315
3316         }
3317 #endif
3318
3319         cond_resched();
3320         return ret;
3321 }
3322
3323 /*
3324  * this is used while deleting old snapshots, and it drops the refs
3325  * on a whole subtree starting from a level 1 node.
3326  *
3327  * The idea is to sort all the leaf pointers, and then drop the
3328  * ref on all the leaves in order.  Most of the time the leaves
3329  * will have ref cache entries, so no leaf IOs will be required to
3330  * find the extents they have references on.
3331  *
3332  * For each leaf, any references it has are also dropped in order
3333  *
3334  * This ends up dropping the references in something close to optimal
3335  * order for reading and modifying the extent allocation tree.
3336  */
3337 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3338                                         struct btrfs_root *root,
3339                                         struct btrfs_path *path)
3340 {
3341         u64 bytenr;
3342         u64 root_owner;
3343         u64 root_gen;
3344         struct extent_buffer *eb = path->nodes[1];
3345         struct extent_buffer *leaf;
3346         struct btrfs_leaf_ref *ref;
3347         struct refsort *sorted = NULL;
3348         int nritems = btrfs_header_nritems(eb);
3349         int ret;
3350         int i;
3351         int refi = 0;
3352         int slot = path->slots[1];
3353         u32 blocksize = btrfs_level_size(root, 0);
3354         u32 refs;
3355
3356         if (nritems == 0)
3357                 goto out;
3358
3359         root_owner = btrfs_header_owner(eb);
3360         root_gen = btrfs_header_generation(eb);
3361         sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3362
3363         /*
3364          * step one, sort all the leaf pointers so we don't scribble
3365          * randomly into the extent allocation tree
3366          */
3367         for (i = slot; i < nritems; i++) {
3368                 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3369                 sorted[refi].slot = i;
3370                 refi++;
3371         }
3372
3373         /*
3374          * nritems won't be zero, but if we're picking up drop_snapshot
3375          * after a crash, slot might be > 0, so double check things
3376          * just in case.
3377          */
3378         if (refi == 0)
3379                 goto out;
3380
3381         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3382
3383         /*
3384          * the first loop frees everything the leaves point to
3385          */
3386         for (i = 0; i < refi; i++) {
3387                 u64 ptr_gen;
3388
3389                 bytenr = sorted[i].bytenr;
3390
3391                 /*
3392                  * check the reference count on this leaf.  If it is > 1
3393                  * we just decrement it below and don't update any
3394                  * of the refs the leaf points to.
3395                  */
3396                 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3397                                                 blocksize, &refs);
3398                 BUG_ON(ret);
3399                 if (refs != 1)
3400                         continue;
3401
3402                 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3403
3404                 /*
3405                  * the leaf only had one reference, which means the
3406                  * only thing pointing to this leaf is the snapshot
3407                  * we're deleting.  It isn't possible for the reference
3408                  * count to increase again later
3409                  *
3410                  * The reference cache is checked for the leaf,
3411                  * and if found we'll be able to drop any refs held by
3412                  * the leaf without needing to read it in.
3413                  */
3414                 ref = btrfs_lookup_leaf_ref(root, bytenr);
3415                 if (ref && ref->generation != ptr_gen) {
3416                         btrfs_free_leaf_ref(root, ref);
3417                         ref = NULL;
3418                 }
3419                 if (ref) {
3420                         ret = cache_drop_leaf_ref(trans, root, ref);
3421                         BUG_ON(ret);
3422                         btrfs_remove_leaf_ref(root, ref);
3423                         btrfs_free_leaf_ref(root, ref);
3424                 } else {
3425                         /*
3426                          * the leaf wasn't in the reference cache, so
3427                          * we have to read it.
3428                          */
3429                         leaf = read_tree_block(root, bytenr, blocksize,
3430                                                ptr_gen);
3431                         ret = btrfs_drop_leaf_ref(trans, root, leaf);
3432                         BUG_ON(ret);
3433                         free_extent_buffer(leaf);
3434                 }
3435                 atomic_inc(&root->fs_info->throttle_gen);
3436                 wake_up(&root->fs_info->transaction_throttle);
3437                 cond_resched();
3438         }
3439
3440         /*
3441          * run through the loop again to free the refs on the leaves.
3442          * This is faster than doing it in the loop above because
3443          * the leaves are likely to be clustered together.  We end up
3444          * working in nice chunks on the extent allocation tree.
3445          */
3446         for (i = 0; i < refi; i++) {
3447                 bytenr = sorted[i].bytenr;
3448                 ret = btrfs_free_extent(trans, root, bytenr,
3449                                         blocksize, eb->start,
3450                                         root_owner, root_gen, 0, 1);
3451                 BUG_ON(ret);
3452
3453                 atomic_inc(&root->fs_info->throttle_gen);
3454                 wake_up(&root->fs_info->transaction_throttle);
3455                 cond_resched();
3456         }
3457 out:
3458         kfree(sorted);
3459
3460         /*
3461          * update the path to show we've processed the entire level 1
3462          * node.  This will get saved into the root's drop_snapshot_progress
3463          * field so these drops are not repeated again if this transaction
3464          * commits.
3465          */
3466         path->slots[1] = nritems;
3467         return 0;
3468 }
3469
3470 /*
3471  * helper function for drop_snapshot, this walks down the tree dropping ref
3472  * counts as it goes.
3473  */
3474 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3475                                    struct btrfs_root *root,
3476                                    struct btrfs_path *path, int *level)
3477 {
3478         u64 root_owner;
3479         u64 root_gen;
3480         u64 bytenr;
3481         u64 ptr_gen;
3482         struct extent_buffer *next;
3483         struct extent_buffer *cur;
3484         struct extent_buffer *parent;
3485         u32 blocksize;
3486         int ret;
3487         u32 refs;
3488
3489         WARN_ON(*level < 0);
3490         WARN_ON(*level >= BTRFS_MAX_LEVEL);
3491         ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
3492                                 path->nodes[*level]->len, &refs);
3493         BUG_ON(ret);
3494         if (refs > 1)
3495                 goto out;
3496
3497         /*
3498          * walk down to the last node level and free all the leaves
3499          */
3500         while (*level >= 0) {
3501                 WARN_ON(*level < 0);
3502                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3503                 cur = path->nodes[*level];
3504
3505                 if (btrfs_header_level(cur) != *level)
3506                         WARN_ON(1);
3507
3508                 if (path->slots[*level] >=
3509                     btrfs_header_nritems(cur))
3510                         break;
3511
3512                 /* the new code goes down to level 1 and does all the
3513                  * leaves pointed to that node in bulk.  So, this check
3514                  * for level 0 will always be false.
3515                  *
3516                  * But, the disk format allows the drop_snapshot_progress
3517                  * field in the root to leave things in a state where
3518                  * a leaf will need cleaning up here.  If someone crashes
3519                  * with the old code and then boots with the new code,
3520                  * we might find a leaf here.
3521                  */
3522                 if (*level == 0) {
3523                         ret = btrfs_drop_leaf_ref(trans, root, cur);
3524                         BUG_ON(ret);
3525                         break;
3526                 }
3527
3528                 /*
3529                  * once we get to level one, process the whole node
3530                  * at once, including everything below it.
3531                  */
3532                 if (*level == 1) {
3533                         ret = drop_level_one_refs(trans, root, path);
3534                         BUG_ON(ret);
3535                         break;
3536                 }
3537
3538                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3539                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3540                 blocksize = btrfs_level_size(root, *level - 1);
3541
3542                 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3543                                                 blocksize, &refs);
3544                 BUG_ON(ret);
3545
3546                 /*
3547                  * if there is more than one reference, we don't need
3548                  * to read that node to drop any references it has.  We
3549                  * just drop the ref we hold on that node and move on to the
3550                  * next slot in this level.
3551                  */
3552                 if (refs != 1) {
3553                         parent = path->nodes[*level];
3554                         root_owner = btrfs_header_owner(parent);
3555                         root_gen = btrfs_header_generation(parent);
3556                         path->slots[*level]++;
3557
3558                         ret = btrfs_free_extent(trans, root, bytenr,
3559                                                 blocksize, parent->start,
3560                                                 root_owner, root_gen,
3561                                                 *level - 1, 1);
3562                         BUG_ON(ret);
3563
3564                         atomic_inc(&root->fs_info->throttle_gen);
3565                         wake_up(&root->fs_info->transaction_throttle);
3566                         cond_resched();
3567
3568                         continue;
3569                 }
3570
3571                 /*
3572                  * we need to keep freeing things in the next level down.
3573                  * read the block and loop around to process it
3574                  */
3575                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3576                 WARN_ON(*level <= 0);
3577                 if (path->nodes[*level-1])
3578                         free_extent_buffer(path->nodes[*level-1]);
3579                 path->nodes[*level-1] = next;
3580                 *level = btrfs_header_level(next);
3581                 path->slots[*level] = 0;
3582                 cond_resched();
3583         }
3584 out:
3585         WARN_ON(*level < 0);
3586         WARN_ON(*level >= BTRFS_MAX_LEVEL);
3587
3588         if (path->nodes[*level] == root->node) {
3589                 parent = path->nodes[*level];
3590                 bytenr = path->nodes[*level]->start;
3591         } else {
3592                 parent = path->nodes[*level + 1];
3593                 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3594         }
3595
3596         blocksize = btrfs_level_size(root, *level);
3597         root_owner = btrfs_header_owner(parent);
3598         root_gen = btrfs_header_generation(parent);
3599
3600         /*
3601          * cleanup and free the reference on the last node
3602          * we processed
3603          */
3604         ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3605                                   parent->start, root_owner, root_gen,
3606                                   *level, 1);
3607         free_extent_buffer(path->nodes[*level]);
3608         path->nodes[*level] = NULL;
3609
3610         *level += 1;
3611         BUG_ON(ret);
3612
3613         cond_resched();
3614         return 0;
3615 }
3616
3617 /*
3618  * helper function for drop_subtree, this function is similar to
3619  * walk_down_tree. The main difference is that it checks reference
3620  * counts while tree blocks are locked.
3621  */
3622 static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3623                                       struct btrfs_root *root,
3624                                       struct btrfs_path *path, int *level)
3625 {
3626         struct extent_buffer *next;
3627         struct extent_buffer *cur;
3628         struct extent_buffer *parent;
3629         u64 bytenr;
3630         u64 ptr_gen;
3631         u32 blocksize;
3632         u32 refs;
3633         int ret;
3634
3635         cur = path->nodes[*level];
3636         ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3637                                       &refs);
3638         BUG_ON(ret);
3639         if (refs > 1)
3640                 goto out;
3641
3642         while (*level >= 0) {
3643                 cur = path->nodes[*level];
3644                 if (*level == 0) {
3645                         ret = btrfs_drop_leaf_ref(trans, root, cur);
3646                         BUG_ON(ret);
3647                         clean_tree_block(trans, root, cur);
3648                         break;
3649                 }
3650                 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3651                         clean_tree_block(trans, root, cur);
3652                         break;
3653                 }
3654
3655                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3656                 blocksize = btrfs_level_size(root, *level - 1);
3657                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3658
3659                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3660                 btrfs_tree_lock(next);
3661                 btrfs_set_lock_blocking(next);
3662
3663                 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3664                                               &refs);
3665                 BUG_ON(ret);
3666                 if (refs > 1) {
3667                         parent = path->nodes[*level];
3668                         ret = btrfs_free_extent(trans, root, bytenr,
3669                                         blocksize, parent->start,
3670                                         btrfs_header_owner(parent),
3671                                         btrfs_header_generation(parent),
3672                                         *level - 1, 1);
3673                         BUG_ON(ret);
3674                         path->slots[*level]++;
3675                         btrfs_tree_unlock(next);
3676                         free_extent_buffer(next);
3677                         continue;
3678                 }
3679
3680                 *level = btrfs_header_level(next);
3681                 path->nodes[*level] = next;
3682                 path->slots[*level] = 0;
3683                 path->locks[*level] = 1;
3684                 cond_resched();
3685         }
3686 out:
3687         parent = path->nodes[*level + 1];
3688         bytenr = path->nodes[*level]->start;
3689         blocksize = path->nodes[*level]->len;
3690
3691         ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3692                         parent->start, btrfs_header_owner(parent),
3693                         btrfs_header_generation(parent), *level, 1);
3694         BUG_ON(ret);
3695
3696         if (path->locks[*level]) {
3697                 btrfs_tree_unlock(path->nodes[*level]);
3698                 path->locks[*level] = 0;
3699         }
3700         free_extent_buffer(path->nodes[*level]);
3701         path->nodes[*level] = NULL;
3702         *level += 1;
3703         cond_resched();
3704         return 0;
3705 }
3706
3707 /*
3708  * helper for dropping snapshots.  This walks back up the tree in the path
3709  * to find the first node higher up where we haven't yet gone through
3710  * all the slots
3711  */
3712 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3713                                  struct btrfs_root *root,
3714                                  struct btrfs_path *path,
3715                                  int *level, int max_level)
3716 {
3717         u64 root_owner;
3718         u64 root_gen;
3719         struct btrfs_root_item *root_item = &root->root_item;
3720         int i;
3721         int slot;
3722         int ret;
3723
3724         for (i = *level; i < max_level && path->nodes[i]; i++) {
3725                 slot = path->slots[i];
3726                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3727                         struct extent_buffer *node;
3728                         struct btrfs_disk_key disk_key;
3729
3730                         /*
3731                          * there is more work to do in this level.
3732                          * Update the drop_progress marker to reflect
3733                          * the work we've done so far, and then bump
3734                          * the slot number
3735                          */
3736                         node = path->nodes[i];
3737                         path->slots[i]++;
3738                         *level = i;
3739                         WARN_ON(*level == 0);
3740                         btrfs_node_key(node, &disk_key, path->slots[i]);
3741                         memcpy(&root_item->drop_progress,
3742                                &disk_key, sizeof(disk_key));
3743                         root_item->drop_level = i;
3744                         return 0;
3745                 } else {
3746                         struct extent_buffer *parent;
3747
3748                         /*
3749                          * this whole node is done, free our reference
3750                          * on it and go up one level
3751                          */
3752                         if (path->nodes[*level] == root->node)
3753                                 parent = path->nodes[*level];
3754                         else
3755                                 parent = path->nodes[*level + 1];
3756
3757                         root_owner = btrfs_header_owner(parent);
3758                         root_gen = btrfs_header_generation(parent);
3759
3760                         clean_tree_block(trans, root, path->nodes[*level]);
3761                         ret = btrfs_free_extent(trans, root,
3762                                                 path->nodes[*level]->start,
3763                                                 path->nodes[*level]->len,
3764                                                 parent->start, root_owner,
3765                                                 root_gen, *level, 1);
3766                         BUG_ON(ret);
3767                         if (path->locks[*level]) {
3768                                 btrfs_tree_unlock(path->nodes[*level]);
3769                                 path->locks[*level] = 0;
3770                         }
3771                         free_extent_buffer(path->nodes[*level]);
3772                         path->nodes[*level] = NULL;
3773                         *level = i + 1;
3774                 }
3775         }
3776         return 1;
3777 }
3778
3779 /*
3780  * drop the reference count on the tree rooted at 'snap'.  This traverses
3781  * the tree freeing any blocks that have a ref count of zero after being
3782  * decremented.
3783  */
3784 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3785                         *root)
3786 {
3787         int ret = 0;
3788         int wret;
3789         int level;
3790         struct btrfs_path *path;
3791         int i;
3792         int orig_level;
3793         int update_count;
3794         struct btrfs_root_item *root_item = &root->root_item;
3795
3796         WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3797         path = btrfs_alloc_path();
3798         BUG_ON(!path);
3799
3800         level = btrfs_header_level(root->node);
3801         orig_level = level;
3802         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3803                 path->nodes[level] = root->node;
3804                 extent_buffer_get(root->node);
3805                 path->slots[level] = 0;
3806         } else {
3807                 struct btrfs_key key;
3808                 struct btrfs_disk_key found_key;
3809                 struct extent_buffer *node;
3810
3811                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3812                 level = root_item->drop_level;
3813                 path->lowest_level = level;
3814                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3815                 if (wret < 0) {
3816                         ret = wret;
3817                         goto out;
3818                 }
3819                 node = path->nodes[level];
3820                 btrfs_node_key(node, &found_key, path->slots[level]);
3821                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3822                                sizeof(found_key)));
3823                 /*
3824                  * unlock our path, this is safe because only this
3825                  * function is allowed to delete this snapshot
3826                  */
3827                 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3828                         if (path->nodes[i] && path->locks[i]) {
3829                                 path->locks[i] = 0;
3830                                 btrfs_tree_unlock(path->nodes[i]);
3831                         }
3832                 }
3833         }
3834         while (1) {
3835                 unsigned long update;
3836                 wret = walk_down_tree(trans, root, path, &level);
3837                 if (wret > 0)
3838                         break;
3839                 if (wret < 0)
3840                         ret = wret;
3841
3842                 wret = walk_up_tree(trans, root, path, &level,
3843                                     BTRFS_MAX_LEVEL);
3844                 if (wret > 0)
3845                         break;
3846                 if (wret < 0)
3847                         ret = wret;
3848                 if (trans->transaction->in_commit ||
3849                     trans->transaction->delayed_refs.flushing) {
3850                         ret = -EAGAIN;
3851                         break;
3852                 }
3853                 atomic_inc(&root->fs_info->throttle_gen);
3854                 wake_up(&root->fs_info->transaction_throttle);
3855                 for (update_count = 0; update_count < 16; update_count++) {
3856                         update = trans->delayed_ref_updates;
3857                         trans->delayed_ref_updates = 0;
3858                         if (update)
3859                                 btrfs_run_delayed_refs(trans, root, update);
3860                         else
3861                                 break;
3862                 }
3863         }
3864         for (i = 0; i <= orig_level; i++) {
3865                 if (path->nodes[i]) {
3866                         free_extent_buffer(path->nodes[i]);
3867                         path->nodes[i] = NULL;
3868                 }
3869         }
3870 out:
3871         btrfs_free_path(path);
3872         return ret;
3873 }
3874
3875 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3876                         struct btrfs_root *root,
3877                         struct extent_buffer *node,
3878                         struct extent_buffer *parent)
3879 {
3880         struct btrfs_path *path;
3881         int level;
3882         int parent_level;
3883         int ret = 0;
3884         int wret;
3885
3886         path = btrfs_alloc_path();
3887         BUG_ON(!path);
3888
3889         btrfs_assert_tree_locked(parent);
3890         parent_level = btrfs_header_level(parent);
3891         extent_buffer_get(parent);
3892         path->nodes[parent_level] = parent;
3893         path->slots[parent_level] = btrfs_header_nritems(parent);
3894
3895         btrfs_assert_tree_locked(node);
3896         level = btrfs_header_level(node);
3897         extent_buffer_get(node);
3898         path->nodes[level] = node;
3899         path->slots[level] = 0;
3900
3901         while (1) {
3902                 wret = walk_down_subtree(trans, root, path, &level);
3903                 if (wret < 0)
3904                         ret = wret;
3905                 if (wret != 0)
3906                         break;
3907
3908                 wret = walk_up_tree(trans, root, path, &level, parent_level);
3909                 if (wret < 0)
3910                         ret = wret;
3911                 if (wret != 0)
3912                         break;
3913         }
3914
3915         btrfs_free_path(path);
3916         return ret;
3917 }
3918
3919 static unsigned long calc_ra(unsigned long start, unsigned long last,
3920                              unsigned long nr)
3921 {
3922         return min(last, start + nr - 1);
3923 }
3924
3925 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3926                                          u64 len)
3927 {
3928         u64 page_start;
3929         u64 page_end;
3930         unsigned long first_index;
3931         unsigned long last_index;
3932         unsigned long i;
3933         struct page *page;
3934         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3935         struct file_ra_state *ra;
3936         struct btrfs_ordered_extent *ordered;
3937         unsigned int total_read = 0;
3938         unsigned int total_dirty = 0;
3939         int ret = 0;
3940
3941         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3942
3943         mutex_lock(&inode->i_mutex);
3944         first_index = start >> PAGE_CACHE_SHIFT;
3945         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3946
3947         /* make sure the dirty trick played by the caller work */
3948         ret = invalidate_inode_pages2_range(inode->i_mapping,
3949                                             first_index, last_index);
3950         if (ret)
3951                 goto out_unlock;
3952
3953         file_ra_state_init(ra, inode->i_mapping);
3954
3955         for (i = first_index ; i <= last_index; i++) {
3956                 if (total_read % ra->ra_pages == 0) {
3957                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3958                                        calc_ra(i, last_index, ra->ra_pages));
3959                 }
3960                 total_read++;
3961 again:
3962                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3963                         BUG_ON(1);
3964                 page = grab_cache_page(inode->i_mapping, i);
3965                 if (!page) {
3966                         ret = -ENOMEM;
3967                         goto out_unlock;
3968                 }
3969                 if (!PageUptodate(page)) {
3970                         btrfs_readpage(NULL, page);
3971                         lock_page(page);
3972                         if (!PageUptodate(page)) {
3973                                 unlock_page(page);
3974                                 page_cache_release(page);
3975                                 ret = -EIO;
3976                                 goto out_unlock;
3977                         }
3978                 }
3979                 wait_on_page_writeback(page);
3980
3981                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3982                 page_end = page_start + PAGE_CACHE_SIZE - 1;
3983                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3984
3985                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3986                 if (ordered) {
3987                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3988                         unlock_page(page);
3989                         page_cache_release(page);
3990                         btrfs_start_ordered_extent(inode, ordered, 1);
3991                         btrfs_put_ordered_extent(ordered);
3992                         goto again;
3993                 }
3994                 set_page_extent_mapped(page);
3995
3996                 if (i == first_index)
3997                         set_extent_bits(io_tree, page_start, page_end,
3998                                         EXTENT_BOUNDARY, GFP_NOFS);
3999                 btrfs_set_extent_delalloc(inode, page_start, page_end);
4000
4001                 set_page_dirty(page);
4002                 total_dirty++;
4003
4004                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4005                 unlock_page(page);
4006                 page_cache_release(page);
4007         }
4008
4009 out_unlock:
4010         kfree(ra);
4011         mutex_unlock(&inode->i_mutex);
4012         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4013         return ret;
4014 }
4015
4016 static noinline int relocate_data_extent(struct inode *reloc_inode,
4017                                          struct btrfs_key *extent_key,
4018                                          u64 offset)
4019 {
4020         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4021         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4022         struct extent_map *em;
4023         u64 start = extent_key->objectid - offset;
4024         u64 end = start + extent_key->offset - 1;
4025
4026         em = alloc_extent_map(GFP_NOFS);
4027         BUG_ON(!em || IS_ERR(em));
4028
4029         em->start = start;
4030         em->len = extent_key->offset;
4031         em->block_len = extent_key->offset;
4032         em->block_start = extent_key->objectid;
4033         em->bdev = root->fs_info->fs_devices->latest_bdev;
4034         set_bit(EXTENT_FLAG_PINNED, &em->flags);
4035
4036         /* setup extent map to cheat btrfs_readpage */
4037         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4038         while (1) {
4039                 int ret;
4040                 spin_lock(&em_tree->lock);
4041                 ret = add_extent_mapping(em_tree, em);
4042                 spin_unlock(&em_tree->lock);
4043                 if (ret != -EEXIST) {
4044                         free_extent_map(em);
4045                         break;
4046                 }
4047                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4048         }
4049         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4050
4051         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4052 }
4053
4054 struct btrfs_ref_path {
4055         u64 extent_start;
4056         u64 nodes[BTRFS_MAX_LEVEL];
4057         u64 root_objectid;
4058         u64 root_generation;
4059         u64 owner_objectid;
4060         u32 num_refs;
4061         int lowest_level;
4062         int current_level;
4063         int shared_level;
4064
4065         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4066         u64 new_nodes[BTRFS_MAX_LEVEL];
4067 };
4068
4069 struct disk_extent {
4070         u64 ram_bytes;
4071         u64 disk_bytenr;
4072         u64 disk_num_bytes;
4073         u64 offset;
4074         u64 num_bytes;
4075         u8 compression;
4076         u8 encryption;
4077         u16 other_encoding;
4078 };
4079
4080 static int is_cowonly_root(u64 root_objectid)
4081 {
4082         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4083             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4084             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4085             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4086             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4087             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4088                 return 1;
4089         return 0;
4090 }
4091
4092 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4093                                     struct btrfs_root *extent_root,
4094                                     struct btrfs_ref_path *ref_path,
4095                                     int first_time)
4096 {
4097         struct extent_buffer *leaf;
4098         struct btrfs_path *path;
4099         struct btrfs_extent_ref *ref;
4100         struct btrfs_key key;
4101         struct btrfs_key found_key;
4102         u64 bytenr;
4103         u32 nritems;
4104         int level;
4105         int ret = 1;
4106
4107         path = btrfs_alloc_path();
4108         if (!path)
4109                 return -ENOMEM;
4110
4111         if (first_time) {
4112                 ref_path->lowest_level = -1;
4113                 ref_path->current_level = -1;
4114                 ref_path->shared_level = -1;
4115                 goto walk_up;
4116         }
4117 walk_down:
4118         level = ref_path->current_level - 1;
4119         while (level >= -1) {
4120                 u64 parent;
4121                 if (level < ref_path->lowest_level)
4122                         break;
4123
4124                 if (level >= 0)
4125                         bytenr = ref_path->nodes[level];
4126                 else
4127                         bytenr = ref_path->extent_start;
4128                 BUG_ON(bytenr == 0);
4129
4130                 parent = ref_path->nodes[level + 1];
4131                 ref_path->nodes[level + 1] = 0;
4132                 ref_path->current_level = level;
4133                 BUG_ON(parent == 0);
4134
4135                 key.objectid = bytenr;
4136                 key.offset = parent + 1;
4137                 key.type = BTRFS_EXTENT_REF_KEY;
4138
4139                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4140                 if (ret < 0)
4141                         goto out;
4142                 BUG_ON(ret == 0);
4143
4144                 leaf = path->nodes[0];
4145                 nritems = btrfs_header_nritems(leaf);
4146                 if (path->slots[0] >= nritems) {
4147                         ret = btrfs_next_leaf(extent_root, path);
4148                         if (ret < 0)
4149                                 goto out;
4150                         if (ret > 0)
4151                                 goto next;
4152                         leaf = path->nodes[0];
4153                 }
4154
4155                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4156                 if (found_key.objectid == bytenr &&
4157                     found_key.type == BTRFS_EXTENT_REF_KEY) {
4158                         if (level < ref_path->shared_level)
4159                                 ref_path->shared_level = level;
4160                         goto found;
4161                 }
4162 next:
4163                 level--;
4164                 btrfs_release_path(extent_root, path);
4165                 cond_resched();
4166         }
4167         /* reached lowest level */
4168         ret = 1;
4169         goto out;
4170 walk_up:
4171         level = ref_path->current_level;
4172         while (level < BTRFS_MAX_LEVEL - 1) {
4173                 u64 ref_objectid;
4174
4175                 if (level >= 0)
4176                         bytenr = ref_path->nodes[level];
4177                 else
4178                         bytenr = ref_path->extent_start;
4179
4180                 BUG_ON(bytenr == 0);
4181
4182                 key.objectid = bytenr;
4183                 key.offset = 0;
4184                 key.type = BTRFS_EXTENT_REF_KEY;
4185
4186                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4187                 if (ret < 0)
4188                         goto out;
4189
4190                 leaf = path->nodes[0];
4191                 nritems = btrfs_header_nritems(leaf);
4192                 if (path->slots[0] >= nritems) {
4193                         ret = btrfs_next_leaf(extent_root, path);
4194                         if (ret < 0)
4195                                 goto out;
4196                         if (ret > 0) {
4197                                 /* the extent was freed by someone */
4198                                 if (ref_path->lowest_level == level)
4199                                         goto out;
4200                                 btrfs_release_path(extent_root, path);
4201                                 goto walk_down;
4202                         }
4203                         leaf = path->nodes[0];
4204                 }
4205
4206                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4207                 if (found_key.objectid != bytenr ||
4208                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
4209                         /* the extent was freed by someone */
4210                         if (ref_path->lowest_level == level) {
4211                                 ret = 1;
4212                                 goto out;
4213                         }
4214                         btrfs_release_path(extent_root, path);
4215                         goto walk_down;
4216                 }
4217 found:
4218                 ref = btrfs_item_ptr(leaf, path->slots[0],
4219                                 struct btrfs_extent_ref);
4220                 ref_objectid = btrfs_ref_objectid(leaf, ref);
4221                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4222                         if (first_time) {
4223                                 level = (int)ref_objectid;
4224                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
4225                                 ref_path->lowest_level = level;
4226                                 ref_path->current_level = level;
4227                                 ref_path->nodes[level] = bytenr;
4228                         } else {
4229                                 WARN_ON(ref_objectid != level);
4230                         }
4231                 } else {
4232                         WARN_ON(level != -1);
4233                 }
4234                 first_time = 0;
4235
4236                 if (ref_path->lowest_level == level) {
4237                         ref_path->owner_objectid = ref_objectid;
4238                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4239                 }
4240
4241                 /*
4242                  * the block is tree root or the block isn't in reference
4243                  * counted tree.
4244                  */
4245                 if (found_key.objectid == found_key.offset ||
4246                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4247                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4248                         ref_path->root_generation =
4249                                 btrfs_ref_generation(leaf, ref);
4250                         if (level < 0) {
4251                                 /* special reference from the tree log */
4252                                 ref_path->nodes[0] = found_key.offset;
4253                                 ref_path->current_level = 0;
4254                         }
4255                         ret = 0;
4256                         goto out;
4257                 }
4258
4259                 level++;
4260                 BUG_ON(ref_path->nodes[level] != 0);
4261                 ref_path->nodes[level] = found_key.offset;
4262                 ref_path->current_level = level;
4263
4264                 /*
4265                  * the reference was created in the running transaction,
4266                  * no need to continue walking up.
4267                  */
4268                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4269                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4270                         ref_path->root_generation =
4271                                 btrfs_ref_generation(leaf, ref);
4272                         ret = 0;
4273                         goto out;
4274                 }
4275
4276                 btrfs_release_path(extent_root, path);
4277                 cond_resched();
4278         }
4279         /* reached max tree level, but no tree root found. */
4280         BUG();
4281 out:
4282         btrfs_free_path(path);
4283         return ret;
4284 }
4285
4286 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4287                                 struct btrfs_root *extent_root,
4288                                 struct btrfs_ref_path *ref_path,
4289                                 u64 extent_start)
4290 {
4291         memset(ref_path, 0, sizeof(*ref_path));
4292         ref_path->extent_start = extent_start;
4293
4294         return __next_ref_path(trans, extent_root, ref_path, 1);
4295 }
4296
4297 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4298                                struct btrfs_root *extent_root,
4299                                struct btrfs_ref_path *ref_path)
4300 {
4301         return __next_ref_path(trans, extent_root, ref_path, 0);
4302 }
4303
4304 static noinline int get_new_locations(struct inode *reloc_inode,
4305                                       struct btrfs_key *extent_key,
4306                                       u64 offset, int no_fragment,
4307                                       struct disk_extent **extents,
4308                                       int *nr_extents)
4309 {
4310         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4311         struct btrfs_path *path;
4312         struct btrfs_file_extent_item *fi;
4313         struct extent_buffer *leaf;
4314         struct disk_extent *exts = *extents;
4315         struct btrfs_key found_key;
4316         u64 cur_pos;
4317         u64 last_byte;
4318         u32 nritems;
4319         int nr = 0;
4320         int max = *nr_extents;
4321         int ret;
4322
4323         WARN_ON(!no_fragment && *extents);
4324         if (!exts) {
4325                 max = 1;
4326                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4327                 if (!exts)
4328                         return -ENOMEM;
4329         }
4330
4331         path = btrfs_alloc_path();
4332         BUG_ON(!path);
4333
4334         cur_pos = extent_key->objectid - offset;
4335         last_byte = extent_key->objectid + extent_key->offset;
4336         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4337                                        cur_pos, 0);
4338         if (ret < 0)
4339                 goto out;
4340         if (ret > 0) {
4341                 ret = -ENOENT;
4342                 goto out;
4343         }
4344
4345         while (1) {
4346                 leaf = path->nodes[0];
4347                 nritems = btrfs_header_nritems(leaf);
4348                 if (path->slots[0] >= nritems) {
4349                         ret = btrfs_next_leaf(root, path);
4350                         if (ret < 0)
4351                                 goto out;
4352                         if (ret > 0)
4353                                 break;
4354                         leaf = path->nodes[0];
4355                 }
4356
4357                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4358                 if (found_key.offset != cur_pos ||
4359                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
4360                     found_key.objectid != reloc_inode->i_ino)
4361                         break;
4362
4363                 fi = btrfs_item_ptr(leaf, path->slots[0],
4364                                     struct btrfs_file_extent_item);
4365                 if (btrfs_file_extent_type(leaf, fi) !=
4366                     BTRFS_FILE_EXTENT_REG ||
4367                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4368                         break;
4369
4370                 if (nr == max) {
4371                         struct disk_extent *old = exts;
4372                         max *= 2;
4373                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4374                         memcpy(exts, old, sizeof(*exts) * nr);
4375                         if (old != *extents)
4376                                 kfree(old);
4377                 }
4378
4379                 exts[nr].disk_bytenr =
4380                         btrfs_file_extent_disk_bytenr(leaf, fi);
4381                 exts[nr].disk_num_bytes =
4382                         btrfs_file_extent_disk_num_bytes(leaf, fi);
4383                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4384                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4385                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4386                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4387                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4388                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4389                                                                            fi);
4390                 BUG_ON(exts[nr].offset > 0);
4391                 BUG_ON(exts[nr].compression || exts[nr].encryption);
4392                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4393
4394                 cur_pos += exts[nr].num_bytes;
4395                 nr++;
4396
4397                 if (cur_pos + offset >= last_byte)
4398                         break;
4399
4400                 if (no_fragment) {
4401                         ret = 1;
4402                         goto out;
4403                 }
4404                 path->slots[0]++;
4405         }
4406
4407         BUG_ON(cur_pos + offset > last_byte);
4408         if (cur_pos + offset < last_byte) {
4409                 ret = -ENOENT;
4410                 goto out;
4411         }
4412         ret = 0;
4413 out:
4414         btrfs_free_path(path);
4415         if (ret) {
4416                 if (exts != *extents)
4417                         kfree(exts);
4418         } else {
4419                 *extents = exts;
4420                 *nr_extents = nr;
4421         }
4422         return ret;
4423 }
4424
4425 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4426                                         struct btrfs_root *root,
4427                                         struct btrfs_path *path,
4428                                         struct btrfs_key *extent_key,
4429                                         struct btrfs_key *leaf_key,
4430                                         struct btrfs_ref_path *ref_path,
4431                                         struct disk_extent *new_extents,
4432                                         int nr_extents)
4433 {
4434         struct extent_buffer *leaf;
4435         struct btrfs_file_extent_item *fi;
4436         struct inode *inode = NULL;
4437         struct btrfs_key key;
4438         u64 lock_start = 0;
4439         u64 lock_end = 0;
4440         u64 num_bytes;
4441         u64 ext_offset;
4442         u64 search_end = (u64)-1;
4443         u32 nritems;
4444         int nr_scaned = 0;
4445         int extent_locked = 0;
4446         int extent_type;
4447         int ret;
4448
4449         memcpy(&key, leaf_key, sizeof(key));
4450         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4451                 if (key.objectid < ref_path->owner_objectid ||
4452                     (key.objectid == ref_path->owner_objectid &&
4453                      key.type < BTRFS_EXTENT_DATA_KEY)) {
4454                         key.objectid = ref_path->owner_objectid;
4455                         key.type = BTRFS_EXTENT_DATA_KEY;
4456                         key.offset = 0;
4457                 }
4458         }
4459
4460         while (1) {
4461                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4462                 if (ret < 0)
4463                         goto out;
4464
4465                 leaf = path->nodes[0];
4466                 nritems = btrfs_header_nritems(leaf);
4467 next:
4468                 if (extent_locked && ret > 0) {
4469                         /*
4470                          * the file extent item was modified by someone
4471                          * before the extent got locked.
4472                          */
4473                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4474                                       lock_end, GFP_NOFS);
4475                         extent_locked = 0;
4476                 }
4477
4478                 if (path->slots[0] >= nritems) {
4479                         if (++nr_scaned > 2)
4480                                 break;
4481
4482                         BUG_ON(extent_locked);
4483                         ret = btrfs_next_leaf(root, path);
4484                         if (ret < 0)
4485                                 goto out;
4486                         if (ret > 0)
4487                                 break;
4488                         leaf = path->nodes[0];
4489                         nritems = btrfs_header_nritems(leaf);
4490                 }
4491
4492                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4493
4494                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4495                         if ((key.objectid > ref_path->owner_objectid) ||
4496                             (key.objectid == ref_path->owner_objectid &&
4497                              key.type > BTRFS_EXTENT_DATA_KEY) ||
4498                             key.offset >= search_end)
4499                                 break;
4500                 }
4501
4502                 if (inode && key.objectid != inode->i_ino) {
4503                         BUG_ON(extent_locked);
4504                         btrfs_release_path(root, path);
4505                         mutex_unlock(&inode->i_mutex);
4506                         iput(inode);
4507                         inode = NULL;
4508                         continue;
4509                 }
4510
4511                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4512                         path->slots[0]++;
4513                         ret = 1;
4514                         goto next;
4515                 }
4516                 fi = btrfs_item_ptr(leaf, path->slots[0],
4517                                     struct btrfs_file_extent_item);
4518                 extent_type = btrfs_file_extent_type(leaf, fi);
4519                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4520                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4521                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4522                      extent_key->objectid)) {
4523                         path->slots[0]++;
4524                         ret = 1;
4525                         goto next;
4526                 }
4527
4528                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4529                 ext_offset = btrfs_file_extent_offset(leaf, fi);
4530
4531                 if (search_end == (u64)-1) {
4532                         search_end = key.offset - ext_offset +
4533                                 btrfs_file_extent_ram_bytes(leaf, fi);
4534                 }
4535
4536                 if (!extent_locked) {
4537                         lock_start = key.offset;
4538                         lock_end = lock_start + num_bytes - 1;
4539                 } else {
4540                         if (lock_start > key.offset ||
4541                             lock_end + 1 < key.offset + num_bytes) {
4542                                 unlock_extent(&BTRFS_I(inode)->io_tree,
4543                                               lock_start, lock_end, GFP_NOFS);
4544                                 extent_locked = 0;
4545                         }
4546                 }
4547
4548                 if (!inode) {
4549                         btrfs_release_path(root, path);
4550
4551                         inode = btrfs_iget_locked(root->fs_info->sb,
4552                                                   key.objectid, root);
4553                         if (inode->i_state & I_NEW) {
4554                                 BTRFS_I(inode)->root = root;
4555                                 BTRFS_I(inode)->location.objectid =
4556                                         key.objectid;
4557                                 BTRFS_I(inode)->location.type =
4558                                         BTRFS_INODE_ITEM_KEY;
4559                                 BTRFS_I(inode)->location.offset = 0;
4560                                 btrfs_read_locked_inode(inode);
4561                                 unlock_new_inode(inode);
4562                         }
4563                         /*
4564                          * some code call btrfs_commit_transaction while
4565                          * holding the i_mutex, so we can't use mutex_lock
4566                          * here.
4567                          */
4568                         if (is_bad_inode(inode) ||
4569                             !mutex_trylock(&inode->i_mutex)) {
4570                                 iput(inode);
4571                                 inode = NULL;
4572                                 key.offset = (u64)-1;
4573                                 goto skip;
4574                         }
4575                 }
4576
4577                 if (!extent_locked) {
4578                         struct btrfs_ordered_extent *ordered;
4579
4580                         btrfs_release_path(root, path);
4581
4582                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4583                                     lock_end, GFP_NOFS);
4584                         ordered = btrfs_lookup_first_ordered_extent(inode,
4585                                                                     lock_end);
4586                         if (ordered &&
4587                             ordered->file_offset <= lock_end &&
4588                             ordered->file_offset + ordered->len > lock_start) {
4589                                 unlock_extent(&BTRFS_I(inode)->io_tree,
4590                                               lock_start, lock_end, GFP_NOFS);
4591                                 btrfs_start_ordered_extent(inode, ordered, 1);
4592                                 btrfs_put_ordered_extent(ordered);
4593                                 key.offset += num_bytes;
4594                                 goto skip;
4595                         }
4596                         if (ordered)
4597                                 btrfs_put_ordered_extent(ordered);
4598
4599                         extent_locked = 1;
4600                         continue;
4601                 }
4602
4603                 if (nr_extents == 1) {
4604                         /* update extent pointer in place */
4605                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
4606                                                 new_extents[0].disk_bytenr);
4607                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4608                                                 new_extents[0].disk_num_bytes);
4609                         btrfs_mark_buffer_dirty(leaf);
4610
4611                         btrfs_drop_extent_cache(inode, key.offset,
4612                                                 key.offset + num_bytes - 1, 0);
4613
4614                         ret = btrfs_inc_extent_ref(trans, root,
4615                                                 new_extents[0].disk_bytenr,
4616                                                 new_extents[0].disk_num_bytes,
4617                                                 leaf->start,
4618                                                 root->root_key.objectid,
4619                                                 trans->transid,
4620                                                 key.objectid);
4621                         BUG_ON(ret);
4622
4623                         ret = btrfs_free_extent(trans, root,
4624                                                 extent_key->objectid,
4625                                                 extent_key->offset,
4626                                                 leaf->start,
4627                                                 btrfs_header_owner(leaf),
4628                                                 btrfs_header_generation(leaf),
4629                                                 key.objectid, 0);
4630                         BUG_ON(ret);
4631
4632                         btrfs_release_path(root, path);
4633                         key.offset += num_bytes;
4634                 } else {
4635                         BUG_ON(1);
4636 #if 0
4637                         u64 alloc_hint;
4638                         u64 extent_len;
4639                         int i;
4640                         /*
4641                          * drop old extent pointer at first, then insert the
4642                          * new pointers one bye one
4643                          */
4644                         btrfs_release_path(root, path);
4645                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
4646                                                  key.offset + num_bytes,
4647                                                  key.offset, &alloc_hint);
4648                         BUG_ON(ret);
4649
4650                         for (i = 0; i < nr_extents; i++) {
4651                                 if (ext_offset >= new_extents[i].num_bytes) {
4652                                         ext_offset -= new_extents[i].num_bytes;
4653                                         continue;
4654                                 }
4655                                 extent_len = min(new_extents[i].num_bytes -
4656                                                  ext_offset, num_bytes);
4657
4658                                 ret = btrfs_insert_empty_item(trans, root,
4659                                                               path, &key,
4660                                                               sizeof(*fi));
4661                                 BUG_ON(ret);
4662
4663                                 leaf = path->nodes[0];
4664                                 fi = btrfs_item_ptr(leaf, path->slots[0],
4665                                                 struct btrfs_file_extent_item);
4666                                 btrfs_set_file_extent_generation(leaf, fi,
4667                                                         trans->transid);
4668                                 btrfs_set_file_extent_type(leaf, fi,
4669                                                         BTRFS_FILE_EXTENT_REG);
4670                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4671                                                 new_extents[i].disk_bytenr);
4672                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4673                                                 new_extents[i].disk_num_bytes);
4674                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
4675                                                 new_extents[i].ram_bytes);
4676
4677                                 btrfs_set_file_extent_compression(leaf, fi,
4678                                                 new_extents[i].compression);
4679                                 btrfs_set_file_extent_encryption(leaf, fi,
4680                                                 new_extents[i].encryption);
4681                                 btrfs_set_file_extent_other_encoding(leaf, fi,
4682                                                 new_extents[i].other_encoding);
4683
4684                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4685                                                         extent_len);
4686                                 ext_offset += new_extents[i].offset;
4687                                 btrfs_set_file_extent_offset(leaf, fi,
4688                                                         ext_offset);
4689                                 btrfs_mark_buffer_dirty(leaf);
4690
4691                                 btrfs_drop_extent_cache(inode, key.offset,
4692                                                 key.offset + extent_len - 1, 0);
4693
4694                                 ret = btrfs_inc_extent_ref(trans, root,
4695                                                 new_extents[i].disk_bytenr,
4696                                                 new_extents[i].disk_num_bytes,
4697                                                 leaf->start,
4698                                                 root->root_key.objectid,
4699                                                 trans->transid, key.objectid);
4700                                 BUG_ON(ret);
4701                                 btrfs_release_path(root, path);
4702
4703                                 inode_add_bytes(inode, extent_len);
4704
4705                                 ext_offset = 0;
4706                                 num_bytes -= extent_len;
4707                                 key.offset += extent_len;
4708
4709                                 if (num_bytes == 0)
4710                                         break;
4711                         }
4712                         BUG_ON(i >= nr_extents);
4713 #endif
4714                 }
4715
4716                 if (extent_locked) {
4717                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4718                                       lock_end, GFP_NOFS);
4719                         extent_locked = 0;
4720                 }
4721 skip:
4722                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4723                     key.offset >= search_end)
4724                         break;
4725
4726                 cond_resched();
4727         }
4728         ret = 0;
4729 out:
4730         btrfs_release_path(root, path);
4731         if (inode) {
4732                 mutex_unlock(&inode->i_mutex);
4733                 if (extent_locked) {
4734                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4735                                       lock_end, GFP_NOFS);
4736                 }
4737                 iput(inode);
4738         }
4739         return ret;
4740 }
4741
4742 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4743                                struct btrfs_root *root,
4744                                struct extent_buffer *buf, u64 orig_start)
4745 {
4746         int level;
4747         int ret;
4748
4749         BUG_ON(btrfs_header_generation(buf) != trans->transid);
4750         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4751
4752         level = btrfs_header_level(buf);
4753         if (level == 0) {
4754                 struct btrfs_leaf_ref *ref;
4755                 struct btrfs_leaf_ref *orig_ref;
4756
4757                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4758                 if (!orig_ref)
4759                         return -ENOENT;
4760
4761                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4762                 if (!ref) {
4763                         btrfs_free_leaf_ref(root, orig_ref);
4764                         return -ENOMEM;
4765                 }
4766
4767                 ref->nritems = orig_ref->nritems;
4768                 memcpy(ref->extents, orig_ref->extents,
4769                         sizeof(ref->extents[0]) * ref->nritems);
4770
4771                 btrfs_free_leaf_ref(root, orig_ref);
4772
4773                 ref->root_gen = trans->transid;
4774                 ref->bytenr = buf->start;
4775                 ref->owner = btrfs_header_owner(buf);
4776                 ref->generation = btrfs_header_generation(buf);
4777
4778                 ret = btrfs_add_leaf_ref(root, ref, 0);
4779                 WARN_ON(ret);
4780                 btrfs_free_leaf_ref(root, ref);
4781         }
4782         return 0;
4783 }
4784
4785 static noinline int invalidate_extent_cache(struct btrfs_root *root,
4786                                         struct extent_buffer *leaf,
4787                                         struct btrfs_block_group_cache *group,
4788                                         struct btrfs_root *target_root)
4789 {
4790         struct btrfs_key key;
4791         struct inode *inode = NULL;
4792         struct btrfs_file_extent_item *fi;
4793         u64 num_bytes;
4794         u64 skip_objectid = 0;
4795         u32 nritems;
4796         u32 i;
4797
4798         nritems = btrfs_header_nritems(leaf);
4799         for (i = 0; i < nritems; i++) {
4800                 btrfs_item_key_to_cpu(leaf, &key, i);
4801                 if (key.objectid == skip_objectid ||
4802                     key.type != BTRFS_EXTENT_DATA_KEY)
4803                         continue;
4804                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4805                 if (btrfs_file_extent_type(leaf, fi) ==
4806                     BTRFS_FILE_EXTENT_INLINE)
4807                         continue;
4808                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4809                         continue;
4810                 if (!inode || inode->i_ino != key.objectid) {
4811                         iput(inode);
4812                         inode = btrfs_ilookup(target_root->fs_info->sb,
4813                                               key.objectid, target_root, 1);
4814                 }
4815                 if (!inode) {
4816                         skip_objectid = key.objectid;
4817                         continue;
4818                 }
4819                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4820
4821                 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4822                             key.offset + num_bytes - 1, GFP_NOFS);
4823                 btrfs_drop_extent_cache(inode, key.offset,
4824                                         key.offset + num_bytes - 1, 1);
4825                 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4826                               key.offset + num_bytes - 1, GFP_NOFS);
4827                 cond_resched();
4828         }
4829         iput(inode);
4830         return 0;
4831 }
4832
4833 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4834                                         struct btrfs_root *root,
4835                                         struct extent_buffer *leaf,
4836                                         struct btrfs_block_group_cache *group,
4837                                         struct inode *reloc_inode)
4838 {
4839         struct btrfs_key key;
4840         struct btrfs_key extent_key;
4841         struct btrfs_file_extent_item *fi;
4842         struct btrfs_leaf_ref *ref;
4843         struct disk_extent *new_extent;
4844         u64 bytenr;
4845         u64 num_bytes;
4846         u32 nritems;
4847         u32 i;
4848         int ext_index;
4849         int nr_extent;
4850         int ret;
4851
4852         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4853         BUG_ON(!new_extent);
4854
4855         ref = btrfs_lookup_leaf_ref(root, leaf->start);
4856         BUG_ON(!ref);
4857
4858         ext_index = -1;
4859         nritems = btrfs_header_nritems(leaf);
4860         for (i = 0; i < nritems; i++) {
4861                 btrfs_item_key_to_cpu(leaf, &key, i);
4862                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4863                         continue;
4864                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4865                 if (btrfs_file_extent_type(leaf, fi) ==
4866                     BTRFS_FILE_EXTENT_INLINE)
4867                         continue;
4868                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4869                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4870                 if (bytenr == 0)
4871                         continue;
4872
4873                 ext_index++;
4874                 if (bytenr >= group->key.objectid + group->key.offset ||
4875                     bytenr + num_bytes <= group->key.objectid)
4876                         continue;
4877
4878                 extent_key.objectid = bytenr;
4879                 extent_key.offset = num_bytes;
4880                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4881                 nr_extent = 1;
4882                 ret = get_new_locations(reloc_inode, &extent_key,
4883                                         group->key.objectid, 1,
4884                                         &new_extent, &nr_extent);
4885                 if (ret > 0)
4886                         continue;
4887                 BUG_ON(ret < 0);
4888
4889                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4890                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4891                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4892                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4893
4894                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4895                                                 new_extent->disk_bytenr);
4896                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4897                                                 new_extent->disk_num_bytes);
4898                 btrfs_mark_buffer_dirty(leaf);
4899
4900                 ret = btrfs_inc_extent_ref(trans, root,
4901                                         new_extent->disk_bytenr,
4902                                         new_extent->disk_num_bytes,
4903                                         leaf->start,
4904                                         root->root_key.objectid,
4905                                         trans->transid, key.objectid);
4906                 BUG_ON(ret);
4907
4908                 ret = btrfs_free_extent(trans, root,
4909                                         bytenr, num_bytes, leaf->start,
4910                                         btrfs_header_owner(leaf),
4911                                         btrfs_header_generation(leaf),
4912                                         key.objectid, 0);
4913                 BUG_ON(ret);
4914                 cond_resched();
4915         }
4916         kfree(new_extent);
4917         BUG_ON(ext_index + 1 != ref->nritems);
4918         btrfs_free_leaf_ref(root, ref);
4919         return 0;
4920 }
4921
4922 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4923                           struct btrfs_root *root)
4924 {
4925         struct btrfs_root *reloc_root;
4926         int ret;
4927
4928         if (root->reloc_root) {
4929                 reloc_root = root->reloc_root;
4930                 root->reloc_root = NULL;
4931                 list_add(&reloc_root->dead_list,
4932                          &root->fs_info->dead_reloc_roots);
4933
4934                 btrfs_set_root_bytenr(&reloc_root->root_item,
4935                                       reloc_root->node->start);
4936                 btrfs_set_root_level(&root->root_item,
4937                                      btrfs_header_level(reloc_root->node));
4938                 memset(&reloc_root->root_item.drop_progress, 0,
4939                         sizeof(struct btrfs_disk_key));
4940                 reloc_root->root_item.drop_level = 0;
4941
4942                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4943                                         &reloc_root->root_key,
4944                                         &reloc_root->root_item);
4945                 BUG_ON(ret);
4946         }
4947         return 0;
4948 }
4949
4950 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4951 {
4952         struct btrfs_trans_handle *trans;
4953         struct btrfs_root *reloc_root;
4954         struct btrfs_root *prev_root = NULL;
4955         struct list_head dead_roots;
4956         int ret;
4957         unsigned long nr;
4958
4959         INIT_LIST_HEAD(&dead_roots);
4960         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4961
4962         while (!list_empty(&dead_roots)) {
4963                 reloc_root = list_entry(dead_roots.prev,
4964                                         struct btrfs_root, dead_list);
4965                 list_del_init(&reloc_root->dead_list);
4966
4967                 BUG_ON(reloc_root->commit_root != NULL);
4968                 while (1) {
4969                         trans = btrfs_join_transaction(root, 1);
4970                         BUG_ON(!trans);
4971
4972                         mutex_lock(&root->fs_info->drop_mutex);
4973                         ret = btrfs_drop_snapshot(trans, reloc_root);
4974                         if (ret != -EAGAIN)
4975                                 break;
4976                         mutex_unlock(&root->fs_info->drop_mutex);
4977
4978                         nr = trans->blocks_used;
4979                         ret = btrfs_end_transaction(trans, root);
4980                         BUG_ON(ret);
4981                         btrfs_btree_balance_dirty(root, nr);
4982                 }
4983
4984                 free_extent_buffer(reloc_root->node);
4985
4986                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4987                                      &reloc_root->root_key);
4988                 BUG_ON(ret);
4989                 mutex_unlock(&root->fs_info->drop_mutex);
4990
4991                 nr = trans->blocks_used;
4992                 ret = btrfs_end_transaction(trans, root);
4993                 BUG_ON(ret);
4994                 btrfs_btree_balance_dirty(root, nr);
4995
4996                 kfree(prev_root);
4997                 prev_root = reloc_root;
4998         }
4999         if (prev_root) {
5000                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5001                 kfree(prev_root);
5002         }
5003         return 0;
5004 }
5005
5006 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5007 {
5008         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5009         return 0;
5010 }
5011
5012 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5013 {
5014         struct btrfs_root *reloc_root;
5015         struct btrfs_trans_handle *trans;
5016         struct btrfs_key location;
5017         int found;
5018         int ret;
5019
5020         mutex_lock(&root->fs_info->tree_reloc_mutex);
5021         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5022         BUG_ON(ret);
5023         found = !list_empty(&root->fs_info->dead_reloc_roots);
5024         mutex_unlock(&root->fs_info->tree_reloc_mutex);
5025
5026         if (found) {
5027                 trans = btrfs_start_transaction(root, 1);
5028                 BUG_ON(!trans);
5029                 ret = btrfs_commit_transaction(trans, root);
5030                 BUG_ON(ret);
5031         }
5032
5033         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5034         location.offset = (u64)-1;
5035         location.type = BTRFS_ROOT_ITEM_KEY;
5036
5037         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5038         BUG_ON(!reloc_root);
5039         btrfs_orphan_cleanup(reloc_root);
5040         return 0;
5041 }
5042
5043 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5044                                     struct btrfs_root *root)
5045 {
5046         struct btrfs_root *reloc_root;
5047         struct extent_buffer *eb;
5048         struct btrfs_root_item *root_item;
5049         struct btrfs_key root_key;
5050         int ret;
5051
5052         BUG_ON(!root->ref_cows);
5053         if (root->reloc_root)
5054                 return 0;
5055
5056         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5057         BUG_ON(!root_item);
5058
5059         ret = btrfs_copy_root(trans, root, root->commit_root,
5060                               &eb, BTRFS_TREE_RELOC_OBJECTID);
5061         BUG_ON(ret);
5062
5063         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5064         root_key.offset = root->root_key.objectid;
5065         root_key.type = BTRFS_ROOT_ITEM_KEY;
5066
5067         memcpy(root_item, &root->root_item, sizeof(root_item));
5068         btrfs_set_root_refs(root_item, 0);
5069         btrfs_set_root_bytenr(root_item, eb->start);
5070         btrfs_set_root_level(root_item, btrfs_header_level(eb));
5071         btrfs_set_root_generation(root_item, trans->transid);
5072
5073         btrfs_tree_unlock(eb);
5074         free_extent_buffer(eb);
5075
5076         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5077                                 &root_key, root_item);
5078         BUG_ON(ret);
5079         kfree(root_item);
5080
5081         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5082                                                  &root_key);
5083         BUG_ON(!reloc_root);
5084         reloc_root->last_trans = trans->transid;
5085         reloc_root->commit_root = NULL;
5086         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5087
5088         root->reloc_root = reloc_root;
5089         return 0;
5090 }
5091
5092 /*
5093  * Core function of space balance.
5094  *
5095  * The idea is using reloc trees to relocate tree blocks in reference
5096  * counted roots. There is one reloc tree for each subvol, and all
5097  * reloc trees share same root key objectid. Reloc trees are snapshots
5098  * of the latest committed roots of subvols (root->commit_root).
5099  *
5100  * To relocate a tree block referenced by a subvol, there are two steps.
5101  * COW the block through subvol's reloc tree, then update block pointer
5102  * in the subvol to point to the new block. Since all reloc trees share
5103  * same root key objectid, doing special handing for tree blocks owned
5104  * by them is easy. Once a tree block has been COWed in one reloc tree,
5105  * we can use the resulting new block directly when the same block is
5106  * required to COW again through other reloc trees. By this way, relocated
5107  * tree blocks are shared between reloc trees, so they are also shared
5108  * between subvols.
5109  */
5110 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5111                                       struct btrfs_root *root,
5112                                       struct btrfs_path *path,
5113                                       struct btrfs_key *first_key,
5114                                       struct btrfs_ref_path *ref_path,
5115                                       struct btrfs_block_group_cache *group,
5116                                       struct inode *reloc_inode)
5117 {
5118         struct btrfs_root *reloc_root;
5119         struct extent_buffer *eb = NULL;
5120         struct btrfs_key *keys;
5121         u64 *nodes;
5122         int level;
5123         int shared_level;
5124         int lowest_level = 0;
5125         int ret;
5126
5127         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5128                 lowest_level = ref_path->owner_objectid;
5129
5130         if (!root->ref_cows) {
5131                 path->lowest_level = lowest_level;
5132                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5133                 BUG_ON(ret < 0);
5134                 path->lowest_level = 0;
5135                 btrfs_release_path(root, path);
5136                 return 0;
5137         }
5138
5139         mutex_lock(&root->fs_info->tree_reloc_mutex);
5140         ret = init_reloc_tree(trans, root);
5141         BUG_ON(ret);
5142         reloc_root = root->reloc_root;
5143
5144         shared_level = ref_path->shared_level;
5145         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5146
5147         keys = ref_path->node_keys;
5148         nodes = ref_path->new_nodes;
5149         memset(&keys[shared_level + 1], 0,
5150                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5151         memset(&nodes[shared_level + 1], 0,
5152                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5153
5154         if (nodes[lowest_level] == 0) {
5155                 path->lowest_level = lowest_level;
5156                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5157                                         0, 1);
5158                 BUG_ON(ret);
5159                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5160                         eb = path->nodes[level];
5161                         if (!eb || eb == reloc_root->node)
5162                                 break;
5163                         nodes[level] = eb->start;
5164                         if (level == 0)
5165                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5166                         else
5167                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5168                 }
5169                 if (nodes[0] &&
5170                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5171                         eb = path->nodes[0];
5172                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
5173                                                       group, reloc_inode);
5174                         BUG_ON(ret);
5175                 }
5176                 btrfs_release_path(reloc_root, path);
5177         } else {
5178                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5179                                        lowest_level);
5180                 BUG_ON(ret);
5181         }
5182
5183         /*
5184          * replace tree blocks in the fs tree with tree blocks in
5185          * the reloc tree.
5186          */
5187         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5188         BUG_ON(ret < 0);
5189
5190         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5191                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5192                                         0, 0);
5193                 BUG_ON(ret);
5194                 extent_buffer_get(path->nodes[0]);
5195                 eb = path->nodes[0];
5196                 btrfs_release_path(reloc_root, path);
5197                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5198                 BUG_ON(ret);
5199                 free_extent_buffer(eb);
5200         }
5201
5202         mutex_unlock(&root->fs_info->tree_reloc_mutex);
5203         path->lowest_level = 0;
5204         return 0;
5205 }
5206
5207 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5208                                         struct btrfs_root *root,
5209                                         struct btrfs_path *path,
5210                                         struct btrfs_key *first_key,
5211                                         struct btrfs_ref_path *ref_path)
5212 {
5213         int ret;
5214
5215         ret = relocate_one_path(trans, root, path, first_key,
5216                                 ref_path, NULL, NULL);
5217         BUG_ON(ret);
5218
5219         return 0;
5220 }
5221
5222 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5223                                     struct btrfs_root *extent_root,
5224                                     struct btrfs_path *path,
5225                                     struct btrfs_key *extent_key)
5226 {
5227         int ret;
5228
5229         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5230         if (ret)
5231                 goto out;
5232         ret = btrfs_del_item(trans, extent_root, path);
5233 out:
5234         btrfs_release_path(extent_root, path);
5235         return ret;
5236 }
5237
5238 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5239                                                 struct btrfs_ref_path *ref_path)
5240 {
5241         struct btrfs_key root_key;
5242
5243         root_key.objectid = ref_path->root_objectid;
5244         root_key.type = BTRFS_ROOT_ITEM_KEY;
5245         if (is_cowonly_root(ref_path->root_objectid))
5246                 root_key.offset = 0;
5247         else
5248                 root_key.offset = (u64)-1;
5249
5250         return btrfs_read_fs_root_no_name(fs_info, &root_key);
5251 }
5252
5253 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5254                                         struct btrfs_path *path,
5255                                         struct btrfs_key *extent_key,
5256                                         struct btrfs_block_group_cache *group,
5257                                         struct inode *reloc_inode, int pass)
5258 {
5259         struct btrfs_trans_handle *trans;
5260         struct btrfs_root *found_root;
5261         struct btrfs_ref_path *ref_path = NULL;
5262         struct disk_extent *new_extents = NULL;
5263         int nr_extents = 0;
5264         int loops;
5265         int ret;
5266         int level;
5267         struct btrfs_key first_key;
5268         u64 prev_block = 0;
5269
5270
5271         trans = btrfs_start_transaction(extent_root, 1);
5272         BUG_ON(!trans);
5273
5274         if (extent_key->objectid == 0) {
5275                 ret = del_extent_zero(trans, extent_root, path, extent_key);
5276                 goto out;
5277         }
5278
5279         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5280         if (!ref_path) {
5281                 ret = -ENOMEM;
5282                 goto out;
5283         }
5284
5285         for (loops = 0; ; loops++) {
5286                 if (loops == 0) {
5287                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5288                                                    extent_key->objectid);
5289                 } else {
5290                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5291                 }
5292                 if (ret < 0)
5293                         goto out;
5294                 if (ret > 0)
5295                         break;
5296
5297                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5298                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5299                         continue;
5300
5301                 found_root = read_ref_root(extent_root->fs_info, ref_path);
5302                 BUG_ON(!found_root);
5303                 /*
5304                  * for reference counted tree, only process reference paths
5305                  * rooted at the latest committed root.
5306                  */
5307                 if (found_root->ref_cows &&
5308                     ref_path->root_generation != found_root->root_key.offset)
5309                         continue;
5310
5311                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5312                         if (pass == 0) {
5313                                 /*
5314                                  * copy data extents to new locations
5315                                  */
5316                                 u64 group_start = group->key.objectid;
5317                                 ret = relocate_data_extent(reloc_inode,
5318                                                            extent_key,
5319                                                            group_start);
5320                                 if (ret < 0)
5321                                         goto out;
5322                                 break;
5323                         }
5324                         level = 0;
5325                 } else {
5326                         level = ref_path->owner_objectid;
5327                 }
5328
5329                 if (prev_block != ref_path->nodes[level]) {
5330                         struct extent_buffer *eb;
5331                         u64 block_start = ref_path->nodes[level];
5332                         u64 block_size = btrfs_level_size(found_root, level);
5333
5334                         eb = read_tree_block(found_root, block_start,
5335                                              block_size, 0);
5336                         btrfs_tree_lock(eb);
5337                         BUG_ON(level != btrfs_header_level(eb));
5338
5339                         if (level == 0)
5340                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
5341                         else
5342                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
5343
5344                         btrfs_tree_unlock(eb);
5345                         free_extent_buffer(eb);
5346                         prev_block = block_start;
5347                 }
5348
5349                 mutex_lock(&extent_root->fs_info->trans_mutex);
5350                 btrfs_record_root_in_trans(found_root);
5351                 mutex_unlock(&extent_root->fs_info->trans_mutex);
5352                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5353                         /*
5354                          * try to update data extent references while
5355                          * keeping metadata shared between snapshots.
5356                          */
5357                         if (pass == 1) {
5358                                 ret = relocate_one_path(trans, found_root,
5359                                                 path, &first_key, ref_path,
5360                                                 group, reloc_inode);
5361                                 if (ret < 0)
5362                                         goto out;
5363                                 continue;
5364                         }
5365                         /*
5366                          * use fallback method to process the remaining
5367                          * references.
5368                          */
5369                         if (!new_extents) {
5370                                 u64 group_start = group->key.objectid;
5371                                 new_extents = kmalloc(sizeof(*new_extents),
5372                                                       GFP_NOFS);
5373                                 nr_extents = 1;
5374                                 ret = get_new_locations(reloc_inode,
5375                                                         extent_key,
5376                                                         group_start, 1,
5377                                                         &new_extents,
5378                                                         &nr_extents);
5379                                 if (ret)
5380                                         goto out;
5381                         }
5382                         ret = replace_one_extent(trans, found_root,
5383                                                 path, extent_key,
5384                                                 &first_key, ref_path,
5385                                                 new_extents, nr_extents);
5386                 } else {
5387                         ret = relocate_tree_block(trans, found_root, path,
5388                                                   &first_key, ref_path);
5389                 }
5390                 if (ret < 0)
5391                         goto out;
5392         }
5393         ret = 0;
5394 out:
5395         btrfs_end_transaction(trans, extent_root);
5396         kfree(new_extents);
5397         kfree(ref_path);
5398         return ret;
5399 }
5400
5401 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5402 {
5403         u64 num_devices;
5404         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5405                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5406
5407         num_devices = root->fs_info->fs_devices->rw_devices;
5408         if (num_devices == 1) {
5409                 stripped |= BTRFS_BLOCK_GROUP_DUP;
5410                 stripped = flags & ~stripped;
5411
5412                 /* turn raid0 into single device chunks */
5413                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5414                         return stripped;
5415
5416                 /* turn mirroring into duplication */
5417                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5418                              BTRFS_BLOCK_GROUP_RAID10))
5419                         return stripped | BTRFS_BLOCK_GROUP_DUP;
5420                 return flags;
5421         } else {
5422                 /* they already had raid on here, just return */
5423                 if (flags & stripped)
5424                         return flags;
5425
5426                 stripped |= BTRFS_BLOCK_GROUP_DUP;
5427                 stripped = flags & ~stripped;
5428
5429                 /* switch duplicated blocks with raid1 */
5430                 if (flags & BTRFS_BLOCK_GROUP_DUP)
5431                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
5432
5433                 /* turn single device chunks into raid0 */
5434                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5435         }
5436         return flags;
5437 }
5438
5439 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5440                      struct btrfs_block_group_cache *shrink_block_group,
5441                      int force)
5442 {
5443         struct btrfs_trans_handle *trans;
5444         u64 new_alloc_flags;
5445         u64 calc;
5446
5447         spin_lock(&shrink_block_group->lock);
5448         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5449                 spin_unlock(&shrink_block_group->lock);
5450
5451                 trans = btrfs_start_transaction(root, 1);
5452                 spin_lock(&shrink_block_group->lock);
5453
5454                 new_alloc_flags = update_block_group_flags(root,
5455                                                    shrink_block_group->flags);
5456                 if (new_alloc_flags != shrink_block_group->flags) {
5457                         calc =
5458                              btrfs_block_group_used(&shrink_block_group->item);
5459                 } else {
5460                         calc = shrink_block_group->key.offset;
5461                 }
5462                 spin_unlock(&shrink_block_group->lock);
5463
5464                 do_chunk_alloc(trans, root->fs_info->extent_root,
5465                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
5466
5467                 btrfs_end_transaction(trans, root);
5468         } else
5469                 spin_unlock(&shrink_block_group->lock);
5470         return 0;
5471 }
5472
5473 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5474                                  struct btrfs_root *root,
5475                                  u64 objectid, u64 size)
5476 {
5477         struct btrfs_path *path;
5478         struct btrfs_inode_item *item;
5479         struct extent_buffer *leaf;
5480         int ret;
5481
5482         path = btrfs_alloc_path();
5483         if (!path)
5484                 return -ENOMEM;
5485
5486         path->leave_spinning = 1;
5487         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5488         if (ret)
5489                 goto out;
5490
5491         leaf = path->nodes[0];
5492         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5493         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5494         btrfs_set_inode_generation(leaf, item, 1);
5495         btrfs_set_inode_size(leaf, item, size);
5496         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5497         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5498         btrfs_mark_buffer_dirty(leaf);
5499         btrfs_release_path(root, path);
5500 out:
5501         btrfs_free_path(path);
5502         return ret;
5503 }
5504
5505 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5506                                         struct btrfs_block_group_cache *group)
5507 {
5508         struct inode *inode = NULL;
5509         struct btrfs_trans_handle *trans;
5510         struct btrfs_root *root;
5511         struct btrfs_key root_key;
5512         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5513         int err = 0;
5514
5515         root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5516         root_key.type = BTRFS_ROOT_ITEM_KEY;
5517         root_key.offset = (u64)-1;
5518         root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5519         if (IS_ERR(root))
5520                 return ERR_CAST(root);
5521
5522         trans = btrfs_start_transaction(root, 1);
5523         BUG_ON(!trans);
5524
5525         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5526         if (err)
5527                 goto out;
5528
5529         err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5530         BUG_ON(err);
5531
5532         err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5533                                        group->key.offset, 0, group->key.offset,
5534                                        0, 0, 0);
5535         BUG_ON(err);
5536
5537         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5538         if (inode->i_state & I_NEW) {
5539                 BTRFS_I(inode)->root = root;
5540                 BTRFS_I(inode)->location.objectid = objectid;
5541                 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5542                 BTRFS_I(inode)->location.offset = 0;
5543                 btrfs_read_locked_inode(inode);
5544                 unlock_new_inode(inode);
5545                 BUG_ON(is_bad_inode(inode));
5546         } else {
5547                 BUG_ON(1);
5548         }
5549         BTRFS_I(inode)->index_cnt = group->key.objectid;
5550
5551         err = btrfs_orphan_add(trans, inode);
5552 out:
5553         btrfs_end_transaction(trans, root);
5554         if (err) {
5555                 if (inode)
5556                         iput(inode);
5557                 inode = ERR_PTR(err);
5558         }
5559         return inode;
5560 }
5561
5562 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5563 {
5564
5565         struct btrfs_ordered_sum *sums;
5566         struct btrfs_sector_sum *sector_sum;
5567         struct btrfs_ordered_extent *ordered;
5568         struct btrfs_root *root = BTRFS_I(inode)->root;
5569         struct list_head list;
5570         size_t offset;
5571         int ret;
5572         u64 disk_bytenr;
5573
5574         INIT_LIST_HEAD(&list);
5575
5576         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5577         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5578
5579         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5580         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5581                                        disk_bytenr + len - 1, &list);
5582
5583         while (!list_empty(&list)) {
5584                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5585                 list_del_init(&sums->list);
5586
5587                 sector_sum = sums->sums;
5588                 sums->bytenr = ordered->start;
5589
5590                 offset = 0;
5591                 while (offset < sums->len) {
5592                         sector_sum->bytenr += ordered->start - disk_bytenr;
5593                         sector_sum++;
5594                         offset += root->sectorsize;
5595                 }
5596
5597                 btrfs_add_ordered_sum(inode, ordered, sums);
5598         }
5599         btrfs_put_ordered_extent(ordered);
5600         return 0;
5601 }
5602
5603 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5604 {
5605         struct btrfs_trans_handle *trans;
5606         struct btrfs_path *path;
5607         struct btrfs_fs_info *info = root->fs_info;
5608         struct extent_buffer *leaf;
5609         struct inode *reloc_inode;
5610         struct btrfs_block_group_cache *block_group;
5611         struct btrfs_key key;
5612         u64 skipped;
5613         u64 cur_byte;
5614         u64 total_found;
5615         u32 nritems;
5616         int ret;
5617         int progress;
5618         int pass = 0;
5619
5620         root = root->fs_info->extent_root;
5621
5622         block_group = btrfs_lookup_block_group(info, group_start);
5623         BUG_ON(!block_group);
5624
5625         printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5626                (unsigned long long)block_group->key.objectid,
5627                (unsigned long long)block_group->flags);
5628
5629         path = btrfs_alloc_path();
5630         BUG_ON(!path);
5631
5632         reloc_inode = create_reloc_inode(info, block_group);
5633         BUG_ON(IS_ERR(reloc_inode));
5634
5635         __alloc_chunk_for_shrink(root, block_group, 1);
5636         set_block_group_readonly(block_group);
5637
5638         btrfs_start_delalloc_inodes(info->tree_root);
5639         btrfs_wait_ordered_extents(info->tree_root, 0);
5640 again:
5641         skipped = 0;
5642         total_found = 0;
5643         progress = 0;
5644         key.objectid = block_group->key.objectid;
5645         key.offset = 0;
5646         key.type = 0;
5647         cur_byte = key.objectid;
5648
5649         trans = btrfs_start_transaction(info->tree_root, 1);
5650         btrfs_commit_transaction(trans, info->tree_root);
5651
5652         mutex_lock(&root->fs_info->cleaner_mutex);
5653         btrfs_clean_old_snapshots(info->tree_root);
5654         btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5655         mutex_unlock(&root->fs_info->cleaner_mutex);
5656
5657         trans = btrfs_start_transaction(info->tree_root, 1);
5658         btrfs_commit_transaction(trans, info->tree_root);
5659
5660         while (1) {
5661                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5662                 if (ret < 0)
5663                         goto out;
5664 next:
5665                 leaf = path->nodes[0];
5666                 nritems = btrfs_header_nritems(leaf);
5667                 if (path->slots[0] >= nritems) {
5668                         ret = btrfs_next_leaf(root, path);
5669                         if (ret < 0)
5670                                 goto out;
5671                         if (ret == 1) {
5672                                 ret = 0;
5673                                 break;
5674                         }
5675                         leaf = path->nodes[0];
5676                         nritems = btrfs_header_nritems(leaf);
5677                 }
5678
5679                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5680
5681                 if (key.objectid >= block_group->key.objectid +
5682                     block_group->key.offset)
5683                         break;
5684
5685                 if (progress && need_resched()) {
5686                         btrfs_release_path(root, path);
5687                         cond_resched();
5688                         progress = 0;
5689                         continue;
5690                 }
5691                 progress = 1;
5692
5693                 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5694                     key.objectid + key.offset <= cur_byte) {
5695                         path->slots[0]++;
5696                         goto next;
5697                 }
5698
5699                 total_found++;
5700                 cur_byte = key.objectid + key.offset;
5701                 btrfs_release_path(root, path);
5702
5703                 __alloc_chunk_for_shrink(root, block_group, 0);
5704                 ret = relocate_one_extent(root, path, &key, block_group,
5705                                           reloc_inode, pass);
5706                 BUG_ON(ret < 0);
5707                 if (ret > 0)
5708                         skipped++;
5709
5710                 key.objectid = cur_byte;
5711                 key.type = 0;
5712                 key.offset = 0;
5713         }
5714
5715         btrfs_release_path(root, path);
5716
5717         if (pass == 0) {
5718                 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5719                 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5720         }
5721
5722         if (total_found > 0) {
5723                 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5724                        (unsigned long long)total_found, pass);
5725                 pass++;
5726                 if (total_found == skipped && pass > 2) {
5727                         iput(reloc_inode);
5728                         reloc_inode = create_reloc_inode(info, block_group);
5729                         pass = 0;
5730                 }
5731                 goto again;
5732         }
5733
5734         /* delete reloc_inode */
5735         iput(reloc_inode);
5736
5737         /* unpin extents in this range */
5738         trans = btrfs_start_transaction(info->tree_root, 1);
5739         btrfs_commit_transaction(trans, info->tree_root);
5740
5741         spin_lock(&block_group->lock);
5742         WARN_ON(block_group->pinned > 0);
5743         WARN_ON(block_group->reserved > 0);
5744         WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5745         spin_unlock(&block_group->lock);
5746         btrfs_put_block_group(block_group);
5747         ret = 0;
5748 out:
5749         btrfs_free_path(path);
5750         return ret;
5751 }
5752
5753 static int find_first_block_group(struct btrfs_root *root,
5754                 struct btrfs_path *path, struct btrfs_key *key)
5755 {
5756         int ret = 0;
5757         struct btrfs_key found_key;
5758         struct extent_buffer *leaf;
5759         int slot;
5760
5761         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5762         if (ret < 0)
5763                 goto out;
5764
5765         while (1) {
5766                 slot = path->slots[0];
5767                 leaf = path->nodes[0];
5768                 if (slot >= btrfs_header_nritems(leaf)) {
5769                         ret = btrfs_next_leaf(root, path);
5770                         if (ret == 0)
5771                                 continue;
5772                         if (ret < 0)
5773                                 goto out;
5774                         break;
5775                 }
5776                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5777
5778                 if (found_key.objectid >= key->objectid &&
5779                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5780                         ret = 0;
5781                         goto out;
5782                 }
5783                 path->slots[0]++;
5784         }
5785         ret = -ENOENT;
5786 out:
5787         return ret;
5788 }
5789
5790 int btrfs_free_block_groups(struct btrfs_fs_info *info)
5791 {
5792         struct btrfs_block_group_cache *block_group;
5793         struct btrfs_space_info *space_info;
5794         struct rb_node *n;
5795
5796         spin_lock(&info->block_group_cache_lock);
5797         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5798                 block_group = rb_entry(n, struct btrfs_block_group_cache,
5799                                        cache_node);
5800                 rb_erase(&block_group->cache_node,
5801                          &info->block_group_cache_tree);
5802                 spin_unlock(&info->block_group_cache_lock);
5803
5804                 btrfs_remove_free_space_cache(block_group);
5805                 down_write(&block_group->space_info->groups_sem);
5806                 list_del(&block_group->list);
5807                 up_write(&block_group->space_info->groups_sem);
5808
5809                 WARN_ON(atomic_read(&block_group->count) != 1);
5810                 kfree(block_group);
5811
5812                 spin_lock(&info->block_group_cache_lock);
5813         }
5814         spin_unlock(&info->block_group_cache_lock);
5815
5816         /* now that all the block groups are freed, go through and
5817          * free all the space_info structs.  This is only called during
5818          * the final stages of unmount, and so we know nobody is
5819          * using them.  We call synchronize_rcu() once before we start,
5820          * just to be on the safe side.
5821          */
5822         synchronize_rcu();
5823
5824         while(!list_empty(&info->space_info)) {
5825                 space_info = list_entry(info->space_info.next,
5826                                         struct btrfs_space_info,
5827                                         list);
5828
5829                 list_del(&space_info->list);
5830                 kfree(space_info);
5831         }
5832         return 0;
5833 }
5834
5835 int btrfs_read_block_groups(struct btrfs_root *root)
5836 {
5837         struct btrfs_path *path;
5838         int ret;
5839         struct btrfs_block_group_cache *cache;
5840         struct btrfs_fs_info *info = root->fs_info;
5841         struct btrfs_space_info *space_info;
5842         struct btrfs_key key;
5843         struct btrfs_key found_key;
5844         struct extent_buffer *leaf;
5845
5846         root = info->extent_root;
5847         key.objectid = 0;
5848         key.offset = 0;
5849         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5850         path = btrfs_alloc_path();
5851         if (!path)
5852                 return -ENOMEM;
5853
5854         while (1) {
5855                 ret = find_first_block_group(root, path, &key);
5856                 if (ret > 0) {
5857                         ret = 0;
5858                         goto error;
5859                 }
5860                 if (ret != 0)
5861                         goto error;
5862
5863                 leaf = path->nodes[0];
5864                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5865                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5866                 if (!cache) {
5867                         ret = -ENOMEM;
5868                         break;
5869                 }
5870
5871                 atomic_set(&cache->count, 1);
5872                 spin_lock_init(&cache->lock);
5873                 spin_lock_init(&cache->tree_lock);
5874                 mutex_init(&cache->cache_mutex);
5875                 INIT_LIST_HEAD(&cache->list);
5876                 INIT_LIST_HEAD(&cache->cluster_list);
5877                 read_extent_buffer(leaf, &cache->item,
5878                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
5879                                    sizeof(cache->item));
5880                 memcpy(&cache->key, &found_key, sizeof(found_key));
5881
5882                 key.objectid = found_key.objectid + found_key.offset;
5883                 btrfs_release_path(root, path);
5884                 cache->flags = btrfs_block_group_flags(&cache->item);
5885
5886                 ret = update_space_info(info, cache->flags, found_key.offset,
5887                                         btrfs_block_group_used(&cache->item),
5888                                         &space_info);
5889                 BUG_ON(ret);
5890                 cache->space_info = space_info;
5891                 down_write(&space_info->groups_sem);
5892                 list_add_tail(&cache->list, &space_info->block_groups);
5893                 up_write(&space_info->groups_sem);
5894
5895                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5896                 BUG_ON(ret);
5897
5898                 set_avail_alloc_bits(root->fs_info, cache->flags);
5899                 if (btrfs_chunk_readonly(root, cache->key.objectid))
5900                         set_block_group_readonly(cache);
5901         }
5902         ret = 0;
5903 error:
5904         btrfs_free_path(path);
5905         return ret;
5906 }
5907
5908 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5909                            struct btrfs_root *root, u64 bytes_used,
5910                            u64 type, u64 chunk_objectid, u64 chunk_offset,
5911                            u64 size)
5912 {
5913         int ret;
5914         struct btrfs_root *extent_root;
5915         struct btrfs_block_group_cache *cache;
5916
5917         extent_root = root->fs_info->extent_root;
5918
5919         root->fs_info->last_trans_log_full_commit = trans->transid;
5920
5921         cache = kzalloc(sizeof(*cache), GFP_NOFS);
5922         if (!cache)
5923                 return -ENOMEM;
5924
5925         cache->key.objectid = chunk_offset;
5926         cache->key.offset = size;
5927         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5928         atomic_set(&cache->count, 1);
5929         spin_lock_init(&cache->lock);
5930         spin_lock_init(&cache->tree_lock);
5931         mutex_init(&cache->cache_mutex);
5932         INIT_LIST_HEAD(&cache->list);
5933         INIT_LIST_HEAD(&cache->cluster_list);
5934
5935         btrfs_set_block_group_used(&cache->item, bytes_used);
5936         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5937         cache->flags = type;
5938         btrfs_set_block_group_flags(&cache->item, type);
5939
5940         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5941                                 &cache->space_info);
5942         BUG_ON(ret);
5943         down_write(&cache->space_info->groups_sem);
5944         list_add_tail(&cache->list, &cache->space_info->block_groups);
5945         up_write(&cache->space_info->groups_sem);
5946
5947         ret = btrfs_add_block_group_cache(root->fs_info, cache);
5948         BUG_ON(ret);
5949
5950         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5951                                 sizeof(cache->item));
5952         BUG_ON(ret);
5953
5954         set_avail_alloc_bits(extent_root->fs_info, type);
5955
5956         return 0;
5957 }
5958
5959 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5960                              struct btrfs_root *root, u64 group_start)
5961 {
5962         struct btrfs_path *path;
5963         struct btrfs_block_group_cache *block_group;
5964         struct btrfs_key key;
5965         int ret;
5966
5967         root = root->fs_info->extent_root;
5968
5969         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5970         BUG_ON(!block_group);
5971         BUG_ON(!block_group->ro);
5972
5973         memcpy(&key, &block_group->key, sizeof(key));
5974
5975         path = btrfs_alloc_path();
5976         BUG_ON(!path);
5977
5978         spin_lock(&root->fs_info->block_group_cache_lock);
5979         rb_erase(&block_group->cache_node,
5980                  &root->fs_info->block_group_cache_tree);
5981         spin_unlock(&root->fs_info->block_group_cache_lock);
5982         btrfs_remove_free_space_cache(block_group);
5983         down_write(&block_group->space_info->groups_sem);
5984         list_del(&block_group->list);
5985         up_write(&block_group->space_info->groups_sem);
5986
5987         spin_lock(&block_group->space_info->lock);
5988         block_group->space_info->total_bytes -= block_group->key.offset;
5989         block_group->space_info->bytes_readonly -= block_group->key.offset;
5990         spin_unlock(&block_group->space_info->lock);
5991         block_group->space_info->full = 0;
5992
5993         btrfs_put_block_group(block_group);
5994         btrfs_put_block_group(block_group);
5995
5996         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5997         if (ret > 0)
5998                 ret = -EIO;
5999         if (ret < 0)
6000                 goto out;
6001
6002         ret = btrfs_del_item(trans, root, path);
6003 out:
6004         btrfs_free_path(path);
6005         return ret;
6006 }