Btrfs: do extent allocation and reference count updates in the background
[linux-2.6] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include "compat.h"
25 #include "hash.h"
26 #include "crc32c.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "ref-cache.h"
34
35 #define PENDING_EXTENT_INSERT 0
36 #define PENDING_EXTENT_DELETE 1
37 #define PENDING_BACKREF_UPDATE 2
38
39 struct pending_extent_op {
40         int type;
41         u64 bytenr;
42         u64 num_bytes;
43         u64 parent;
44         u64 orig_parent;
45         u64 generation;
46         u64 orig_generation;
47         int level;
48         struct list_head list;
49         int del;
50 };
51
52 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
53                                          struct btrfs_root *root, u64 parent,
54                                          u64 root_objectid, u64 ref_generation,
55                                          u64 owner, struct btrfs_key *ins,
56                                          int ref_mod);
57 static int update_reserved_extents(struct btrfs_root *root,
58                                    u64 bytenr, u64 num, int reserve);
59 static int pin_down_bytes(struct btrfs_trans_handle *trans,
60                           struct btrfs_root *root,
61                           u64 bytenr, u64 num_bytes, int is_data);
62 static int update_block_group(struct btrfs_trans_handle *trans,
63                               struct btrfs_root *root,
64                               u64 bytenr, u64 num_bytes, int alloc,
65                               int mark_free);
66 static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
67                                         struct btrfs_root *root,
68                                         u64 bytenr, u64 num_bytes, u64 parent,
69                                         u64 root_objectid, u64 ref_generation,
70                                         u64 owner_objectid, int pin,
71                                         int ref_to_drop);
72
73 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
74                           struct btrfs_root *extent_root, u64 alloc_bytes,
75                           u64 flags, int force);
76
77 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
78 {
79         return (cache->flags & bits) == bits;
80 }
81
82 /*
83  * this adds the block group to the fs_info rb tree for the block group
84  * cache
85  */
86 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
87                                 struct btrfs_block_group_cache *block_group)
88 {
89         struct rb_node **p;
90         struct rb_node *parent = NULL;
91         struct btrfs_block_group_cache *cache;
92
93         spin_lock(&info->block_group_cache_lock);
94         p = &info->block_group_cache_tree.rb_node;
95
96         while (*p) {
97                 parent = *p;
98                 cache = rb_entry(parent, struct btrfs_block_group_cache,
99                                  cache_node);
100                 if (block_group->key.objectid < cache->key.objectid) {
101                         p = &(*p)->rb_left;
102                 } else if (block_group->key.objectid > cache->key.objectid) {
103                         p = &(*p)->rb_right;
104                 } else {
105                         spin_unlock(&info->block_group_cache_lock);
106                         return -EEXIST;
107                 }
108         }
109
110         rb_link_node(&block_group->cache_node, parent, p);
111         rb_insert_color(&block_group->cache_node,
112                         &info->block_group_cache_tree);
113         spin_unlock(&info->block_group_cache_lock);
114
115         return 0;
116 }
117
118 /*
119  * This will return the block group at or after bytenr if contains is 0, else
120  * it will return the block group that contains the bytenr
121  */
122 static struct btrfs_block_group_cache *
123 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
124                               int contains)
125 {
126         struct btrfs_block_group_cache *cache, *ret = NULL;
127         struct rb_node *n;
128         u64 end, start;
129
130         spin_lock(&info->block_group_cache_lock);
131         n = info->block_group_cache_tree.rb_node;
132
133         while (n) {
134                 cache = rb_entry(n, struct btrfs_block_group_cache,
135                                  cache_node);
136                 end = cache->key.objectid + cache->key.offset - 1;
137                 start = cache->key.objectid;
138
139                 if (bytenr < start) {
140                         if (!contains && (!ret || start < ret->key.objectid))
141                                 ret = cache;
142                         n = n->rb_left;
143                 } else if (bytenr > start) {
144                         if (contains && bytenr <= end) {
145                                 ret = cache;
146                                 break;
147                         }
148                         n = n->rb_right;
149                 } else {
150                         ret = cache;
151                         break;
152                 }
153         }
154         if (ret)
155                 atomic_inc(&ret->count);
156         spin_unlock(&info->block_group_cache_lock);
157
158         return ret;
159 }
160
161 /*
162  * this is only called by cache_block_group, since we could have freed extents
163  * we need to check the pinned_extents for any extents that can't be used yet
164  * since their free space will be released as soon as the transaction commits.
165  */
166 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
167                               struct btrfs_fs_info *info, u64 start, u64 end)
168 {
169         u64 extent_start, extent_end, size;
170         int ret;
171
172         mutex_lock(&info->pinned_mutex);
173         while (start < end) {
174                 ret = find_first_extent_bit(&info->pinned_extents, start,
175                                             &extent_start, &extent_end,
176                                             EXTENT_DIRTY);
177                 if (ret)
178                         break;
179
180                 if (extent_start == start) {
181                         start = extent_end + 1;
182                 } else if (extent_start > start && extent_start < end) {
183                         size = extent_start - start;
184                         ret = btrfs_add_free_space(block_group, start,
185                                                    size);
186                         BUG_ON(ret);
187                         start = extent_end + 1;
188                 } else {
189                         break;
190                 }
191         }
192
193         if (start < end) {
194                 size = end - start;
195                 ret = btrfs_add_free_space(block_group, start, size);
196                 BUG_ON(ret);
197         }
198         mutex_unlock(&info->pinned_mutex);
199
200         return 0;
201 }
202
203 static int remove_sb_from_cache(struct btrfs_root *root,
204                                 struct btrfs_block_group_cache *cache)
205 {
206         u64 bytenr;
207         u64 *logical;
208         int stripe_len;
209         int i, nr, ret;
210
211         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
212                 bytenr = btrfs_sb_offset(i);
213                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
214                                        cache->key.objectid, bytenr, 0,
215                                        &logical, &nr, &stripe_len);
216                 BUG_ON(ret);
217                 while (nr--) {
218                         btrfs_remove_free_space(cache, logical[nr],
219                                                 stripe_len);
220                 }
221                 kfree(logical);
222         }
223         return 0;
224 }
225
226 static int cache_block_group(struct btrfs_root *root,
227                              struct btrfs_block_group_cache *block_group)
228 {
229         struct btrfs_path *path;
230         int ret = 0;
231         struct btrfs_key key;
232         struct extent_buffer *leaf;
233         int slot;
234         u64 last;
235
236         if (!block_group)
237                 return 0;
238
239         root = root->fs_info->extent_root;
240
241         if (block_group->cached)
242                 return 0;
243
244         path = btrfs_alloc_path();
245         if (!path)
246                 return -ENOMEM;
247
248         path->reada = 2;
249         /*
250          * we get into deadlocks with paths held by callers of this function.
251          * since the alloc_mutex is protecting things right now, just
252          * skip the locking here
253          */
254         path->skip_locking = 1;
255         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
256         key.objectid = last;
257         key.offset = 0;
258         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
259         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
260         if (ret < 0)
261                 goto err;
262
263         while (1) {
264                 leaf = path->nodes[0];
265                 slot = path->slots[0];
266                 if (slot >= btrfs_header_nritems(leaf)) {
267                         ret = btrfs_next_leaf(root, path);
268                         if (ret < 0)
269                                 goto err;
270                         if (ret == 0)
271                                 continue;
272                         else
273                                 break;
274                 }
275                 btrfs_item_key_to_cpu(leaf, &key, slot);
276                 if (key.objectid < block_group->key.objectid)
277                         goto next;
278
279                 if (key.objectid >= block_group->key.objectid +
280                     block_group->key.offset)
281                         break;
282
283                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
284                         add_new_free_space(block_group, root->fs_info, last,
285                                            key.objectid);
286
287                         last = key.objectid + key.offset;
288                 }
289 next:
290                 path->slots[0]++;
291         }
292
293         add_new_free_space(block_group, root->fs_info, last,
294                            block_group->key.objectid +
295                            block_group->key.offset);
296
297         remove_sb_from_cache(root, block_group);
298         block_group->cached = 1;
299         ret = 0;
300 err:
301         btrfs_free_path(path);
302         return ret;
303 }
304
305 /*
306  * return the block group that starts at or after bytenr
307  */
308 static struct btrfs_block_group_cache *
309 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
310 {
311         struct btrfs_block_group_cache *cache;
312
313         cache = block_group_cache_tree_search(info, bytenr, 0);
314
315         return cache;
316 }
317
318 /*
319  * return the block group that contains teh given bytenr
320  */
321 struct btrfs_block_group_cache *btrfs_lookup_block_group(
322                                                  struct btrfs_fs_info *info,
323                                                  u64 bytenr)
324 {
325         struct btrfs_block_group_cache *cache;
326
327         cache = block_group_cache_tree_search(info, bytenr, 1);
328
329         return cache;
330 }
331
332 static inline void put_block_group(struct btrfs_block_group_cache *cache)
333 {
334         if (atomic_dec_and_test(&cache->count))
335                 kfree(cache);
336 }
337
338 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
339                                                   u64 flags)
340 {
341         struct list_head *head = &info->space_info;
342         struct btrfs_space_info *found;
343
344         rcu_read_lock();
345         list_for_each_entry_rcu(found, head, list) {
346                 if (found->flags == flags) {
347                         rcu_read_unlock();
348                         return found;
349                 }
350         }
351         rcu_read_unlock();
352         return NULL;
353 }
354
355 /*
356  * after adding space to the filesystem, we need to clear the full flags
357  * on all the space infos.
358  */
359 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
360 {
361         struct list_head *head = &info->space_info;
362         struct btrfs_space_info *found;
363
364         rcu_read_lock();
365         list_for_each_entry_rcu(found, head, list)
366                 found->full = 0;
367         rcu_read_unlock();
368 }
369
370 static u64 div_factor(u64 num, int factor)
371 {
372         if (factor == 10)
373                 return num;
374         num *= factor;
375         do_div(num, 10);
376         return num;
377 }
378
379 u64 btrfs_find_block_group(struct btrfs_root *root,
380                            u64 search_start, u64 search_hint, int owner)
381 {
382         struct btrfs_block_group_cache *cache;
383         u64 used;
384         u64 last = max(search_hint, search_start);
385         u64 group_start = 0;
386         int full_search = 0;
387         int factor = 9;
388         int wrapped = 0;
389 again:
390         while (1) {
391                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
392                 if (!cache)
393                         break;
394
395                 spin_lock(&cache->lock);
396                 last = cache->key.objectid + cache->key.offset;
397                 used = btrfs_block_group_used(&cache->item);
398
399                 if ((full_search || !cache->ro) &&
400                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
401                         if (used + cache->pinned + cache->reserved <
402                             div_factor(cache->key.offset, factor)) {
403                                 group_start = cache->key.objectid;
404                                 spin_unlock(&cache->lock);
405                                 put_block_group(cache);
406                                 goto found;
407                         }
408                 }
409                 spin_unlock(&cache->lock);
410                 put_block_group(cache);
411                 cond_resched();
412         }
413         if (!wrapped) {
414                 last = search_start;
415                 wrapped = 1;
416                 goto again;
417         }
418         if (!full_search && factor < 10) {
419                 last = search_start;
420                 full_search = 1;
421                 factor = 10;
422                 goto again;
423         }
424 found:
425         return group_start;
426 }
427
428 /* simple helper to search for an existing extent at a given offset */
429 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
430 {
431         int ret;
432         struct btrfs_key key;
433         struct btrfs_path *path;
434
435         path = btrfs_alloc_path();
436         BUG_ON(!path);
437         key.objectid = start;
438         key.offset = len;
439         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
440         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
441                                 0, 0);
442         btrfs_free_path(path);
443         return ret;
444 }
445
446 /*
447  * Back reference rules.  Back refs have three main goals:
448  *
449  * 1) differentiate between all holders of references to an extent so that
450  *    when a reference is dropped we can make sure it was a valid reference
451  *    before freeing the extent.
452  *
453  * 2) Provide enough information to quickly find the holders of an extent
454  *    if we notice a given block is corrupted or bad.
455  *
456  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
457  *    maintenance.  This is actually the same as #2, but with a slightly
458  *    different use case.
459  *
460  * File extents can be referenced by:
461  *
462  * - multiple snapshots, subvolumes, or different generations in one subvol
463  * - different files inside a single subvolume
464  * - different offsets inside a file (bookend extents in file.c)
465  *
466  * The extent ref structure has fields for:
467  *
468  * - Objectid of the subvolume root
469  * - Generation number of the tree holding the reference
470  * - objectid of the file holding the reference
471  * - number of references holding by parent node (alway 1 for tree blocks)
472  *
473  * Btree leaf may hold multiple references to a file extent. In most cases,
474  * these references are from same file and the corresponding offsets inside
475  * the file are close together.
476  *
477  * When a file extent is allocated the fields are filled in:
478  *     (root_key.objectid, trans->transid, inode objectid, 1)
479  *
480  * When a leaf is cow'd new references are added for every file extent found
481  * in the leaf.  It looks similar to the create case, but trans->transid will
482  * be different when the block is cow'd.
483  *
484  *     (root_key.objectid, trans->transid, inode objectid,
485  *      number of references in the leaf)
486  *
487  * When a file extent is removed either during snapshot deletion or
488  * file truncation, we find the corresponding back reference and check
489  * the following fields:
490  *
491  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
492  *      inode objectid)
493  *
494  * Btree extents can be referenced by:
495  *
496  * - Different subvolumes
497  * - Different generations of the same subvolume
498  *
499  * When a tree block is created, back references are inserted:
500  *
501  * (root->root_key.objectid, trans->transid, level, 1)
502  *
503  * When a tree block is cow'd, new back references are added for all the
504  * blocks it points to. If the tree block isn't in reference counted root,
505  * the old back references are removed. These new back references are of
506  * the form (trans->transid will have increased since creation):
507  *
508  * (root->root_key.objectid, trans->transid, level, 1)
509  *
510  * When a backref is in deleting, the following fields are checked:
511  *
512  * if backref was for a tree root:
513  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
514  * else
515  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
516  *
517  * Back Reference Key composing:
518  *
519  * The key objectid corresponds to the first byte in the extent, the key
520  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
521  * byte of parent extent. If a extent is tree root, the key offset is set
522  * to the key objectid.
523  */
524
525 static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
526                                           struct btrfs_root *root,
527                                           struct btrfs_path *path,
528                                           u64 bytenr, u64 parent,
529                                           u64 ref_root, u64 ref_generation,
530                                           u64 owner_objectid, int del)
531 {
532         struct btrfs_key key;
533         struct btrfs_extent_ref *ref;
534         struct extent_buffer *leaf;
535         u64 ref_objectid;
536         int ret;
537
538         key.objectid = bytenr;
539         key.type = BTRFS_EXTENT_REF_KEY;
540         key.offset = parent;
541
542         ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
543         if (ret < 0)
544                 goto out;
545         if (ret > 0) {
546                 ret = -ENOENT;
547                 goto out;
548         }
549
550         leaf = path->nodes[0];
551         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
552         ref_objectid = btrfs_ref_objectid(leaf, ref);
553         if (btrfs_ref_root(leaf, ref) != ref_root ||
554             btrfs_ref_generation(leaf, ref) != ref_generation ||
555             (ref_objectid != owner_objectid &&
556              ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
557                 ret = -EIO;
558                 WARN_ON(1);
559                 goto out;
560         }
561         ret = 0;
562 out:
563         return ret;
564 }
565
566 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
567                                           struct btrfs_root *root,
568                                           struct btrfs_path *path,
569                                           u64 bytenr, u64 parent,
570                                           u64 ref_root, u64 ref_generation,
571                                           u64 owner_objectid,
572                                           int refs_to_add)
573 {
574         struct btrfs_key key;
575         struct extent_buffer *leaf;
576         struct btrfs_extent_ref *ref;
577         u32 num_refs;
578         int ret;
579
580         key.objectid = bytenr;
581         key.type = BTRFS_EXTENT_REF_KEY;
582         key.offset = parent;
583
584         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
585         if (ret == 0) {
586                 leaf = path->nodes[0];
587                 ref = btrfs_item_ptr(leaf, path->slots[0],
588                                      struct btrfs_extent_ref);
589                 btrfs_set_ref_root(leaf, ref, ref_root);
590                 btrfs_set_ref_generation(leaf, ref, ref_generation);
591                 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
592                 btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
593         } else if (ret == -EEXIST) {
594                 u64 existing_owner;
595
596                 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
597                 leaf = path->nodes[0];
598                 ref = btrfs_item_ptr(leaf, path->slots[0],
599                                      struct btrfs_extent_ref);
600                 if (btrfs_ref_root(leaf, ref) != ref_root ||
601                     btrfs_ref_generation(leaf, ref) != ref_generation) {
602                         ret = -EIO;
603                         WARN_ON(1);
604                         goto out;
605                 }
606
607                 num_refs = btrfs_ref_num_refs(leaf, ref);
608                 BUG_ON(num_refs == 0);
609                 btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
610
611                 existing_owner = btrfs_ref_objectid(leaf, ref);
612                 if (existing_owner != owner_objectid &&
613                     existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
614                         btrfs_set_ref_objectid(leaf, ref,
615                                         BTRFS_MULTIPLE_OBJECTIDS);
616                 }
617                 ret = 0;
618         } else {
619                 goto out;
620         }
621         btrfs_mark_buffer_dirty(path->nodes[0]);
622 out:
623         btrfs_release_path(root, path);
624         return ret;
625 }
626
627 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
628                                           struct btrfs_root *root,
629                                           struct btrfs_path *path,
630                                           int refs_to_drop)
631 {
632         struct extent_buffer *leaf;
633         struct btrfs_extent_ref *ref;
634         u32 num_refs;
635         int ret = 0;
636
637         leaf = path->nodes[0];
638         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
639         num_refs = btrfs_ref_num_refs(leaf, ref);
640         BUG_ON(num_refs < refs_to_drop);
641         num_refs -= refs_to_drop;
642         if (num_refs == 0) {
643                 ret = btrfs_del_item(trans, root, path);
644         } else {
645                 btrfs_set_ref_num_refs(leaf, ref, num_refs);
646                 btrfs_mark_buffer_dirty(leaf);
647         }
648         btrfs_release_path(root, path);
649         return ret;
650 }
651
652 #ifdef BIO_RW_DISCARD
653 static void btrfs_issue_discard(struct block_device *bdev,
654                                 u64 start, u64 len)
655 {
656         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
657 }
658 #endif
659
660 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
661                                 u64 num_bytes)
662 {
663 #ifdef BIO_RW_DISCARD
664         int ret;
665         u64 map_length = num_bytes;
666         struct btrfs_multi_bio *multi = NULL;
667
668         /* Tell the block device(s) that the sectors can be discarded */
669         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
670                               bytenr, &map_length, &multi, 0);
671         if (!ret) {
672                 struct btrfs_bio_stripe *stripe = multi->stripes;
673                 int i;
674
675                 if (map_length > num_bytes)
676                         map_length = num_bytes;
677
678                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
679                         btrfs_issue_discard(stripe->dev->bdev,
680                                             stripe->physical,
681                                             map_length);
682                 }
683                 kfree(multi);
684         }
685
686         return ret;
687 #else
688         return 0;
689 #endif
690 }
691
692 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
693                                      struct btrfs_root *root, u64 bytenr,
694                                      u64 num_bytes,
695                                      u64 orig_parent, u64 parent,
696                                      u64 orig_root, u64 ref_root,
697                                      u64 orig_generation, u64 ref_generation,
698                                      u64 owner_objectid)
699 {
700         int ret;
701         int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
702
703         ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
704                                        orig_parent, parent, orig_root,
705                                        ref_root, orig_generation,
706                                        ref_generation, owner_objectid, pin);
707         BUG_ON(ret);
708         return ret;
709 }
710
711 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
712                             struct btrfs_root *root, u64 bytenr,
713                             u64 num_bytes, u64 orig_parent, u64 parent,
714                             u64 ref_root, u64 ref_generation,
715                             u64 owner_objectid)
716 {
717         int ret;
718         if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
719             owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
720                 return 0;
721
722         ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
723                                         orig_parent, parent, ref_root,
724                                         ref_root, ref_generation,
725                                         ref_generation, owner_objectid);
726         return ret;
727 }
728 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
729                                   struct btrfs_root *root, u64 bytenr,
730                                   u64 num_bytes,
731                                   u64 orig_parent, u64 parent,
732                                   u64 orig_root, u64 ref_root,
733                                   u64 orig_generation, u64 ref_generation,
734                                   u64 owner_objectid)
735 {
736         int ret;
737
738         ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
739                                     ref_generation, owner_objectid,
740                                     BTRFS_ADD_DELAYED_REF, 0);
741         BUG_ON(ret);
742         return ret;
743 }
744
745 static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
746                           struct btrfs_root *root, u64 bytenr,
747                           u64 num_bytes, u64 parent, u64 ref_root,
748                           u64 ref_generation, u64 owner_objectid,
749                           int refs_to_add)
750 {
751         struct btrfs_path *path;
752         int ret;
753         struct btrfs_key key;
754         struct extent_buffer *l;
755         struct btrfs_extent_item *item;
756         u32 refs;
757
758         path = btrfs_alloc_path();
759         if (!path)
760                 return -ENOMEM;
761
762         path->reada = 1;
763         key.objectid = bytenr;
764         key.type = BTRFS_EXTENT_ITEM_KEY;
765         key.offset = num_bytes;
766
767         /* first find the extent item and update its reference count */
768         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
769                                 path, 0, 1);
770         if (ret < 0)
771                 return ret;
772
773         if (ret > 0) {
774                 WARN_ON(1);
775                 btrfs_free_path(path);
776                 return -EIO;
777         }
778         l = path->nodes[0];
779
780         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
781         if (key.objectid != bytenr) {
782                 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
783                 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
784                        (unsigned long long)bytenr,
785                        (unsigned long long)key.objectid);
786                 BUG();
787         }
788         BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
789
790         item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
791
792         refs = btrfs_extent_refs(l, item);
793         btrfs_set_extent_refs(l, item, refs + refs_to_add);
794         btrfs_mark_buffer_dirty(path->nodes[0]);
795
796         btrfs_release_path(root->fs_info->extent_root, path);
797
798         path->reada = 1;
799         /* now insert the actual backref */
800         ret = insert_extent_backref(trans, root->fs_info->extent_root,
801                                     path, bytenr, parent,
802                                     ref_root, ref_generation,
803                                     owner_objectid, refs_to_add);
804         BUG_ON(ret);
805         btrfs_free_path(path);
806         return 0;
807 }
808
809 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
810                          struct btrfs_root *root,
811                          u64 bytenr, u64 num_bytes, u64 parent,
812                          u64 ref_root, u64 ref_generation,
813                          u64 owner_objectid)
814 {
815         int ret;
816         if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
817             owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
818                 return 0;
819
820         ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
821                                      0, ref_root, 0, ref_generation,
822                                      owner_objectid);
823         return ret;
824 }
825
826 static int drop_delayed_ref(struct btrfs_trans_handle *trans,
827                                         struct btrfs_root *root,
828                                         struct btrfs_delayed_ref_node *node)
829 {
830         int ret = 0;
831         struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
832
833         BUG_ON(node->ref_mod == 0);
834         ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
835                                   node->parent, ref->root, ref->generation,
836                                   ref->owner_objectid, ref->pin, node->ref_mod);
837
838         return ret;
839 }
840
841 /* helper function to actually process a single delayed ref entry */
842 static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
843                                         struct btrfs_root *root,
844                                         struct btrfs_delayed_ref_node *node,
845                                         int insert_reserved)
846 {
847         int ret;
848         struct btrfs_delayed_ref *ref;
849
850         if (node->parent == (u64)-1) {
851                 struct btrfs_delayed_ref_head *head;
852                 /*
853                  * we've hit the end of the chain and we were supposed
854                  * to insert this extent into the tree.  But, it got
855                  * deleted before we ever needed to insert it, so all
856                  * we have to do is clean up the accounting
857                  */
858                 if (insert_reserved) {
859                         update_reserved_extents(root, node->bytenr,
860                                                 node->num_bytes, 0);
861                 }
862                 head = btrfs_delayed_node_to_head(node);
863                 mutex_unlock(&head->mutex);
864                 return 0;
865         }
866
867         ref = btrfs_delayed_node_to_ref(node);
868         if (ref->action == BTRFS_ADD_DELAYED_REF) {
869                 if (insert_reserved) {
870                         struct btrfs_key ins;
871
872                         ins.objectid = node->bytenr;
873                         ins.offset = node->num_bytes;
874                         ins.type = BTRFS_EXTENT_ITEM_KEY;
875
876                         /* record the full extent allocation */
877                         ret = __btrfs_alloc_reserved_extent(trans, root,
878                                         node->parent, ref->root,
879                                         ref->generation, ref->owner_objectid,
880                                         &ins, node->ref_mod);
881                         update_reserved_extents(root, node->bytenr,
882                                                 node->num_bytes, 0);
883                 } else {
884                         /* just add one backref */
885                         ret = add_extent_ref(trans, root, node->bytenr,
886                                      node->num_bytes,
887                                      node->parent, ref->root, ref->generation,
888                                      ref->owner_objectid, node->ref_mod);
889                 }
890                 BUG_ON(ret);
891         } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
892                 WARN_ON(insert_reserved);
893                 ret = drop_delayed_ref(trans, root, node);
894         }
895         return 0;
896 }
897
898 static noinline struct btrfs_delayed_ref_node *
899 select_delayed_ref(struct btrfs_delayed_ref_head *head)
900 {
901         struct rb_node *node;
902         struct btrfs_delayed_ref_node *ref;
903         int action = BTRFS_ADD_DELAYED_REF;
904 again:
905         /*
906          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
907          * this prevents ref count from going down to zero when
908          * there still are pending delayed ref.
909          */
910         node = rb_prev(&head->node.rb_node);
911         while (1) {
912                 if (!node)
913                         break;
914                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
915                                 rb_node);
916                 if (ref->bytenr != head->node.bytenr)
917                         break;
918                 if (btrfs_delayed_node_to_ref(ref)->action == action)
919                         return ref;
920                 node = rb_prev(node);
921         }
922         if (action == BTRFS_ADD_DELAYED_REF) {
923                 action = BTRFS_DROP_DELAYED_REF;
924                 goto again;
925         }
926         return NULL;
927 }
928
929 /*
930  * this starts processing the delayed reference count updates and
931  * extent insertions we have queued up so far.  count can be
932  * 0, which means to process everything in the tree at the start
933  * of the run (but not newly added entries), or it can be some target
934  * number you'd like to process.
935  */
936 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
937                            struct btrfs_root *root, unsigned long count)
938 {
939         struct rb_node *node;
940         struct btrfs_delayed_ref_root *delayed_refs;
941         struct btrfs_delayed_ref_node *ref;
942         struct btrfs_delayed_ref_head *locked_ref = NULL;
943         int ret;
944         int must_insert_reserved = 0;
945         int run_all = count == (unsigned long)-1;
946
947         if (root == root->fs_info->extent_root)
948                 root = root->fs_info->tree_root;
949
950         delayed_refs = &trans->transaction->delayed_refs;
951 again:
952         spin_lock(&delayed_refs->lock);
953         if (count == 0)
954                 count = delayed_refs->num_entries;
955         while (1) {
956                 if (!locked_ref) {
957                         /*
958                          * no locked ref, go find something we can
959                          * process in the rbtree.  We start at
960                          * the beginning of the tree, there may be less
961                          * lock contention if we do something smarter here.
962                          */
963                         node = rb_first(&delayed_refs->root);
964                         if (!node) {
965                                 spin_unlock(&delayed_refs->lock);
966                                 break;
967                         }
968
969                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
970                                        rb_node);
971                         ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
972                         if (ret) {
973                                 spin_unlock(&delayed_refs->lock);
974                                 break;
975                         }
976                 }
977
978                 /*
979                  * record the must insert reserved flag before we
980                  * drop the spin lock.
981                  */
982                 must_insert_reserved = locked_ref->must_insert_reserved;
983                 locked_ref->must_insert_reserved = 0;
984
985                 /*
986                  * locked_ref is the head node, so we have to go one
987                  * node back for any delayed ref updates
988                  */
989
990                 ref = select_delayed_ref(locked_ref);
991                 if (!ref) {
992                         /* All delayed refs have been processed, Go ahead
993                          * and send the head node to run_one_delayed_ref,
994                          * so that any accounting fixes can happen
995                          */
996                         ref = &locked_ref->node;
997                         locked_ref = NULL;
998                 }
999
1000                 ref->in_tree = 0;
1001                 rb_erase(&ref->rb_node, &delayed_refs->root);
1002                 delayed_refs->num_entries--;
1003                 spin_unlock(&delayed_refs->lock);
1004
1005                 ret = run_one_delayed_ref(trans, root, ref,
1006                                           must_insert_reserved);
1007                 BUG_ON(ret);
1008                 btrfs_put_delayed_ref(ref);
1009
1010                 /* once we lock the head ref, we have to process all the
1011                  * entries for it.  So, we might end up doing more entries
1012                  * that count was asking us to do.
1013                  */
1014                 if (count > 0)
1015                         count--;
1016
1017                 /*
1018                  * we set locked_ref to null above if we're all done
1019                  * with this bytenr
1020                  */
1021                 if (!locked_ref && count == 0)
1022                         break;
1023
1024                 spin_lock(&delayed_refs->lock);
1025         }
1026         if (run_all) {
1027                 spin_lock(&delayed_refs->lock);
1028                 node = rb_first(&delayed_refs->root);
1029                 if (!node) {
1030                         spin_unlock(&delayed_refs->lock);
1031                         goto out;
1032                 }
1033
1034                 while (node) {
1035                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
1036                                        rb_node);
1037                         if (btrfs_delayed_ref_is_head(ref)) {
1038                                 struct btrfs_delayed_ref_head *head;
1039
1040                                 head = btrfs_delayed_node_to_head(ref);
1041                                 atomic_inc(&ref->refs);
1042
1043                                 spin_unlock(&delayed_refs->lock);
1044                                 mutex_lock(&head->mutex);
1045                                 mutex_unlock(&head->mutex);
1046
1047                                 btrfs_put_delayed_ref(ref);
1048                                 goto again;
1049                         }
1050                         node = rb_next(node);
1051                 }
1052                 spin_unlock(&delayed_refs->lock);
1053                 count = (unsigned long)-1;
1054                 schedule_timeout(1);
1055                 goto again;
1056         }
1057 out:
1058         return 0;
1059 }
1060
1061 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1062                           struct btrfs_root *root, u64 objectid, u64 bytenr)
1063 {
1064         struct btrfs_root *extent_root = root->fs_info->extent_root;
1065         struct btrfs_path *path;
1066         struct extent_buffer *leaf;
1067         struct btrfs_extent_ref *ref_item;
1068         struct btrfs_key key;
1069         struct btrfs_key found_key;
1070         u64 ref_root;
1071         u64 last_snapshot;
1072         u32 nritems;
1073         int ret;
1074
1075         key.objectid = bytenr;
1076         key.offset = (u64)-1;
1077         key.type = BTRFS_EXTENT_ITEM_KEY;
1078
1079         path = btrfs_alloc_path();
1080         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1081         if (ret < 0)
1082                 goto out;
1083         BUG_ON(ret == 0);
1084
1085         ret = -ENOENT;
1086         if (path->slots[0] == 0)
1087                 goto out;
1088
1089         path->slots[0]--;
1090         leaf = path->nodes[0];
1091         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1092
1093         if (found_key.objectid != bytenr ||
1094             found_key.type != BTRFS_EXTENT_ITEM_KEY)
1095                 goto out;
1096
1097         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1098         while (1) {
1099                 leaf = path->nodes[0];
1100                 nritems = btrfs_header_nritems(leaf);
1101                 if (path->slots[0] >= nritems) {
1102                         ret = btrfs_next_leaf(extent_root, path);
1103                         if (ret < 0)
1104                                 goto out;
1105                         if (ret == 0)
1106                                 continue;
1107                         break;
1108                 }
1109                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1110                 if (found_key.objectid != bytenr)
1111                         break;
1112
1113                 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1114                         path->slots[0]++;
1115                         continue;
1116                 }
1117
1118                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1119                                           struct btrfs_extent_ref);
1120                 ref_root = btrfs_ref_root(leaf, ref_item);
1121                 if ((ref_root != root->root_key.objectid &&
1122                      ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1123                      objectid != btrfs_ref_objectid(leaf, ref_item)) {
1124                         ret = 1;
1125                         goto out;
1126                 }
1127                 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1128                         ret = 1;
1129                         goto out;
1130                 }
1131
1132                 path->slots[0]++;
1133         }
1134         ret = 0;
1135 out:
1136         btrfs_free_path(path);
1137         return ret;
1138 }
1139
1140 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1141                     struct extent_buffer *buf, u32 nr_extents)
1142 {
1143         struct btrfs_key key;
1144         struct btrfs_file_extent_item *fi;
1145         u64 root_gen;
1146         u32 nritems;
1147         int i;
1148         int level;
1149         int ret = 0;
1150         int shared = 0;
1151
1152         if (!root->ref_cows)
1153                 return 0;
1154
1155         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1156                 shared = 0;
1157                 root_gen = root->root_key.offset;
1158         } else {
1159                 shared = 1;
1160                 root_gen = trans->transid - 1;
1161         }
1162
1163         level = btrfs_header_level(buf);
1164         nritems = btrfs_header_nritems(buf);
1165
1166         if (level == 0) {
1167                 struct btrfs_leaf_ref *ref;
1168                 struct btrfs_extent_info *info;
1169
1170                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1171                 if (!ref) {
1172                         ret = -ENOMEM;
1173                         goto out;
1174                 }
1175
1176                 ref->root_gen = root_gen;
1177                 ref->bytenr = buf->start;
1178                 ref->owner = btrfs_header_owner(buf);
1179                 ref->generation = btrfs_header_generation(buf);
1180                 ref->nritems = nr_extents;
1181                 info = ref->extents;
1182
1183                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1184                         u64 disk_bytenr;
1185                         btrfs_item_key_to_cpu(buf, &key, i);
1186                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1187                                 continue;
1188                         fi = btrfs_item_ptr(buf, i,
1189                                             struct btrfs_file_extent_item);
1190                         if (btrfs_file_extent_type(buf, fi) ==
1191                             BTRFS_FILE_EXTENT_INLINE)
1192                                 continue;
1193                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1194                         if (disk_bytenr == 0)
1195                                 continue;
1196
1197                         info->bytenr = disk_bytenr;
1198                         info->num_bytes =
1199                                 btrfs_file_extent_disk_num_bytes(buf, fi);
1200                         info->objectid = key.objectid;
1201                         info->offset = key.offset;
1202                         info++;
1203                 }
1204
1205                 ret = btrfs_add_leaf_ref(root, ref, shared);
1206                 if (ret == -EEXIST && shared) {
1207                         struct btrfs_leaf_ref *old;
1208                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1209                         BUG_ON(!old);
1210                         btrfs_remove_leaf_ref(root, old);
1211                         btrfs_free_leaf_ref(root, old);
1212                         ret = btrfs_add_leaf_ref(root, ref, shared);
1213                 }
1214                 WARN_ON(ret);
1215                 btrfs_free_leaf_ref(root, ref);
1216         }
1217 out:
1218         return ret;
1219 }
1220
1221 /* when a block goes through cow, we update the reference counts of
1222  * everything that block points to.  The internal pointers of the block
1223  * can be in just about any order, and it is likely to have clusters of
1224  * things that are close together and clusters of things that are not.
1225  *
1226  * To help reduce the seeks that come with updating all of these reference
1227  * counts, sort them by byte number before actual updates are done.
1228  *
1229  * struct refsort is used to match byte number to slot in the btree block.
1230  * we sort based on the byte number and then use the slot to actually
1231  * find the item.
1232  *
1233  * struct refsort is smaller than strcut btrfs_item and smaller than
1234  * struct btrfs_key_ptr.  Since we're currently limited to the page size
1235  * for a btree block, there's no way for a kmalloc of refsorts for a
1236  * single node to be bigger than a page.
1237  */
1238 struct refsort {
1239         u64 bytenr;
1240         u32 slot;
1241 };
1242
1243 /*
1244  * for passing into sort()
1245  */
1246 static int refsort_cmp(const void *a_void, const void *b_void)
1247 {
1248         const struct refsort *a = a_void;
1249         const struct refsort *b = b_void;
1250
1251         if (a->bytenr < b->bytenr)
1252                 return -1;
1253         if (a->bytenr > b->bytenr)
1254                 return 1;
1255         return 0;
1256 }
1257
1258
1259 noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1260                            struct btrfs_root *root,
1261                            struct extent_buffer *orig_buf,
1262                            struct extent_buffer *buf, u32 *nr_extents)
1263 {
1264         u64 bytenr;
1265         u64 ref_root;
1266         u64 orig_root;
1267         u64 ref_generation;
1268         u64 orig_generation;
1269         struct refsort *sorted;
1270         u32 nritems;
1271         u32 nr_file_extents = 0;
1272         struct btrfs_key key;
1273         struct btrfs_file_extent_item *fi;
1274         int i;
1275         int level;
1276         int ret = 0;
1277         int faili = 0;
1278         int refi = 0;
1279         int slot;
1280         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1281                             u64, u64, u64, u64, u64, u64, u64, u64, u64);
1282
1283         ref_root = btrfs_header_owner(buf);
1284         ref_generation = btrfs_header_generation(buf);
1285         orig_root = btrfs_header_owner(orig_buf);
1286         orig_generation = btrfs_header_generation(orig_buf);
1287
1288         nritems = btrfs_header_nritems(buf);
1289         level = btrfs_header_level(buf);
1290
1291         sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1292         BUG_ON(!sorted);
1293
1294         if (root->ref_cows) {
1295                 process_func = __btrfs_inc_extent_ref;
1296         } else {
1297                 if (level == 0 &&
1298                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1299                         goto out;
1300                 if (level != 0 &&
1301                     root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1302                         goto out;
1303                 process_func = __btrfs_update_extent_ref;
1304         }
1305
1306         /*
1307          * we make two passes through the items.  In the first pass we
1308          * only record the byte number and slot.  Then we sort based on
1309          * byte number and do the actual work based on the sorted results
1310          */
1311         for (i = 0; i < nritems; i++) {
1312                 cond_resched();
1313                 if (level == 0) {
1314                         btrfs_item_key_to_cpu(buf, &key, i);
1315                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1316                                 continue;
1317                         fi = btrfs_item_ptr(buf, i,
1318                                             struct btrfs_file_extent_item);
1319                         if (btrfs_file_extent_type(buf, fi) ==
1320                             BTRFS_FILE_EXTENT_INLINE)
1321                                 continue;
1322                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1323                         if (bytenr == 0)
1324                                 continue;
1325
1326                         nr_file_extents++;
1327                         sorted[refi].bytenr = bytenr;
1328                         sorted[refi].slot = i;
1329                         refi++;
1330                 } else {
1331                         bytenr = btrfs_node_blockptr(buf, i);
1332                         sorted[refi].bytenr = bytenr;
1333                         sorted[refi].slot = i;
1334                         refi++;
1335                 }
1336         }
1337         /*
1338          * if refi == 0, we didn't actually put anything into the sorted
1339          * array and we're done
1340          */
1341         if (refi == 0)
1342                 goto out;
1343
1344         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1345
1346         for (i = 0; i < refi; i++) {
1347                 cond_resched();
1348                 slot = sorted[i].slot;
1349                 bytenr = sorted[i].bytenr;
1350
1351                 if (level == 0) {
1352                         btrfs_item_key_to_cpu(buf, &key, slot);
1353                         fi = btrfs_item_ptr(buf, slot,
1354                                             struct btrfs_file_extent_item);
1355
1356                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1357                         if (bytenr == 0)
1358                                 continue;
1359
1360                         ret = process_func(trans, root, bytenr,
1361                                    btrfs_file_extent_disk_num_bytes(buf, fi),
1362                                    orig_buf->start, buf->start,
1363                                    orig_root, ref_root,
1364                                    orig_generation, ref_generation,
1365                                    key.objectid);
1366
1367                         if (ret) {
1368                                 faili = slot;
1369                                 WARN_ON(1);
1370                                 goto fail;
1371                         }
1372                 } else {
1373                         ret = process_func(trans, root, bytenr, buf->len,
1374                                            orig_buf->start, buf->start,
1375                                            orig_root, ref_root,
1376                                            orig_generation, ref_generation,
1377                                            level - 1);
1378                         if (ret) {
1379                                 faili = slot;
1380                                 WARN_ON(1);
1381                                 goto fail;
1382                         }
1383                 }
1384         }
1385 out:
1386         kfree(sorted);
1387         if (nr_extents) {
1388                 if (level == 0)
1389                         *nr_extents = nr_file_extents;
1390                 else
1391                         *nr_extents = nritems;
1392         }
1393         return 0;
1394 fail:
1395         kfree(sorted);
1396         WARN_ON(1);
1397         return ret;
1398 }
1399
1400 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1401                      struct btrfs_root *root, struct extent_buffer *orig_buf,
1402                      struct extent_buffer *buf, int start_slot, int nr)
1403
1404 {
1405         u64 bytenr;
1406         u64 ref_root;
1407         u64 orig_root;
1408         u64 ref_generation;
1409         u64 orig_generation;
1410         struct btrfs_key key;
1411         struct btrfs_file_extent_item *fi;
1412         int i;
1413         int ret;
1414         int slot;
1415         int level;
1416
1417         BUG_ON(start_slot < 0);
1418         BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1419
1420         ref_root = btrfs_header_owner(buf);
1421         ref_generation = btrfs_header_generation(buf);
1422         orig_root = btrfs_header_owner(orig_buf);
1423         orig_generation = btrfs_header_generation(orig_buf);
1424         level = btrfs_header_level(buf);
1425
1426         if (!root->ref_cows) {
1427                 if (level == 0 &&
1428                     root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1429                         return 0;
1430                 if (level != 0 &&
1431                     root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1432                         return 0;
1433         }
1434
1435         for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1436                 cond_resched();
1437                 if (level == 0) {
1438                         btrfs_item_key_to_cpu(buf, &key, slot);
1439                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1440                                 continue;
1441                         fi = btrfs_item_ptr(buf, slot,
1442                                             struct btrfs_file_extent_item);
1443                         if (btrfs_file_extent_type(buf, fi) ==
1444                             BTRFS_FILE_EXTENT_INLINE)
1445                                 continue;
1446                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1447                         if (bytenr == 0)
1448                                 continue;
1449                         ret = __btrfs_update_extent_ref(trans, root, bytenr,
1450                                     btrfs_file_extent_disk_num_bytes(buf, fi),
1451                                     orig_buf->start, buf->start,
1452                                     orig_root, ref_root, orig_generation,
1453                                     ref_generation, key.objectid);
1454                         if (ret)
1455                                 goto fail;
1456                 } else {
1457                         bytenr = btrfs_node_blockptr(buf, slot);
1458                         ret = __btrfs_update_extent_ref(trans, root, bytenr,
1459                                             buf->len, orig_buf->start,
1460                                             buf->start, orig_root, ref_root,
1461                                             orig_generation, ref_generation,
1462                                             level - 1);
1463                         if (ret)
1464                                 goto fail;
1465                 }
1466         }
1467         return 0;
1468 fail:
1469         WARN_ON(1);
1470         return -1;
1471 }
1472
1473 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1474                                  struct btrfs_root *root,
1475                                  struct btrfs_path *path,
1476                                  struct btrfs_block_group_cache *cache)
1477 {
1478         int ret;
1479         struct btrfs_root *extent_root = root->fs_info->extent_root;
1480         unsigned long bi;
1481         struct extent_buffer *leaf;
1482
1483         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1484         if (ret < 0)
1485                 goto fail;
1486         BUG_ON(ret);
1487
1488         leaf = path->nodes[0];
1489         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1490         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1491         btrfs_mark_buffer_dirty(leaf);
1492         btrfs_release_path(extent_root, path);
1493 fail:
1494         if (ret)
1495                 return ret;
1496         return 0;
1497
1498 }
1499
1500 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1501                                    struct btrfs_root *root)
1502 {
1503         struct btrfs_block_group_cache *cache, *entry;
1504         struct rb_node *n;
1505         int err = 0;
1506         int werr = 0;
1507         struct btrfs_path *path;
1508         u64 last = 0;
1509
1510         path = btrfs_alloc_path();
1511         if (!path)
1512                 return -ENOMEM;
1513
1514         while (1) {
1515                 cache = NULL;
1516                 spin_lock(&root->fs_info->block_group_cache_lock);
1517                 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1518                      n; n = rb_next(n)) {
1519                         entry = rb_entry(n, struct btrfs_block_group_cache,
1520                                          cache_node);
1521                         if (entry->dirty) {
1522                                 cache = entry;
1523                                 break;
1524                         }
1525                 }
1526                 spin_unlock(&root->fs_info->block_group_cache_lock);
1527
1528                 if (!cache)
1529                         break;
1530
1531                 cache->dirty = 0;
1532                 last += cache->key.offset;
1533
1534                 err = write_one_cache_group(trans, root,
1535                                             path, cache);
1536                 /*
1537                  * if we fail to write the cache group, we want
1538                  * to keep it marked dirty in hopes that a later
1539                  * write will work
1540                  */
1541                 if (err) {
1542                         werr = err;
1543                         continue;
1544                 }
1545         }
1546         btrfs_free_path(path);
1547         return werr;
1548 }
1549
1550 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1551 {
1552         struct btrfs_block_group_cache *block_group;
1553         int readonly = 0;
1554
1555         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1556         if (!block_group || block_group->ro)
1557                 readonly = 1;
1558         if (block_group)
1559                 put_block_group(block_group);
1560         return readonly;
1561 }
1562
1563 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1564                              u64 total_bytes, u64 bytes_used,
1565                              struct btrfs_space_info **space_info)
1566 {
1567         struct btrfs_space_info *found;
1568
1569         found = __find_space_info(info, flags);
1570         if (found) {
1571                 spin_lock(&found->lock);
1572                 found->total_bytes += total_bytes;
1573                 found->bytes_used += bytes_used;
1574                 found->full = 0;
1575                 spin_unlock(&found->lock);
1576                 *space_info = found;
1577                 return 0;
1578         }
1579         found = kzalloc(sizeof(*found), GFP_NOFS);
1580         if (!found)
1581                 return -ENOMEM;
1582
1583         INIT_LIST_HEAD(&found->block_groups);
1584         init_rwsem(&found->groups_sem);
1585         spin_lock_init(&found->lock);
1586         found->flags = flags;
1587         found->total_bytes = total_bytes;
1588         found->bytes_used = bytes_used;
1589         found->bytes_pinned = 0;
1590         found->bytes_reserved = 0;
1591         found->bytes_readonly = 0;
1592         found->bytes_delalloc = 0;
1593         found->full = 0;
1594         found->force_alloc = 0;
1595         *space_info = found;
1596         list_add_rcu(&found->list, &info->space_info);
1597         return 0;
1598 }
1599
1600 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1601 {
1602         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1603                                    BTRFS_BLOCK_GROUP_RAID1 |
1604                                    BTRFS_BLOCK_GROUP_RAID10 |
1605                                    BTRFS_BLOCK_GROUP_DUP);
1606         if (extra_flags) {
1607                 if (flags & BTRFS_BLOCK_GROUP_DATA)
1608                         fs_info->avail_data_alloc_bits |= extra_flags;
1609                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1610                         fs_info->avail_metadata_alloc_bits |= extra_flags;
1611                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1612                         fs_info->avail_system_alloc_bits |= extra_flags;
1613         }
1614 }
1615
1616 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1617 {
1618         spin_lock(&cache->space_info->lock);
1619         spin_lock(&cache->lock);
1620         if (!cache->ro) {
1621                 cache->space_info->bytes_readonly += cache->key.offset -
1622                                         btrfs_block_group_used(&cache->item);
1623                 cache->ro = 1;
1624         }
1625         spin_unlock(&cache->lock);
1626         spin_unlock(&cache->space_info->lock);
1627 }
1628
1629 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1630 {
1631         u64 num_devices = root->fs_info->fs_devices->rw_devices;
1632
1633         if (num_devices == 1)
1634                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1635         if (num_devices < 4)
1636                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1637
1638         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1639             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1640                       BTRFS_BLOCK_GROUP_RAID10))) {
1641                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1642         }
1643
1644         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1645             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1646                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1647         }
1648
1649         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1650             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1651              (flags & BTRFS_BLOCK_GROUP_RAID10) |
1652              (flags & BTRFS_BLOCK_GROUP_DUP)))
1653                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1654         return flags;
1655 }
1656
1657 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1658 {
1659         struct btrfs_fs_info *info = root->fs_info;
1660         u64 alloc_profile;
1661
1662         if (data) {
1663                 alloc_profile = info->avail_data_alloc_bits &
1664                         info->data_alloc_profile;
1665                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1666         } else if (root == root->fs_info->chunk_root) {
1667                 alloc_profile = info->avail_system_alloc_bits &
1668                         info->system_alloc_profile;
1669                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1670         } else {
1671                 alloc_profile = info->avail_metadata_alloc_bits &
1672                         info->metadata_alloc_profile;
1673                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1674         }
1675
1676         return btrfs_reduce_alloc_profile(root, data);
1677 }
1678
1679 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
1680 {
1681         u64 alloc_target;
1682
1683         alloc_target = btrfs_get_alloc_profile(root, 1);
1684         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
1685                                                        alloc_target);
1686 }
1687
1688 /*
1689  * for now this just makes sure we have at least 5% of our metadata space free
1690  * for use.
1691  */
1692 int btrfs_check_metadata_free_space(struct btrfs_root *root)
1693 {
1694         struct btrfs_fs_info *info = root->fs_info;
1695         struct btrfs_space_info *meta_sinfo;
1696         u64 alloc_target, thresh;
1697         int committed = 0, ret;
1698
1699         /* get the space info for where the metadata will live */
1700         alloc_target = btrfs_get_alloc_profile(root, 0);
1701         meta_sinfo = __find_space_info(info, alloc_target);
1702
1703 again:
1704         spin_lock(&meta_sinfo->lock);
1705         if (!meta_sinfo->full)
1706                 thresh = meta_sinfo->total_bytes * 80;
1707         else
1708                 thresh = meta_sinfo->total_bytes * 95;
1709
1710         do_div(thresh, 100);
1711
1712         if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
1713             meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
1714                 struct btrfs_trans_handle *trans;
1715                 if (!meta_sinfo->full) {
1716                         meta_sinfo->force_alloc = 1;
1717                         spin_unlock(&meta_sinfo->lock);
1718
1719                         trans = btrfs_start_transaction(root, 1);
1720                         if (!trans)
1721                                 return -ENOMEM;
1722
1723                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1724                                              2 * 1024 * 1024, alloc_target, 0);
1725                         btrfs_end_transaction(trans, root);
1726                         goto again;
1727                 }
1728                 spin_unlock(&meta_sinfo->lock);
1729
1730                 if (!committed) {
1731                         committed = 1;
1732                         trans = btrfs_join_transaction(root, 1);
1733                         if (!trans)
1734                                 return -ENOMEM;
1735                         ret = btrfs_commit_transaction(trans, root);
1736                         if (ret)
1737                                 return ret;
1738                         goto again;
1739                 }
1740                 return -ENOSPC;
1741         }
1742         spin_unlock(&meta_sinfo->lock);
1743
1744         return 0;
1745 }
1746
1747 /*
1748  * This will check the space that the inode allocates from to make sure we have
1749  * enough space for bytes.
1750  */
1751 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1752                                 u64 bytes)
1753 {
1754         struct btrfs_space_info *data_sinfo;
1755         int ret = 0, committed = 0;
1756
1757         /* make sure bytes are sectorsize aligned */
1758         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1759
1760         data_sinfo = BTRFS_I(inode)->space_info;
1761 again:
1762         /* make sure we have enough space to handle the data first */
1763         spin_lock(&data_sinfo->lock);
1764         if (data_sinfo->total_bytes - data_sinfo->bytes_used -
1765             data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
1766             data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
1767             data_sinfo->bytes_may_use < bytes) {
1768                 struct btrfs_trans_handle *trans;
1769
1770                 /*
1771                  * if we don't have enough free bytes in this space then we need
1772                  * to alloc a new chunk.
1773                  */
1774                 if (!data_sinfo->full) {
1775                         u64 alloc_target;
1776
1777                         data_sinfo->force_alloc = 1;
1778                         spin_unlock(&data_sinfo->lock);
1779
1780                         alloc_target = btrfs_get_alloc_profile(root, 1);
1781                         trans = btrfs_start_transaction(root, 1);
1782                         if (!trans)
1783                                 return -ENOMEM;
1784
1785                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
1786                                              bytes + 2 * 1024 * 1024,
1787                                              alloc_target, 0);
1788                         btrfs_end_transaction(trans, root);
1789                         if (ret)
1790                                 return ret;
1791                         goto again;
1792                 }
1793                 spin_unlock(&data_sinfo->lock);
1794
1795                 /* commit the current transaction and try again */
1796                 if (!committed) {
1797                         committed = 1;
1798                         trans = btrfs_join_transaction(root, 1);
1799                         if (!trans)
1800                                 return -ENOMEM;
1801                         ret = btrfs_commit_transaction(trans, root);
1802                         if (ret)
1803                                 return ret;
1804                         goto again;
1805                 }
1806
1807                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
1808                        ", %llu bytes_used, %llu bytes_reserved, "
1809                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
1810                        "%llu total\n", bytes, data_sinfo->bytes_delalloc,
1811                        data_sinfo->bytes_used, data_sinfo->bytes_reserved,
1812                        data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
1813                        data_sinfo->bytes_may_use, data_sinfo->total_bytes);
1814                 return -ENOSPC;
1815         }
1816         data_sinfo->bytes_may_use += bytes;
1817         BTRFS_I(inode)->reserved_bytes += bytes;
1818         spin_unlock(&data_sinfo->lock);
1819
1820         return btrfs_check_metadata_free_space(root);
1821 }
1822
1823 /*
1824  * if there was an error for whatever reason after calling
1825  * btrfs_check_data_free_space, call this so we can cleanup the counters.
1826  */
1827 void btrfs_free_reserved_data_space(struct btrfs_root *root,
1828                                     struct inode *inode, u64 bytes)
1829 {
1830         struct btrfs_space_info *data_sinfo;
1831
1832         /* make sure bytes are sectorsize aligned */
1833         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
1834
1835         data_sinfo = BTRFS_I(inode)->space_info;
1836         spin_lock(&data_sinfo->lock);
1837         data_sinfo->bytes_may_use -= bytes;
1838         BTRFS_I(inode)->reserved_bytes -= bytes;
1839         spin_unlock(&data_sinfo->lock);
1840 }
1841
1842 /* called when we are adding a delalloc extent to the inode's io_tree */
1843 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1844                                   u64 bytes)
1845 {
1846         struct btrfs_space_info *data_sinfo;
1847
1848         /* get the space info for where this inode will be storing its data */
1849         data_sinfo = BTRFS_I(inode)->space_info;
1850
1851         /* make sure we have enough space to handle the data first */
1852         spin_lock(&data_sinfo->lock);
1853         data_sinfo->bytes_delalloc += bytes;
1854
1855         /*
1856          * we are adding a delalloc extent without calling
1857          * btrfs_check_data_free_space first.  This happens on a weird
1858          * writepage condition, but shouldn't hurt our accounting
1859          */
1860         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
1861                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
1862                 BTRFS_I(inode)->reserved_bytes = 0;
1863         } else {
1864                 data_sinfo->bytes_may_use -= bytes;
1865                 BTRFS_I(inode)->reserved_bytes -= bytes;
1866         }
1867
1868         spin_unlock(&data_sinfo->lock);
1869 }
1870
1871 /* called when we are clearing an delalloc extent from the inode's io_tree */
1872 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1873                               u64 bytes)
1874 {
1875         struct btrfs_space_info *info;
1876
1877         info = BTRFS_I(inode)->space_info;
1878
1879         spin_lock(&info->lock);
1880         info->bytes_delalloc -= bytes;
1881         spin_unlock(&info->lock);
1882 }
1883
1884 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885                           struct btrfs_root *extent_root, u64 alloc_bytes,
1886                           u64 flags, int force)
1887 {
1888         struct btrfs_space_info *space_info;
1889         u64 thresh;
1890         int ret = 0;
1891
1892         mutex_lock(&extent_root->fs_info->chunk_mutex);
1893
1894         flags = btrfs_reduce_alloc_profile(extent_root, flags);
1895
1896         space_info = __find_space_info(extent_root->fs_info, flags);
1897         if (!space_info) {
1898                 ret = update_space_info(extent_root->fs_info, flags,
1899                                         0, 0, &space_info);
1900                 BUG_ON(ret);
1901         }
1902         BUG_ON(!space_info);
1903
1904         spin_lock(&space_info->lock);
1905         if (space_info->force_alloc) {
1906                 force = 1;
1907                 space_info->force_alloc = 0;
1908         }
1909         if (space_info->full) {
1910                 spin_unlock(&space_info->lock);
1911                 goto out;
1912         }
1913
1914         thresh = space_info->total_bytes - space_info->bytes_readonly;
1915         thresh = div_factor(thresh, 6);
1916         if (!force &&
1917            (space_info->bytes_used + space_info->bytes_pinned +
1918             space_info->bytes_reserved + alloc_bytes) < thresh) {
1919                 spin_unlock(&space_info->lock);
1920                 goto out;
1921         }
1922         spin_unlock(&space_info->lock);
1923
1924         ret = btrfs_alloc_chunk(trans, extent_root, flags);
1925         if (ret)
1926                 space_info->full = 1;
1927 out:
1928         mutex_unlock(&extent_root->fs_info->chunk_mutex);
1929         return ret;
1930 }
1931
1932 static int update_block_group(struct btrfs_trans_handle *trans,
1933                               struct btrfs_root *root,
1934                               u64 bytenr, u64 num_bytes, int alloc,
1935                               int mark_free)
1936 {
1937         struct btrfs_block_group_cache *cache;
1938         struct btrfs_fs_info *info = root->fs_info;
1939         u64 total = num_bytes;
1940         u64 old_val;
1941         u64 byte_in_group;
1942
1943         while (total) {
1944                 cache = btrfs_lookup_block_group(info, bytenr);
1945                 if (!cache)
1946                         return -1;
1947                 byte_in_group = bytenr - cache->key.objectid;
1948                 WARN_ON(byte_in_group > cache->key.offset);
1949
1950                 spin_lock(&cache->space_info->lock);
1951                 spin_lock(&cache->lock);
1952                 cache->dirty = 1;
1953                 old_val = btrfs_block_group_used(&cache->item);
1954                 num_bytes = min(total, cache->key.offset - byte_in_group);
1955                 if (alloc) {
1956                         old_val += num_bytes;
1957                         cache->space_info->bytes_used += num_bytes;
1958                         if (cache->ro)
1959                                 cache->space_info->bytes_readonly -= num_bytes;
1960                         btrfs_set_block_group_used(&cache->item, old_val);
1961                         spin_unlock(&cache->lock);
1962                         spin_unlock(&cache->space_info->lock);
1963                 } else {
1964                         old_val -= num_bytes;
1965                         cache->space_info->bytes_used -= num_bytes;
1966                         if (cache->ro)
1967                                 cache->space_info->bytes_readonly += num_bytes;
1968                         btrfs_set_block_group_used(&cache->item, old_val);
1969                         spin_unlock(&cache->lock);
1970                         spin_unlock(&cache->space_info->lock);
1971                         if (mark_free) {
1972                                 int ret;
1973
1974                                 ret = btrfs_discard_extent(root, bytenr,
1975                                                            num_bytes);
1976                                 WARN_ON(ret);
1977
1978                                 ret = btrfs_add_free_space(cache, bytenr,
1979                                                            num_bytes);
1980                                 WARN_ON(ret);
1981                         }
1982                 }
1983                 put_block_group(cache);
1984                 total -= num_bytes;
1985                 bytenr += num_bytes;
1986         }
1987         return 0;
1988 }
1989
1990 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1991 {
1992         struct btrfs_block_group_cache *cache;
1993         u64 bytenr;
1994
1995         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1996         if (!cache)
1997                 return 0;
1998
1999         bytenr = cache->key.objectid;
2000         put_block_group(cache);
2001
2002         return bytenr;
2003 }
2004
2005 int btrfs_update_pinned_extents(struct btrfs_root *root,
2006                                 u64 bytenr, u64 num, int pin)
2007 {
2008         u64 len;
2009         struct btrfs_block_group_cache *cache;
2010         struct btrfs_fs_info *fs_info = root->fs_info;
2011
2012         WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2013         if (pin) {
2014                 set_extent_dirty(&fs_info->pinned_extents,
2015                                 bytenr, bytenr + num - 1, GFP_NOFS);
2016         } else {
2017                 clear_extent_dirty(&fs_info->pinned_extents,
2018                                 bytenr, bytenr + num - 1, GFP_NOFS);
2019         }
2020         while (num > 0) {
2021                 cache = btrfs_lookup_block_group(fs_info, bytenr);
2022                 BUG_ON(!cache);
2023                 len = min(num, cache->key.offset -
2024                           (bytenr - cache->key.objectid));
2025                 if (pin) {
2026                         spin_lock(&cache->space_info->lock);
2027                         spin_lock(&cache->lock);
2028                         cache->pinned += len;
2029                         cache->space_info->bytes_pinned += len;
2030                         spin_unlock(&cache->lock);
2031                         spin_unlock(&cache->space_info->lock);
2032                         fs_info->total_pinned += len;
2033                 } else {
2034                         spin_lock(&cache->space_info->lock);
2035                         spin_lock(&cache->lock);
2036                         cache->pinned -= len;
2037                         cache->space_info->bytes_pinned -= len;
2038                         spin_unlock(&cache->lock);
2039                         spin_unlock(&cache->space_info->lock);
2040                         fs_info->total_pinned -= len;
2041                         if (cache->cached)
2042                                 btrfs_add_free_space(cache, bytenr, len);
2043                 }
2044                 put_block_group(cache);
2045                 bytenr += len;
2046                 num -= len;
2047         }
2048         return 0;
2049 }
2050
2051 static int update_reserved_extents(struct btrfs_root *root,
2052                                    u64 bytenr, u64 num, int reserve)
2053 {
2054         u64 len;
2055         struct btrfs_block_group_cache *cache;
2056         struct btrfs_fs_info *fs_info = root->fs_info;
2057
2058         while (num > 0) {
2059                 cache = btrfs_lookup_block_group(fs_info, bytenr);
2060                 BUG_ON(!cache);
2061                 len = min(num, cache->key.offset -
2062                           (bytenr - cache->key.objectid));
2063
2064                 spin_lock(&cache->space_info->lock);
2065                 spin_lock(&cache->lock);
2066                 if (reserve) {
2067                         cache->reserved += len;
2068                         cache->space_info->bytes_reserved += len;
2069                 } else {
2070                         cache->reserved -= len;
2071                         cache->space_info->bytes_reserved -= len;
2072                 }
2073                 spin_unlock(&cache->lock);
2074                 spin_unlock(&cache->space_info->lock);
2075                 put_block_group(cache);
2076                 bytenr += len;
2077                 num -= len;
2078         }
2079         return 0;
2080 }
2081
2082 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2083 {
2084         u64 last = 0;
2085         u64 start;
2086         u64 end;
2087         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2088         int ret;
2089
2090         mutex_lock(&root->fs_info->pinned_mutex);
2091         while (1) {
2092                 ret = find_first_extent_bit(pinned_extents, last,
2093                                             &start, &end, EXTENT_DIRTY);
2094                 if (ret)
2095                         break;
2096                 set_extent_dirty(copy, start, end, GFP_NOFS);
2097                 last = end + 1;
2098         }
2099         mutex_unlock(&root->fs_info->pinned_mutex);
2100         return 0;
2101 }
2102
2103 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2104                                struct btrfs_root *root,
2105                                struct extent_io_tree *unpin)
2106 {
2107         u64 start;
2108         u64 end;
2109         int ret;
2110
2111         mutex_lock(&root->fs_info->pinned_mutex);
2112         while (1) {
2113                 ret = find_first_extent_bit(unpin, 0, &start, &end,
2114                                             EXTENT_DIRTY);
2115                 if (ret)
2116                         break;
2117
2118                 ret = btrfs_discard_extent(root, start, end + 1 - start);
2119
2120                 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2121                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2122
2123                 if (need_resched()) {
2124                         mutex_unlock(&root->fs_info->pinned_mutex);
2125                         cond_resched();
2126                         mutex_lock(&root->fs_info->pinned_mutex);
2127                 }
2128         }
2129         mutex_unlock(&root->fs_info->pinned_mutex);
2130         return ret;
2131 }
2132
2133 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2134                           struct btrfs_root *root,
2135                           u64 bytenr, u64 num_bytes, int is_data)
2136 {
2137         int err = 0;
2138         struct extent_buffer *buf;
2139
2140         if (is_data)
2141                 goto pinit;
2142
2143         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2144         if (!buf)
2145                 goto pinit;
2146
2147         /* we can reuse a block if it hasn't been written
2148          * and it is from this transaction.  We can't
2149          * reuse anything from the tree log root because
2150          * it has tiny sub-transactions.
2151          */
2152         if (btrfs_buffer_uptodate(buf, 0) &&
2153             btrfs_try_tree_lock(buf)) {
2154                 u64 header_owner = btrfs_header_owner(buf);
2155                 u64 header_transid = btrfs_header_generation(buf);
2156                 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2157                     header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2158                     header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2159                     header_transid == trans->transid &&
2160                     !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2161                         clean_tree_block(NULL, root, buf);
2162                         btrfs_tree_unlock(buf);
2163                         free_extent_buffer(buf);
2164                         return 1;
2165                 }
2166                 btrfs_tree_unlock(buf);
2167         }
2168         free_extent_buffer(buf);
2169 pinit:
2170         btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2171
2172         BUG_ON(err < 0);
2173         return 0;
2174 }
2175
2176 /*
2177  * remove an extent from the root, returns 0 on success
2178  */
2179 static int __free_extent(struct btrfs_trans_handle *trans,
2180                          struct btrfs_root *root,
2181                          u64 bytenr, u64 num_bytes, u64 parent,
2182                          u64 root_objectid, u64 ref_generation,
2183                          u64 owner_objectid, int pin, int mark_free,
2184                          int refs_to_drop)
2185 {
2186         struct btrfs_path *path;
2187         struct btrfs_key key;
2188         struct btrfs_fs_info *info = root->fs_info;
2189         struct btrfs_root *extent_root = info->extent_root;
2190         struct extent_buffer *leaf;
2191         int ret;
2192         int extent_slot = 0;
2193         int found_extent = 0;
2194         int num_to_del = 1;
2195         struct btrfs_extent_item *ei;
2196         u32 refs;
2197
2198         key.objectid = bytenr;
2199         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2200         key.offset = num_bytes;
2201         path = btrfs_alloc_path();
2202         if (!path)
2203                 return -ENOMEM;
2204
2205         path->reada = 1;
2206         ret = lookup_extent_backref(trans, extent_root, path,
2207                                     bytenr, parent, root_objectid,
2208                                     ref_generation, owner_objectid, 1);
2209         if (ret == 0) {
2210                 struct btrfs_key found_key;
2211                 extent_slot = path->slots[0];
2212                 while (extent_slot > 0) {
2213                         extent_slot--;
2214                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2215                                               extent_slot);
2216                         if (found_key.objectid != bytenr)
2217                                 break;
2218                         if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2219                             found_key.offset == num_bytes) {
2220                                 found_extent = 1;
2221                                 break;
2222                         }
2223                         if (path->slots[0] - extent_slot > 5)
2224                                 break;
2225                 }
2226                 if (!found_extent) {
2227                         ret = remove_extent_backref(trans, extent_root, path,
2228                                                     refs_to_drop);
2229                         BUG_ON(ret);
2230                         btrfs_release_path(extent_root, path);
2231                         ret = btrfs_search_slot(trans, extent_root,
2232                                                 &key, path, -1, 1);
2233                         if (ret) {
2234                                 printk(KERN_ERR "umm, got %d back from search"
2235                                        ", was looking for %llu\n", ret,
2236                                        (unsigned long long)bytenr);
2237                                 btrfs_print_leaf(extent_root, path->nodes[0]);
2238                         }
2239                         BUG_ON(ret);
2240                         extent_slot = path->slots[0];
2241                 }
2242         } else {
2243                 btrfs_print_leaf(extent_root, path->nodes[0]);
2244                 WARN_ON(1);
2245                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2246                        "parent %llu root %llu gen %llu owner %llu\n",
2247                        (unsigned long long)bytenr,
2248                        (unsigned long long)parent,
2249                        (unsigned long long)root_objectid,
2250                        (unsigned long long)ref_generation,
2251                        (unsigned long long)owner_objectid);
2252         }
2253
2254         leaf = path->nodes[0];
2255         ei = btrfs_item_ptr(leaf, extent_slot,
2256                             struct btrfs_extent_item);
2257         refs = btrfs_extent_refs(leaf, ei);
2258
2259         /*
2260          * we're not allowed to delete the extent item if there
2261          * are other delayed ref updates pending
2262          */
2263
2264         BUG_ON(refs < refs_to_drop);
2265         refs -= refs_to_drop;
2266         btrfs_set_extent_refs(leaf, ei, refs);
2267         btrfs_mark_buffer_dirty(leaf);
2268
2269         if (refs == 0 && found_extent &&
2270             path->slots[0] == extent_slot + 1) {
2271                 struct btrfs_extent_ref *ref;
2272                 ref = btrfs_item_ptr(leaf, path->slots[0],
2273                                      struct btrfs_extent_ref);
2274                 BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
2275                 /* if the back ref and the extent are next to each other
2276                  * they get deleted below in one shot
2277                  */
2278                 path->slots[0] = extent_slot;
2279                 num_to_del = 2;
2280         } else if (found_extent) {
2281                 /* otherwise delete the extent back ref */
2282                 ret = remove_extent_backref(trans, extent_root, path,
2283                                             refs_to_drop);
2284                 BUG_ON(ret);
2285                 /* if refs are 0, we need to setup the path for deletion */
2286                 if (refs == 0) {
2287                         btrfs_release_path(extent_root, path);
2288                         ret = btrfs_search_slot(trans, extent_root, &key, path,
2289                                                 -1, 1);
2290                         BUG_ON(ret);
2291                 }
2292         }
2293
2294         if (refs == 0) {
2295                 u64 super_used;
2296                 u64 root_used;
2297
2298                 if (pin) {
2299                         mutex_lock(&root->fs_info->pinned_mutex);
2300                         ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2301                                 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2302                         mutex_unlock(&root->fs_info->pinned_mutex);
2303                         if (ret > 0)
2304                                 mark_free = 1;
2305                         BUG_ON(ret < 0);
2306                 }
2307                 /* block accounting for super block */
2308                 spin_lock(&info->delalloc_lock);
2309                 super_used = btrfs_super_bytes_used(&info->super_copy);
2310                 btrfs_set_super_bytes_used(&info->super_copy,
2311                                            super_used - num_bytes);
2312
2313                 /* block accounting for root item */
2314                 root_used = btrfs_root_used(&root->root_item);
2315                 btrfs_set_root_used(&root->root_item,
2316                                            root_used - num_bytes);
2317                 spin_unlock(&info->delalloc_lock);
2318                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2319                                       num_to_del);
2320                 BUG_ON(ret);
2321                 btrfs_release_path(extent_root, path);
2322
2323                 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2324                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2325                         BUG_ON(ret);
2326                 }
2327
2328                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2329                                          mark_free);
2330                 BUG_ON(ret);
2331         }
2332         btrfs_free_path(path);
2333         return ret;
2334 }
2335
2336 /*
2337  * remove an extent from the root, returns 0 on success
2338  */
2339 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2340                                         struct btrfs_root *root,
2341                                         u64 bytenr, u64 num_bytes, u64 parent,
2342                                         u64 root_objectid, u64 ref_generation,
2343                                         u64 owner_objectid, int pin,
2344                                         int refs_to_drop)
2345 {
2346         WARN_ON(num_bytes < root->sectorsize);
2347
2348         /*
2349          * if metadata always pin
2350          * if data pin when any transaction has committed this
2351          */
2352         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
2353             ref_generation != trans->transid)
2354                 pin = 1;
2355
2356         if (ref_generation != trans->transid)
2357                 pin = 1;
2358
2359         return __free_extent(trans, root, bytenr, num_bytes, parent,
2360                             root_objectid, ref_generation,
2361                             owner_objectid, pin, pin == 0, refs_to_drop);
2362 }
2363
2364 int btrfs_free_extent(struct btrfs_trans_handle *trans,
2365                       struct btrfs_root *root,
2366                       u64 bytenr, u64 num_bytes, u64 parent,
2367                       u64 root_objectid, u64 ref_generation,
2368                       u64 owner_objectid, int pin)
2369 {
2370         int ret;
2371
2372         /*
2373          * tree log blocks never actually go into the extent allocation
2374          * tree, just update pinning info and exit early.
2375          *
2376          * data extents referenced by the tree log do need to have
2377          * their reference counts bumped.
2378          */
2379         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2380             owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2381                 mutex_lock(&root->fs_info->pinned_mutex);
2382                 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2383                 mutex_unlock(&root->fs_info->pinned_mutex);
2384                 update_reserved_extents(root, bytenr, num_bytes, 0);
2385                 ret = 0;
2386         } else {
2387                 ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
2388                                        root_objectid, ref_generation,
2389                                        owner_objectid,
2390                                        BTRFS_DROP_DELAYED_REF, 1);
2391         }
2392         return ret;
2393 }
2394
2395 static u64 stripe_align(struct btrfs_root *root, u64 val)
2396 {
2397         u64 mask = ((u64)root->stripesize - 1);
2398         u64 ret = (val + mask) & ~mask;
2399         return ret;
2400 }
2401
2402 /*
2403  * walks the btree of allocated extents and find a hole of a given size.
2404  * The key ins is changed to record the hole:
2405  * ins->objectid == block start
2406  * ins->flags = BTRFS_EXTENT_ITEM_KEY
2407  * ins->offset == number of blocks
2408  * Any available blocks before search_start are skipped.
2409  */
2410 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2411                                      struct btrfs_root *orig_root,
2412                                      u64 num_bytes, u64 empty_size,
2413                                      u64 search_start, u64 search_end,
2414                                      u64 hint_byte, struct btrfs_key *ins,
2415                                      u64 exclude_start, u64 exclude_nr,
2416                                      int data)
2417 {
2418         int ret = 0;
2419         struct btrfs_root *root = orig_root->fs_info->extent_root;
2420         u64 total_needed = num_bytes;
2421         u64 *last_ptr = NULL;
2422         u64 last_wanted = 0;
2423         struct btrfs_block_group_cache *block_group = NULL;
2424         int chunk_alloc_done = 0;
2425         int empty_cluster = 2 * 1024 * 1024;
2426         int allowed_chunk_alloc = 0;
2427         struct list_head *head = NULL, *cur = NULL;
2428         int loop = 0;
2429         int extra_loop = 0;
2430         struct btrfs_space_info *space_info;
2431
2432         WARN_ON(num_bytes < root->sectorsize);
2433         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2434         ins->objectid = 0;
2435         ins->offset = 0;
2436
2437         if (orig_root->ref_cows || empty_size)
2438                 allowed_chunk_alloc = 1;
2439
2440         if (data & BTRFS_BLOCK_GROUP_METADATA) {
2441                 last_ptr = &root->fs_info->last_alloc;
2442                 if (!btrfs_test_opt(root, SSD))
2443                         empty_cluster = 64 * 1024;
2444         }
2445
2446         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2447                 last_ptr = &root->fs_info->last_data_alloc;
2448
2449         if (last_ptr) {
2450                 if (*last_ptr) {
2451                         hint_byte = *last_ptr;
2452                         last_wanted = *last_ptr;
2453                 } else
2454                         empty_size += empty_cluster;
2455         } else {
2456                 empty_cluster = 0;
2457         }
2458         search_start = max(search_start, first_logical_byte(root, 0));
2459         search_start = max(search_start, hint_byte);
2460
2461         if (last_wanted && search_start != last_wanted) {
2462                 last_wanted = 0;
2463                 empty_size += empty_cluster;
2464         }
2465
2466         total_needed += empty_size;
2467         block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2468         if (!block_group)
2469                 block_group = btrfs_lookup_first_block_group(root->fs_info,
2470                                                              search_start);
2471         space_info = __find_space_info(root->fs_info, data);
2472
2473         down_read(&space_info->groups_sem);
2474         while (1) {
2475                 struct btrfs_free_space *free_space;
2476                 /*
2477                  * the only way this happens if our hint points to a block
2478                  * group thats not of the proper type, while looping this
2479                  * should never happen
2480                  */
2481                 if (empty_size)
2482                         extra_loop = 1;
2483
2484                 if (!block_group)
2485                         goto new_group_no_lock;
2486
2487                 if (unlikely(!block_group->cached)) {
2488                         mutex_lock(&block_group->cache_mutex);
2489                         ret = cache_block_group(root, block_group);
2490                         mutex_unlock(&block_group->cache_mutex);
2491                         if (ret)
2492                                 break;
2493                 }
2494
2495                 mutex_lock(&block_group->alloc_mutex);
2496                 if (unlikely(!block_group_bits(block_group, data)))
2497                         goto new_group;
2498
2499                 if (unlikely(block_group->ro))
2500                         goto new_group;
2501
2502                 free_space = btrfs_find_free_space(block_group, search_start,
2503                                                    total_needed);
2504                 if (free_space) {
2505                         u64 start = block_group->key.objectid;
2506                         u64 end = block_group->key.objectid +
2507                                 block_group->key.offset;
2508
2509                         search_start = stripe_align(root, free_space->offset);
2510
2511                         /* move on to the next group */
2512                         if (search_start + num_bytes >= search_end)
2513                                 goto new_group;
2514
2515                         /* move on to the next group */
2516                         if (search_start + num_bytes > end)
2517                                 goto new_group;
2518
2519                         if (last_wanted && search_start != last_wanted) {
2520                                 total_needed += empty_cluster;
2521                                 empty_size += empty_cluster;
2522                                 last_wanted = 0;
2523                                 /*
2524                                  * if search_start is still in this block group
2525                                  * then we just re-search this block group
2526                                  */
2527                                 if (search_start >= start &&
2528                                     search_start < end) {
2529                                         mutex_unlock(&block_group->alloc_mutex);
2530                                         continue;
2531                                 }
2532
2533                                 /* else we go to the next block group */
2534                                 goto new_group;
2535                         }
2536
2537                         if (exclude_nr > 0 &&
2538                             (search_start + num_bytes > exclude_start &&
2539                              search_start < exclude_start + exclude_nr)) {
2540                                 search_start = exclude_start + exclude_nr;
2541                                 /*
2542                                  * if search_start is still in this block group
2543                                  * then we just re-search this block group
2544                                  */
2545                                 if (search_start >= start &&
2546                                     search_start < end) {
2547                                         mutex_unlock(&block_group->alloc_mutex);
2548                                         last_wanted = 0;
2549                                         continue;
2550                                 }
2551
2552                                 /* else we go to the next block group */
2553                                 goto new_group;
2554                         }
2555
2556                         ins->objectid = search_start;
2557                         ins->offset = num_bytes;
2558
2559                         btrfs_remove_free_space_lock(block_group, search_start,
2560                                                      num_bytes);
2561                         /* we are all good, lets return */
2562                         mutex_unlock(&block_group->alloc_mutex);
2563                         break;
2564                 }
2565 new_group:
2566                 mutex_unlock(&block_group->alloc_mutex);
2567                 put_block_group(block_group);
2568                 block_group = NULL;
2569 new_group_no_lock:
2570                 /* don't try to compare new allocations against the
2571                  * last allocation any more
2572                  */
2573                 last_wanted = 0;
2574
2575                 /*
2576                  * Here's how this works.
2577                  * loop == 0: we were searching a block group via a hint
2578                  *              and didn't find anything, so we start at
2579                  *              the head of the block groups and keep searching
2580                  * loop == 1: we're searching through all of the block groups
2581                  *              if we hit the head again we have searched
2582                  *              all of the block groups for this space and we
2583                  *              need to try and allocate, if we cant error out.
2584                  * loop == 2: we allocated more space and are looping through
2585                  *              all of the block groups again.
2586                  */
2587                 if (loop == 0) {
2588                         head = &space_info->block_groups;
2589                         cur = head->next;
2590                         loop++;
2591                 } else if (loop == 1 && cur == head) {
2592                         int keep_going;
2593
2594                         /* at this point we give up on the empty_size
2595                          * allocations and just try to allocate the min
2596                          * space.
2597                          *
2598                          * The extra_loop field was set if an empty_size
2599                          * allocation was attempted above, and if this
2600                          * is try we need to try the loop again without
2601                          * the additional empty_size.
2602                          */
2603                         total_needed -= empty_size;
2604                         empty_size = 0;
2605                         keep_going = extra_loop;
2606                         loop++;
2607
2608                         if (allowed_chunk_alloc && !chunk_alloc_done) {
2609                                 up_read(&space_info->groups_sem);
2610                                 ret = do_chunk_alloc(trans, root, num_bytes +
2611                                                      2 * 1024 * 1024, data, 1);
2612                                 down_read(&space_info->groups_sem);
2613                                 if (ret < 0)
2614                                         goto loop_check;
2615                                 head = &space_info->block_groups;
2616                                 /*
2617                                  * we've allocated a new chunk, keep
2618                                  * trying
2619                                  */
2620                                 keep_going = 1;
2621                                 chunk_alloc_done = 1;
2622                         } else if (!allowed_chunk_alloc) {
2623                                 space_info->force_alloc = 1;
2624                         }
2625 loop_check:
2626                         if (keep_going) {
2627                                 cur = head->next;
2628                                 extra_loop = 0;
2629                         } else {
2630                                 break;
2631                         }
2632                 } else if (cur == head) {
2633                         break;
2634                 }
2635
2636                 block_group = list_entry(cur, struct btrfs_block_group_cache,
2637                                          list);
2638                 atomic_inc(&block_group->count);
2639
2640                 search_start = block_group->key.objectid;
2641                 cur = cur->next;
2642         }
2643
2644         /* we found what we needed */
2645         if (ins->objectid) {
2646                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2647                         trans->block_group = block_group->key.objectid;
2648
2649                 if (last_ptr)
2650                         *last_ptr = ins->objectid + ins->offset;
2651                 ret = 0;
2652         } else if (!ret) {
2653                 printk(KERN_ERR "btrfs searching for %llu bytes, "
2654                        "num_bytes %llu, loop %d, allowed_alloc %d\n",
2655                        (unsigned long long)total_needed,
2656                        (unsigned long long)num_bytes,
2657                        loop, allowed_chunk_alloc);
2658                 ret = -ENOSPC;
2659         }
2660         if (block_group)
2661                 put_block_group(block_group);
2662
2663         up_read(&space_info->groups_sem);
2664         return ret;
2665 }
2666
2667 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2668 {
2669         struct btrfs_block_group_cache *cache;
2670
2671         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
2672                (unsigned long long)(info->total_bytes - info->bytes_used -
2673                                     info->bytes_pinned - info->bytes_reserved),
2674                (info->full) ? "" : "not ");
2675         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
2676                " may_use=%llu, used=%llu\n", info->total_bytes,
2677                info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
2678                info->bytes_used);
2679
2680         down_read(&info->groups_sem);
2681         list_for_each_entry(cache, &info->block_groups, list) {
2682                 spin_lock(&cache->lock);
2683                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
2684                        "%llu pinned %llu reserved\n",
2685                        (unsigned long long)cache->key.objectid,
2686                        (unsigned long long)cache->key.offset,
2687                        (unsigned long long)btrfs_block_group_used(&cache->item),
2688                        (unsigned long long)cache->pinned,
2689                        (unsigned long long)cache->reserved);
2690                 btrfs_dump_free_space(cache, bytes);
2691                 spin_unlock(&cache->lock);
2692         }
2693         up_read(&info->groups_sem);
2694 }
2695
2696 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2697                                   struct btrfs_root *root,
2698                                   u64 num_bytes, u64 min_alloc_size,
2699                                   u64 empty_size, u64 hint_byte,
2700                                   u64 search_end, struct btrfs_key *ins,
2701                                   u64 data)
2702 {
2703         int ret;
2704         u64 search_start = 0;
2705         struct btrfs_fs_info *info = root->fs_info;
2706
2707         data = btrfs_get_alloc_profile(root, data);
2708 again:
2709         /*
2710          * the only place that sets empty_size is btrfs_realloc_node, which
2711          * is not called recursively on allocations
2712          */
2713         if (empty_size || root->ref_cows) {
2714                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
2715                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2716                                      2 * 1024 * 1024,
2717                                      BTRFS_BLOCK_GROUP_METADATA |
2718                                      (info->metadata_alloc_profile &
2719                                       info->avail_metadata_alloc_bits), 0);
2720                 }
2721                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2722                                      num_bytes + 2 * 1024 * 1024, data, 0);
2723         }
2724
2725         WARN_ON(num_bytes < root->sectorsize);
2726         ret = find_free_extent(trans, root, num_bytes, empty_size,
2727                                search_start, search_end, hint_byte, ins,
2728                                trans->alloc_exclude_start,
2729                                trans->alloc_exclude_nr, data);
2730
2731         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2732                 num_bytes = num_bytes >> 1;
2733                 num_bytes = num_bytes & ~(root->sectorsize - 1);
2734                 num_bytes = max(num_bytes, min_alloc_size);
2735                 do_chunk_alloc(trans, root->fs_info->extent_root,
2736                                num_bytes, data, 1);
2737                 goto again;
2738         }
2739         if (ret) {
2740                 struct btrfs_space_info *sinfo;
2741
2742                 sinfo = __find_space_info(root->fs_info, data);
2743                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
2744                        "wanted %llu\n", (unsigned long long)data,
2745                        (unsigned long long)num_bytes);
2746                 dump_space_info(sinfo, num_bytes);
2747                 BUG();
2748         }
2749
2750         return ret;
2751 }
2752
2753 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2754 {
2755         struct btrfs_block_group_cache *cache;
2756         int ret = 0;
2757
2758         cache = btrfs_lookup_block_group(root->fs_info, start);
2759         if (!cache) {
2760                 printk(KERN_ERR "Unable to find block group for %llu\n",
2761                        (unsigned long long)start);
2762                 return -ENOSPC;
2763         }
2764
2765         ret = btrfs_discard_extent(root, start, len);
2766
2767         btrfs_add_free_space(cache, start, len);
2768         put_block_group(cache);
2769         update_reserved_extents(root, start, len, 0);
2770
2771         return ret;
2772 }
2773
2774 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2775                                   struct btrfs_root *root,
2776                                   u64 num_bytes, u64 min_alloc_size,
2777                                   u64 empty_size, u64 hint_byte,
2778                                   u64 search_end, struct btrfs_key *ins,
2779                                   u64 data)
2780 {
2781         int ret;
2782         ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2783                                      empty_size, hint_byte, search_end, ins,
2784                                      data);
2785         update_reserved_extents(root, ins->objectid, ins->offset, 1);
2786         return ret;
2787 }
2788
2789 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2790                                          struct btrfs_root *root, u64 parent,
2791                                          u64 root_objectid, u64 ref_generation,
2792                                          u64 owner, struct btrfs_key *ins,
2793                                          int ref_mod)
2794 {
2795         int ret;
2796         u64 super_used;
2797         u64 root_used;
2798         u64 num_bytes = ins->offset;
2799         u32 sizes[2];
2800         struct btrfs_fs_info *info = root->fs_info;
2801         struct btrfs_root *extent_root = info->extent_root;
2802         struct btrfs_extent_item *extent_item;
2803         struct btrfs_extent_ref *ref;
2804         struct btrfs_path *path;
2805         struct btrfs_key keys[2];
2806
2807         if (parent == 0)
2808                 parent = ins->objectid;
2809
2810         /* block accounting for super block */
2811         spin_lock(&info->delalloc_lock);
2812         super_used = btrfs_super_bytes_used(&info->super_copy);
2813         btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2814
2815         /* block accounting for root item */
2816         root_used = btrfs_root_used(&root->root_item);
2817         btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2818         spin_unlock(&info->delalloc_lock);
2819
2820         memcpy(&keys[0], ins, sizeof(*ins));
2821         keys[1].objectid = ins->objectid;
2822         keys[1].type = BTRFS_EXTENT_REF_KEY;
2823         keys[1].offset = parent;
2824         sizes[0] = sizeof(*extent_item);
2825         sizes[1] = sizeof(*ref);
2826
2827         path = btrfs_alloc_path();
2828         BUG_ON(!path);
2829
2830         ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2831                                        sizes, 2);
2832         BUG_ON(ret);
2833
2834         extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2835                                      struct btrfs_extent_item);
2836         btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
2837         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2838                              struct btrfs_extent_ref);
2839
2840         btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2841         btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2842         btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2843         btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
2844
2845         btrfs_mark_buffer_dirty(path->nodes[0]);
2846
2847         trans->alloc_exclude_start = 0;
2848         trans->alloc_exclude_nr = 0;
2849         btrfs_free_path(path);
2850
2851         if (ret)
2852                 goto out;
2853
2854         ret = update_block_group(trans, root, ins->objectid,
2855                                  ins->offset, 1, 0);
2856         if (ret) {
2857                 printk(KERN_ERR "btrfs update block group failed for %llu "
2858                        "%llu\n", (unsigned long long)ins->objectid,
2859                        (unsigned long long)ins->offset);
2860                 BUG();
2861         }
2862 out:
2863         return ret;
2864 }
2865
2866 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2867                                 struct btrfs_root *root, u64 parent,
2868                                 u64 root_objectid, u64 ref_generation,
2869                                 u64 owner, struct btrfs_key *ins)
2870 {
2871         int ret;
2872
2873         if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
2874                 return 0;
2875
2876         ret = btrfs_add_delayed_ref(trans, ins->objectid,
2877                                     ins->offset, parent, root_objectid,
2878                                     ref_generation, owner,
2879                                     BTRFS_ADD_DELAYED_EXTENT, 0);
2880         BUG_ON(ret);
2881         return ret;
2882 }
2883
2884 /*
2885  * this is used by the tree logging recovery code.  It records that
2886  * an extent has been allocated and makes sure to clear the free
2887  * space cache bits as well
2888  */
2889 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
2890                                 struct btrfs_root *root, u64 parent,
2891                                 u64 root_objectid, u64 ref_generation,
2892                                 u64 owner, struct btrfs_key *ins)
2893 {
2894         int ret;
2895         struct btrfs_block_group_cache *block_group;
2896
2897         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
2898         mutex_lock(&block_group->cache_mutex);
2899         cache_block_group(root, block_group);
2900         mutex_unlock(&block_group->cache_mutex);
2901
2902         ret = btrfs_remove_free_space(block_group, ins->objectid,
2903                                       ins->offset);
2904         BUG_ON(ret);
2905         put_block_group(block_group);
2906         ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
2907                                             ref_generation, owner, ins, 1);
2908         return ret;
2909 }
2910
2911 /*
2912  * finds a free extent and does all the dirty work required for allocation
2913  * returns the key for the extent through ins, and a tree buffer for
2914  * the first block of the extent through buf.
2915  *
2916  * returns 0 if everything worked, non-zero otherwise.
2917  */
2918 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
2919                        struct btrfs_root *root,
2920                        u64 num_bytes, u64 parent, u64 min_alloc_size,
2921                        u64 root_objectid, u64 ref_generation,
2922                        u64 owner_objectid, u64 empty_size, u64 hint_byte,
2923                        u64 search_end, struct btrfs_key *ins, u64 data)
2924 {
2925         int ret;
2926         ret = __btrfs_reserve_extent(trans, root, num_bytes,
2927                                      min_alloc_size, empty_size, hint_byte,
2928                                      search_end, ins, data);
2929         BUG_ON(ret);
2930         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
2931                 ret = btrfs_add_delayed_ref(trans, ins->objectid,
2932                                             ins->offset, parent, root_objectid,
2933                                             ref_generation, owner_objectid,
2934                                             BTRFS_ADD_DELAYED_EXTENT, 0);
2935                 BUG_ON(ret);
2936         }
2937         update_reserved_extents(root, ins->objectid, ins->offset, 1);
2938         return ret;
2939 }
2940
2941 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2942                                             struct btrfs_root *root,
2943                                             u64 bytenr, u32 blocksize,
2944                                             int level)
2945 {
2946         struct extent_buffer *buf;
2947
2948         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
2949         if (!buf)
2950                 return ERR_PTR(-ENOMEM);
2951         btrfs_set_header_generation(buf, trans->transid);
2952         btrfs_set_buffer_lockdep_class(buf, level);
2953         btrfs_tree_lock(buf);
2954         clean_tree_block(trans, root, buf);
2955
2956         btrfs_set_lock_blocking(buf);
2957         btrfs_set_buffer_uptodate(buf);
2958
2959         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2960                 set_extent_dirty(&root->dirty_log_pages, buf->start,
2961                          buf->start + buf->len - 1, GFP_NOFS);
2962         } else {
2963                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
2964                          buf->start + buf->len - 1, GFP_NOFS);
2965         }
2966         trans->blocks_used++;
2967         /* this returns a buffer locked for blocking */
2968         return buf;
2969 }
2970
2971 /*
2972  * helper function to allocate a block for a given tree
2973  * returns the tree buffer or NULL.
2974  */
2975 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2976                                              struct btrfs_root *root,
2977                                              u32 blocksize, u64 parent,
2978                                              u64 root_objectid,
2979                                              u64 ref_generation,
2980                                              int level,
2981                                              u64 hint,
2982                                              u64 empty_size)
2983 {
2984         struct btrfs_key ins;
2985         int ret;
2986         struct extent_buffer *buf;
2987
2988         ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
2989                                  root_objectid, ref_generation, level,
2990                                  empty_size, hint, (u64)-1, &ins, 0);
2991         if (ret) {
2992                 BUG_ON(ret > 0);
2993                 return ERR_PTR(ret);
2994         }
2995
2996         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
2997                                     blocksize, level);
2998         return buf;
2999 }
3000
3001 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3002                         struct btrfs_root *root, struct extent_buffer *leaf)
3003 {
3004         u64 leaf_owner;
3005         u64 leaf_generation;
3006         struct refsort *sorted;
3007         struct btrfs_key key;
3008         struct btrfs_file_extent_item *fi;
3009         int i;
3010         int nritems;
3011         int ret;
3012         int refi = 0;
3013         int slot;
3014
3015         BUG_ON(!btrfs_is_leaf(leaf));
3016         nritems = btrfs_header_nritems(leaf);
3017         leaf_owner = btrfs_header_owner(leaf);
3018         leaf_generation = btrfs_header_generation(leaf);
3019
3020         sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3021         /* we do this loop twice.  The first time we build a list
3022          * of the extents we have a reference on, then we sort the list
3023          * by bytenr.  The second time around we actually do the
3024          * extent freeing.
3025          */
3026         for (i = 0; i < nritems; i++) {
3027                 u64 disk_bytenr;
3028                 cond_resched();
3029
3030                 btrfs_item_key_to_cpu(leaf, &key, i);
3031
3032                 /* only extents have references, skip everything else */
3033                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3034                         continue;
3035
3036                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3037
3038                 /* inline extents live in the btree, they don't have refs */
3039                 if (btrfs_file_extent_type(leaf, fi) ==
3040                     BTRFS_FILE_EXTENT_INLINE)
3041                         continue;
3042
3043                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3044
3045                 /* holes don't have refs */
3046                 if (disk_bytenr == 0)
3047                         continue;
3048
3049                 sorted[refi].bytenr = disk_bytenr;
3050                 sorted[refi].slot = i;
3051                 refi++;
3052         }
3053
3054         if (refi == 0)
3055                 goto out;
3056
3057         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3058
3059         for (i = 0; i < refi; i++) {
3060                 u64 disk_bytenr;
3061
3062                 disk_bytenr = sorted[i].bytenr;
3063                 slot = sorted[i].slot;
3064
3065                 cond_resched();
3066
3067                 btrfs_item_key_to_cpu(leaf, &key, slot);
3068                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3069                         continue;
3070
3071                 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3072
3073                 ret = btrfs_free_extent(trans, root, disk_bytenr,
3074                                 btrfs_file_extent_disk_num_bytes(leaf, fi),
3075                                 leaf->start, leaf_owner, leaf_generation,
3076                                 key.objectid, 0);
3077                 BUG_ON(ret);
3078
3079                 atomic_inc(&root->fs_info->throttle_gen);
3080                 wake_up(&root->fs_info->transaction_throttle);
3081                 cond_resched();
3082         }
3083 out:
3084         kfree(sorted);
3085         return 0;
3086 }
3087
3088 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3089                                         struct btrfs_root *root,
3090                                         struct btrfs_leaf_ref *ref)
3091 {
3092         int i;
3093         int ret;
3094         struct btrfs_extent_info *info;
3095         struct refsort *sorted;
3096
3097         if (ref->nritems == 0)
3098                 return 0;
3099
3100         sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3101         for (i = 0; i < ref->nritems; i++) {
3102                 sorted[i].bytenr = ref->extents[i].bytenr;
3103                 sorted[i].slot = i;
3104         }
3105         sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3106
3107         /*
3108          * the items in the ref were sorted when the ref was inserted
3109          * into the ref cache, so this is already in order
3110          */
3111         for (i = 0; i < ref->nritems; i++) {
3112                 info = ref->extents + sorted[i].slot;
3113                 ret = btrfs_free_extent(trans, root, info->bytenr,
3114                                           info->num_bytes, ref->bytenr,
3115                                           ref->owner, ref->generation,
3116                                           info->objectid, 0);
3117
3118                 atomic_inc(&root->fs_info->throttle_gen);
3119                 wake_up(&root->fs_info->transaction_throttle);
3120                 cond_resched();
3121
3122                 BUG_ON(ret);
3123                 info++;
3124         }
3125
3126         kfree(sorted);
3127         return 0;
3128 }
3129
3130 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
3131                                      struct btrfs_root *root, u64 start,
3132                                      u64 len, u32 *refs)
3133 {
3134         int ret;
3135
3136         ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
3137         BUG_ON(ret);
3138
3139 #if 0 /* some debugging code in case we see problems here */
3140         /* if the refs count is one, it won't get increased again.  But
3141          * if the ref count is > 1, someone may be decreasing it at
3142          * the same time we are.
3143          */
3144         if (*refs != 1) {
3145                 struct extent_buffer *eb = NULL;
3146                 eb = btrfs_find_create_tree_block(root, start, len);
3147                 if (eb)
3148                         btrfs_tree_lock(eb);
3149
3150                 mutex_lock(&root->fs_info->alloc_mutex);
3151                 ret = lookup_extent_ref(NULL, root, start, len, refs);
3152                 BUG_ON(ret);
3153                 mutex_unlock(&root->fs_info->alloc_mutex);
3154
3155                 if (eb) {
3156                         btrfs_tree_unlock(eb);
3157                         free_extent_buffer(eb);
3158                 }
3159                 if (*refs == 1) {
3160                         printk(KERN_ERR "btrfs block %llu went down to one "
3161                                "during drop_snap\n", (unsigned long long)start);
3162                 }
3163
3164         }
3165 #endif
3166
3167         cond_resched();
3168         return ret;
3169 }
3170
3171 /*
3172  * this is used while deleting old snapshots, and it drops the refs
3173  * on a whole subtree starting from a level 1 node.
3174  *
3175  * The idea is to sort all the leaf pointers, and then drop the
3176  * ref on all the leaves in order.  Most of the time the leaves
3177  * will have ref cache entries, so no leaf IOs will be required to
3178  * find the extents they have references on.
3179  *
3180  * For each leaf, any references it has are also dropped in order
3181  *
3182  * This ends up dropping the references in something close to optimal
3183  * order for reading and modifying the extent allocation tree.
3184  */
3185 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3186                                         struct btrfs_root *root,
3187                                         struct btrfs_path *path)
3188 {
3189         u64 bytenr;
3190         u64 root_owner;
3191         u64 root_gen;
3192         struct extent_buffer *eb = path->nodes[1];
3193         struct extent_buffer *leaf;
3194         struct btrfs_leaf_ref *ref;
3195         struct refsort *sorted = NULL;
3196         int nritems = btrfs_header_nritems(eb);
3197         int ret;
3198         int i;
3199         int refi = 0;
3200         int slot = path->slots[1];
3201         u32 blocksize = btrfs_level_size(root, 0);
3202         u32 refs;
3203
3204         if (nritems == 0)
3205                 goto out;
3206
3207         root_owner = btrfs_header_owner(eb);
3208         root_gen = btrfs_header_generation(eb);
3209         sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3210
3211         /*
3212          * step one, sort all the leaf pointers so we don't scribble
3213          * randomly into the extent allocation tree
3214          */
3215         for (i = slot; i < nritems; i++) {
3216                 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3217                 sorted[refi].slot = i;
3218                 refi++;
3219         }
3220
3221         /*
3222          * nritems won't be zero, but if we're picking up drop_snapshot
3223          * after a crash, slot might be > 0, so double check things
3224          * just in case.
3225          */
3226         if (refi == 0)
3227                 goto out;
3228
3229         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3230
3231         /*
3232          * the first loop frees everything the leaves point to
3233          */
3234         for (i = 0; i < refi; i++) {
3235                 u64 ptr_gen;
3236
3237                 bytenr = sorted[i].bytenr;
3238
3239                 /*
3240                  * check the reference count on this leaf.  If it is > 1
3241                  * we just decrement it below and don't update any
3242                  * of the refs the leaf points to.
3243                  */
3244                 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3245                                                 blocksize, &refs);
3246                 BUG_ON(ret);
3247                 if (refs != 1)
3248                         continue;
3249
3250                 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3251
3252                 /*
3253                  * the leaf only had one reference, which means the
3254                  * only thing pointing to this leaf is the snapshot
3255                  * we're deleting.  It isn't possible for the reference
3256                  * count to increase again later
3257                  *
3258                  * The reference cache is checked for the leaf,
3259                  * and if found we'll be able to drop any refs held by
3260                  * the leaf without needing to read it in.
3261                  */
3262                 ref = btrfs_lookup_leaf_ref(root, bytenr);
3263                 if (ref && ref->generation != ptr_gen) {
3264                         btrfs_free_leaf_ref(root, ref);
3265                         ref = NULL;
3266                 }
3267                 if (ref) {
3268                         ret = cache_drop_leaf_ref(trans, root, ref);
3269                         BUG_ON(ret);
3270                         btrfs_remove_leaf_ref(root, ref);
3271                         btrfs_free_leaf_ref(root, ref);
3272                 } else {
3273                         /*
3274                          * the leaf wasn't in the reference cache, so
3275                          * we have to read it.
3276                          */
3277                         leaf = read_tree_block(root, bytenr, blocksize,
3278                                                ptr_gen);
3279                         ret = btrfs_drop_leaf_ref(trans, root, leaf);
3280                         BUG_ON(ret);
3281                         free_extent_buffer(leaf);
3282                 }
3283                 atomic_inc(&root->fs_info->throttle_gen);
3284                 wake_up(&root->fs_info->transaction_throttle);
3285                 cond_resched();
3286         }
3287
3288         /*
3289          * run through the loop again to free the refs on the leaves.
3290          * This is faster than doing it in the loop above because
3291          * the leaves are likely to be clustered together.  We end up
3292          * working in nice chunks on the extent allocation tree.
3293          */
3294         for (i = 0; i < refi; i++) {
3295                 bytenr = sorted[i].bytenr;
3296                 ret = btrfs_free_extent(trans, root, bytenr,
3297                                         blocksize, eb->start,
3298                                         root_owner, root_gen, 0, 1);
3299                 BUG_ON(ret);
3300
3301                 atomic_inc(&root->fs_info->throttle_gen);
3302                 wake_up(&root->fs_info->transaction_throttle);
3303                 cond_resched();
3304         }
3305 out:
3306         kfree(sorted);
3307
3308         /*
3309          * update the path to show we've processed the entire level 1
3310          * node.  This will get saved into the root's drop_snapshot_progress
3311          * field so these drops are not repeated again if this transaction
3312          * commits.
3313          */
3314         path->slots[1] = nritems;
3315         return 0;
3316 }
3317
3318 /*
3319  * helper function for drop_snapshot, this walks down the tree dropping ref
3320  * counts as it goes.
3321  */
3322 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3323                                    struct btrfs_root *root,
3324                                    struct btrfs_path *path, int *level)
3325 {
3326         u64 root_owner;
3327         u64 root_gen;
3328         u64 bytenr;
3329         u64 ptr_gen;
3330         struct extent_buffer *next;
3331         struct extent_buffer *cur;
3332         struct extent_buffer *parent;
3333         u32 blocksize;
3334         int ret;
3335         u32 refs;
3336
3337         WARN_ON(*level < 0);
3338         WARN_ON(*level >= BTRFS_MAX_LEVEL);
3339         ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
3340                                 path->nodes[*level]->len, &refs);
3341         BUG_ON(ret);
3342         if (refs > 1)
3343                 goto out;
3344
3345         /*
3346          * walk down to the last node level and free all the leaves
3347          */
3348         while (*level >= 0) {
3349                 WARN_ON(*level < 0);
3350                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3351                 cur = path->nodes[*level];
3352
3353                 if (btrfs_header_level(cur) != *level)
3354                         WARN_ON(1);
3355
3356                 if (path->slots[*level] >=
3357                     btrfs_header_nritems(cur))
3358                         break;
3359
3360                 /* the new code goes down to level 1 and does all the
3361                  * leaves pointed to that node in bulk.  So, this check
3362                  * for level 0 will always be false.
3363                  *
3364                  * But, the disk format allows the drop_snapshot_progress
3365                  * field in the root to leave things in a state where
3366                  * a leaf will need cleaning up here.  If someone crashes
3367                  * with the old code and then boots with the new code,
3368                  * we might find a leaf here.
3369                  */
3370                 if (*level == 0) {
3371                         ret = btrfs_drop_leaf_ref(trans, root, cur);
3372                         BUG_ON(ret);
3373                         break;
3374                 }
3375
3376                 /*
3377                  * once we get to level one, process the whole node
3378                  * at once, including everything below it.
3379                  */
3380                 if (*level == 1) {
3381                         ret = drop_level_one_refs(trans, root, path);
3382                         BUG_ON(ret);
3383                         break;
3384                 }
3385
3386                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3387                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3388                 blocksize = btrfs_level_size(root, *level - 1);
3389
3390                 ret = drop_snap_lookup_refcount(trans, root, bytenr,
3391                                                 blocksize, &refs);
3392                 BUG_ON(ret);
3393
3394                 /*
3395                  * if there is more than one reference, we don't need
3396                  * to read that node to drop any references it has.  We
3397                  * just drop the ref we hold on that node and move on to the
3398                  * next slot in this level.
3399                  */
3400                 if (refs != 1) {
3401                         parent = path->nodes[*level];
3402                         root_owner = btrfs_header_owner(parent);
3403                         root_gen = btrfs_header_generation(parent);
3404                         path->slots[*level]++;
3405
3406                         ret = btrfs_free_extent(trans, root, bytenr,
3407                                                 blocksize, parent->start,
3408                                                 root_owner, root_gen,
3409                                                 *level - 1, 1);
3410                         BUG_ON(ret);
3411
3412                         atomic_inc(&root->fs_info->throttle_gen);
3413                         wake_up(&root->fs_info->transaction_throttle);
3414                         cond_resched();
3415
3416                         continue;
3417                 }
3418
3419                 /*
3420                  * we need to keep freeing things in the next level down.
3421                  * read the block and loop around to process it
3422                  */
3423                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3424                 WARN_ON(*level <= 0);
3425                 if (path->nodes[*level-1])
3426                         free_extent_buffer(path->nodes[*level-1]);
3427                 path->nodes[*level-1] = next;
3428                 *level = btrfs_header_level(next);
3429                 path->slots[*level] = 0;
3430                 cond_resched();
3431         }
3432 out:
3433         WARN_ON(*level < 0);
3434         WARN_ON(*level >= BTRFS_MAX_LEVEL);
3435
3436         if (path->nodes[*level] == root->node) {
3437                 parent = path->nodes[*level];
3438                 bytenr = path->nodes[*level]->start;
3439         } else {
3440                 parent = path->nodes[*level + 1];
3441                 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3442         }
3443
3444         blocksize = btrfs_level_size(root, *level);
3445         root_owner = btrfs_header_owner(parent);
3446         root_gen = btrfs_header_generation(parent);
3447
3448         /*
3449          * cleanup and free the reference on the last node
3450          * we processed
3451          */
3452         ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3453                                   parent->start, root_owner, root_gen,
3454                                   *level, 1);
3455         free_extent_buffer(path->nodes[*level]);
3456         path->nodes[*level] = NULL;
3457
3458         *level += 1;
3459         BUG_ON(ret);
3460
3461         cond_resched();
3462         return 0;
3463 }
3464
3465 /*
3466  * helper function for drop_subtree, this function is similar to
3467  * walk_down_tree. The main difference is that it checks reference
3468  * counts while tree blocks are locked.
3469  */
3470 static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3471                                       struct btrfs_root *root,
3472                                       struct btrfs_path *path, int *level)
3473 {
3474         struct extent_buffer *next;
3475         struct extent_buffer *cur;
3476         struct extent_buffer *parent;
3477         u64 bytenr;
3478         u64 ptr_gen;
3479         u32 blocksize;
3480         u32 refs;
3481         int ret;
3482
3483         cur = path->nodes[*level];
3484         ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3485                                       &refs);
3486         BUG_ON(ret);
3487         if (refs > 1)
3488                 goto out;
3489
3490         while (*level >= 0) {
3491                 cur = path->nodes[*level];
3492                 if (*level == 0) {
3493                         ret = btrfs_drop_leaf_ref(trans, root, cur);
3494                         BUG_ON(ret);
3495                         clean_tree_block(trans, root, cur);
3496                         break;
3497                 }
3498                 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3499                         clean_tree_block(trans, root, cur);
3500                         break;
3501                 }
3502
3503                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3504                 blocksize = btrfs_level_size(root, *level - 1);
3505                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3506
3507                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3508                 btrfs_tree_lock(next);
3509                 btrfs_set_lock_blocking(next);
3510
3511                 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3512                                               &refs);
3513                 BUG_ON(ret);
3514                 if (refs > 1) {
3515                         parent = path->nodes[*level];
3516                         ret = btrfs_free_extent(trans, root, bytenr,
3517                                         blocksize, parent->start,
3518                                         btrfs_header_owner(parent),
3519                                         btrfs_header_generation(parent),
3520                                         *level - 1, 1);
3521                         BUG_ON(ret);
3522                         path->slots[*level]++;
3523                         btrfs_tree_unlock(next);
3524                         free_extent_buffer(next);
3525                         continue;
3526                 }
3527
3528                 *level = btrfs_header_level(next);
3529                 path->nodes[*level] = next;
3530                 path->slots[*level] = 0;
3531                 path->locks[*level] = 1;
3532                 cond_resched();
3533         }
3534 out:
3535         parent = path->nodes[*level + 1];
3536         bytenr = path->nodes[*level]->start;
3537         blocksize = path->nodes[*level]->len;
3538
3539         ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3540                         parent->start, btrfs_header_owner(parent),
3541                         btrfs_header_generation(parent), *level, 1);
3542         BUG_ON(ret);
3543
3544         if (path->locks[*level]) {
3545                 btrfs_tree_unlock(path->nodes[*level]);
3546                 path->locks[*level] = 0;
3547         }
3548         free_extent_buffer(path->nodes[*level]);
3549         path->nodes[*level] = NULL;
3550         *level += 1;
3551         cond_resched();
3552         return 0;
3553 }
3554
3555 /*
3556  * helper for dropping snapshots.  This walks back up the tree in the path
3557  * to find the first node higher up where we haven't yet gone through
3558  * all the slots
3559  */
3560 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3561                                  struct btrfs_root *root,
3562                                  struct btrfs_path *path,
3563                                  int *level, int max_level)
3564 {
3565         u64 root_owner;
3566         u64 root_gen;
3567         struct btrfs_root_item *root_item = &root->root_item;
3568         int i;
3569         int slot;
3570         int ret;
3571
3572         for (i = *level; i < max_level && path->nodes[i]; i++) {
3573                 slot = path->slots[i];
3574                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3575                         struct extent_buffer *node;
3576                         struct btrfs_disk_key disk_key;
3577
3578                         /*
3579                          * there is more work to do in this level.
3580                          * Update the drop_progress marker to reflect
3581                          * the work we've done so far, and then bump
3582                          * the slot number
3583                          */
3584                         node = path->nodes[i];
3585                         path->slots[i]++;
3586                         *level = i;
3587                         WARN_ON(*level == 0);
3588                         btrfs_node_key(node, &disk_key, path->slots[i]);
3589                         memcpy(&root_item->drop_progress,
3590                                &disk_key, sizeof(disk_key));
3591                         root_item->drop_level = i;
3592                         return 0;
3593                 } else {
3594                         struct extent_buffer *parent;
3595
3596                         /*
3597                          * this whole node is done, free our reference
3598                          * on it and go up one level
3599                          */
3600                         if (path->nodes[*level] == root->node)
3601                                 parent = path->nodes[*level];
3602                         else
3603                                 parent = path->nodes[*level + 1];
3604
3605                         root_owner = btrfs_header_owner(parent);
3606                         root_gen = btrfs_header_generation(parent);
3607
3608                         clean_tree_block(trans, root, path->nodes[*level]);
3609                         ret = btrfs_free_extent(trans, root,
3610                                                 path->nodes[*level]->start,
3611                                                 path->nodes[*level]->len,
3612                                                 parent->start, root_owner,
3613                                                 root_gen, *level, 1);
3614                         BUG_ON(ret);
3615                         if (path->locks[*level]) {
3616                                 btrfs_tree_unlock(path->nodes[*level]);
3617                                 path->locks[*level] = 0;
3618                         }
3619                         free_extent_buffer(path->nodes[*level]);
3620                         path->nodes[*level] = NULL;
3621                         *level = i + 1;
3622                 }
3623         }
3624         return 1;
3625 }
3626
3627 /*
3628  * drop the reference count on the tree rooted at 'snap'.  This traverses
3629  * the tree freeing any blocks that have a ref count of zero after being
3630  * decremented.
3631  */
3632 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3633                         *root)
3634 {
3635         int ret = 0;
3636         int wret;
3637         int level;
3638         struct btrfs_path *path;
3639         int i;
3640         int orig_level;
3641         struct btrfs_root_item *root_item = &root->root_item;
3642
3643         WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3644         path = btrfs_alloc_path();
3645         BUG_ON(!path);
3646
3647         level = btrfs_header_level(root->node);
3648         orig_level = level;
3649         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3650                 path->nodes[level] = root->node;
3651                 extent_buffer_get(root->node);
3652                 path->slots[level] = 0;
3653         } else {
3654                 struct btrfs_key key;
3655                 struct btrfs_disk_key found_key;
3656                 struct extent_buffer *node;
3657
3658                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3659                 level = root_item->drop_level;
3660                 path->lowest_level = level;
3661                 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3662                 if (wret < 0) {
3663                         ret = wret;
3664                         goto out;
3665                 }
3666                 node = path->nodes[level];
3667                 btrfs_node_key(node, &found_key, path->slots[level]);
3668                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3669                                sizeof(found_key)));
3670                 /*
3671                  * unlock our path, this is safe because only this
3672                  * function is allowed to delete this snapshot
3673                  */
3674                 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3675                         if (path->nodes[i] && path->locks[i]) {
3676                                 path->locks[i] = 0;
3677                                 btrfs_tree_unlock(path->nodes[i]);
3678                         }
3679                 }
3680         }
3681         while (1) {
3682                 wret = walk_down_tree(trans, root, path, &level);
3683                 if (wret > 0)
3684                         break;
3685                 if (wret < 0)
3686                         ret = wret;
3687
3688                 wret = walk_up_tree(trans, root, path, &level,
3689                                     BTRFS_MAX_LEVEL);
3690                 if (wret > 0)
3691                         break;
3692                 if (wret < 0)
3693                         ret = wret;
3694                 if (trans->transaction->in_commit) {
3695                         ret = -EAGAIN;
3696                         break;
3697                 }
3698                 atomic_inc(&root->fs_info->throttle_gen);
3699                 wake_up(&root->fs_info->transaction_throttle);
3700         }
3701         for (i = 0; i <= orig_level; i++) {
3702                 if (path->nodes[i]) {
3703                         free_extent_buffer(path->nodes[i]);
3704                         path->nodes[i] = NULL;
3705                 }
3706         }
3707 out:
3708         btrfs_free_path(path);
3709         return ret;
3710 }
3711
3712 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3713                         struct btrfs_root *root,
3714                         struct extent_buffer *node,
3715                         struct extent_buffer *parent)
3716 {
3717         struct btrfs_path *path;
3718         int level;
3719         int parent_level;
3720         int ret = 0;
3721         int wret;
3722
3723         path = btrfs_alloc_path();
3724         BUG_ON(!path);
3725
3726         btrfs_assert_tree_locked(parent);
3727         parent_level = btrfs_header_level(parent);
3728         extent_buffer_get(parent);
3729         path->nodes[parent_level] = parent;
3730         path->slots[parent_level] = btrfs_header_nritems(parent);
3731
3732         btrfs_assert_tree_locked(node);
3733         level = btrfs_header_level(node);
3734         extent_buffer_get(node);
3735         path->nodes[level] = node;
3736         path->slots[level] = 0;
3737
3738         while (1) {
3739                 wret = walk_down_subtree(trans, root, path, &level);
3740                 if (wret < 0)
3741                         ret = wret;
3742                 if (wret != 0)
3743                         break;
3744
3745                 wret = walk_up_tree(trans, root, path, &level, parent_level);
3746                 if (wret < 0)
3747                         ret = wret;
3748                 if (wret != 0)
3749                         break;
3750         }
3751
3752         btrfs_free_path(path);
3753         return ret;
3754 }
3755
3756 static unsigned long calc_ra(unsigned long start, unsigned long last,
3757                              unsigned long nr)
3758 {
3759         return min(last, start + nr - 1);
3760 }
3761
3762 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3763                                          u64 len)
3764 {
3765         u64 page_start;
3766         u64 page_end;
3767         unsigned long first_index;
3768         unsigned long last_index;
3769         unsigned long i;
3770         struct page *page;
3771         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3772         struct file_ra_state *ra;
3773         struct btrfs_ordered_extent *ordered;
3774         unsigned int total_read = 0;
3775         unsigned int total_dirty = 0;
3776         int ret = 0;
3777
3778         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3779
3780         mutex_lock(&inode->i_mutex);
3781         first_index = start >> PAGE_CACHE_SHIFT;
3782         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3783
3784         /* make sure the dirty trick played by the caller work */
3785         ret = invalidate_inode_pages2_range(inode->i_mapping,
3786                                             first_index, last_index);
3787         if (ret)
3788                 goto out_unlock;
3789
3790         file_ra_state_init(ra, inode->i_mapping);
3791
3792         for (i = first_index ; i <= last_index; i++) {
3793                 if (total_read % ra->ra_pages == 0) {
3794                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3795                                        calc_ra(i, last_index, ra->ra_pages));
3796                 }
3797                 total_read++;
3798 again:
3799                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3800                         BUG_ON(1);
3801                 page = grab_cache_page(inode->i_mapping, i);
3802                 if (!page) {
3803                         ret = -ENOMEM;
3804                         goto out_unlock;
3805                 }
3806                 if (!PageUptodate(page)) {
3807                         btrfs_readpage(NULL, page);
3808                         lock_page(page);
3809                         if (!PageUptodate(page)) {
3810                                 unlock_page(page);
3811                                 page_cache_release(page);
3812                                 ret = -EIO;
3813                                 goto out_unlock;
3814                         }
3815                 }
3816                 wait_on_page_writeback(page);
3817
3818                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3819                 page_end = page_start + PAGE_CACHE_SIZE - 1;
3820                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3821
3822                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3823                 if (ordered) {
3824                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3825                         unlock_page(page);
3826                         page_cache_release(page);
3827                         btrfs_start_ordered_extent(inode, ordered, 1);
3828                         btrfs_put_ordered_extent(ordered);
3829                         goto again;
3830                 }
3831                 set_page_extent_mapped(page);
3832
3833                 if (i == first_index)
3834                         set_extent_bits(io_tree, page_start, page_end,
3835                                         EXTENT_BOUNDARY, GFP_NOFS);
3836                 btrfs_set_extent_delalloc(inode, page_start, page_end);
3837
3838                 set_page_dirty(page);
3839                 total_dirty++;
3840
3841                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3842                 unlock_page(page);
3843                 page_cache_release(page);
3844         }
3845
3846 out_unlock:
3847         kfree(ra);
3848         mutex_unlock(&inode->i_mutex);
3849         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
3850         return ret;
3851 }
3852
3853 static noinline int relocate_data_extent(struct inode *reloc_inode,
3854                                          struct btrfs_key *extent_key,
3855                                          u64 offset)
3856 {
3857         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
3858         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
3859         struct extent_map *em;
3860         u64 start = extent_key->objectid - offset;
3861         u64 end = start + extent_key->offset - 1;
3862
3863         em = alloc_extent_map(GFP_NOFS);
3864         BUG_ON(!em || IS_ERR(em));
3865
3866         em->start = start;
3867         em->len = extent_key->offset;
3868         em->block_len = extent_key->offset;
3869         em->block_start = extent_key->objectid;
3870         em->bdev = root->fs_info->fs_devices->latest_bdev;
3871         set_bit(EXTENT_FLAG_PINNED, &em->flags);
3872
3873         /* setup extent map to cheat btrfs_readpage */
3874         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
3875         while (1) {
3876                 int ret;
3877                 spin_lock(&em_tree->lock);
3878                 ret = add_extent_mapping(em_tree, em);
3879                 spin_unlock(&em_tree->lock);
3880                 if (ret != -EEXIST) {
3881                         free_extent_map(em);
3882                         break;
3883                 }
3884                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
3885         }
3886         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
3887
3888         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
3889 }
3890
3891 struct btrfs_ref_path {
3892         u64 extent_start;
3893         u64 nodes[BTRFS_MAX_LEVEL];
3894         u64 root_objectid;
3895         u64 root_generation;
3896         u64 owner_objectid;
3897         u32 num_refs;
3898         int lowest_level;
3899         int current_level;
3900         int shared_level;
3901
3902         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
3903         u64 new_nodes[BTRFS_MAX_LEVEL];
3904 };
3905
3906 struct disk_extent {
3907         u64 ram_bytes;
3908         u64 disk_bytenr;
3909         u64 disk_num_bytes;
3910         u64 offset;
3911         u64 num_bytes;
3912         u8 compression;
3913         u8 encryption;
3914         u16 other_encoding;
3915 };
3916
3917 static int is_cowonly_root(u64 root_objectid)
3918 {
3919         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
3920             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
3921             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
3922             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
3923             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
3924             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
3925                 return 1;
3926         return 0;
3927 }
3928
3929 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
3930                                     struct btrfs_root *extent_root,
3931                                     struct btrfs_ref_path *ref_path,
3932                                     int first_time)
3933 {
3934         struct extent_buffer *leaf;
3935         struct btrfs_path *path;
3936         struct btrfs_extent_ref *ref;
3937         struct btrfs_key key;
3938         struct btrfs_key found_key;
3939         u64 bytenr;
3940         u32 nritems;
3941         int level;
3942         int ret = 1;
3943
3944         path = btrfs_alloc_path();
3945         if (!path)
3946                 return -ENOMEM;
3947
3948         if (first_time) {
3949                 ref_path->lowest_level = -1;
3950                 ref_path->current_level = -1;
3951                 ref_path->shared_level = -1;
3952                 goto walk_up;
3953         }
3954 walk_down:
3955         level = ref_path->current_level - 1;
3956         while (level >= -1) {
3957                 u64 parent;
3958                 if (level < ref_path->lowest_level)
3959                         break;
3960
3961                 if (level >= 0)
3962                         bytenr = ref_path->nodes[level];
3963                 else
3964                         bytenr = ref_path->extent_start;
3965                 BUG_ON(bytenr == 0);
3966
3967                 parent = ref_path->nodes[level + 1];
3968                 ref_path->nodes[level + 1] = 0;
3969                 ref_path->current_level = level;
3970                 BUG_ON(parent == 0);
3971
3972                 key.objectid = bytenr;
3973                 key.offset = parent + 1;
3974                 key.type = BTRFS_EXTENT_REF_KEY;
3975
3976                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
3977                 if (ret < 0)
3978                         goto out;
3979                 BUG_ON(ret == 0);
3980
3981                 leaf = path->nodes[0];
3982                 nritems = btrfs_header_nritems(leaf);
3983                 if (path->slots[0] >= nritems) {
3984                         ret = btrfs_next_leaf(extent_root, path);
3985                         if (ret < 0)
3986                                 goto out;
3987                         if (ret > 0)
3988                                 goto next;
3989                         leaf = path->nodes[0];
3990                 }
3991
3992                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3993                 if (found_key.objectid == bytenr &&
3994                     found_key.type == BTRFS_EXTENT_REF_KEY) {
3995                         if (level < ref_path->shared_level)
3996                                 ref_path->shared_level = level;
3997                         goto found;
3998                 }
3999 next:
4000                 level--;
4001                 btrfs_release_path(extent_root, path);
4002                 cond_resched();
4003         }
4004         /* reached lowest level */
4005         ret = 1;
4006         goto out;
4007 walk_up:
4008         level = ref_path->current_level;
4009         while (level < BTRFS_MAX_LEVEL - 1) {
4010                 u64 ref_objectid;
4011
4012                 if (level >= 0)
4013                         bytenr = ref_path->nodes[level];
4014                 else
4015                         bytenr = ref_path->extent_start;
4016
4017                 BUG_ON(bytenr == 0);
4018
4019                 key.objectid = bytenr;
4020                 key.offset = 0;
4021                 key.type = BTRFS_EXTENT_REF_KEY;
4022
4023                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4024                 if (ret < 0)
4025                         goto out;
4026
4027                 leaf = path->nodes[0];
4028                 nritems = btrfs_header_nritems(leaf);
4029                 if (path->slots[0] >= nritems) {
4030                         ret = btrfs_next_leaf(extent_root, path);
4031                         if (ret < 0)
4032                                 goto out;
4033                         if (ret > 0) {
4034                                 /* the extent was freed by someone */
4035                                 if (ref_path->lowest_level == level)
4036                                         goto out;
4037                                 btrfs_release_path(extent_root, path);
4038                                 goto walk_down;
4039                         }
4040                         leaf = path->nodes[0];
4041                 }
4042
4043                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4044                 if (found_key.objectid != bytenr ||
4045                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
4046                         /* the extent was freed by someone */
4047                         if (ref_path->lowest_level == level) {
4048                                 ret = 1;
4049                                 goto out;
4050                         }
4051                         btrfs_release_path(extent_root, path);
4052                         goto walk_down;
4053                 }
4054 found:
4055                 ref = btrfs_item_ptr(leaf, path->slots[0],
4056                                 struct btrfs_extent_ref);
4057                 ref_objectid = btrfs_ref_objectid(leaf, ref);
4058                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4059                         if (first_time) {
4060                                 level = (int)ref_objectid;
4061                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
4062                                 ref_path->lowest_level = level;
4063                                 ref_path->current_level = level;
4064                                 ref_path->nodes[level] = bytenr;
4065                         } else {
4066                                 WARN_ON(ref_objectid != level);
4067                         }
4068                 } else {
4069                         WARN_ON(level != -1);
4070                 }
4071                 first_time = 0;
4072
4073                 if (ref_path->lowest_level == level) {
4074                         ref_path->owner_objectid = ref_objectid;
4075                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4076                 }
4077
4078                 /*
4079                  * the block is tree root or the block isn't in reference
4080                  * counted tree.
4081                  */
4082                 if (found_key.objectid == found_key.offset ||
4083                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4084                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4085                         ref_path->root_generation =
4086                                 btrfs_ref_generation(leaf, ref);
4087                         if (level < 0) {
4088                                 /* special reference from the tree log */
4089                                 ref_path->nodes[0] = found_key.offset;
4090                                 ref_path->current_level = 0;
4091                         }
4092                         ret = 0;
4093                         goto out;
4094                 }
4095
4096                 level++;
4097                 BUG_ON(ref_path->nodes[level] != 0);
4098                 ref_path->nodes[level] = found_key.offset;
4099                 ref_path->current_level = level;
4100
4101                 /*
4102                  * the reference was created in the running transaction,
4103                  * no need to continue walking up.
4104                  */
4105                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4106                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4107                         ref_path->root_generation =
4108                                 btrfs_ref_generation(leaf, ref);
4109                         ret = 0;
4110                         goto out;
4111                 }
4112
4113                 btrfs_release_path(extent_root, path);
4114                 cond_resched();
4115         }
4116         /* reached max tree level, but no tree root found. */
4117         BUG();
4118 out:
4119         btrfs_free_path(path);
4120         return ret;
4121 }
4122
4123 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4124                                 struct btrfs_root *extent_root,
4125                                 struct btrfs_ref_path *ref_path,
4126                                 u64 extent_start)
4127 {
4128         memset(ref_path, 0, sizeof(*ref_path));
4129         ref_path->extent_start = extent_start;
4130
4131         return __next_ref_path(trans, extent_root, ref_path, 1);
4132 }
4133
4134 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4135                                struct btrfs_root *extent_root,
4136                                struct btrfs_ref_path *ref_path)
4137 {
4138         return __next_ref_path(trans, extent_root, ref_path, 0);
4139 }
4140
4141 static noinline int get_new_locations(struct inode *reloc_inode,
4142                                       struct btrfs_key *extent_key,
4143                                       u64 offset, int no_fragment,
4144                                       struct disk_extent **extents,
4145                                       int *nr_extents)
4146 {
4147         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4148         struct btrfs_path *path;
4149         struct btrfs_file_extent_item *fi;
4150         struct extent_buffer *leaf;
4151         struct disk_extent *exts = *extents;
4152         struct btrfs_key found_key;
4153         u64 cur_pos;
4154         u64 last_byte;
4155         u32 nritems;
4156         int nr = 0;
4157         int max = *nr_extents;
4158         int ret;
4159
4160         WARN_ON(!no_fragment && *extents);
4161         if (!exts) {
4162                 max = 1;
4163                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4164                 if (!exts)
4165                         return -ENOMEM;
4166         }
4167
4168         path = btrfs_alloc_path();
4169         BUG_ON(!path);
4170
4171         cur_pos = extent_key->objectid - offset;
4172         last_byte = extent_key->objectid + extent_key->offset;
4173         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4174                                        cur_pos, 0);
4175         if (ret < 0)
4176                 goto out;
4177         if (ret > 0) {
4178                 ret = -ENOENT;
4179                 goto out;
4180         }
4181
4182         while (1) {
4183                 leaf = path->nodes[0];
4184                 nritems = btrfs_header_nritems(leaf);
4185                 if (path->slots[0] >= nritems) {
4186                         ret = btrfs_next_leaf(root, path);
4187                         if (ret < 0)
4188                                 goto out;
4189                         if (ret > 0)
4190                                 break;
4191                         leaf = path->nodes[0];
4192                 }
4193
4194                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4195                 if (found_key.offset != cur_pos ||
4196                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
4197                     found_key.objectid != reloc_inode->i_ino)
4198                         break;
4199
4200                 fi = btrfs_item_ptr(leaf, path->slots[0],
4201                                     struct btrfs_file_extent_item);
4202                 if (btrfs_file_extent_type(leaf, fi) !=
4203                     BTRFS_FILE_EXTENT_REG ||
4204                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4205                         break;
4206
4207                 if (nr == max) {
4208                         struct disk_extent *old = exts;
4209                         max *= 2;
4210                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4211                         memcpy(exts, old, sizeof(*exts) * nr);
4212                         if (old != *extents)
4213                                 kfree(old);
4214                 }
4215
4216                 exts[nr].disk_bytenr =
4217                         btrfs_file_extent_disk_bytenr(leaf, fi);
4218                 exts[nr].disk_num_bytes =
4219                         btrfs_file_extent_disk_num_bytes(leaf, fi);
4220                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4221                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4222                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4223                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4224                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4225                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4226                                                                            fi);
4227                 BUG_ON(exts[nr].offset > 0);
4228                 BUG_ON(exts[nr].compression || exts[nr].encryption);
4229                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4230
4231                 cur_pos += exts[nr].num_bytes;
4232                 nr++;
4233
4234                 if (cur_pos + offset >= last_byte)
4235                         break;
4236
4237                 if (no_fragment) {
4238                         ret = 1;
4239                         goto out;
4240                 }
4241                 path->slots[0]++;
4242         }
4243
4244         BUG_ON(cur_pos + offset > last_byte);
4245         if (cur_pos + offset < last_byte) {
4246                 ret = -ENOENT;
4247                 goto out;
4248         }
4249         ret = 0;
4250 out:
4251         btrfs_free_path(path);
4252         if (ret) {
4253                 if (exts != *extents)
4254                         kfree(exts);
4255         } else {
4256                 *extents = exts;
4257                 *nr_extents = nr;
4258         }
4259         return ret;
4260 }
4261
4262 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4263                                         struct btrfs_root *root,
4264                                         struct btrfs_path *path,
4265                                         struct btrfs_key *extent_key,
4266                                         struct btrfs_key *leaf_key,
4267                                         struct btrfs_ref_path *ref_path,
4268                                         struct disk_extent *new_extents,
4269                                         int nr_extents)
4270 {
4271         struct extent_buffer *leaf;
4272         struct btrfs_file_extent_item *fi;
4273         struct inode *inode = NULL;
4274         struct btrfs_key key;
4275         u64 lock_start = 0;
4276         u64 lock_end = 0;
4277         u64 num_bytes;
4278         u64 ext_offset;
4279         u64 search_end = (u64)-1;
4280         u32 nritems;
4281         int nr_scaned = 0;
4282         int extent_locked = 0;
4283         int extent_type;
4284         int ret;
4285
4286         memcpy(&key, leaf_key, sizeof(key));
4287         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4288                 if (key.objectid < ref_path->owner_objectid ||
4289                     (key.objectid == ref_path->owner_objectid &&
4290                      key.type < BTRFS_EXTENT_DATA_KEY)) {
4291                         key.objectid = ref_path->owner_objectid;
4292                         key.type = BTRFS_EXTENT_DATA_KEY;
4293                         key.offset = 0;
4294                 }
4295         }
4296
4297         while (1) {
4298                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4299                 if (ret < 0)
4300                         goto out;
4301
4302                 leaf = path->nodes[0];
4303                 nritems = btrfs_header_nritems(leaf);
4304 next:
4305                 if (extent_locked && ret > 0) {
4306                         /*
4307                          * the file extent item was modified by someone
4308                          * before the extent got locked.
4309                          */
4310                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4311                                       lock_end, GFP_NOFS);
4312                         extent_locked = 0;
4313                 }
4314
4315                 if (path->slots[0] >= nritems) {
4316                         if (++nr_scaned > 2)
4317                                 break;
4318
4319                         BUG_ON(extent_locked);
4320                         ret = btrfs_next_leaf(root, path);
4321                         if (ret < 0)
4322                                 goto out;
4323                         if (ret > 0)
4324                                 break;
4325                         leaf = path->nodes[0];
4326                         nritems = btrfs_header_nritems(leaf);
4327                 }
4328
4329                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4330
4331                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4332                         if ((key.objectid > ref_path->owner_objectid) ||
4333                             (key.objectid == ref_path->owner_objectid &&
4334                              key.type > BTRFS_EXTENT_DATA_KEY) ||
4335                             key.offset >= search_end)
4336                                 break;
4337                 }
4338
4339                 if (inode && key.objectid != inode->i_ino) {
4340                         BUG_ON(extent_locked);
4341                         btrfs_release_path(root, path);
4342                         mutex_unlock(&inode->i_mutex);
4343                         iput(inode);
4344                         inode = NULL;
4345                         continue;
4346                 }
4347
4348                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4349                         path->slots[0]++;
4350                         ret = 1;
4351                         goto next;
4352                 }
4353                 fi = btrfs_item_ptr(leaf, path->slots[0],
4354                                     struct btrfs_file_extent_item);
4355                 extent_type = btrfs_file_extent_type(leaf, fi);
4356                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4357                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4358                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4359                      extent_key->objectid)) {
4360                         path->slots[0]++;
4361                         ret = 1;
4362                         goto next;
4363                 }
4364
4365                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4366                 ext_offset = btrfs_file_extent_offset(leaf, fi);
4367
4368                 if (search_end == (u64)-1) {
4369                         search_end = key.offset - ext_offset +
4370                                 btrfs_file_extent_ram_bytes(leaf, fi);
4371                 }
4372
4373                 if (!extent_locked) {
4374                         lock_start = key.offset;
4375                         lock_end = lock_start + num_bytes - 1;
4376                 } else {
4377                         if (lock_start > key.offset ||
4378                             lock_end + 1 < key.offset + num_bytes) {
4379                                 unlock_extent(&BTRFS_I(inode)->io_tree,
4380                                               lock_start, lock_end, GFP_NOFS);
4381                                 extent_locked = 0;
4382                         }
4383                 }
4384
4385                 if (!inode) {
4386                         btrfs_release_path(root, path);
4387
4388                         inode = btrfs_iget_locked(root->fs_info->sb,
4389                                                   key.objectid, root);
4390                         if (inode->i_state & I_NEW) {
4391                                 BTRFS_I(inode)->root = root;
4392                                 BTRFS_I(inode)->location.objectid =
4393                                         key.objectid;
4394                                 BTRFS_I(inode)->location.type =
4395                                         BTRFS_INODE_ITEM_KEY;
4396                                 BTRFS_I(inode)->location.offset = 0;
4397                                 btrfs_read_locked_inode(inode);
4398                                 unlock_new_inode(inode);
4399                         }
4400                         /*
4401                          * some code call btrfs_commit_transaction while
4402                          * holding the i_mutex, so we can't use mutex_lock
4403                          * here.
4404                          */
4405                         if (is_bad_inode(inode) ||
4406                             !mutex_trylock(&inode->i_mutex)) {
4407                                 iput(inode);
4408                                 inode = NULL;
4409                                 key.offset = (u64)-1;
4410                                 goto skip;
4411                         }
4412                 }
4413
4414                 if (!extent_locked) {
4415                         struct btrfs_ordered_extent *ordered;
4416
4417                         btrfs_release_path(root, path);
4418
4419                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4420                                     lock_end, GFP_NOFS);
4421                         ordered = btrfs_lookup_first_ordered_extent(inode,
4422                                                                     lock_end);
4423                         if (ordered &&
4424                             ordered->file_offset <= lock_end &&
4425                             ordered->file_offset + ordered->len > lock_start) {
4426                                 unlock_extent(&BTRFS_I(inode)->io_tree,
4427                                               lock_start, lock_end, GFP_NOFS);
4428                                 btrfs_start_ordered_extent(inode, ordered, 1);
4429                                 btrfs_put_ordered_extent(ordered);
4430                                 key.offset += num_bytes;
4431                                 goto skip;
4432                         }
4433                         if (ordered)
4434                                 btrfs_put_ordered_extent(ordered);
4435
4436                         extent_locked = 1;
4437                         continue;
4438                 }
4439
4440                 if (nr_extents == 1) {
4441                         /* update extent pointer in place */
4442                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
4443                                                 new_extents[0].disk_bytenr);
4444                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4445                                                 new_extents[0].disk_num_bytes);
4446                         btrfs_mark_buffer_dirty(leaf);
4447
4448                         btrfs_drop_extent_cache(inode, key.offset,
4449                                                 key.offset + num_bytes - 1, 0);
4450
4451                         ret = btrfs_inc_extent_ref(trans, root,
4452                                                 new_extents[0].disk_bytenr,
4453                                                 new_extents[0].disk_num_bytes,
4454                                                 leaf->start,
4455                                                 root->root_key.objectid,
4456                                                 trans->transid,
4457                                                 key.objectid);
4458                         BUG_ON(ret);
4459
4460                         ret = btrfs_free_extent(trans, root,
4461                                                 extent_key->objectid,
4462                                                 extent_key->offset,
4463                                                 leaf->start,
4464                                                 btrfs_header_owner(leaf),
4465                                                 btrfs_header_generation(leaf),
4466                                                 key.objectid, 0);
4467                         BUG_ON(ret);
4468
4469                         btrfs_release_path(root, path);
4470                         key.offset += num_bytes;
4471                 } else {
4472                         BUG_ON(1);
4473 #if 0
4474                         u64 alloc_hint;
4475                         u64 extent_len;
4476                         int i;
4477                         /*
4478                          * drop old extent pointer at first, then insert the
4479                          * new pointers one bye one
4480                          */
4481                         btrfs_release_path(root, path);
4482                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
4483                                                  key.offset + num_bytes,
4484                                                  key.offset, &alloc_hint);
4485                         BUG_ON(ret);
4486
4487                         for (i = 0; i < nr_extents; i++) {
4488                                 if (ext_offset >= new_extents[i].num_bytes) {
4489                                         ext_offset -= new_extents[i].num_bytes;
4490                                         continue;
4491                                 }
4492                                 extent_len = min(new_extents[i].num_bytes -
4493                                                  ext_offset, num_bytes);
4494
4495                                 ret = btrfs_insert_empty_item(trans, root,
4496                                                               path, &key,
4497                                                               sizeof(*fi));
4498                                 BUG_ON(ret);
4499
4500                                 leaf = path->nodes[0];
4501                                 fi = btrfs_item_ptr(leaf, path->slots[0],
4502                                                 struct btrfs_file_extent_item);
4503                                 btrfs_set_file_extent_generation(leaf, fi,
4504                                                         trans->transid);
4505                                 btrfs_set_file_extent_type(leaf, fi,
4506                                                         BTRFS_FILE_EXTENT_REG);
4507                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4508                                                 new_extents[i].disk_bytenr);
4509                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4510                                                 new_extents[i].disk_num_bytes);
4511                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
4512                                                 new_extents[i].ram_bytes);
4513
4514                                 btrfs_set_file_extent_compression(leaf, fi,
4515                                                 new_extents[i].compression);
4516                                 btrfs_set_file_extent_encryption(leaf, fi,
4517                                                 new_extents[i].encryption);
4518                                 btrfs_set_file_extent_other_encoding(leaf, fi,
4519                                                 new_extents[i].other_encoding);
4520
4521                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4522                                                         extent_len);
4523                                 ext_offset += new_extents[i].offset;
4524                                 btrfs_set_file_extent_offset(leaf, fi,
4525                                                         ext_offset);
4526                                 btrfs_mark_buffer_dirty(leaf);
4527
4528                                 btrfs_drop_extent_cache(inode, key.offset,
4529                                                 key.offset + extent_len - 1, 0);
4530
4531                                 ret = btrfs_inc_extent_ref(trans, root,
4532                                                 new_extents[i].disk_bytenr,
4533                                                 new_extents[i].disk_num_bytes,
4534                                                 leaf->start,
4535                                                 root->root_key.objectid,
4536                                                 trans->transid, key.objectid);
4537                                 BUG_ON(ret);
4538                                 btrfs_release_path(root, path);
4539
4540                                 inode_add_bytes(inode, extent_len);
4541
4542                                 ext_offset = 0;
4543                                 num_bytes -= extent_len;
4544                                 key.offset += extent_len;
4545
4546                                 if (num_bytes == 0)
4547                                         break;
4548                         }
4549                         BUG_ON(i >= nr_extents);
4550 #endif
4551                 }
4552
4553                 if (extent_locked) {
4554                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4555                                       lock_end, GFP_NOFS);
4556                         extent_locked = 0;
4557                 }
4558 skip:
4559                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4560                     key.offset >= search_end)
4561                         break;
4562
4563                 cond_resched();
4564         }
4565         ret = 0;
4566 out:
4567         btrfs_release_path(root, path);
4568         if (inode) {
4569                 mutex_unlock(&inode->i_mutex);
4570                 if (extent_locked) {
4571                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4572                                       lock_end, GFP_NOFS);
4573                 }
4574                 iput(inode);
4575         }
4576         return ret;
4577 }
4578
4579 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4580                                struct btrfs_root *root,
4581                                struct extent_buffer *buf, u64 orig_start)
4582 {
4583         int level;
4584         int ret;
4585
4586         BUG_ON(btrfs_header_generation(buf) != trans->transid);
4587         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4588
4589         level = btrfs_header_level(buf);
4590         if (level == 0) {
4591                 struct btrfs_leaf_ref *ref;
4592                 struct btrfs_leaf_ref *orig_ref;
4593
4594                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4595                 if (!orig_ref)
4596                         return -ENOENT;
4597
4598                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4599                 if (!ref) {
4600                         btrfs_free_leaf_ref(root, orig_ref);
4601                         return -ENOMEM;
4602                 }
4603
4604                 ref->nritems = orig_ref->nritems;
4605                 memcpy(ref->extents, orig_ref->extents,
4606                         sizeof(ref->extents[0]) * ref->nritems);
4607
4608                 btrfs_free_leaf_ref(root, orig_ref);
4609
4610                 ref->root_gen = trans->transid;
4611                 ref->bytenr = buf->start;
4612                 ref->owner = btrfs_header_owner(buf);
4613                 ref->generation = btrfs_header_generation(buf);
4614
4615                 ret = btrfs_add_leaf_ref(root, ref, 0);
4616                 WARN_ON(ret);
4617                 btrfs_free_leaf_ref(root, ref);
4618         }
4619         return 0;
4620 }
4621
4622 static noinline int invalidate_extent_cache(struct btrfs_root *root,
4623                                         struct extent_buffer *leaf,
4624                                         struct btrfs_block_group_cache *group,
4625                                         struct btrfs_root *target_root)
4626 {
4627         struct btrfs_key key;
4628         struct inode *inode = NULL;
4629         struct btrfs_file_extent_item *fi;
4630         u64 num_bytes;
4631         u64 skip_objectid = 0;
4632         u32 nritems;
4633         u32 i;
4634
4635         nritems = btrfs_header_nritems(leaf);
4636         for (i = 0; i < nritems; i++) {
4637                 btrfs_item_key_to_cpu(leaf, &key, i);
4638                 if (key.objectid == skip_objectid ||
4639                     key.type != BTRFS_EXTENT_DATA_KEY)
4640                         continue;
4641                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4642                 if (btrfs_file_extent_type(leaf, fi) ==
4643                     BTRFS_FILE_EXTENT_INLINE)
4644                         continue;
4645                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4646                         continue;
4647                 if (!inode || inode->i_ino != key.objectid) {
4648                         iput(inode);
4649                         inode = btrfs_ilookup(target_root->fs_info->sb,
4650                                               key.objectid, target_root, 1);
4651                 }
4652                 if (!inode) {
4653                         skip_objectid = key.objectid;
4654                         continue;
4655                 }
4656                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4657
4658                 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4659                             key.offset + num_bytes - 1, GFP_NOFS);
4660                 btrfs_drop_extent_cache(inode, key.offset,
4661                                         key.offset + num_bytes - 1, 1);
4662                 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4663                               key.offset + num_bytes - 1, GFP_NOFS);
4664                 cond_resched();
4665         }
4666         iput(inode);
4667         return 0;
4668 }
4669
4670 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4671                                         struct btrfs_root *root,
4672                                         struct extent_buffer *leaf,
4673                                         struct btrfs_block_group_cache *group,
4674                                         struct inode *reloc_inode)
4675 {
4676         struct btrfs_key key;
4677         struct btrfs_key extent_key;
4678         struct btrfs_file_extent_item *fi;
4679         struct btrfs_leaf_ref *ref;
4680         struct disk_extent *new_extent;
4681         u64 bytenr;
4682         u64 num_bytes;
4683         u32 nritems;
4684         u32 i;
4685         int ext_index;
4686         int nr_extent;
4687         int ret;
4688
4689         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4690         BUG_ON(!new_extent);
4691
4692         ref = btrfs_lookup_leaf_ref(root, leaf->start);
4693         BUG_ON(!ref);
4694
4695         ext_index = -1;
4696         nritems = btrfs_header_nritems(leaf);
4697         for (i = 0; i < nritems; i++) {
4698                 btrfs_item_key_to_cpu(leaf, &key, i);
4699                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4700                         continue;
4701                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4702                 if (btrfs_file_extent_type(leaf, fi) ==
4703                     BTRFS_FILE_EXTENT_INLINE)
4704                         continue;
4705                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4706                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4707                 if (bytenr == 0)
4708                         continue;
4709
4710                 ext_index++;
4711                 if (bytenr >= group->key.objectid + group->key.offset ||
4712                     bytenr + num_bytes <= group->key.objectid)
4713                         continue;
4714
4715                 extent_key.objectid = bytenr;
4716                 extent_key.offset = num_bytes;
4717                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4718                 nr_extent = 1;
4719                 ret = get_new_locations(reloc_inode, &extent_key,
4720                                         group->key.objectid, 1,
4721                                         &new_extent, &nr_extent);
4722                 if (ret > 0)
4723                         continue;
4724                 BUG_ON(ret < 0);
4725
4726                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4727                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4728                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4729                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4730
4731                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4732                                                 new_extent->disk_bytenr);
4733                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4734                                                 new_extent->disk_num_bytes);
4735                 btrfs_mark_buffer_dirty(leaf);
4736
4737                 ret = btrfs_inc_extent_ref(trans, root,
4738                                         new_extent->disk_bytenr,
4739                                         new_extent->disk_num_bytes,
4740                                         leaf->start,
4741                                         root->root_key.objectid,
4742                                         trans->transid, key.objectid);
4743                 BUG_ON(ret);
4744
4745                 ret = btrfs_free_extent(trans, root,
4746                                         bytenr, num_bytes, leaf->start,
4747                                         btrfs_header_owner(leaf),
4748                                         btrfs_header_generation(leaf),
4749                                         key.objectid, 0);
4750                 BUG_ON(ret);
4751                 cond_resched();
4752         }
4753         kfree(new_extent);
4754         BUG_ON(ext_index + 1 != ref->nritems);
4755         btrfs_free_leaf_ref(root, ref);
4756         return 0;
4757 }
4758
4759 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4760                           struct btrfs_root *root)
4761 {
4762         struct btrfs_root *reloc_root;
4763         int ret;
4764
4765         if (root->reloc_root) {
4766                 reloc_root = root->reloc_root;
4767                 root->reloc_root = NULL;
4768                 list_add(&reloc_root->dead_list,
4769                          &root->fs_info->dead_reloc_roots);
4770
4771                 btrfs_set_root_bytenr(&reloc_root->root_item,
4772                                       reloc_root->node->start);
4773                 btrfs_set_root_level(&root->root_item,
4774                                      btrfs_header_level(reloc_root->node));
4775                 memset(&reloc_root->root_item.drop_progress, 0,
4776                         sizeof(struct btrfs_disk_key));
4777                 reloc_root->root_item.drop_level = 0;
4778
4779                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4780                                         &reloc_root->root_key,
4781                                         &reloc_root->root_item);
4782                 BUG_ON(ret);
4783         }
4784         return 0;
4785 }
4786
4787 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4788 {
4789         struct btrfs_trans_handle *trans;
4790         struct btrfs_root *reloc_root;
4791         struct btrfs_root *prev_root = NULL;
4792         struct list_head dead_roots;
4793         int ret;
4794         unsigned long nr;
4795
4796         INIT_LIST_HEAD(&dead_roots);
4797         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4798
4799         while (!list_empty(&dead_roots)) {
4800                 reloc_root = list_entry(dead_roots.prev,
4801                                         struct btrfs_root, dead_list);
4802                 list_del_init(&reloc_root->dead_list);
4803
4804                 BUG_ON(reloc_root->commit_root != NULL);
4805                 while (1) {
4806                         trans = btrfs_join_transaction(root, 1);
4807                         BUG_ON(!trans);
4808
4809                         mutex_lock(&root->fs_info->drop_mutex);
4810                         ret = btrfs_drop_snapshot(trans, reloc_root);
4811                         if (ret != -EAGAIN)
4812                                 break;
4813                         mutex_unlock(&root->fs_info->drop_mutex);
4814
4815                         nr = trans->blocks_used;
4816                         ret = btrfs_end_transaction(trans, root);
4817                         BUG_ON(ret);
4818                         btrfs_btree_balance_dirty(root, nr);
4819                 }
4820
4821                 free_extent_buffer(reloc_root->node);
4822
4823                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4824                                      &reloc_root->root_key);
4825                 BUG_ON(ret);
4826                 mutex_unlock(&root->fs_info->drop_mutex);
4827
4828                 nr = trans->blocks_used;
4829                 ret = btrfs_end_transaction(trans, root);
4830                 BUG_ON(ret);
4831                 btrfs_btree_balance_dirty(root, nr);
4832
4833                 kfree(prev_root);
4834                 prev_root = reloc_root;
4835         }
4836         if (prev_root) {
4837                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
4838                 kfree(prev_root);
4839         }
4840         return 0;
4841 }
4842
4843 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
4844 {
4845         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
4846         return 0;
4847 }
4848
4849 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
4850 {
4851         struct btrfs_root *reloc_root;
4852         struct btrfs_trans_handle *trans;
4853         struct btrfs_key location;
4854         int found;
4855         int ret;
4856
4857         mutex_lock(&root->fs_info->tree_reloc_mutex);
4858         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
4859         BUG_ON(ret);
4860         found = !list_empty(&root->fs_info->dead_reloc_roots);
4861         mutex_unlock(&root->fs_info->tree_reloc_mutex);
4862
4863         if (found) {
4864                 trans = btrfs_start_transaction(root, 1);
4865                 BUG_ON(!trans);
4866                 ret = btrfs_commit_transaction(trans, root);
4867                 BUG_ON(ret);
4868         }
4869
4870         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
4871         location.offset = (u64)-1;
4872         location.type = BTRFS_ROOT_ITEM_KEY;
4873
4874         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
4875         BUG_ON(!reloc_root);
4876         btrfs_orphan_cleanup(reloc_root);
4877         return 0;
4878 }
4879
4880 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
4881                                     struct btrfs_root *root)
4882 {
4883         struct btrfs_root *reloc_root;
4884         struct extent_buffer *eb;
4885         struct btrfs_root_item *root_item;
4886         struct btrfs_key root_key;
4887         int ret;
4888
4889         BUG_ON(!root->ref_cows);
4890         if (root->reloc_root)
4891                 return 0;
4892
4893         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
4894         BUG_ON(!root_item);
4895
4896         ret = btrfs_copy_root(trans, root, root->commit_root,
4897                               &eb, BTRFS_TREE_RELOC_OBJECTID);
4898         BUG_ON(ret);
4899
4900         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4901         root_key.offset = root->root_key.objectid;
4902         root_key.type = BTRFS_ROOT_ITEM_KEY;
4903
4904         memcpy(root_item, &root->root_item, sizeof(root_item));
4905         btrfs_set_root_refs(root_item, 0);
4906         btrfs_set_root_bytenr(root_item, eb->start);
4907         btrfs_set_root_level(root_item, btrfs_header_level(eb));
4908         btrfs_set_root_generation(root_item, trans->transid);
4909
4910         btrfs_tree_unlock(eb);
4911         free_extent_buffer(eb);
4912
4913         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
4914                                 &root_key, root_item);
4915         BUG_ON(ret);
4916         kfree(root_item);
4917
4918         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
4919                                                  &root_key);
4920         BUG_ON(!reloc_root);
4921         reloc_root->last_trans = trans->transid;
4922         reloc_root->commit_root = NULL;
4923         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
4924
4925         root->reloc_root = reloc_root;
4926         return 0;
4927 }
4928
4929 /*
4930  * Core function of space balance.
4931  *
4932  * The idea is using reloc trees to relocate tree blocks in reference
4933  * counted roots. There is one reloc tree for each subvol, and all
4934  * reloc trees share same root key objectid. Reloc trees are snapshots
4935  * of the latest committed roots of subvols (root->commit_root).
4936  *
4937  * To relocate a tree block referenced by a subvol, there are two steps.
4938  * COW the block through subvol's reloc tree, then update block pointer
4939  * in the subvol to point to the new block. Since all reloc trees share
4940  * same root key objectid, doing special handing for tree blocks owned
4941  * by them is easy. Once a tree block has been COWed in one reloc tree,
4942  * we can use the resulting new block directly when the same block is
4943  * required to COW again through other reloc trees. By this way, relocated
4944  * tree blocks are shared between reloc trees, so they are also shared
4945  * between subvols.
4946  */
4947 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
4948                                       struct btrfs_root *root,
4949                                       struct btrfs_path *path,
4950                                       struct btrfs_key *first_key,
4951                                       struct btrfs_ref_path *ref_path,
4952                                       struct btrfs_block_group_cache *group,
4953                                       struct inode *reloc_inode)
4954 {
4955         struct btrfs_root *reloc_root;
4956         struct extent_buffer *eb = NULL;
4957         struct btrfs_key *keys;
4958         u64 *nodes;
4959         int level;
4960         int shared_level;
4961         int lowest_level = 0;
4962         int ret;
4963
4964         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
4965                 lowest_level = ref_path->owner_objectid;
4966
4967         if (!root->ref_cows) {
4968                 path->lowest_level = lowest_level;
4969                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
4970                 BUG_ON(ret < 0);
4971                 path->lowest_level = 0;
4972                 btrfs_release_path(root, path);
4973                 return 0;
4974         }
4975
4976         mutex_lock(&root->fs_info->tree_reloc_mutex);
4977         ret = init_reloc_tree(trans, root);
4978         BUG_ON(ret);
4979         reloc_root = root->reloc_root;
4980
4981         shared_level = ref_path->shared_level;
4982         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
4983
4984         keys = ref_path->node_keys;
4985         nodes = ref_path->new_nodes;
4986         memset(&keys[shared_level + 1], 0,
4987                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
4988         memset(&nodes[shared_level + 1], 0,
4989                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
4990
4991         if (nodes[lowest_level] == 0) {
4992                 path->lowest_level = lowest_level;
4993                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
4994                                         0, 1);
4995                 BUG_ON(ret);
4996                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
4997                         eb = path->nodes[level];
4998                         if (!eb || eb == reloc_root->node)
4999                                 break;
5000                         nodes[level] = eb->start;
5001                         if (level == 0)
5002                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5003                         else
5004                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5005                 }
5006                 if (nodes[0] &&
5007                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5008                         eb = path->nodes[0];
5009                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
5010                                                       group, reloc_inode);
5011                         BUG_ON(ret);
5012                 }
5013                 btrfs_release_path(reloc_root, path);
5014         } else {
5015                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5016                                        lowest_level);
5017                 BUG_ON(ret);
5018         }
5019
5020         /*
5021          * replace tree blocks in the fs tree with tree blocks in
5022          * the reloc tree.
5023          */
5024         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5025         BUG_ON(ret < 0);
5026
5027         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5028                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5029                                         0, 0);
5030                 BUG_ON(ret);
5031                 extent_buffer_get(path->nodes[0]);
5032                 eb = path->nodes[0];
5033                 btrfs_release_path(reloc_root, path);
5034                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5035                 BUG_ON(ret);
5036                 free_extent_buffer(eb);
5037         }
5038
5039         mutex_unlock(&root->fs_info->tree_reloc_mutex);
5040         path->lowest_level = 0;
5041         return 0;
5042 }
5043
5044 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5045                                         struct btrfs_root *root,
5046                                         struct btrfs_path *path,
5047                                         struct btrfs_key *first_key,
5048                                         struct btrfs_ref_path *ref_path)
5049 {
5050         int ret;
5051
5052         ret = relocate_one_path(trans, root, path, first_key,
5053                                 ref_path, NULL, NULL);
5054         BUG_ON(ret);
5055
5056         return 0;
5057 }
5058
5059 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5060                                     struct btrfs_root *extent_root,
5061                                     struct btrfs_path *path,
5062                                     struct btrfs_key *extent_key)
5063 {
5064         int ret;
5065
5066         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5067         if (ret)
5068                 goto out;
5069         ret = btrfs_del_item(trans, extent_root, path);
5070 out:
5071         btrfs_release_path(extent_root, path);
5072         return ret;
5073 }
5074
5075 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5076                                                 struct btrfs_ref_path *ref_path)
5077 {
5078         struct btrfs_key root_key;
5079
5080         root_key.objectid = ref_path->root_objectid;
5081         root_key.type = BTRFS_ROOT_ITEM_KEY;
5082         if (is_cowonly_root(ref_path->root_objectid))
5083                 root_key.offset = 0;
5084         else
5085                 root_key.offset = (u64)-1;
5086
5087         return btrfs_read_fs_root_no_name(fs_info, &root_key);
5088 }
5089
5090 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5091                                         struct btrfs_path *path,
5092                                         struct btrfs_key *extent_key,
5093                                         struct btrfs_block_group_cache *group,
5094                                         struct inode *reloc_inode, int pass)
5095 {
5096         struct btrfs_trans_handle *trans;
5097         struct btrfs_root *found_root;
5098         struct btrfs_ref_path *ref_path = NULL;
5099         struct disk_extent *new_extents = NULL;
5100         int nr_extents = 0;
5101         int loops;
5102         int ret;
5103         int level;
5104         struct btrfs_key first_key;
5105         u64 prev_block = 0;
5106
5107
5108         trans = btrfs_start_transaction(extent_root, 1);
5109         BUG_ON(!trans);
5110
5111         if (extent_key->objectid == 0) {
5112                 ret = del_extent_zero(trans, extent_root, path, extent_key);
5113                 goto out;
5114         }
5115
5116         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5117         if (!ref_path) {
5118                 ret = -ENOMEM;
5119                 goto out;
5120         }
5121
5122         for (loops = 0; ; loops++) {
5123                 if (loops == 0) {
5124                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5125                                                    extent_key->objectid);
5126                 } else {
5127                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5128                 }
5129                 if (ret < 0)
5130                         goto out;
5131                 if (ret > 0)
5132                         break;
5133
5134                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5135                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5136                         continue;
5137
5138                 found_root = read_ref_root(extent_root->fs_info, ref_path);
5139                 BUG_ON(!found_root);
5140                 /*
5141                  * for reference counted tree, only process reference paths
5142                  * rooted at the latest committed root.
5143                  */
5144                 if (found_root->ref_cows &&
5145                     ref_path->root_generation != found_root->root_key.offset)
5146                         continue;
5147
5148                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5149                         if (pass == 0) {
5150                                 /*
5151                                  * copy data extents to new locations
5152                                  */
5153                                 u64 group_start = group->key.objectid;
5154                                 ret = relocate_data_extent(reloc_inode,
5155                                                            extent_key,
5156                                                            group_start);
5157                                 if (ret < 0)
5158                                         goto out;
5159                                 break;
5160                         }
5161                         level = 0;
5162                 } else {
5163                         level = ref_path->owner_objectid;
5164                 }
5165
5166                 if (prev_block != ref_path->nodes[level]) {
5167                         struct extent_buffer *eb;
5168                         u64 block_start = ref_path->nodes[level];
5169                         u64 block_size = btrfs_level_size(found_root, level);
5170
5171                         eb = read_tree_block(found_root, block_start,
5172                                              block_size, 0);
5173                         btrfs_tree_lock(eb);
5174                         BUG_ON(level != btrfs_header_level(eb));
5175
5176                         if (level == 0)
5177                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
5178                         else
5179                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
5180
5181                         btrfs_tree_unlock(eb);
5182                         free_extent_buffer(eb);
5183                         prev_block = block_start;
5184                 }
5185
5186                 mutex_lock(&extent_root->fs_info->trans_mutex);
5187                 btrfs_record_root_in_trans(found_root);
5188                 mutex_unlock(&extent_root->fs_info->trans_mutex);
5189                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5190                         /*
5191                          * try to update data extent references while
5192                          * keeping metadata shared between snapshots.
5193                          */
5194                         if (pass == 1) {
5195                                 ret = relocate_one_path(trans, found_root,
5196                                                 path, &first_key, ref_path,
5197                                                 group, reloc_inode);
5198                                 if (ret < 0)
5199                                         goto out;
5200                                 continue;
5201                         }
5202                         /*
5203                          * use fallback method to process the remaining
5204                          * references.
5205                          */
5206                         if (!new_extents) {
5207                                 u64 group_start = group->key.objectid;
5208                                 new_extents = kmalloc(sizeof(*new_extents),
5209                                                       GFP_NOFS);
5210                                 nr_extents = 1;
5211                                 ret = get_new_locations(reloc_inode,
5212                                                         extent_key,
5213                                                         group_start, 1,
5214                                                         &new_extents,
5215                                                         &nr_extents);
5216                                 if (ret)
5217                                         goto out;
5218                         }
5219                         ret = replace_one_extent(trans, found_root,
5220                                                 path, extent_key,
5221                                                 &first_key, ref_path,
5222                                                 new_extents, nr_extents);
5223                 } else {
5224                         ret = relocate_tree_block(trans, found_root, path,
5225                                                   &first_key, ref_path);
5226                 }
5227                 if (ret < 0)
5228                         goto out;
5229         }
5230         ret = 0;
5231 out:
5232         btrfs_end_transaction(trans, extent_root);
5233         kfree(new_extents);
5234         kfree(ref_path);
5235         return ret;
5236 }
5237
5238 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5239 {
5240         u64 num_devices;
5241         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5242                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5243
5244         num_devices = root->fs_info->fs_devices->rw_devices;
5245         if (num_devices == 1) {
5246                 stripped |= BTRFS_BLOCK_GROUP_DUP;
5247                 stripped = flags & ~stripped;
5248
5249                 /* turn raid0 into single device chunks */
5250                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5251                         return stripped;
5252
5253                 /* turn mirroring into duplication */
5254                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5255                              BTRFS_BLOCK_GROUP_RAID10))
5256                         return stripped | BTRFS_BLOCK_GROUP_DUP;
5257                 return flags;
5258         } else {
5259                 /* they already had raid on here, just return */
5260                 if (flags & stripped)
5261                         return flags;
5262
5263                 stripped |= BTRFS_BLOCK_GROUP_DUP;
5264                 stripped = flags & ~stripped;
5265
5266                 /* switch duplicated blocks with raid1 */
5267                 if (flags & BTRFS_BLOCK_GROUP_DUP)
5268                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
5269
5270                 /* turn single device chunks into raid0 */
5271                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5272         }
5273         return flags;
5274 }
5275
5276 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5277                      struct btrfs_block_group_cache *shrink_block_group,
5278                      int force)
5279 {
5280         struct btrfs_trans_handle *trans;
5281         u64 new_alloc_flags;
5282         u64 calc;
5283
5284         spin_lock(&shrink_block_group->lock);
5285         if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5286                 spin_unlock(&shrink_block_group->lock);
5287
5288                 trans = btrfs_start_transaction(root, 1);
5289                 spin_lock(&shrink_block_group->lock);
5290
5291                 new_alloc_flags = update_block_group_flags(root,
5292                                                    shrink_block_group->flags);
5293                 if (new_alloc_flags != shrink_block_group->flags) {
5294                         calc =
5295                              btrfs_block_group_used(&shrink_block_group->item);
5296                 } else {
5297                         calc = shrink_block_group->key.offset;
5298                 }
5299                 spin_unlock(&shrink_block_group->lock);
5300
5301                 do_chunk_alloc(trans, root->fs_info->extent_root,
5302                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
5303
5304                 btrfs_end_transaction(trans, root);
5305         } else
5306                 spin_unlock(&shrink_block_group->lock);
5307         return 0;
5308 }
5309
5310 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5311                                  struct btrfs_root *root,
5312                                  u64 objectid, u64 size)
5313 {
5314         struct btrfs_path *path;
5315         struct btrfs_inode_item *item;
5316         struct extent_buffer *leaf;
5317         int ret;
5318
5319         path = btrfs_alloc_path();
5320         if (!path)
5321                 return -ENOMEM;
5322
5323         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5324         if (ret)
5325                 goto out;
5326
5327         leaf = path->nodes[0];
5328         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5329         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5330         btrfs_set_inode_generation(leaf, item, 1);
5331         btrfs_set_inode_size(leaf, item, size);
5332         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5333         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5334         btrfs_mark_buffer_dirty(leaf);
5335         btrfs_release_path(root, path);
5336 out:
5337         btrfs_free_path(path);
5338         return ret;
5339 }
5340
5341 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5342                                         struct btrfs_block_group_cache *group)
5343 {
5344         struct inode *inode = NULL;
5345         struct btrfs_trans_handle *trans;
5346         struct btrfs_root *root;
5347         struct btrfs_key root_key;
5348         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5349         int err = 0;
5350
5351         root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5352         root_key.type = BTRFS_ROOT_ITEM_KEY;
5353         root_key.offset = (u64)-1;
5354         root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5355         if (IS_ERR(root))
5356                 return ERR_CAST(root);
5357
5358         trans = btrfs_start_transaction(root, 1);
5359         BUG_ON(!trans);
5360
5361         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5362         if (err)
5363                 goto out;
5364
5365         err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5366         BUG_ON(err);
5367
5368         err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5369                                        group->key.offset, 0, group->key.offset,
5370                                        0, 0, 0);
5371         BUG_ON(err);
5372
5373         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5374         if (inode->i_state & I_NEW) {
5375                 BTRFS_I(inode)->root = root;
5376                 BTRFS_I(inode)->location.objectid = objectid;
5377                 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5378                 BTRFS_I(inode)->location.offset = 0;
5379                 btrfs_read_locked_inode(inode);
5380                 unlock_new_inode(inode);
5381                 BUG_ON(is_bad_inode(inode));
5382         } else {
5383                 BUG_ON(1);
5384         }
5385         BTRFS_I(inode)->index_cnt = group->key.objectid;
5386
5387         err = btrfs_orphan_add(trans, inode);
5388 out:
5389         btrfs_end_transaction(trans, root);
5390         if (err) {
5391                 if (inode)
5392                         iput(inode);
5393                 inode = ERR_PTR(err);
5394         }
5395         return inode;
5396 }
5397
5398 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5399 {
5400
5401         struct btrfs_ordered_sum *sums;
5402         struct btrfs_sector_sum *sector_sum;
5403         struct btrfs_ordered_extent *ordered;
5404         struct btrfs_root *root = BTRFS_I(inode)->root;
5405         struct list_head list;
5406         size_t offset;
5407         int ret;
5408         u64 disk_bytenr;
5409
5410         INIT_LIST_HEAD(&list);
5411
5412         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5413         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5414
5415         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5416         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5417                                        disk_bytenr + len - 1, &list);
5418
5419         while (!list_empty(&list)) {
5420                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5421                 list_del_init(&sums->list);
5422
5423                 sector_sum = sums->sums;
5424                 sums->bytenr = ordered->start;
5425
5426                 offset = 0;
5427                 while (offset < sums->len) {
5428                         sector_sum->bytenr += ordered->start - disk_bytenr;
5429                         sector_sum++;
5430                         offset += root->sectorsize;
5431                 }
5432
5433                 btrfs_add_ordered_sum(inode, ordered, sums);
5434         }
5435         btrfs_put_ordered_extent(ordered);
5436         return 0;
5437 }
5438
5439 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5440 {
5441         struct btrfs_trans_handle *trans;
5442         struct btrfs_path *path;
5443         struct btrfs_fs_info *info = root->fs_info;
5444         struct extent_buffer *leaf;
5445         struct inode *reloc_inode;
5446         struct btrfs_block_group_cache *block_group;
5447         struct btrfs_key key;
5448         u64 skipped;
5449         u64 cur_byte;
5450         u64 total_found;
5451         u32 nritems;
5452         int ret;
5453         int progress;
5454         int pass = 0;
5455
5456         root = root->fs_info->extent_root;
5457
5458         block_group = btrfs_lookup_block_group(info, group_start);
5459         BUG_ON(!block_group);
5460
5461         printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5462                (unsigned long long)block_group->key.objectid,
5463                (unsigned long long)block_group->flags);
5464
5465         path = btrfs_alloc_path();
5466         BUG_ON(!path);
5467
5468         reloc_inode = create_reloc_inode(info, block_group);
5469         BUG_ON(IS_ERR(reloc_inode));
5470
5471         __alloc_chunk_for_shrink(root, block_group, 1);
5472         set_block_group_readonly(block_group);
5473
5474         btrfs_start_delalloc_inodes(info->tree_root);
5475         btrfs_wait_ordered_extents(info->tree_root, 0);
5476 again:
5477         skipped = 0;
5478         total_found = 0;
5479         progress = 0;
5480         key.objectid = block_group->key.objectid;
5481         key.offset = 0;
5482         key.type = 0;
5483         cur_byte = key.objectid;
5484
5485         trans = btrfs_start_transaction(info->tree_root, 1);
5486         btrfs_commit_transaction(trans, info->tree_root);
5487
5488         mutex_lock(&root->fs_info->cleaner_mutex);
5489         btrfs_clean_old_snapshots(info->tree_root);
5490         btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5491         mutex_unlock(&root->fs_info->cleaner_mutex);
5492
5493         trans = btrfs_start_transaction(info->tree_root, 1);
5494         btrfs_commit_transaction(trans, info->tree_root);
5495
5496         while (1) {
5497                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5498                 if (ret < 0)
5499                         goto out;
5500 next:
5501                 leaf = path->nodes[0];
5502                 nritems = btrfs_header_nritems(leaf);
5503                 if (path->slots[0] >= nritems) {
5504                         ret = btrfs_next_leaf(root, path);
5505                         if (ret < 0)
5506                                 goto out;
5507                         if (ret == 1) {
5508                                 ret = 0;
5509                                 break;
5510                         }
5511                         leaf = path->nodes[0];
5512                         nritems = btrfs_header_nritems(leaf);
5513                 }
5514
5515                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5516
5517                 if (key.objectid >= block_group->key.objectid +
5518                     block_group->key.offset)
5519                         break;
5520
5521                 if (progress && need_resched()) {
5522                         btrfs_release_path(root, path);
5523                         cond_resched();
5524                         progress = 0;
5525                         continue;
5526                 }
5527                 progress = 1;
5528
5529                 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5530                     key.objectid + key.offset <= cur_byte) {
5531                         path->slots[0]++;
5532                         goto next;
5533                 }
5534
5535                 total_found++;
5536                 cur_byte = key.objectid + key.offset;
5537                 btrfs_release_path(root, path);
5538
5539                 __alloc_chunk_for_shrink(root, block_group, 0);
5540                 ret = relocate_one_extent(root, path, &key, block_group,
5541                                           reloc_inode, pass);
5542                 BUG_ON(ret < 0);
5543                 if (ret > 0)
5544                         skipped++;
5545
5546                 key.objectid = cur_byte;
5547                 key.type = 0;
5548                 key.offset = 0;
5549         }
5550
5551         btrfs_release_path(root, path);
5552
5553         if (pass == 0) {
5554                 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5555                 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5556         }
5557
5558         if (total_found > 0) {
5559                 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5560                        (unsigned long long)total_found, pass);
5561                 pass++;
5562                 if (total_found == skipped && pass > 2) {
5563                         iput(reloc_inode);
5564                         reloc_inode = create_reloc_inode(info, block_group);
5565                         pass = 0;
5566                 }
5567                 goto again;
5568         }
5569
5570         /* delete reloc_inode */
5571         iput(reloc_inode);
5572
5573         /* unpin extents in this range */
5574         trans = btrfs_start_transaction(info->tree_root, 1);
5575         btrfs_commit_transaction(trans, info->tree_root);
5576
5577         spin_lock(&block_group->lock);
5578         WARN_ON(block_group->pinned > 0);
5579         WARN_ON(block_group->reserved > 0);
5580         WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5581         spin_unlock(&block_group->lock);
5582         put_block_group(block_group);
5583         ret = 0;
5584 out:
5585         btrfs_free_path(path);
5586         return ret;
5587 }
5588
5589 static int find_first_block_group(struct btrfs_root *root,
5590                 struct btrfs_path *path, struct btrfs_key *key)
5591 {
5592         int ret = 0;
5593         struct btrfs_key found_key;
5594         struct extent_buffer *leaf;
5595         int slot;
5596
5597         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5598         if (ret < 0)
5599                 goto out;
5600
5601         while (1) {
5602                 slot = path->slots[0];
5603                 leaf = path->nodes[0];
5604                 if (slot >= btrfs_header_nritems(leaf)) {
5605                         ret = btrfs_next_leaf(root, path);
5606                         if (ret == 0)
5607                                 continue;
5608                         if (ret < 0)
5609                                 goto out;
5610                         break;
5611                 }
5612                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5613
5614                 if (found_key.objectid >= key->objectid &&
5615                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5616                         ret = 0;
5617                         goto out;
5618                 }
5619                 path->slots[0]++;
5620         }
5621         ret = -ENOENT;
5622 out:
5623         return ret;
5624 }
5625
5626 int btrfs_free_block_groups(struct btrfs_fs_info *info)
5627 {
5628         struct btrfs_block_group_cache *block_group;
5629         struct btrfs_space_info *space_info;
5630         struct rb_node *n;
5631
5632         spin_lock(&info->block_group_cache_lock);
5633         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5634                 block_group = rb_entry(n, struct btrfs_block_group_cache,
5635                                        cache_node);
5636                 rb_erase(&block_group->cache_node,
5637                          &info->block_group_cache_tree);
5638                 spin_unlock(&info->block_group_cache_lock);
5639
5640                 btrfs_remove_free_space_cache(block_group);
5641                 down_write(&block_group->space_info->groups_sem);
5642                 list_del(&block_group->list);
5643                 up_write(&block_group->space_info->groups_sem);
5644
5645                 WARN_ON(atomic_read(&block_group->count) != 1);
5646                 kfree(block_group);
5647
5648                 spin_lock(&info->block_group_cache_lock);
5649         }
5650         spin_unlock(&info->block_group_cache_lock);
5651
5652         /* now that all the block groups are freed, go through and
5653          * free all the space_info structs.  This is only called during
5654          * the final stages of unmount, and so we know nobody is
5655          * using them.  We call synchronize_rcu() once before we start,
5656          * just to be on the safe side.
5657          */
5658         synchronize_rcu();
5659
5660         while(!list_empty(&info->space_info)) {
5661                 space_info = list_entry(info->space_info.next,
5662                                         struct btrfs_space_info,
5663                                         list);
5664
5665                 list_del(&space_info->list);
5666                 kfree(space_info);
5667         }
5668         return 0;
5669 }
5670
5671 int btrfs_read_block_groups(struct btrfs_root *root)
5672 {
5673         struct btrfs_path *path;
5674         int ret;
5675         struct btrfs_block_group_cache *cache;
5676         struct btrfs_fs_info *info = root->fs_info;
5677         struct btrfs_space_info *space_info;
5678         struct btrfs_key key;
5679         struct btrfs_key found_key;
5680         struct extent_buffer *leaf;
5681
5682         root = info->extent_root;
5683         key.objectid = 0;
5684         key.offset = 0;
5685         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5686         path = btrfs_alloc_path();
5687         if (!path)
5688                 return -ENOMEM;
5689
5690         while (1) {
5691                 ret = find_first_block_group(root, path, &key);
5692                 if (ret > 0) {
5693                         ret = 0;
5694                         goto error;
5695                 }
5696                 if (ret != 0)
5697                         goto error;
5698
5699                 leaf = path->nodes[0];
5700                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5701                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5702                 if (!cache) {
5703                         ret = -ENOMEM;
5704                         break;
5705                 }
5706
5707                 atomic_set(&cache->count, 1);
5708                 spin_lock_init(&cache->lock);
5709                 mutex_init(&cache->alloc_mutex);
5710                 mutex_init(&cache->cache_mutex);
5711                 INIT_LIST_HEAD(&cache->list);
5712                 read_extent_buffer(leaf, &cache->item,
5713                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
5714                                    sizeof(cache->item));
5715                 memcpy(&cache->key, &found_key, sizeof(found_key));
5716
5717                 key.objectid = found_key.objectid + found_key.offset;
5718                 btrfs_release_path(root, path);
5719                 cache->flags = btrfs_block_group_flags(&cache->item);
5720
5721                 ret = update_space_info(info, cache->flags, found_key.offset,
5722                                         btrfs_block_group_used(&cache->item),
5723                                         &space_info);
5724                 BUG_ON(ret);
5725                 cache->space_info = space_info;
5726                 down_write(&space_info->groups_sem);
5727                 list_add_tail(&cache->list, &space_info->block_groups);
5728                 up_write(&space_info->groups_sem);
5729
5730                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5731                 BUG_ON(ret);
5732
5733                 set_avail_alloc_bits(root->fs_info, cache->flags);
5734                 if (btrfs_chunk_readonly(root, cache->key.objectid))
5735                         set_block_group_readonly(cache);
5736         }
5737         ret = 0;
5738 error:
5739         btrfs_free_path(path);
5740         return ret;
5741 }
5742
5743 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5744                            struct btrfs_root *root, u64 bytes_used,
5745                            u64 type, u64 chunk_objectid, u64 chunk_offset,
5746                            u64 size)
5747 {
5748         int ret;
5749         struct btrfs_root *extent_root;
5750         struct btrfs_block_group_cache *cache;
5751
5752         extent_root = root->fs_info->extent_root;
5753
5754         root->fs_info->last_trans_new_blockgroup = trans->transid;
5755
5756         cache = kzalloc(sizeof(*cache), GFP_NOFS);
5757         if (!cache)
5758                 return -ENOMEM;
5759
5760         cache->key.objectid = chunk_offset;
5761         cache->key.offset = size;
5762         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5763         atomic_set(&cache->count, 1);
5764         spin_lock_init(&cache->lock);
5765         mutex_init(&cache->alloc_mutex);
5766         mutex_init(&cache->cache_mutex);
5767         INIT_LIST_HEAD(&cache->list);
5768
5769         btrfs_set_block_group_used(&cache->item, bytes_used);
5770         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5771         cache->flags = type;
5772         btrfs_set_block_group_flags(&cache->item, type);
5773
5774         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5775                                 &cache->space_info);
5776         BUG_ON(ret);
5777         down_write(&cache->space_info->groups_sem);
5778         list_add_tail(&cache->list, &cache->space_info->block_groups);
5779         up_write(&cache->space_info->groups_sem);
5780
5781         ret = btrfs_add_block_group_cache(root->fs_info, cache);
5782         BUG_ON(ret);
5783
5784         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5785                                 sizeof(cache->item));
5786         BUG_ON(ret);
5787
5788         set_avail_alloc_bits(extent_root->fs_info, type);
5789
5790         return 0;
5791 }
5792
5793 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5794                              struct btrfs_root *root, u64 group_start)
5795 {
5796         struct btrfs_path *path;
5797         struct btrfs_block_group_cache *block_group;
5798         struct btrfs_key key;
5799         int ret;
5800
5801         root = root->fs_info->extent_root;
5802
5803         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5804         BUG_ON(!block_group);
5805         BUG_ON(!block_group->ro);
5806
5807         memcpy(&key, &block_group->key, sizeof(key));
5808
5809         path = btrfs_alloc_path();
5810         BUG_ON(!path);
5811
5812         spin_lock(&root->fs_info->block_group_cache_lock);
5813         rb_erase(&block_group->cache_node,
5814                  &root->fs_info->block_group_cache_tree);
5815         spin_unlock(&root->fs_info->block_group_cache_lock);
5816         btrfs_remove_free_space_cache(block_group);
5817         down_write(&block_group->space_info->groups_sem);
5818         list_del(&block_group->list);
5819         up_write(&block_group->space_info->groups_sem);
5820
5821         spin_lock(&block_group->space_info->lock);
5822         block_group->space_info->total_bytes -= block_group->key.offset;
5823         block_group->space_info->bytes_readonly -= block_group->key.offset;
5824         spin_unlock(&block_group->space_info->lock);
5825         block_group->space_info->full = 0;
5826
5827         put_block_group(block_group);
5828         put_block_group(block_group);
5829
5830         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5831         if (ret > 0)
5832                 ret = -EIO;
5833         if (ret < 0)
5834                 goto out;
5835
5836         ret = btrfs_del_item(trans, root, path);
5837 out:
5838         btrfs_free_path(path);
5839         return ret;
5840 }