Btrfs: async block group caching
[linux-2.6] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include "compat.h"
26 #include "hash.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "free-space-cache.h"
34
35 static int update_reserved_extents(struct btrfs_root *root,
36                                    u64 bytenr, u64 num, int reserve);
37 static int update_block_group(struct btrfs_trans_handle *trans,
38                               struct btrfs_root *root,
39                               u64 bytenr, u64 num_bytes, int alloc,
40                               int mark_free);
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                 struct btrfs_root *root,
43                                 u64 bytenr, u64 num_bytes, u64 parent,
44                                 u64 root_objectid, u64 owner_objectid,
45                                 u64 owner_offset, int refs_to_drop,
46                                 struct btrfs_delayed_extent_op *extra_op);
47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48                                     struct extent_buffer *leaf,
49                                     struct btrfs_extent_item *ei);
50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51                                       struct btrfs_root *root,
52                                       u64 parent, u64 root_objectid,
53                                       u64 flags, u64 owner, u64 offset,
54                                       struct btrfs_key *ins, int ref_mod);
55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56                                      struct btrfs_root *root,
57                                      u64 parent, u64 root_objectid,
58                                      u64 flags, struct btrfs_disk_key *key,
59                                      int level, struct btrfs_key *ins);
60
61 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62                           struct btrfs_root *extent_root, u64 alloc_bytes,
63                           u64 flags, int force);
64
65 static noinline int
66 block_group_cache_done(struct btrfs_block_group_cache *cache)
67 {
68         smp_mb();
69         return cache->cached == BTRFS_CACHE_FINISHED;
70 }
71
72 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
73 {
74         return (cache->flags & bits) == bits;
75 }
76
77 /*
78  * this adds the block group to the fs_info rb tree for the block group
79  * cache
80  */
81 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
82                                 struct btrfs_block_group_cache *block_group)
83 {
84         struct rb_node **p;
85         struct rb_node *parent = NULL;
86         struct btrfs_block_group_cache *cache;
87
88         spin_lock(&info->block_group_cache_lock);
89         p = &info->block_group_cache_tree.rb_node;
90
91         while (*p) {
92                 parent = *p;
93                 cache = rb_entry(parent, struct btrfs_block_group_cache,
94                                  cache_node);
95                 if (block_group->key.objectid < cache->key.objectid) {
96                         p = &(*p)->rb_left;
97                 } else if (block_group->key.objectid > cache->key.objectid) {
98                         p = &(*p)->rb_right;
99                 } else {
100                         spin_unlock(&info->block_group_cache_lock);
101                         return -EEXIST;
102                 }
103         }
104
105         rb_link_node(&block_group->cache_node, parent, p);
106         rb_insert_color(&block_group->cache_node,
107                         &info->block_group_cache_tree);
108         spin_unlock(&info->block_group_cache_lock);
109
110         return 0;
111 }
112
113 /*
114  * This will return the block group at or after bytenr if contains is 0, else
115  * it will return the block group that contains the bytenr
116  */
117 static struct btrfs_block_group_cache *
118 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
119                               int contains)
120 {
121         struct btrfs_block_group_cache *cache, *ret = NULL;
122         struct rb_node *n;
123         u64 end, start;
124
125         spin_lock(&info->block_group_cache_lock);
126         n = info->block_group_cache_tree.rb_node;
127
128         while (n) {
129                 cache = rb_entry(n, struct btrfs_block_group_cache,
130                                  cache_node);
131                 end = cache->key.objectid + cache->key.offset - 1;
132                 start = cache->key.objectid;
133
134                 if (bytenr < start) {
135                         if (!contains && (!ret || start < ret->key.objectid))
136                                 ret = cache;
137                         n = n->rb_left;
138                 } else if (bytenr > start) {
139                         if (contains && bytenr <= end) {
140                                 ret = cache;
141                                 break;
142                         }
143                         n = n->rb_right;
144                 } else {
145                         ret = cache;
146                         break;
147                 }
148         }
149         if (ret)
150                 atomic_inc(&ret->count);
151         spin_unlock(&info->block_group_cache_lock);
152
153         return ret;
154 }
155
156 void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info)
157 {
158         u64 start, end, last = 0;
159         int ret;
160
161         while (1) {
162                 ret = find_first_extent_bit(&info->pinned_extents, last,
163                                             &start, &end, EXTENT_LOCKED);
164                 if (ret)
165                         break;
166
167                 unlock_extent(&info->pinned_extents, start, end, GFP_NOFS);
168                 last = end+1;
169         }
170 }
171
172 static int remove_sb_from_cache(struct btrfs_root *root,
173                                 struct btrfs_block_group_cache *cache)
174 {
175         struct btrfs_fs_info *fs_info = root->fs_info;
176         u64 bytenr;
177         u64 *logical;
178         int stripe_len;
179         int i, nr, ret;
180
181         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
182                 bytenr = btrfs_sb_offset(i);
183                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
184                                        cache->key.objectid, bytenr,
185                                        0, &logical, &nr, &stripe_len);
186                 BUG_ON(ret);
187                 while (nr--) {
188                         try_lock_extent(&fs_info->pinned_extents,
189                                         logical[nr],
190                                         logical[nr] + stripe_len - 1, GFP_NOFS);
191                 }
192                 kfree(logical);
193         }
194
195         return 0;
196 }
197
198 /*
199  * this is only called by cache_block_group, since we could have freed extents
200  * we need to check the pinned_extents for any extents that can't be used yet
201  * since their free space will be released as soon as the transaction commits.
202  */
203 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
204                               struct btrfs_fs_info *info, u64 start, u64 end)
205 {
206         u64 extent_start, extent_end, size, total_added = 0;
207         int ret;
208
209         while (start < end) {
210                 ret = find_first_extent_bit(&info->pinned_extents, start,
211                                             &extent_start, &extent_end,
212                                             EXTENT_DIRTY|EXTENT_LOCKED|
213                                             EXTENT_DELALLOC);
214                 if (ret)
215                         break;
216
217                 if (extent_start == start) {
218                         start = extent_end + 1;
219                 } else if (extent_start > start && extent_start < end) {
220                         size = extent_start - start;
221                         total_added += size;
222                         ret = btrfs_add_free_space(block_group, start,
223                                                    size);
224                         BUG_ON(ret);
225                         start = extent_end + 1;
226                 } else {
227                         break;
228                 }
229         }
230
231         if (start < end) {
232                 size = end - start;
233                 total_added += size;
234                 ret = btrfs_add_free_space(block_group, start, size);
235                 BUG_ON(ret);
236         }
237
238         return total_added;
239 }
240
241 DEFINE_MUTEX(discard_mutex);
242
243 /*
244  * if async kthreads are running when we cross transactions, we mark any pinned
245  * extents with EXTENT_DELALLOC and then let the caching kthreads clean up those
246  * extents when they are done.  Also we run this from btrfs_finish_extent_commit
247  * in case there were some pinned extents that were missed because we had
248  * already cached that block group.
249  */
250 static void btrfs_discard_pinned_extents(struct btrfs_fs_info *fs_info,
251                                          struct btrfs_block_group_cache *cache)
252 {
253         u64 start, end, last;
254         int ret;
255
256         if (!cache)
257                 last = 0;
258         else
259                 last = cache->key.objectid;
260
261         mutex_lock(&discard_mutex);
262         while (1) {
263                 ret = find_first_extent_bit(&fs_info->pinned_extents, last,
264                                             &start, &end, EXTENT_DELALLOC);
265                 if (ret)
266                         break;
267
268                 if (cache && start >= cache->key.objectid + cache->key.offset)
269                         break;
270
271
272                 if (!cache) {
273                         cache = btrfs_lookup_block_group(fs_info, start);
274                         BUG_ON(!cache);
275
276                         start = max(start, cache->key.objectid);
277                         end = min(end, cache->key.objectid + cache->key.offset - 1);
278
279                         if (block_group_cache_done(cache))
280                                 btrfs_add_free_space(cache, start,
281                                                      end - start + 1);
282                         cache = NULL;
283                 } else {
284                         start = max(start, cache->key.objectid);
285                         end = min(end, cache->key.objectid + cache->key.offset - 1);
286                         btrfs_add_free_space(cache, start, end - start + 1);
287                 }
288
289                 clear_extent_bits(&fs_info->pinned_extents, start, end,
290                                   EXTENT_DELALLOC, GFP_NOFS);
291                 last = end + 1;
292
293                 if (need_resched()) {
294                         mutex_unlock(&discard_mutex);
295                         cond_resched();
296                         mutex_lock(&discard_mutex);
297                 }
298         }
299         mutex_unlock(&discard_mutex);
300 }
301
302 static int caching_kthread(void *data)
303 {
304         struct btrfs_block_group_cache *block_group = data;
305         struct btrfs_fs_info *fs_info = block_group->fs_info;
306         u64 last = 0;
307         struct btrfs_path *path;
308         int ret = 0;
309         struct btrfs_key key;
310         struct extent_buffer *leaf;
311         int slot;
312         u64 total_found = 0;
313
314         BUG_ON(!fs_info);
315
316         path = btrfs_alloc_path();
317         if (!path)
318                 return -ENOMEM;
319
320         atomic_inc(&fs_info->async_caching_threads);
321         atomic_inc(&block_group->space_info->caching_threads);
322         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
323 again:
324         /* need to make sure the commit_root doesn't disappear */
325         down_read(&fs_info->extent_root->commit_root_sem);
326
327         /*
328          * We don't want to deadlock with somebody trying to allocate a new
329          * extent for the extent root while also trying to search the extent
330          * root to add free space.  So we skip locking and search the commit
331          * root, since its read-only
332          */
333         path->skip_locking = 1;
334         path->search_commit_root = 1;
335         path->reada = 2;
336
337         key.objectid = last;
338         key.offset = 0;
339         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
340         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
341         if (ret < 0)
342                 goto err;
343
344         while (1) {
345                 smp_mb();
346                 if (block_group->fs_info->closing)
347                         break;
348
349                 leaf = path->nodes[0];
350                 slot = path->slots[0];
351                 if (slot >= btrfs_header_nritems(leaf)) {
352                         ret = btrfs_next_leaf(fs_info->extent_root, path);
353                         if (ret < 0)
354                                 goto err;
355                         else if (ret)
356                                 break;
357
358                         if (need_resched()) {
359                                 btrfs_release_path(fs_info->extent_root, path);
360                                 up_read(&fs_info->extent_root->commit_root_sem);
361                                 cond_resched();
362                                 goto again;
363                         }
364
365                         continue;
366                 }
367                 btrfs_item_key_to_cpu(leaf, &key, slot);
368                 if (key.objectid < block_group->key.objectid)
369                         goto next;
370
371                 if (key.objectid >= block_group->key.objectid +
372                     block_group->key.offset)
373                         break;
374
375                 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
376                         total_found += add_new_free_space(block_group,
377                                                           fs_info, last,
378                                                           key.objectid);
379                         last = key.objectid + key.offset;
380                 }
381
382                 if (total_found > (1024 * 1024 * 2)) {
383                         total_found = 0;
384                         wake_up(&block_group->caching_q);
385                 }
386 next:
387                 path->slots[0]++;
388         }
389         ret = 0;
390
391         total_found += add_new_free_space(block_group, fs_info, last,
392                                           block_group->key.objectid +
393                                           block_group->key.offset);
394
395         spin_lock(&block_group->lock);
396         block_group->cached = BTRFS_CACHE_FINISHED;
397         spin_unlock(&block_group->lock);
398
399 err:
400         btrfs_free_path(path);
401         up_read(&fs_info->extent_root->commit_root_sem);
402         atomic_dec(&fs_info->async_caching_threads);
403         atomic_dec(&block_group->space_info->caching_threads);
404         wake_up(&block_group->caching_q);
405
406         if (!ret)
407                 btrfs_discard_pinned_extents(fs_info, block_group);
408
409         return 0;
410 }
411
412 static int cache_block_group(struct btrfs_block_group_cache *cache)
413 {
414         struct task_struct *tsk;
415         int ret = 0;
416
417         spin_lock(&cache->lock);
418         if (cache->cached != BTRFS_CACHE_NO) {
419                 spin_unlock(&cache->lock);
420                 return ret;
421         }
422         cache->cached = BTRFS_CACHE_STARTED;
423         spin_unlock(&cache->lock);
424
425         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
426                           cache->key.objectid);
427         if (IS_ERR(tsk)) {
428                 ret = PTR_ERR(tsk);
429                 printk(KERN_ERR "error running thread %d\n", ret);
430                 BUG();
431         }
432
433         return ret;
434 }
435
436 /*
437  * return the block group that starts at or after bytenr
438  */
439 static struct btrfs_block_group_cache *
440 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
441 {
442         struct btrfs_block_group_cache *cache;
443
444         cache = block_group_cache_tree_search(info, bytenr, 0);
445
446         return cache;
447 }
448
449 /*
450  * return the block group that contains the given bytenr
451  */
452 struct btrfs_block_group_cache *btrfs_lookup_block_group(
453                                                  struct btrfs_fs_info *info,
454                                                  u64 bytenr)
455 {
456         struct btrfs_block_group_cache *cache;
457
458         cache = block_group_cache_tree_search(info, bytenr, 1);
459
460         return cache;
461 }
462
463 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
464 {
465         if (atomic_dec_and_test(&cache->count))
466                 kfree(cache);
467 }
468
469 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
470                                                   u64 flags)
471 {
472         struct list_head *head = &info->space_info;
473         struct btrfs_space_info *found;
474
475         rcu_read_lock();
476         list_for_each_entry_rcu(found, head, list) {
477                 if (found->flags == flags) {
478                         rcu_read_unlock();
479                         return found;
480                 }
481         }
482         rcu_read_unlock();
483         return NULL;
484 }
485
486 /*
487  * after adding space to the filesystem, we need to clear the full flags
488  * on all the space infos.
489  */
490 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
491 {
492         struct list_head *head = &info->space_info;
493         struct btrfs_space_info *found;
494
495         rcu_read_lock();
496         list_for_each_entry_rcu(found, head, list)
497                 found->full = 0;
498         rcu_read_unlock();
499 }
500
501 static u64 div_factor(u64 num, int factor)
502 {
503         if (factor == 10)
504                 return num;
505         num *= factor;
506         do_div(num, 10);
507         return num;
508 }
509
510 u64 btrfs_find_block_group(struct btrfs_root *root,
511                            u64 search_start, u64 search_hint, int owner)
512 {
513         struct btrfs_block_group_cache *cache;
514         u64 used;
515         u64 last = max(search_hint, search_start);
516         u64 group_start = 0;
517         int full_search = 0;
518         int factor = 9;
519         int wrapped = 0;
520 again:
521         while (1) {
522                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
523                 if (!cache)
524                         break;
525
526                 spin_lock(&cache->lock);
527                 last = cache->key.objectid + cache->key.offset;
528                 used = btrfs_block_group_used(&cache->item);
529
530                 if ((full_search || !cache->ro) &&
531                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
532                         if (used + cache->pinned + cache->reserved <
533                             div_factor(cache->key.offset, factor)) {
534                                 group_start = cache->key.objectid;
535                                 spin_unlock(&cache->lock);
536                                 btrfs_put_block_group(cache);
537                                 goto found;
538                         }
539                 }
540                 spin_unlock(&cache->lock);
541                 btrfs_put_block_group(cache);
542                 cond_resched();
543         }
544         if (!wrapped) {
545                 last = search_start;
546                 wrapped = 1;
547                 goto again;
548         }
549         if (!full_search && factor < 10) {
550                 last = search_start;
551                 full_search = 1;
552                 factor = 10;
553                 goto again;
554         }
555 found:
556         return group_start;
557 }
558
559 /* simple helper to search for an existing extent at a given offset */
560 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
561 {
562         int ret;
563         struct btrfs_key key;
564         struct btrfs_path *path;
565
566         path = btrfs_alloc_path();
567         BUG_ON(!path);
568         key.objectid = start;
569         key.offset = len;
570         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
571         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
572                                 0, 0);
573         btrfs_free_path(path);
574         return ret;
575 }
576
577 /*
578  * Back reference rules.  Back refs have three main goals:
579  *
580  * 1) differentiate between all holders of references to an extent so that
581  *    when a reference is dropped we can make sure it was a valid reference
582  *    before freeing the extent.
583  *
584  * 2) Provide enough information to quickly find the holders of an extent
585  *    if we notice a given block is corrupted or bad.
586  *
587  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
588  *    maintenance.  This is actually the same as #2, but with a slightly
589  *    different use case.
590  *
591  * There are two kinds of back refs. The implicit back refs is optimized
592  * for pointers in non-shared tree blocks. For a given pointer in a block,
593  * back refs of this kind provide information about the block's owner tree
594  * and the pointer's key. These information allow us to find the block by
595  * b-tree searching. The full back refs is for pointers in tree blocks not
596  * referenced by their owner trees. The location of tree block is recorded
597  * in the back refs. Actually the full back refs is generic, and can be
598  * used in all cases the implicit back refs is used. The major shortcoming
599  * of the full back refs is its overhead. Every time a tree block gets
600  * COWed, we have to update back refs entry for all pointers in it.
601  *
602  * For a newly allocated tree block, we use implicit back refs for
603  * pointers in it. This means most tree related operations only involve
604  * implicit back refs. For a tree block created in old transaction, the
605  * only way to drop a reference to it is COW it. So we can detect the
606  * event that tree block loses its owner tree's reference and do the
607  * back refs conversion.
608  *
609  * When a tree block is COW'd through a tree, there are four cases:
610  *
611  * The reference count of the block is one and the tree is the block's
612  * owner tree. Nothing to do in this case.
613  *
614  * The reference count of the block is one and the tree is not the
615  * block's owner tree. In this case, full back refs is used for pointers
616  * in the block. Remove these full back refs, add implicit back refs for
617  * every pointers in the new block.
618  *
619  * The reference count of the block is greater than one and the tree is
620  * the block's owner tree. In this case, implicit back refs is used for
621  * pointers in the block. Add full back refs for every pointers in the
622  * block, increase lower level extents' reference counts. The original
623  * implicit back refs are entailed to the new block.
624  *
625  * The reference count of the block is greater than one and the tree is
626  * not the block's owner tree. Add implicit back refs for every pointer in
627  * the new block, increase lower level extents' reference count.
628  *
629  * Back Reference Key composing:
630  *
631  * The key objectid corresponds to the first byte in the extent,
632  * The key type is used to differentiate between types of back refs.
633  * There are different meanings of the key offset for different types
634  * of back refs.
635  *
636  * File extents can be referenced by:
637  *
638  * - multiple snapshots, subvolumes, or different generations in one subvol
639  * - different files inside a single subvolume
640  * - different offsets inside a file (bookend extents in file.c)
641  *
642  * The extent ref structure for the implicit back refs has fields for:
643  *
644  * - Objectid of the subvolume root
645  * - objectid of the file holding the reference
646  * - original offset in the file
647  * - how many bookend extents
648  *
649  * The key offset for the implicit back refs is hash of the first
650  * three fields.
651  *
652  * The extent ref structure for the full back refs has field for:
653  *
654  * - number of pointers in the tree leaf
655  *
656  * The key offset for the implicit back refs is the first byte of
657  * the tree leaf
658  *
659  * When a file extent is allocated, The implicit back refs is used.
660  * the fields are filled in:
661  *
662  *     (root_key.objectid, inode objectid, offset in file, 1)
663  *
664  * When a file extent is removed file truncation, we find the
665  * corresponding implicit back refs and check the following fields:
666  *
667  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
668  *
669  * Btree extents can be referenced by:
670  *
671  * - Different subvolumes
672  *
673  * Both the implicit back refs and the full back refs for tree blocks
674  * only consist of key. The key offset for the implicit back refs is
675  * objectid of block's owner tree. The key offset for the full back refs
676  * is the first byte of parent block.
677  *
678  * When implicit back refs is used, information about the lowest key and
679  * level of the tree block are required. These information are stored in
680  * tree block info structure.
681  */
682
683 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
684 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
685                                   struct btrfs_root *root,
686                                   struct btrfs_path *path,
687                                   u64 owner, u32 extra_size)
688 {
689         struct btrfs_extent_item *item;
690         struct btrfs_extent_item_v0 *ei0;
691         struct btrfs_extent_ref_v0 *ref0;
692         struct btrfs_tree_block_info *bi;
693         struct extent_buffer *leaf;
694         struct btrfs_key key;
695         struct btrfs_key found_key;
696         u32 new_size = sizeof(*item);
697         u64 refs;
698         int ret;
699
700         leaf = path->nodes[0];
701         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
702
703         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
704         ei0 = btrfs_item_ptr(leaf, path->slots[0],
705                              struct btrfs_extent_item_v0);
706         refs = btrfs_extent_refs_v0(leaf, ei0);
707
708         if (owner == (u64)-1) {
709                 while (1) {
710                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
711                                 ret = btrfs_next_leaf(root, path);
712                                 if (ret < 0)
713                                         return ret;
714                                 BUG_ON(ret > 0);
715                                 leaf = path->nodes[0];
716                         }
717                         btrfs_item_key_to_cpu(leaf, &found_key,
718                                               path->slots[0]);
719                         BUG_ON(key.objectid != found_key.objectid);
720                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
721                                 path->slots[0]++;
722                                 continue;
723                         }
724                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
725                                               struct btrfs_extent_ref_v0);
726                         owner = btrfs_ref_objectid_v0(leaf, ref0);
727                         break;
728                 }
729         }
730         btrfs_release_path(root, path);
731
732         if (owner < BTRFS_FIRST_FREE_OBJECTID)
733                 new_size += sizeof(*bi);
734
735         new_size -= sizeof(*ei0);
736         ret = btrfs_search_slot(trans, root, &key, path,
737                                 new_size + extra_size, 1);
738         if (ret < 0)
739                 return ret;
740         BUG_ON(ret);
741
742         ret = btrfs_extend_item(trans, root, path, new_size);
743         BUG_ON(ret);
744
745         leaf = path->nodes[0];
746         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
747         btrfs_set_extent_refs(leaf, item, refs);
748         /* FIXME: get real generation */
749         btrfs_set_extent_generation(leaf, item, 0);
750         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
751                 btrfs_set_extent_flags(leaf, item,
752                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
753                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
754                 bi = (struct btrfs_tree_block_info *)(item + 1);
755                 /* FIXME: get first key of the block */
756                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
757                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
758         } else {
759                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
760         }
761         btrfs_mark_buffer_dirty(leaf);
762         return 0;
763 }
764 #endif
765
766 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
767 {
768         u32 high_crc = ~(u32)0;
769         u32 low_crc = ~(u32)0;
770         __le64 lenum;
771
772         lenum = cpu_to_le64(root_objectid);
773         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
774         lenum = cpu_to_le64(owner);
775         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
776         lenum = cpu_to_le64(offset);
777         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
778
779         return ((u64)high_crc << 31) ^ (u64)low_crc;
780 }
781
782 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
783                                      struct btrfs_extent_data_ref *ref)
784 {
785         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
786                                     btrfs_extent_data_ref_objectid(leaf, ref),
787                                     btrfs_extent_data_ref_offset(leaf, ref));
788 }
789
790 static int match_extent_data_ref(struct extent_buffer *leaf,
791                                  struct btrfs_extent_data_ref *ref,
792                                  u64 root_objectid, u64 owner, u64 offset)
793 {
794         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
795             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
796             btrfs_extent_data_ref_offset(leaf, ref) != offset)
797                 return 0;
798         return 1;
799 }
800
801 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
802                                            struct btrfs_root *root,
803                                            struct btrfs_path *path,
804                                            u64 bytenr, u64 parent,
805                                            u64 root_objectid,
806                                            u64 owner, u64 offset)
807 {
808         struct btrfs_key key;
809         struct btrfs_extent_data_ref *ref;
810         struct extent_buffer *leaf;
811         u32 nritems;
812         int ret;
813         int recow;
814         int err = -ENOENT;
815
816         key.objectid = bytenr;
817         if (parent) {
818                 key.type = BTRFS_SHARED_DATA_REF_KEY;
819                 key.offset = parent;
820         } else {
821                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
822                 key.offset = hash_extent_data_ref(root_objectid,
823                                                   owner, offset);
824         }
825 again:
826         recow = 0;
827         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
828         if (ret < 0) {
829                 err = ret;
830                 goto fail;
831         }
832
833         if (parent) {
834                 if (!ret)
835                         return 0;
836 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
837                 key.type = BTRFS_EXTENT_REF_V0_KEY;
838                 btrfs_release_path(root, path);
839                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
840                 if (ret < 0) {
841                         err = ret;
842                         goto fail;
843                 }
844                 if (!ret)
845                         return 0;
846 #endif
847                 goto fail;
848         }
849
850         leaf = path->nodes[0];
851         nritems = btrfs_header_nritems(leaf);
852         while (1) {
853                 if (path->slots[0] >= nritems) {
854                         ret = btrfs_next_leaf(root, path);
855                         if (ret < 0)
856                                 err = ret;
857                         if (ret)
858                                 goto fail;
859
860                         leaf = path->nodes[0];
861                         nritems = btrfs_header_nritems(leaf);
862                         recow = 1;
863                 }
864
865                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
866                 if (key.objectid != bytenr ||
867                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
868                         goto fail;
869
870                 ref = btrfs_item_ptr(leaf, path->slots[0],
871                                      struct btrfs_extent_data_ref);
872
873                 if (match_extent_data_ref(leaf, ref, root_objectid,
874                                           owner, offset)) {
875                         if (recow) {
876                                 btrfs_release_path(root, path);
877                                 goto again;
878                         }
879                         err = 0;
880                         break;
881                 }
882                 path->slots[0]++;
883         }
884 fail:
885         return err;
886 }
887
888 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
889                                            struct btrfs_root *root,
890                                            struct btrfs_path *path,
891                                            u64 bytenr, u64 parent,
892                                            u64 root_objectid, u64 owner,
893                                            u64 offset, int refs_to_add)
894 {
895         struct btrfs_key key;
896         struct extent_buffer *leaf;
897         u32 size;
898         u32 num_refs;
899         int ret;
900
901         key.objectid = bytenr;
902         if (parent) {
903                 key.type = BTRFS_SHARED_DATA_REF_KEY;
904                 key.offset = parent;
905                 size = sizeof(struct btrfs_shared_data_ref);
906         } else {
907                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
908                 key.offset = hash_extent_data_ref(root_objectid,
909                                                   owner, offset);
910                 size = sizeof(struct btrfs_extent_data_ref);
911         }
912
913         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
914         if (ret && ret != -EEXIST)
915                 goto fail;
916
917         leaf = path->nodes[0];
918         if (parent) {
919                 struct btrfs_shared_data_ref *ref;
920                 ref = btrfs_item_ptr(leaf, path->slots[0],
921                                      struct btrfs_shared_data_ref);
922                 if (ret == 0) {
923                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
924                 } else {
925                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
926                         num_refs += refs_to_add;
927                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
928                 }
929         } else {
930                 struct btrfs_extent_data_ref *ref;
931                 while (ret == -EEXIST) {
932                         ref = btrfs_item_ptr(leaf, path->slots[0],
933                                              struct btrfs_extent_data_ref);
934                         if (match_extent_data_ref(leaf, ref, root_objectid,
935                                                   owner, offset))
936                                 break;
937                         btrfs_release_path(root, path);
938                         key.offset++;
939                         ret = btrfs_insert_empty_item(trans, root, path, &key,
940                                                       size);
941                         if (ret && ret != -EEXIST)
942                                 goto fail;
943
944                         leaf = path->nodes[0];
945                 }
946                 ref = btrfs_item_ptr(leaf, path->slots[0],
947                                      struct btrfs_extent_data_ref);
948                 if (ret == 0) {
949                         btrfs_set_extent_data_ref_root(leaf, ref,
950                                                        root_objectid);
951                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
952                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
953                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
954                 } else {
955                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
956                         num_refs += refs_to_add;
957                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
958                 }
959         }
960         btrfs_mark_buffer_dirty(leaf);
961         ret = 0;
962 fail:
963         btrfs_release_path(root, path);
964         return ret;
965 }
966
967 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
968                                            struct btrfs_root *root,
969                                            struct btrfs_path *path,
970                                            int refs_to_drop)
971 {
972         struct btrfs_key key;
973         struct btrfs_extent_data_ref *ref1 = NULL;
974         struct btrfs_shared_data_ref *ref2 = NULL;
975         struct extent_buffer *leaf;
976         u32 num_refs = 0;
977         int ret = 0;
978
979         leaf = path->nodes[0];
980         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
981
982         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
983                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
984                                       struct btrfs_extent_data_ref);
985                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
986         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
987                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
988                                       struct btrfs_shared_data_ref);
989                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
990 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
991         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
992                 struct btrfs_extent_ref_v0 *ref0;
993                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
994                                       struct btrfs_extent_ref_v0);
995                 num_refs = btrfs_ref_count_v0(leaf, ref0);
996 #endif
997         } else {
998                 BUG();
999         }
1000
1001         BUG_ON(num_refs < refs_to_drop);
1002         num_refs -= refs_to_drop;
1003
1004         if (num_refs == 0) {
1005                 ret = btrfs_del_item(trans, root, path);
1006         } else {
1007                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1008                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1009                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1010                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1011 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1012                 else {
1013                         struct btrfs_extent_ref_v0 *ref0;
1014                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1015                                         struct btrfs_extent_ref_v0);
1016                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1017                 }
1018 #endif
1019                 btrfs_mark_buffer_dirty(leaf);
1020         }
1021         return ret;
1022 }
1023
1024 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1025                                           struct btrfs_path *path,
1026                                           struct btrfs_extent_inline_ref *iref)
1027 {
1028         struct btrfs_key key;
1029         struct extent_buffer *leaf;
1030         struct btrfs_extent_data_ref *ref1;
1031         struct btrfs_shared_data_ref *ref2;
1032         u32 num_refs = 0;
1033
1034         leaf = path->nodes[0];
1035         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1036         if (iref) {
1037                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1038                     BTRFS_EXTENT_DATA_REF_KEY) {
1039                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1040                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1041                 } else {
1042                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1043                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1044                 }
1045         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1046                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1047                                       struct btrfs_extent_data_ref);
1048                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1049         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1050                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1051                                       struct btrfs_shared_data_ref);
1052                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1053 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1054         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1055                 struct btrfs_extent_ref_v0 *ref0;
1056                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1057                                       struct btrfs_extent_ref_v0);
1058                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1059 #endif
1060         } else {
1061                 WARN_ON(1);
1062         }
1063         return num_refs;
1064 }
1065
1066 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1067                                           struct btrfs_root *root,
1068                                           struct btrfs_path *path,
1069                                           u64 bytenr, u64 parent,
1070                                           u64 root_objectid)
1071 {
1072         struct btrfs_key key;
1073         int ret;
1074
1075         key.objectid = bytenr;
1076         if (parent) {
1077                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1078                 key.offset = parent;
1079         } else {
1080                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1081                 key.offset = root_objectid;
1082         }
1083
1084         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1085         if (ret > 0)
1086                 ret = -ENOENT;
1087 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1088         if (ret == -ENOENT && parent) {
1089                 btrfs_release_path(root, path);
1090                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1091                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1092                 if (ret > 0)
1093                         ret = -ENOENT;
1094         }
1095 #endif
1096         return ret;
1097 }
1098
1099 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1100                                           struct btrfs_root *root,
1101                                           struct btrfs_path *path,
1102                                           u64 bytenr, u64 parent,
1103                                           u64 root_objectid)
1104 {
1105         struct btrfs_key key;
1106         int ret;
1107
1108         key.objectid = bytenr;
1109         if (parent) {
1110                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1111                 key.offset = parent;
1112         } else {
1113                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1114                 key.offset = root_objectid;
1115         }
1116
1117         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1118         btrfs_release_path(root, path);
1119         return ret;
1120 }
1121
1122 static inline int extent_ref_type(u64 parent, u64 owner)
1123 {
1124         int type;
1125         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1126                 if (parent > 0)
1127                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1128                 else
1129                         type = BTRFS_TREE_BLOCK_REF_KEY;
1130         } else {
1131                 if (parent > 0)
1132                         type = BTRFS_SHARED_DATA_REF_KEY;
1133                 else
1134                         type = BTRFS_EXTENT_DATA_REF_KEY;
1135         }
1136         return type;
1137 }
1138
1139 static int find_next_key(struct btrfs_path *path, int level,
1140                          struct btrfs_key *key)
1141
1142 {
1143         for (; level < BTRFS_MAX_LEVEL; level++) {
1144                 if (!path->nodes[level])
1145                         break;
1146                 if (path->slots[level] + 1 >=
1147                     btrfs_header_nritems(path->nodes[level]))
1148                         continue;
1149                 if (level == 0)
1150                         btrfs_item_key_to_cpu(path->nodes[level], key,
1151                                               path->slots[level] + 1);
1152                 else
1153                         btrfs_node_key_to_cpu(path->nodes[level], key,
1154                                               path->slots[level] + 1);
1155                 return 0;
1156         }
1157         return 1;
1158 }
1159
1160 /*
1161  * look for inline back ref. if back ref is found, *ref_ret is set
1162  * to the address of inline back ref, and 0 is returned.
1163  *
1164  * if back ref isn't found, *ref_ret is set to the address where it
1165  * should be inserted, and -ENOENT is returned.
1166  *
1167  * if insert is true and there are too many inline back refs, the path
1168  * points to the extent item, and -EAGAIN is returned.
1169  *
1170  * NOTE: inline back refs are ordered in the same way that back ref
1171  *       items in the tree are ordered.
1172  */
1173 static noinline_for_stack
1174 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1175                                  struct btrfs_root *root,
1176                                  struct btrfs_path *path,
1177                                  struct btrfs_extent_inline_ref **ref_ret,
1178                                  u64 bytenr, u64 num_bytes,
1179                                  u64 parent, u64 root_objectid,
1180                                  u64 owner, u64 offset, int insert)
1181 {
1182         struct btrfs_key key;
1183         struct extent_buffer *leaf;
1184         struct btrfs_extent_item *ei;
1185         struct btrfs_extent_inline_ref *iref;
1186         u64 flags;
1187         u64 item_size;
1188         unsigned long ptr;
1189         unsigned long end;
1190         int extra_size;
1191         int type;
1192         int want;
1193         int ret;
1194         int err = 0;
1195
1196         key.objectid = bytenr;
1197         key.type = BTRFS_EXTENT_ITEM_KEY;
1198         key.offset = num_bytes;
1199
1200         want = extent_ref_type(parent, owner);
1201         if (insert) {
1202                 extra_size = btrfs_extent_inline_ref_size(want);
1203                 path->keep_locks = 1;
1204         } else
1205                 extra_size = -1;
1206         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1207         if (ret < 0) {
1208                 err = ret;
1209                 goto out;
1210         }
1211         BUG_ON(ret);
1212
1213         leaf = path->nodes[0];
1214         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1215 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1216         if (item_size < sizeof(*ei)) {
1217                 if (!insert) {
1218                         err = -ENOENT;
1219                         goto out;
1220                 }
1221                 ret = convert_extent_item_v0(trans, root, path, owner,
1222                                              extra_size);
1223                 if (ret < 0) {
1224                         err = ret;
1225                         goto out;
1226                 }
1227                 leaf = path->nodes[0];
1228                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1229         }
1230 #endif
1231         BUG_ON(item_size < sizeof(*ei));
1232
1233         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1234         flags = btrfs_extent_flags(leaf, ei);
1235
1236         ptr = (unsigned long)(ei + 1);
1237         end = (unsigned long)ei + item_size;
1238
1239         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1240                 ptr += sizeof(struct btrfs_tree_block_info);
1241                 BUG_ON(ptr > end);
1242         } else {
1243                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1244         }
1245
1246         err = -ENOENT;
1247         while (1) {
1248                 if (ptr >= end) {
1249                         WARN_ON(ptr > end);
1250                         break;
1251                 }
1252                 iref = (struct btrfs_extent_inline_ref *)ptr;
1253                 type = btrfs_extent_inline_ref_type(leaf, iref);
1254                 if (want < type)
1255                         break;
1256                 if (want > type) {
1257                         ptr += btrfs_extent_inline_ref_size(type);
1258                         continue;
1259                 }
1260
1261                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1262                         struct btrfs_extent_data_ref *dref;
1263                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1264                         if (match_extent_data_ref(leaf, dref, root_objectid,
1265                                                   owner, offset)) {
1266                                 err = 0;
1267                                 break;
1268                         }
1269                         if (hash_extent_data_ref_item(leaf, dref) <
1270                             hash_extent_data_ref(root_objectid, owner, offset))
1271                                 break;
1272                 } else {
1273                         u64 ref_offset;
1274                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1275                         if (parent > 0) {
1276                                 if (parent == ref_offset) {
1277                                         err = 0;
1278                                         break;
1279                                 }
1280                                 if (ref_offset < parent)
1281                                         break;
1282                         } else {
1283                                 if (root_objectid == ref_offset) {
1284                                         err = 0;
1285                                         break;
1286                                 }
1287                                 if (ref_offset < root_objectid)
1288                                         break;
1289                         }
1290                 }
1291                 ptr += btrfs_extent_inline_ref_size(type);
1292         }
1293         if (err == -ENOENT && insert) {
1294                 if (item_size + extra_size >=
1295                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1296                         err = -EAGAIN;
1297                         goto out;
1298                 }
1299                 /*
1300                  * To add new inline back ref, we have to make sure
1301                  * there is no corresponding back ref item.
1302                  * For simplicity, we just do not add new inline back
1303                  * ref if there is any kind of item for this block
1304                  */
1305                 if (find_next_key(path, 0, &key) == 0 &&
1306                     key.objectid == bytenr &&
1307                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1308                         err = -EAGAIN;
1309                         goto out;
1310                 }
1311         }
1312         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1313 out:
1314         if (insert) {
1315                 path->keep_locks = 0;
1316                 btrfs_unlock_up_safe(path, 1);
1317         }
1318         return err;
1319 }
1320
1321 /*
1322  * helper to add new inline back ref
1323  */
1324 static noinline_for_stack
1325 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1326                                 struct btrfs_root *root,
1327                                 struct btrfs_path *path,
1328                                 struct btrfs_extent_inline_ref *iref,
1329                                 u64 parent, u64 root_objectid,
1330                                 u64 owner, u64 offset, int refs_to_add,
1331                                 struct btrfs_delayed_extent_op *extent_op)
1332 {
1333         struct extent_buffer *leaf;
1334         struct btrfs_extent_item *ei;
1335         unsigned long ptr;
1336         unsigned long end;
1337         unsigned long item_offset;
1338         u64 refs;
1339         int size;
1340         int type;
1341         int ret;
1342
1343         leaf = path->nodes[0];
1344         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1345         item_offset = (unsigned long)iref - (unsigned long)ei;
1346
1347         type = extent_ref_type(parent, owner);
1348         size = btrfs_extent_inline_ref_size(type);
1349
1350         ret = btrfs_extend_item(trans, root, path, size);
1351         BUG_ON(ret);
1352
1353         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1354         refs = btrfs_extent_refs(leaf, ei);
1355         refs += refs_to_add;
1356         btrfs_set_extent_refs(leaf, ei, refs);
1357         if (extent_op)
1358                 __run_delayed_extent_op(extent_op, leaf, ei);
1359
1360         ptr = (unsigned long)ei + item_offset;
1361         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1362         if (ptr < end - size)
1363                 memmove_extent_buffer(leaf, ptr + size, ptr,
1364                                       end - size - ptr);
1365
1366         iref = (struct btrfs_extent_inline_ref *)ptr;
1367         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1368         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1369                 struct btrfs_extent_data_ref *dref;
1370                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1371                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1372                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1373                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1374                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1375         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1376                 struct btrfs_shared_data_ref *sref;
1377                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1378                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1379                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1380         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1381                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1382         } else {
1383                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1384         }
1385         btrfs_mark_buffer_dirty(leaf);
1386         return 0;
1387 }
1388
1389 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1390                                  struct btrfs_root *root,
1391                                  struct btrfs_path *path,
1392                                  struct btrfs_extent_inline_ref **ref_ret,
1393                                  u64 bytenr, u64 num_bytes, u64 parent,
1394                                  u64 root_objectid, u64 owner, u64 offset)
1395 {
1396         int ret;
1397
1398         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1399                                            bytenr, num_bytes, parent,
1400                                            root_objectid, owner, offset, 0);
1401         if (ret != -ENOENT)
1402                 return ret;
1403
1404         btrfs_release_path(root, path);
1405         *ref_ret = NULL;
1406
1407         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1408                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1409                                             root_objectid);
1410         } else {
1411                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1412                                              root_objectid, owner, offset);
1413         }
1414         return ret;
1415 }
1416
1417 /*
1418  * helper to update/remove inline back ref
1419  */
1420 static noinline_for_stack
1421 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1422                                  struct btrfs_root *root,
1423                                  struct btrfs_path *path,
1424                                  struct btrfs_extent_inline_ref *iref,
1425                                  int refs_to_mod,
1426                                  struct btrfs_delayed_extent_op *extent_op)
1427 {
1428         struct extent_buffer *leaf;
1429         struct btrfs_extent_item *ei;
1430         struct btrfs_extent_data_ref *dref = NULL;
1431         struct btrfs_shared_data_ref *sref = NULL;
1432         unsigned long ptr;
1433         unsigned long end;
1434         u32 item_size;
1435         int size;
1436         int type;
1437         int ret;
1438         u64 refs;
1439
1440         leaf = path->nodes[0];
1441         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1442         refs = btrfs_extent_refs(leaf, ei);
1443         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1444         refs += refs_to_mod;
1445         btrfs_set_extent_refs(leaf, ei, refs);
1446         if (extent_op)
1447                 __run_delayed_extent_op(extent_op, leaf, ei);
1448
1449         type = btrfs_extent_inline_ref_type(leaf, iref);
1450
1451         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1452                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1453                 refs = btrfs_extent_data_ref_count(leaf, dref);
1454         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1455                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1456                 refs = btrfs_shared_data_ref_count(leaf, sref);
1457         } else {
1458                 refs = 1;
1459                 BUG_ON(refs_to_mod != -1);
1460         }
1461
1462         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1463         refs += refs_to_mod;
1464
1465         if (refs > 0) {
1466                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1467                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1468                 else
1469                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1470         } else {
1471                 size =  btrfs_extent_inline_ref_size(type);
1472                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1473                 ptr = (unsigned long)iref;
1474                 end = (unsigned long)ei + item_size;
1475                 if (ptr + size < end)
1476                         memmove_extent_buffer(leaf, ptr, ptr + size,
1477                                               end - ptr - size);
1478                 item_size -= size;
1479                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1480                 BUG_ON(ret);
1481         }
1482         btrfs_mark_buffer_dirty(leaf);
1483         return 0;
1484 }
1485
1486 static noinline_for_stack
1487 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1488                                  struct btrfs_root *root,
1489                                  struct btrfs_path *path,
1490                                  u64 bytenr, u64 num_bytes, u64 parent,
1491                                  u64 root_objectid, u64 owner,
1492                                  u64 offset, int refs_to_add,
1493                                  struct btrfs_delayed_extent_op *extent_op)
1494 {
1495         struct btrfs_extent_inline_ref *iref;
1496         int ret;
1497
1498         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1499                                            bytenr, num_bytes, parent,
1500                                            root_objectid, owner, offset, 1);
1501         if (ret == 0) {
1502                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1503                 ret = update_inline_extent_backref(trans, root, path, iref,
1504                                                    refs_to_add, extent_op);
1505         } else if (ret == -ENOENT) {
1506                 ret = setup_inline_extent_backref(trans, root, path, iref,
1507                                                   parent, root_objectid,
1508                                                   owner, offset, refs_to_add,
1509                                                   extent_op);
1510         }
1511         return ret;
1512 }
1513
1514 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1515                                  struct btrfs_root *root,
1516                                  struct btrfs_path *path,
1517                                  u64 bytenr, u64 parent, u64 root_objectid,
1518                                  u64 owner, u64 offset, int refs_to_add)
1519 {
1520         int ret;
1521         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1522                 BUG_ON(refs_to_add != 1);
1523                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1524                                             parent, root_objectid);
1525         } else {
1526                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1527                                              parent, root_objectid,
1528                                              owner, offset, refs_to_add);
1529         }
1530         return ret;
1531 }
1532
1533 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1534                                  struct btrfs_root *root,
1535                                  struct btrfs_path *path,
1536                                  struct btrfs_extent_inline_ref *iref,
1537                                  int refs_to_drop, int is_data)
1538 {
1539         int ret;
1540
1541         BUG_ON(!is_data && refs_to_drop != 1);
1542         if (iref) {
1543                 ret = update_inline_extent_backref(trans, root, path, iref,
1544                                                    -refs_to_drop, NULL);
1545         } else if (is_data) {
1546                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1547         } else {
1548                 ret = btrfs_del_item(trans, root, path);
1549         }
1550         return ret;
1551 }
1552
1553 #ifdef BIO_RW_DISCARD
1554 static void btrfs_issue_discard(struct block_device *bdev,
1555                                 u64 start, u64 len)
1556 {
1557         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
1558 }
1559 #endif
1560
1561 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1562                                 u64 num_bytes)
1563 {
1564 #ifdef BIO_RW_DISCARD
1565         int ret;
1566         u64 map_length = num_bytes;
1567         struct btrfs_multi_bio *multi = NULL;
1568
1569         /* Tell the block device(s) that the sectors can be discarded */
1570         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1571                               bytenr, &map_length, &multi, 0);
1572         if (!ret) {
1573                 struct btrfs_bio_stripe *stripe = multi->stripes;
1574                 int i;
1575
1576                 if (map_length > num_bytes)
1577                         map_length = num_bytes;
1578
1579                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1580                         btrfs_issue_discard(stripe->dev->bdev,
1581                                             stripe->physical,
1582                                             map_length);
1583                 }
1584                 kfree(multi);
1585         }
1586
1587         return ret;
1588 #else
1589         return 0;
1590 #endif
1591 }
1592
1593 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1594                          struct btrfs_root *root,
1595                          u64 bytenr, u64 num_bytes, u64 parent,
1596                          u64 root_objectid, u64 owner, u64 offset)
1597 {
1598         int ret;
1599         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1600                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1601
1602         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1603                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1604                                         parent, root_objectid, (int)owner,
1605                                         BTRFS_ADD_DELAYED_REF, NULL);
1606         } else {
1607                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1608                                         parent, root_objectid, owner, offset,
1609                                         BTRFS_ADD_DELAYED_REF, NULL);
1610         }
1611         return ret;
1612 }
1613
1614 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1615                                   struct btrfs_root *root,
1616                                   u64 bytenr, u64 num_bytes,
1617                                   u64 parent, u64 root_objectid,
1618                                   u64 owner, u64 offset, int refs_to_add,
1619                                   struct btrfs_delayed_extent_op *extent_op)
1620 {
1621         struct btrfs_path *path;
1622         struct extent_buffer *leaf;
1623         struct btrfs_extent_item *item;
1624         u64 refs;
1625         int ret;
1626         int err = 0;
1627
1628         path = btrfs_alloc_path();
1629         if (!path)
1630                 return -ENOMEM;
1631
1632         path->reada = 1;
1633         path->leave_spinning = 1;
1634         /* this will setup the path even if it fails to insert the back ref */
1635         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1636                                            path, bytenr, num_bytes, parent,
1637                                            root_objectid, owner, offset,
1638                                            refs_to_add, extent_op);
1639         if (ret == 0)
1640                 goto out;
1641
1642         if (ret != -EAGAIN) {
1643                 err = ret;
1644                 goto out;
1645         }
1646
1647         leaf = path->nodes[0];
1648         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1649         refs = btrfs_extent_refs(leaf, item);
1650         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1651         if (extent_op)
1652                 __run_delayed_extent_op(extent_op, leaf, item);
1653
1654         btrfs_mark_buffer_dirty(leaf);
1655         btrfs_release_path(root->fs_info->extent_root, path);
1656
1657         path->reada = 1;
1658         path->leave_spinning = 1;
1659
1660         /* now insert the actual backref */
1661         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1662                                     path, bytenr, parent, root_objectid,
1663                                     owner, offset, refs_to_add);
1664         BUG_ON(ret);
1665 out:
1666         btrfs_free_path(path);
1667         return err;
1668 }
1669
1670 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1671                                 struct btrfs_root *root,
1672                                 struct btrfs_delayed_ref_node *node,
1673                                 struct btrfs_delayed_extent_op *extent_op,
1674                                 int insert_reserved)
1675 {
1676         int ret = 0;
1677         struct btrfs_delayed_data_ref *ref;
1678         struct btrfs_key ins;
1679         u64 parent = 0;
1680         u64 ref_root = 0;
1681         u64 flags = 0;
1682
1683         ins.objectid = node->bytenr;
1684         ins.offset = node->num_bytes;
1685         ins.type = BTRFS_EXTENT_ITEM_KEY;
1686
1687         ref = btrfs_delayed_node_to_data_ref(node);
1688         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1689                 parent = ref->parent;
1690         else
1691                 ref_root = ref->root;
1692
1693         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1694                 if (extent_op) {
1695                         BUG_ON(extent_op->update_key);
1696                         flags |= extent_op->flags_to_set;
1697                 }
1698                 ret = alloc_reserved_file_extent(trans, root,
1699                                                  parent, ref_root, flags,
1700                                                  ref->objectid, ref->offset,
1701                                                  &ins, node->ref_mod);
1702                 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1703         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1704                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1705                                              node->num_bytes, parent,
1706                                              ref_root, ref->objectid,
1707                                              ref->offset, node->ref_mod,
1708                                              extent_op);
1709         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1710                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1711                                           node->num_bytes, parent,
1712                                           ref_root, ref->objectid,
1713                                           ref->offset, node->ref_mod,
1714                                           extent_op);
1715         } else {
1716                 BUG();
1717         }
1718         return ret;
1719 }
1720
1721 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1722                                     struct extent_buffer *leaf,
1723                                     struct btrfs_extent_item *ei)
1724 {
1725         u64 flags = btrfs_extent_flags(leaf, ei);
1726         if (extent_op->update_flags) {
1727                 flags |= extent_op->flags_to_set;
1728                 btrfs_set_extent_flags(leaf, ei, flags);
1729         }
1730
1731         if (extent_op->update_key) {
1732                 struct btrfs_tree_block_info *bi;
1733                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1734                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1735                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1736         }
1737 }
1738
1739 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1740                                  struct btrfs_root *root,
1741                                  struct btrfs_delayed_ref_node *node,
1742                                  struct btrfs_delayed_extent_op *extent_op)
1743 {
1744         struct btrfs_key key;
1745         struct btrfs_path *path;
1746         struct btrfs_extent_item *ei;
1747         struct extent_buffer *leaf;
1748         u32 item_size;
1749         int ret;
1750         int err = 0;
1751
1752         path = btrfs_alloc_path();
1753         if (!path)
1754                 return -ENOMEM;
1755
1756         key.objectid = node->bytenr;
1757         key.type = BTRFS_EXTENT_ITEM_KEY;
1758         key.offset = node->num_bytes;
1759
1760         path->reada = 1;
1761         path->leave_spinning = 1;
1762         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1763                                 path, 0, 1);
1764         if (ret < 0) {
1765                 err = ret;
1766                 goto out;
1767         }
1768         if (ret > 0) {
1769                 err = -EIO;
1770                 goto out;
1771         }
1772
1773         leaf = path->nodes[0];
1774         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1775 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1776         if (item_size < sizeof(*ei)) {
1777                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1778                                              path, (u64)-1, 0);
1779                 if (ret < 0) {
1780                         err = ret;
1781                         goto out;
1782                 }
1783                 leaf = path->nodes[0];
1784                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1785         }
1786 #endif
1787         BUG_ON(item_size < sizeof(*ei));
1788         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1789         __run_delayed_extent_op(extent_op, leaf, ei);
1790
1791         btrfs_mark_buffer_dirty(leaf);
1792 out:
1793         btrfs_free_path(path);
1794         return err;
1795 }
1796
1797 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1798                                 struct btrfs_root *root,
1799                                 struct btrfs_delayed_ref_node *node,
1800                                 struct btrfs_delayed_extent_op *extent_op,
1801                                 int insert_reserved)
1802 {
1803         int ret = 0;
1804         struct btrfs_delayed_tree_ref *ref;
1805         struct btrfs_key ins;
1806         u64 parent = 0;
1807         u64 ref_root = 0;
1808
1809         ins.objectid = node->bytenr;
1810         ins.offset = node->num_bytes;
1811         ins.type = BTRFS_EXTENT_ITEM_KEY;
1812
1813         ref = btrfs_delayed_node_to_tree_ref(node);
1814         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1815                 parent = ref->parent;
1816         else
1817                 ref_root = ref->root;
1818
1819         BUG_ON(node->ref_mod != 1);
1820         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1821                 BUG_ON(!extent_op || !extent_op->update_flags ||
1822                        !extent_op->update_key);
1823                 ret = alloc_reserved_tree_block(trans, root,
1824                                                 parent, ref_root,
1825                                                 extent_op->flags_to_set,
1826                                                 &extent_op->key,
1827                                                 ref->level, &ins);
1828                 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1829         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1830                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1831                                              node->num_bytes, parent, ref_root,
1832                                              ref->level, 0, 1, extent_op);
1833         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1834                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1835                                           node->num_bytes, parent, ref_root,
1836                                           ref->level, 0, 1, extent_op);
1837         } else {
1838                 BUG();
1839         }
1840         return ret;
1841 }
1842
1843
1844 /* helper function to actually process a single delayed ref entry */
1845 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1846                                struct btrfs_root *root,
1847                                struct btrfs_delayed_ref_node *node,
1848                                struct btrfs_delayed_extent_op *extent_op,
1849                                int insert_reserved)
1850 {
1851         int ret;
1852         if (btrfs_delayed_ref_is_head(node)) {
1853                 struct btrfs_delayed_ref_head *head;
1854                 /*
1855                  * we've hit the end of the chain and we were supposed
1856                  * to insert this extent into the tree.  But, it got
1857                  * deleted before we ever needed to insert it, so all
1858                  * we have to do is clean up the accounting
1859                  */
1860                 BUG_ON(extent_op);
1861                 head = btrfs_delayed_node_to_head(node);
1862                 if (insert_reserved) {
1863                         if (head->is_data) {
1864                                 ret = btrfs_del_csums(trans, root,
1865                                                       node->bytenr,
1866                                                       node->num_bytes);
1867                                 BUG_ON(ret);
1868                         }
1869                         btrfs_update_pinned_extents(root, node->bytenr,
1870                                                     node->num_bytes, 1, 0);
1871                         update_reserved_extents(root, node->bytenr,
1872                                                 node->num_bytes, 0);
1873                 }
1874                 mutex_unlock(&head->mutex);
1875                 return 0;
1876         }
1877
1878         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1879             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1880                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
1881                                            insert_reserved);
1882         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1883                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1884                 ret = run_delayed_data_ref(trans, root, node, extent_op,
1885                                            insert_reserved);
1886         else
1887                 BUG();
1888         return ret;
1889 }
1890
1891 static noinline struct btrfs_delayed_ref_node *
1892 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1893 {
1894         struct rb_node *node;
1895         struct btrfs_delayed_ref_node *ref;
1896         int action = BTRFS_ADD_DELAYED_REF;
1897 again:
1898         /*
1899          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1900          * this prevents ref count from going down to zero when
1901          * there still are pending delayed ref.
1902          */
1903         node = rb_prev(&head->node.rb_node);
1904         while (1) {
1905                 if (!node)
1906                         break;
1907                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1908                                 rb_node);
1909                 if (ref->bytenr != head->node.bytenr)
1910                         break;
1911                 if (ref->action == action)
1912                         return ref;
1913                 node = rb_prev(node);
1914         }
1915         if (action == BTRFS_ADD_DELAYED_REF) {
1916                 action = BTRFS_DROP_DELAYED_REF;
1917                 goto again;
1918         }
1919         return NULL;
1920 }
1921
1922 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1923                                        struct btrfs_root *root,
1924                                        struct list_head *cluster)
1925 {
1926         struct btrfs_delayed_ref_root *delayed_refs;
1927         struct btrfs_delayed_ref_node *ref;
1928         struct btrfs_delayed_ref_head *locked_ref = NULL;
1929         struct btrfs_delayed_extent_op *extent_op;
1930         int ret;
1931         int count = 0;
1932         int must_insert_reserved = 0;
1933
1934         delayed_refs = &trans->transaction->delayed_refs;
1935         while (1) {
1936                 if (!locked_ref) {
1937                         /* pick a new head ref from the cluster list */
1938                         if (list_empty(cluster))
1939                                 break;
1940
1941                         locked_ref = list_entry(cluster->next,
1942                                      struct btrfs_delayed_ref_head, cluster);
1943
1944                         /* grab the lock that says we are going to process
1945                          * all the refs for this head */
1946                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
1947
1948                         /*
1949                          * we may have dropped the spin lock to get the head
1950                          * mutex lock, and that might have given someone else
1951                          * time to free the head.  If that's true, it has been
1952                          * removed from our list and we can move on.
1953                          */
1954                         if (ret == -EAGAIN) {
1955                                 locked_ref = NULL;
1956                                 count++;
1957                                 continue;
1958                         }
1959                 }
1960
1961                 /*
1962                  * record the must insert reserved flag before we
1963                  * drop the spin lock.
1964                  */
1965                 must_insert_reserved = locked_ref->must_insert_reserved;
1966                 locked_ref->must_insert_reserved = 0;
1967
1968                 extent_op = locked_ref->extent_op;
1969                 locked_ref->extent_op = NULL;
1970
1971                 /*
1972                  * locked_ref is the head node, so we have to go one
1973                  * node back for any delayed ref updates
1974                  */
1975                 ref = select_delayed_ref(locked_ref);
1976                 if (!ref) {
1977                         /* All delayed refs have been processed, Go ahead
1978                          * and send the head node to run_one_delayed_ref,
1979                          * so that any accounting fixes can happen
1980                          */
1981                         ref = &locked_ref->node;
1982
1983                         if (extent_op && must_insert_reserved) {
1984                                 kfree(extent_op);
1985                                 extent_op = NULL;
1986                         }
1987
1988                         if (extent_op) {
1989                                 spin_unlock(&delayed_refs->lock);
1990
1991                                 ret = run_delayed_extent_op(trans, root,
1992                                                             ref, extent_op);
1993                                 BUG_ON(ret);
1994                                 kfree(extent_op);
1995
1996                                 cond_resched();
1997                                 spin_lock(&delayed_refs->lock);
1998                                 continue;
1999                         }
2000
2001                         list_del_init(&locked_ref->cluster);
2002                         locked_ref = NULL;
2003                 }
2004
2005                 ref->in_tree = 0;
2006                 rb_erase(&ref->rb_node, &delayed_refs->root);
2007                 delayed_refs->num_entries--;
2008
2009                 spin_unlock(&delayed_refs->lock);
2010
2011                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2012                                           must_insert_reserved);
2013                 BUG_ON(ret);
2014
2015                 btrfs_put_delayed_ref(ref);
2016                 kfree(extent_op);
2017                 count++;
2018
2019                 cond_resched();
2020                 spin_lock(&delayed_refs->lock);
2021         }
2022         return count;
2023 }
2024
2025 /*
2026  * this starts processing the delayed reference count updates and
2027  * extent insertions we have queued up so far.  count can be
2028  * 0, which means to process everything in the tree at the start
2029  * of the run (but not newly added entries), or it can be some target
2030  * number you'd like to process.
2031  */
2032 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2033                            struct btrfs_root *root, unsigned long count)
2034 {
2035         struct rb_node *node;
2036         struct btrfs_delayed_ref_root *delayed_refs;
2037         struct btrfs_delayed_ref_node *ref;
2038         struct list_head cluster;
2039         int ret;
2040         int run_all = count == (unsigned long)-1;
2041         int run_most = 0;
2042
2043         if (root == root->fs_info->extent_root)
2044                 root = root->fs_info->tree_root;
2045
2046         delayed_refs = &trans->transaction->delayed_refs;
2047         INIT_LIST_HEAD(&cluster);
2048 again:
2049         spin_lock(&delayed_refs->lock);
2050         if (count == 0) {
2051                 count = delayed_refs->num_entries * 2;
2052                 run_most = 1;
2053         }
2054         while (1) {
2055                 if (!(run_all || run_most) &&
2056                     delayed_refs->num_heads_ready < 64)
2057                         break;
2058
2059                 /*
2060                  * go find something we can process in the rbtree.  We start at
2061                  * the beginning of the tree, and then build a cluster
2062                  * of refs to process starting at the first one we are able to
2063                  * lock
2064                  */
2065                 ret = btrfs_find_ref_cluster(trans, &cluster,
2066                                              delayed_refs->run_delayed_start);
2067                 if (ret)
2068                         break;
2069
2070                 ret = run_clustered_refs(trans, root, &cluster);
2071                 BUG_ON(ret < 0);
2072
2073                 count -= min_t(unsigned long, ret, count);
2074
2075                 if (count == 0)
2076                         break;
2077         }
2078
2079         if (run_all) {
2080                 node = rb_first(&delayed_refs->root);
2081                 if (!node)
2082                         goto out;
2083                 count = (unsigned long)-1;
2084
2085                 while (node) {
2086                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2087                                        rb_node);
2088                         if (btrfs_delayed_ref_is_head(ref)) {
2089                                 struct btrfs_delayed_ref_head *head;
2090
2091                                 head = btrfs_delayed_node_to_head(ref);
2092                                 atomic_inc(&ref->refs);
2093
2094                                 spin_unlock(&delayed_refs->lock);
2095                                 mutex_lock(&head->mutex);
2096                                 mutex_unlock(&head->mutex);
2097
2098                                 btrfs_put_delayed_ref(ref);
2099                                 cond_resched();
2100                                 goto again;
2101                         }
2102                         node = rb_next(node);
2103                 }
2104                 spin_unlock(&delayed_refs->lock);
2105                 schedule_timeout(1);
2106                 goto again;
2107         }
2108 out:
2109         spin_unlock(&delayed_refs->lock);
2110         return 0;
2111 }
2112
2113 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2114                                 struct btrfs_root *root,
2115                                 u64 bytenr, u64 num_bytes, u64 flags,
2116                                 int is_data)
2117 {
2118         struct btrfs_delayed_extent_op *extent_op;
2119         int ret;
2120
2121         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2122         if (!extent_op)
2123                 return -ENOMEM;
2124
2125         extent_op->flags_to_set = flags;
2126         extent_op->update_flags = 1;
2127         extent_op->update_key = 0;
2128         extent_op->is_data = is_data ? 1 : 0;
2129
2130         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2131         if (ret)
2132                 kfree(extent_op);
2133         return ret;
2134 }
2135
2136 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2137                                       struct btrfs_root *root,
2138                                       struct btrfs_path *path,
2139                                       u64 objectid, u64 offset, u64 bytenr)
2140 {
2141         struct btrfs_delayed_ref_head *head;
2142         struct btrfs_delayed_ref_node *ref;
2143         struct btrfs_delayed_data_ref *data_ref;
2144         struct btrfs_delayed_ref_root *delayed_refs;
2145         struct rb_node *node;
2146         int ret = 0;
2147
2148         ret = -ENOENT;
2149         delayed_refs = &trans->transaction->delayed_refs;
2150         spin_lock(&delayed_refs->lock);
2151         head = btrfs_find_delayed_ref_head(trans, bytenr);
2152         if (!head)
2153                 goto out;
2154
2155         if (!mutex_trylock(&head->mutex)) {
2156                 atomic_inc(&head->node.refs);
2157                 spin_unlock(&delayed_refs->lock);
2158
2159                 btrfs_release_path(root->fs_info->extent_root, path);
2160
2161                 mutex_lock(&head->mutex);
2162                 mutex_unlock(&head->mutex);
2163                 btrfs_put_delayed_ref(&head->node);
2164                 return -EAGAIN;
2165         }
2166
2167         node = rb_prev(&head->node.rb_node);
2168         if (!node)
2169                 goto out_unlock;
2170
2171         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2172
2173         if (ref->bytenr != bytenr)
2174                 goto out_unlock;
2175
2176         ret = 1;
2177         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2178                 goto out_unlock;
2179
2180         data_ref = btrfs_delayed_node_to_data_ref(ref);
2181
2182         node = rb_prev(node);
2183         if (node) {
2184                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2185                 if (ref->bytenr == bytenr)
2186                         goto out_unlock;
2187         }
2188
2189         if (data_ref->root != root->root_key.objectid ||
2190             data_ref->objectid != objectid || data_ref->offset != offset)
2191                 goto out_unlock;
2192
2193         ret = 0;
2194 out_unlock:
2195         mutex_unlock(&head->mutex);
2196 out:
2197         spin_unlock(&delayed_refs->lock);
2198         return ret;
2199 }
2200
2201 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2202                                         struct btrfs_root *root,
2203                                         struct btrfs_path *path,
2204                                         u64 objectid, u64 offset, u64 bytenr)
2205 {
2206         struct btrfs_root *extent_root = root->fs_info->extent_root;
2207         struct extent_buffer *leaf;
2208         struct btrfs_extent_data_ref *ref;
2209         struct btrfs_extent_inline_ref *iref;
2210         struct btrfs_extent_item *ei;
2211         struct btrfs_key key;
2212         u32 item_size;
2213         int ret;
2214
2215         key.objectid = bytenr;
2216         key.offset = (u64)-1;
2217         key.type = BTRFS_EXTENT_ITEM_KEY;
2218
2219         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2220         if (ret < 0)
2221                 goto out;
2222         BUG_ON(ret == 0);
2223
2224         ret = -ENOENT;
2225         if (path->slots[0] == 0)
2226                 goto out;
2227
2228         path->slots[0]--;
2229         leaf = path->nodes[0];
2230         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2231
2232         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2233                 goto out;
2234
2235         ret = 1;
2236         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2237 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2238         if (item_size < sizeof(*ei)) {
2239                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2240                 goto out;
2241         }
2242 #endif
2243         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2244
2245         if (item_size != sizeof(*ei) +
2246             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2247                 goto out;
2248
2249         if (btrfs_extent_generation(leaf, ei) <=
2250             btrfs_root_last_snapshot(&root->root_item))
2251                 goto out;
2252
2253         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2254         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2255             BTRFS_EXTENT_DATA_REF_KEY)
2256                 goto out;
2257
2258         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2259         if (btrfs_extent_refs(leaf, ei) !=
2260             btrfs_extent_data_ref_count(leaf, ref) ||
2261             btrfs_extent_data_ref_root(leaf, ref) !=
2262             root->root_key.objectid ||
2263             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2264             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2265                 goto out;
2266
2267         ret = 0;
2268 out:
2269         return ret;
2270 }
2271
2272 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2273                           struct btrfs_root *root,
2274                           u64 objectid, u64 offset, u64 bytenr)
2275 {
2276         struct btrfs_path *path;
2277         int ret;
2278         int ret2;
2279
2280         path = btrfs_alloc_path();
2281         if (!path)
2282                 return -ENOENT;
2283
2284         do {
2285                 ret = check_committed_ref(trans, root, path, objectid,
2286                                           offset, bytenr);
2287                 if (ret && ret != -ENOENT)
2288                         goto out;
2289
2290                 ret2 = check_delayed_ref(trans, root, path, objectid,
2291                                          offset, bytenr);
2292         } while (ret2 == -EAGAIN);
2293
2294         if (ret2 && ret2 != -ENOENT) {
2295                 ret = ret2;
2296                 goto out;
2297         }
2298
2299         if (ret != -ENOENT || ret2 != -ENOENT)
2300                 ret = 0;
2301 out:
2302         btrfs_free_path(path);
2303         return ret;
2304 }
2305
2306 #if 0
2307 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2308                     struct extent_buffer *buf, u32 nr_extents)
2309 {
2310         struct btrfs_key key;
2311         struct btrfs_file_extent_item *fi;
2312         u64 root_gen;
2313         u32 nritems;
2314         int i;
2315         int level;
2316         int ret = 0;
2317         int shared = 0;
2318
2319         if (!root->ref_cows)
2320                 return 0;
2321
2322         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2323                 shared = 0;
2324                 root_gen = root->root_key.offset;
2325         } else {
2326                 shared = 1;
2327                 root_gen = trans->transid - 1;
2328         }
2329
2330         level = btrfs_header_level(buf);
2331         nritems = btrfs_header_nritems(buf);
2332
2333         if (level == 0) {
2334                 struct btrfs_leaf_ref *ref;
2335                 struct btrfs_extent_info *info;
2336
2337                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2338                 if (!ref) {
2339                         ret = -ENOMEM;
2340                         goto out;
2341                 }
2342
2343                 ref->root_gen = root_gen;
2344                 ref->bytenr = buf->start;
2345                 ref->owner = btrfs_header_owner(buf);
2346                 ref->generation = btrfs_header_generation(buf);
2347                 ref->nritems = nr_extents;
2348                 info = ref->extents;
2349
2350                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2351                         u64 disk_bytenr;
2352                         btrfs_item_key_to_cpu(buf, &key, i);
2353                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2354                                 continue;
2355                         fi = btrfs_item_ptr(buf, i,
2356                                             struct btrfs_file_extent_item);
2357                         if (btrfs_file_extent_type(buf, fi) ==
2358                             BTRFS_FILE_EXTENT_INLINE)
2359                                 continue;
2360                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2361                         if (disk_bytenr == 0)
2362                                 continue;
2363
2364                         info->bytenr = disk_bytenr;
2365                         info->num_bytes =
2366                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2367                         info->objectid = key.objectid;
2368                         info->offset = key.offset;
2369                         info++;
2370                 }
2371
2372                 ret = btrfs_add_leaf_ref(root, ref, shared);
2373                 if (ret == -EEXIST && shared) {
2374                         struct btrfs_leaf_ref *old;
2375                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2376                         BUG_ON(!old);
2377                         btrfs_remove_leaf_ref(root, old);
2378                         btrfs_free_leaf_ref(root, old);
2379                         ret = btrfs_add_leaf_ref(root, ref, shared);
2380                 }
2381                 WARN_ON(ret);
2382                 btrfs_free_leaf_ref(root, ref);
2383         }
2384 out:
2385         return ret;
2386 }
2387
2388 /* when a block goes through cow, we update the reference counts of
2389  * everything that block points to.  The internal pointers of the block
2390  * can be in just about any order, and it is likely to have clusters of
2391  * things that are close together and clusters of things that are not.
2392  *
2393  * To help reduce the seeks that come with updating all of these reference
2394  * counts, sort them by byte number before actual updates are done.
2395  *
2396  * struct refsort is used to match byte number to slot in the btree block.
2397  * we sort based on the byte number and then use the slot to actually
2398  * find the item.
2399  *
2400  * struct refsort is smaller than strcut btrfs_item and smaller than
2401  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2402  * for a btree block, there's no way for a kmalloc of refsorts for a
2403  * single node to be bigger than a page.
2404  */
2405 struct refsort {
2406         u64 bytenr;
2407         u32 slot;
2408 };
2409
2410 /*
2411  * for passing into sort()
2412  */
2413 static int refsort_cmp(const void *a_void, const void *b_void)
2414 {
2415         const struct refsort *a = a_void;
2416         const struct refsort *b = b_void;
2417
2418         if (a->bytenr < b->bytenr)
2419                 return -1;
2420         if (a->bytenr > b->bytenr)
2421                 return 1;
2422         return 0;
2423 }
2424 #endif
2425
2426 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2427                            struct btrfs_root *root,
2428                            struct extent_buffer *buf,
2429                            int full_backref, int inc)
2430 {
2431         u64 bytenr;
2432         u64 num_bytes;
2433         u64 parent;
2434         u64 ref_root;
2435         u32 nritems;
2436         struct btrfs_key key;
2437         struct btrfs_file_extent_item *fi;
2438         int i;
2439         int level;
2440         int ret = 0;
2441         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2442                             u64, u64, u64, u64, u64, u64);
2443
2444         ref_root = btrfs_header_owner(buf);
2445         nritems = btrfs_header_nritems(buf);
2446         level = btrfs_header_level(buf);
2447
2448         if (!root->ref_cows && level == 0)
2449                 return 0;
2450
2451         if (inc)
2452                 process_func = btrfs_inc_extent_ref;
2453         else
2454                 process_func = btrfs_free_extent;
2455
2456         if (full_backref)
2457                 parent = buf->start;
2458         else
2459                 parent = 0;
2460
2461         for (i = 0; i < nritems; i++) {
2462                 if (level == 0) {
2463                         btrfs_item_key_to_cpu(buf, &key, i);
2464                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2465                                 continue;
2466                         fi = btrfs_item_ptr(buf, i,
2467                                             struct btrfs_file_extent_item);
2468                         if (btrfs_file_extent_type(buf, fi) ==
2469                             BTRFS_FILE_EXTENT_INLINE)
2470                                 continue;
2471                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2472                         if (bytenr == 0)
2473                                 continue;
2474
2475                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2476                         key.offset -= btrfs_file_extent_offset(buf, fi);
2477                         ret = process_func(trans, root, bytenr, num_bytes,
2478                                            parent, ref_root, key.objectid,
2479                                            key.offset);
2480                         if (ret)
2481                                 goto fail;
2482                 } else {
2483                         bytenr = btrfs_node_blockptr(buf, i);
2484                         num_bytes = btrfs_level_size(root, level - 1);
2485                         ret = process_func(trans, root, bytenr, num_bytes,
2486                                            parent, ref_root, level - 1, 0);
2487                         if (ret)
2488                                 goto fail;
2489                 }
2490         }
2491         return 0;
2492 fail:
2493         BUG();
2494         return ret;
2495 }
2496
2497 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2498                   struct extent_buffer *buf, int full_backref)
2499 {
2500         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2501 }
2502
2503 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2504                   struct extent_buffer *buf, int full_backref)
2505 {
2506         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2507 }
2508
2509 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2510                                  struct btrfs_root *root,
2511                                  struct btrfs_path *path,
2512                                  struct btrfs_block_group_cache *cache)
2513 {
2514         int ret;
2515         struct btrfs_root *extent_root = root->fs_info->extent_root;
2516         unsigned long bi;
2517         struct extent_buffer *leaf;
2518
2519         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2520         if (ret < 0)
2521                 goto fail;
2522         BUG_ON(ret);
2523
2524         leaf = path->nodes[0];
2525         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2526         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2527         btrfs_mark_buffer_dirty(leaf);
2528         btrfs_release_path(extent_root, path);
2529 fail:
2530         if (ret)
2531                 return ret;
2532         return 0;
2533
2534 }
2535
2536 static struct btrfs_block_group_cache *
2537 next_block_group(struct btrfs_root *root,
2538                  struct btrfs_block_group_cache *cache)
2539 {
2540         struct rb_node *node;
2541         spin_lock(&root->fs_info->block_group_cache_lock);
2542         node = rb_next(&cache->cache_node);
2543         btrfs_put_block_group(cache);
2544         if (node) {
2545                 cache = rb_entry(node, struct btrfs_block_group_cache,
2546                                  cache_node);
2547                 atomic_inc(&cache->count);
2548         } else
2549                 cache = NULL;
2550         spin_unlock(&root->fs_info->block_group_cache_lock);
2551         return cache;
2552 }
2553
2554 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2555                                    struct btrfs_root *root)
2556 {
2557         struct btrfs_block_group_cache *cache;
2558         int err = 0;
2559         struct btrfs_path *path;
2560         u64 last = 0;
2561
2562         path = btrfs_alloc_path();
2563         if (!path)
2564                 return -ENOMEM;
2565
2566         while (1) {
2567                 if (last == 0) {
2568                         err = btrfs_run_delayed_refs(trans, root,
2569                                                      (unsigned long)-1);
2570                         BUG_ON(err);
2571                 }
2572
2573                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2574                 while (cache) {
2575                         if (cache->dirty)
2576                                 break;
2577                         cache = next_block_group(root, cache);
2578                 }
2579                 if (!cache) {
2580                         if (last == 0)
2581                                 break;
2582                         last = 0;
2583                         continue;
2584                 }
2585
2586                 cache->dirty = 0;
2587                 last = cache->key.objectid + cache->key.offset;
2588
2589                 err = write_one_cache_group(trans, root, path, cache);
2590                 BUG_ON(err);
2591                 btrfs_put_block_group(cache);
2592         }
2593
2594         btrfs_free_path(path);
2595         return 0;
2596 }
2597
2598 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2599 {
2600         struct btrfs_block_group_cache *block_group;
2601         int readonly = 0;
2602
2603         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2604         if (!block_group || block_group->ro)
2605                 readonly = 1;
2606         if (block_group)
2607                 btrfs_put_block_group(block_group);
2608         return readonly;
2609 }
2610
2611 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2612                              u64 total_bytes, u64 bytes_used,
2613                              struct btrfs_space_info **space_info)
2614 {
2615         struct btrfs_space_info *found;
2616
2617         found = __find_space_info(info, flags);
2618         if (found) {
2619                 spin_lock(&found->lock);
2620                 found->total_bytes += total_bytes;
2621                 found->bytes_used += bytes_used;
2622                 found->full = 0;
2623                 spin_unlock(&found->lock);
2624                 *space_info = found;
2625                 return 0;
2626         }
2627         found = kzalloc(sizeof(*found), GFP_NOFS);
2628         if (!found)
2629                 return -ENOMEM;
2630
2631         INIT_LIST_HEAD(&found->block_groups);
2632         init_rwsem(&found->groups_sem);
2633         spin_lock_init(&found->lock);
2634         found->flags = flags;
2635         found->total_bytes = total_bytes;
2636         found->bytes_used = bytes_used;
2637         found->bytes_pinned = 0;
2638         found->bytes_reserved = 0;
2639         found->bytes_readonly = 0;
2640         found->bytes_delalloc = 0;
2641         found->full = 0;
2642         found->force_alloc = 0;
2643         *space_info = found;
2644         list_add_rcu(&found->list, &info->space_info);
2645         atomic_set(&found->caching_threads, 0);
2646         return 0;
2647 }
2648
2649 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2650 {
2651         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2652                                    BTRFS_BLOCK_GROUP_RAID1 |
2653                                    BTRFS_BLOCK_GROUP_RAID10 |
2654                                    BTRFS_BLOCK_GROUP_DUP);
2655         if (extra_flags) {
2656                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2657                         fs_info->avail_data_alloc_bits |= extra_flags;
2658                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2659                         fs_info->avail_metadata_alloc_bits |= extra_flags;
2660                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2661                         fs_info->avail_system_alloc_bits |= extra_flags;
2662         }
2663 }
2664
2665 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2666 {
2667         spin_lock(&cache->space_info->lock);
2668         spin_lock(&cache->lock);
2669         if (!cache->ro) {
2670                 cache->space_info->bytes_readonly += cache->key.offset -
2671                                         btrfs_block_group_used(&cache->item);
2672                 cache->ro = 1;
2673         }
2674         spin_unlock(&cache->lock);
2675         spin_unlock(&cache->space_info->lock);
2676 }
2677
2678 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2679 {
2680         u64 num_devices = root->fs_info->fs_devices->rw_devices;
2681
2682         if (num_devices == 1)
2683                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2684         if (num_devices < 4)
2685                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2686
2687         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2688             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2689                       BTRFS_BLOCK_GROUP_RAID10))) {
2690                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
2691         }
2692
2693         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2694             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2695                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2696         }
2697
2698         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2699             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2700              (flags & BTRFS_BLOCK_GROUP_RAID10) |
2701              (flags & BTRFS_BLOCK_GROUP_DUP)))
2702                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2703         return flags;
2704 }
2705
2706 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2707 {
2708         struct btrfs_fs_info *info = root->fs_info;
2709         u64 alloc_profile;
2710
2711         if (data) {
2712                 alloc_profile = info->avail_data_alloc_bits &
2713                         info->data_alloc_profile;
2714                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2715         } else if (root == root->fs_info->chunk_root) {
2716                 alloc_profile = info->avail_system_alloc_bits &
2717                         info->system_alloc_profile;
2718                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2719         } else {
2720                 alloc_profile = info->avail_metadata_alloc_bits &
2721                         info->metadata_alloc_profile;
2722                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2723         }
2724
2725         return btrfs_reduce_alloc_profile(root, data);
2726 }
2727
2728 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2729 {
2730         u64 alloc_target;
2731
2732         alloc_target = btrfs_get_alloc_profile(root, 1);
2733         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2734                                                        alloc_target);
2735 }
2736
2737 /*
2738  * for now this just makes sure we have at least 5% of our metadata space free
2739  * for use.
2740  */
2741 int btrfs_check_metadata_free_space(struct btrfs_root *root)
2742 {
2743         struct btrfs_fs_info *info = root->fs_info;
2744         struct btrfs_space_info *meta_sinfo;
2745         u64 alloc_target, thresh;
2746         int committed = 0, ret;
2747
2748         /* get the space info for where the metadata will live */
2749         alloc_target = btrfs_get_alloc_profile(root, 0);
2750         meta_sinfo = __find_space_info(info, alloc_target);
2751
2752 again:
2753         spin_lock(&meta_sinfo->lock);
2754         if (!meta_sinfo->full)
2755                 thresh = meta_sinfo->total_bytes * 80;
2756         else
2757                 thresh = meta_sinfo->total_bytes * 95;
2758
2759         do_div(thresh, 100);
2760
2761         if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2762             meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2763                 struct btrfs_trans_handle *trans;
2764                 if (!meta_sinfo->full) {
2765                         meta_sinfo->force_alloc = 1;
2766                         spin_unlock(&meta_sinfo->lock);
2767
2768                         trans = btrfs_start_transaction(root, 1);
2769                         if (!trans)
2770                                 return -ENOMEM;
2771
2772                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2773                                              2 * 1024 * 1024, alloc_target, 0);
2774                         btrfs_end_transaction(trans, root);
2775                         goto again;
2776                 }
2777                 spin_unlock(&meta_sinfo->lock);
2778
2779                 if (!committed) {
2780                         committed = 1;
2781                         trans = btrfs_join_transaction(root, 1);
2782                         if (!trans)
2783                                 return -ENOMEM;
2784                         ret = btrfs_commit_transaction(trans, root);
2785                         if (ret)
2786                                 return ret;
2787                         goto again;
2788                 }
2789                 return -ENOSPC;
2790         }
2791         spin_unlock(&meta_sinfo->lock);
2792
2793         return 0;
2794 }
2795
2796 /*
2797  * This will check the space that the inode allocates from to make sure we have
2798  * enough space for bytes.
2799  */
2800 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2801                                 u64 bytes)
2802 {
2803         struct btrfs_space_info *data_sinfo;
2804         int ret = 0, committed = 0;
2805
2806         /* make sure bytes are sectorsize aligned */
2807         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2808
2809         data_sinfo = BTRFS_I(inode)->space_info;
2810 again:
2811         /* make sure we have enough space to handle the data first */
2812         spin_lock(&data_sinfo->lock);
2813         if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2814             data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2815             data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2816             data_sinfo->bytes_may_use < bytes) {
2817                 struct btrfs_trans_handle *trans;
2818
2819                 /*
2820                  * if we don't have enough free bytes in this space then we need
2821                  * to alloc a new chunk.
2822                  */
2823                 if (!data_sinfo->full) {
2824                         u64 alloc_target;
2825
2826                         data_sinfo->force_alloc = 1;
2827                         spin_unlock(&data_sinfo->lock);
2828
2829                         alloc_target = btrfs_get_alloc_profile(root, 1);
2830                         trans = btrfs_start_transaction(root, 1);
2831                         if (!trans)
2832                                 return -ENOMEM;
2833
2834                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2835                                              bytes + 2 * 1024 * 1024,
2836                                              alloc_target, 0);
2837                         btrfs_end_transaction(trans, root);
2838                         if (ret)
2839                                 return ret;
2840                         goto again;
2841                 }
2842                 spin_unlock(&data_sinfo->lock);
2843
2844                 /* commit the current transaction and try again */
2845                 if (!committed) {
2846                         committed = 1;
2847                         trans = btrfs_join_transaction(root, 1);
2848                         if (!trans)
2849                                 return -ENOMEM;
2850                         ret = btrfs_commit_transaction(trans, root);
2851                         if (ret)
2852                                 return ret;
2853                         goto again;
2854                 }
2855
2856                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2857                        ", %llu bytes_used, %llu bytes_reserved, "
2858                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
2859                        "%llu total\n", (unsigned long long)bytes,
2860                        (unsigned long long)data_sinfo->bytes_delalloc,
2861                        (unsigned long long)data_sinfo->bytes_used,
2862                        (unsigned long long)data_sinfo->bytes_reserved,
2863                        (unsigned long long)data_sinfo->bytes_pinned,
2864                        (unsigned long long)data_sinfo->bytes_readonly,
2865                        (unsigned long long)data_sinfo->bytes_may_use,
2866                        (unsigned long long)data_sinfo->total_bytes);
2867                 return -ENOSPC;
2868         }
2869         data_sinfo->bytes_may_use += bytes;
2870         BTRFS_I(inode)->reserved_bytes += bytes;
2871         spin_unlock(&data_sinfo->lock);
2872
2873         return btrfs_check_metadata_free_space(root);
2874 }
2875
2876 /*
2877  * if there was an error for whatever reason after calling
2878  * btrfs_check_data_free_space, call this so we can cleanup the counters.
2879  */
2880 void btrfs_free_reserved_data_space(struct btrfs_root *root,
2881                                     struct inode *inode, u64 bytes)
2882 {
2883         struct btrfs_space_info *data_sinfo;
2884
2885         /* make sure bytes are sectorsize aligned */
2886         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2887
2888         data_sinfo = BTRFS_I(inode)->space_info;
2889         spin_lock(&data_sinfo->lock);
2890         data_sinfo->bytes_may_use -= bytes;
2891         BTRFS_I(inode)->reserved_bytes -= bytes;
2892         spin_unlock(&data_sinfo->lock);
2893 }
2894
2895 /* called when we are adding a delalloc extent to the inode's io_tree */
2896 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2897                                   u64 bytes)
2898 {
2899         struct btrfs_space_info *data_sinfo;
2900
2901         /* get the space info for where this inode will be storing its data */
2902         data_sinfo = BTRFS_I(inode)->space_info;
2903
2904         /* make sure we have enough space to handle the data first */
2905         spin_lock(&data_sinfo->lock);
2906         data_sinfo->bytes_delalloc += bytes;
2907
2908         /*
2909          * we are adding a delalloc extent without calling
2910          * btrfs_check_data_free_space first.  This happens on a weird
2911          * writepage condition, but shouldn't hurt our accounting
2912          */
2913         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2914                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2915                 BTRFS_I(inode)->reserved_bytes = 0;
2916         } else {
2917                 data_sinfo->bytes_may_use -= bytes;
2918                 BTRFS_I(inode)->reserved_bytes -= bytes;
2919         }
2920
2921         spin_unlock(&data_sinfo->lock);
2922 }
2923
2924 /* called when we are clearing an delalloc extent from the inode's io_tree */
2925 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2926                               u64 bytes)
2927 {
2928         struct btrfs_space_info *info;
2929
2930         info = BTRFS_I(inode)->space_info;
2931
2932         spin_lock(&info->lock);
2933         info->bytes_delalloc -= bytes;
2934         spin_unlock(&info->lock);
2935 }
2936
2937 static void force_metadata_allocation(struct btrfs_fs_info *info)
2938 {
2939         struct list_head *head = &info->space_info;
2940         struct btrfs_space_info *found;
2941
2942         rcu_read_lock();
2943         list_for_each_entry_rcu(found, head, list) {
2944                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
2945                         found->force_alloc = 1;
2946         }
2947         rcu_read_unlock();
2948 }
2949
2950 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2951                           struct btrfs_root *extent_root, u64 alloc_bytes,
2952                           u64 flags, int force)
2953 {
2954         struct btrfs_space_info *space_info;
2955         struct btrfs_fs_info *fs_info = extent_root->fs_info;
2956         u64 thresh;
2957         int ret = 0;
2958
2959         mutex_lock(&fs_info->chunk_mutex);
2960
2961         flags = btrfs_reduce_alloc_profile(extent_root, flags);
2962
2963         space_info = __find_space_info(extent_root->fs_info, flags);
2964         if (!space_info) {
2965                 ret = update_space_info(extent_root->fs_info, flags,
2966                                         0, 0, &space_info);
2967                 BUG_ON(ret);
2968         }
2969         BUG_ON(!space_info);
2970
2971         spin_lock(&space_info->lock);
2972         if (space_info->force_alloc) {
2973                 force = 1;
2974                 space_info->force_alloc = 0;
2975         }
2976         if (space_info->full) {
2977                 spin_unlock(&space_info->lock);
2978                 goto out;
2979         }
2980
2981         thresh = space_info->total_bytes - space_info->bytes_readonly;
2982         thresh = div_factor(thresh, 6);
2983         if (!force &&
2984            (space_info->bytes_used + space_info->bytes_pinned +
2985             space_info->bytes_reserved + alloc_bytes) < thresh) {
2986                 spin_unlock(&space_info->lock);
2987                 goto out;
2988         }
2989         spin_unlock(&space_info->lock);
2990
2991         /*
2992          * if we're doing a data chunk, go ahead and make sure that
2993          * we keep a reasonable number of metadata chunks allocated in the
2994          * FS as well.
2995          */
2996         if (flags & BTRFS_BLOCK_GROUP_DATA) {
2997                 fs_info->data_chunk_allocations++;
2998                 if (!(fs_info->data_chunk_allocations %
2999                       fs_info->metadata_ratio))
3000                         force_metadata_allocation(fs_info);
3001         }
3002
3003         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3004         if (ret)
3005                 space_info->full = 1;
3006 out:
3007         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3008         return ret;
3009 }
3010
3011 static int update_block_group(struct btrfs_trans_handle *trans,
3012                               struct btrfs_root *root,
3013                               u64 bytenr, u64 num_bytes, int alloc,
3014                               int mark_free)
3015 {
3016         struct btrfs_block_group_cache *cache;
3017         struct btrfs_fs_info *info = root->fs_info;
3018         u64 total = num_bytes;
3019         u64 old_val;
3020         u64 byte_in_group;
3021
3022         /* block accounting for super block */
3023         spin_lock(&info->delalloc_lock);
3024         old_val = btrfs_super_bytes_used(&info->super_copy);
3025         if (alloc)
3026                 old_val += num_bytes;
3027         else
3028                 old_val -= num_bytes;
3029         btrfs_set_super_bytes_used(&info->super_copy, old_val);
3030
3031         /* block accounting for root item */
3032         old_val = btrfs_root_used(&root->root_item);
3033         if (alloc)
3034                 old_val += num_bytes;
3035         else
3036                 old_val -= num_bytes;
3037         btrfs_set_root_used(&root->root_item, old_val);
3038         spin_unlock(&info->delalloc_lock);
3039
3040         while (total) {
3041                 cache = btrfs_lookup_block_group(info, bytenr);
3042                 if (!cache)
3043                         return -1;
3044                 byte_in_group = bytenr - cache->key.objectid;
3045                 WARN_ON(byte_in_group > cache->key.offset);
3046
3047                 spin_lock(&cache->space_info->lock);
3048                 spin_lock(&cache->lock);
3049                 cache->dirty = 1;
3050                 old_val = btrfs_block_group_used(&cache->item);
3051                 num_bytes = min(total, cache->key.offset - byte_in_group);
3052                 if (alloc) {
3053                         old_val += num_bytes;
3054                         cache->space_info->bytes_used += num_bytes;
3055                         if (cache->ro)
3056                                 cache->space_info->bytes_readonly -= num_bytes;
3057                         btrfs_set_block_group_used(&cache->item, old_val);
3058                         spin_unlock(&cache->lock);
3059                         spin_unlock(&cache->space_info->lock);
3060                 } else {
3061                         old_val -= num_bytes;
3062                         cache->space_info->bytes_used -= num_bytes;
3063                         if (cache->ro)
3064                                 cache->space_info->bytes_readonly += num_bytes;
3065                         btrfs_set_block_group_used(&cache->item, old_val);
3066                         spin_unlock(&cache->lock);
3067                         spin_unlock(&cache->space_info->lock);
3068                         if (mark_free) {
3069                                 int ret;
3070
3071                                 ret = btrfs_discard_extent(root, bytenr,
3072                                                            num_bytes);
3073                                 WARN_ON(ret);
3074
3075                                 ret = btrfs_add_free_space(cache, bytenr,
3076                                                            num_bytes);
3077                                 WARN_ON(ret);
3078                         }
3079                 }
3080                 btrfs_put_block_group(cache);
3081                 total -= num_bytes;
3082                 bytenr += num_bytes;
3083         }
3084         return 0;
3085 }
3086
3087 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3088 {
3089         struct btrfs_block_group_cache *cache;
3090         u64 bytenr;
3091
3092         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3093         if (!cache)
3094                 return 0;
3095
3096         bytenr = cache->key.objectid;
3097         btrfs_put_block_group(cache);
3098
3099         return bytenr;
3100 }
3101
3102 int btrfs_update_pinned_extents(struct btrfs_root *root,
3103                                 u64 bytenr, u64 num, int pin, int mark_free)
3104 {
3105         u64 len;
3106         struct btrfs_block_group_cache *cache;
3107         struct btrfs_fs_info *fs_info = root->fs_info;
3108
3109         if (pin) {
3110                 set_extent_dirty(&fs_info->pinned_extents,
3111                                 bytenr, bytenr + num - 1, GFP_NOFS);
3112         } else {
3113                 clear_extent_dirty(&fs_info->pinned_extents,
3114                                 bytenr, bytenr + num - 1, GFP_NOFS);
3115         }
3116
3117         while (num > 0) {
3118                 cache = btrfs_lookup_block_group(fs_info, bytenr);
3119                 BUG_ON(!cache);
3120                 len = min(num, cache->key.offset -
3121                           (bytenr - cache->key.objectid));
3122                 if (pin) {
3123                         spin_lock(&cache->space_info->lock);
3124                         spin_lock(&cache->lock);
3125                         cache->pinned += len;
3126                         cache->space_info->bytes_pinned += len;
3127                         spin_unlock(&cache->lock);
3128                         spin_unlock(&cache->space_info->lock);
3129                         fs_info->total_pinned += len;
3130                 } else {
3131                         spin_lock(&cache->space_info->lock);
3132                         spin_lock(&cache->lock);
3133                         cache->pinned -= len;
3134                         cache->space_info->bytes_pinned -= len;
3135                         spin_unlock(&cache->lock);
3136                         spin_unlock(&cache->space_info->lock);
3137                         fs_info->total_pinned -= len;
3138                         if (block_group_cache_done(cache) && mark_free)
3139                                 btrfs_add_free_space(cache, bytenr, len);
3140                 }
3141                 btrfs_put_block_group(cache);
3142                 bytenr += len;
3143                 num -= len;
3144         }
3145         return 0;
3146 }
3147
3148 static int update_reserved_extents(struct btrfs_root *root,
3149                                    u64 bytenr, u64 num, int reserve)
3150 {
3151         u64 len;
3152         struct btrfs_block_group_cache *cache;
3153         struct btrfs_fs_info *fs_info = root->fs_info;
3154
3155         while (num > 0) {
3156                 cache = btrfs_lookup_block_group(fs_info, bytenr);
3157                 BUG_ON(!cache);
3158                 len = min(num, cache->key.offset -
3159                           (bytenr - cache->key.objectid));
3160
3161                 spin_lock(&cache->space_info->lock);
3162                 spin_lock(&cache->lock);
3163                 if (reserve) {
3164                         cache->reserved += len;
3165                         cache->space_info->bytes_reserved += len;
3166                 } else {
3167                         cache->reserved -= len;
3168                         cache->space_info->bytes_reserved -= len;
3169                 }
3170                 spin_unlock(&cache->lock);
3171                 spin_unlock(&cache->space_info->lock);
3172                 btrfs_put_block_group(cache);
3173                 bytenr += len;
3174                 num -= len;
3175         }
3176         return 0;
3177 }
3178
3179 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3180 {
3181         u64 last = 0;
3182         u64 start;
3183         u64 end;
3184         bool caching_kthreads = false;
3185         struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3186         int ret;
3187
3188         if (atomic_read(&root->fs_info->async_caching_threads))
3189                 caching_kthreads = true;
3190
3191         while (1) {
3192                 ret = find_first_extent_bit(pinned_extents, last,
3193                                             &start, &end, EXTENT_DIRTY);
3194                 if (ret)
3195                         break;
3196
3197                 /*
3198                  * we need to make sure that the pinned extents don't go away
3199                  * while we are caching block groups
3200                  */
3201                 if (unlikely(caching_kthreads))
3202                         set_extent_delalloc(pinned_extents, start, end,
3203                                             GFP_NOFS);
3204
3205                 set_extent_dirty(copy, start, end, GFP_NOFS);
3206                 last = end + 1;
3207         }
3208         return 0;
3209 }
3210
3211 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3212                                struct btrfs_root *root,
3213                                struct extent_io_tree *unpin)
3214 {
3215         u64 start;
3216         u64 end;
3217         int ret;
3218         int mark_free = 1;
3219
3220         ret = find_first_extent_bit(&root->fs_info->pinned_extents, 0,
3221                                     &start, &end, EXTENT_DELALLOC);
3222         if (!ret)
3223                 mark_free = 0;
3224
3225         while (1) {
3226                 ret = find_first_extent_bit(unpin, 0, &start, &end,
3227                                             EXTENT_DIRTY);
3228                 if (ret)
3229                         break;
3230
3231                 ret = btrfs_discard_extent(root, start, end + 1 - start);
3232
3233                 /* unlocks the pinned mutex */
3234                 btrfs_update_pinned_extents(root, start, end + 1 - start, 0,
3235                                             mark_free);
3236                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3237
3238                 cond_resched();
3239         }
3240
3241         if (unlikely(!mark_free))
3242                 btrfs_discard_pinned_extents(root->fs_info, NULL);
3243
3244         return ret;
3245 }
3246
3247 static int pin_down_bytes(struct btrfs_trans_handle *trans,
3248                           struct btrfs_root *root,
3249                           struct btrfs_path *path,
3250                           u64 bytenr, u64 num_bytes, int is_data,
3251                           struct extent_buffer **must_clean)
3252 {
3253         int err = 0;
3254         struct extent_buffer *buf;
3255
3256         if (is_data)
3257                 goto pinit;
3258
3259         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3260         if (!buf)
3261                 goto pinit;
3262
3263         /* we can reuse a block if it hasn't been written
3264          * and it is from this transaction.  We can't
3265          * reuse anything from the tree log root because
3266          * it has tiny sub-transactions.
3267          */
3268         if (btrfs_buffer_uptodate(buf, 0) &&
3269             btrfs_try_tree_lock(buf)) {
3270                 u64 header_owner = btrfs_header_owner(buf);
3271                 u64 header_transid = btrfs_header_generation(buf);
3272                 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3273                     header_transid == trans->transid &&
3274                     !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3275                         *must_clean = buf;
3276                         return 1;
3277                 }
3278                 btrfs_tree_unlock(buf);
3279         }
3280         free_extent_buffer(buf);
3281 pinit:
3282         btrfs_set_path_blocking(path);
3283         /* unlocks the pinned mutex */
3284         btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
3285
3286         BUG_ON(err < 0);
3287         return 0;
3288 }
3289
3290
3291 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3292                                 struct btrfs_root *root,
3293                                 u64 bytenr, u64 num_bytes, u64 parent,
3294                                 u64 root_objectid, u64 owner_objectid,
3295                                 u64 owner_offset, int refs_to_drop,
3296                                 struct btrfs_delayed_extent_op *extent_op)
3297 {
3298         struct btrfs_key key;
3299         struct btrfs_path *path;
3300         struct btrfs_fs_info *info = root->fs_info;
3301         struct btrfs_root *extent_root = info->extent_root;
3302         struct extent_buffer *leaf;
3303         struct btrfs_extent_item *ei;
3304         struct btrfs_extent_inline_ref *iref;
3305         int ret;
3306         int is_data;
3307         int extent_slot = 0;
3308         int found_extent = 0;
3309         int num_to_del = 1;
3310         u32 item_size;
3311         u64 refs;
3312
3313         path = btrfs_alloc_path();
3314         if (!path)
3315                 return -ENOMEM;
3316
3317         path->reada = 1;
3318         path->leave_spinning = 1;
3319
3320         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3321         BUG_ON(!is_data && refs_to_drop != 1);
3322
3323         ret = lookup_extent_backref(trans, extent_root, path, &iref,
3324                                     bytenr, num_bytes, parent,
3325                                     root_objectid, owner_objectid,
3326                                     owner_offset);
3327         if (ret == 0) {
3328                 extent_slot = path->slots[0];
3329                 while (extent_slot >= 0) {
3330                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3331                                               extent_slot);
3332                         if (key.objectid != bytenr)
3333                                 break;
3334                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3335                             key.offset == num_bytes) {
3336                                 found_extent = 1;
3337                                 break;
3338                         }
3339                         if (path->slots[0] - extent_slot > 5)
3340                                 break;
3341                         extent_slot--;
3342                 }
3343 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3344                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3345                 if (found_extent && item_size < sizeof(*ei))
3346                         found_extent = 0;
3347 #endif
3348                 if (!found_extent) {
3349                         BUG_ON(iref);
3350                         ret = remove_extent_backref(trans, extent_root, path,
3351                                                     NULL, refs_to_drop,
3352                                                     is_data);
3353                         BUG_ON(ret);
3354                         btrfs_release_path(extent_root, path);
3355                         path->leave_spinning = 1;
3356
3357                         key.objectid = bytenr;
3358                         key.type = BTRFS_EXTENT_ITEM_KEY;
3359                         key.offset = num_bytes;
3360
3361                         ret = btrfs_search_slot(trans, extent_root,
3362                                                 &key, path, -1, 1);
3363                         if (ret) {
3364                                 printk(KERN_ERR "umm, got %d back from search"
3365                                        ", was looking for %llu\n", ret,
3366                                        (unsigned long long)bytenr);
3367                                 btrfs_print_leaf(extent_root, path->nodes[0]);
3368                         }
3369                         BUG_ON(ret);
3370                         extent_slot = path->slots[0];
3371                 }
3372         } else {
3373                 btrfs_print_leaf(extent_root, path->nodes[0]);
3374                 WARN_ON(1);
3375                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
3376                        "parent %llu root %llu  owner %llu offset %llu\n",
3377                        (unsigned long long)bytenr,
3378                        (unsigned long long)parent,
3379                        (unsigned long long)root_objectid,
3380                        (unsigned long long)owner_objectid,
3381                        (unsigned long long)owner_offset);
3382         }
3383
3384         leaf = path->nodes[0];
3385         item_size = btrfs_item_size_nr(leaf, extent_slot);
3386 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3387         if (item_size < sizeof(*ei)) {
3388                 BUG_ON(found_extent || extent_slot != path->slots[0]);
3389                 ret = convert_extent_item_v0(trans, extent_root, path,
3390                                              owner_objectid, 0);
3391                 BUG_ON(ret < 0);
3392
3393                 btrfs_release_path(extent_root, path);
3394                 path->leave_spinning = 1;
3395
3396                 key.objectid = bytenr;
3397                 key.type = BTRFS_EXTENT_ITEM_KEY;
3398                 key.offset = num_bytes;
3399
3400                 ret = btrfs_search_slot(trans, extent_root, &key, path,
3401                                         -1, 1);
3402                 if (ret) {
3403                         printk(KERN_ERR "umm, got %d back from search"
3404                                ", was looking for %llu\n", ret,
3405                                (unsigned long long)bytenr);
3406                         btrfs_print_leaf(extent_root, path->nodes[0]);
3407                 }
3408                 BUG_ON(ret);
3409                 extent_slot = path->slots[0];
3410                 leaf = path->nodes[0];
3411                 item_size = btrfs_item_size_nr(leaf, extent_slot);
3412         }
3413 #endif
3414         BUG_ON(item_size < sizeof(*ei));
3415         ei = btrfs_item_ptr(leaf, extent_slot,
3416                             struct btrfs_extent_item);
3417         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3418                 struct btrfs_tree_block_info *bi;
3419                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3420                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3421                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3422         }
3423
3424         refs = btrfs_extent_refs(leaf, ei);
3425         BUG_ON(refs < refs_to_drop);
3426         refs -= refs_to_drop;
3427
3428         if (refs > 0) {
3429                 if (extent_op)
3430                         __run_delayed_extent_op(extent_op, leaf, ei);
3431                 /*
3432                  * In the case of inline back ref, reference count will
3433                  * be updated by remove_extent_backref
3434                  */
3435                 if (iref) {
3436                         BUG_ON(!found_extent);
3437                 } else {
3438                         btrfs_set_extent_refs(leaf, ei, refs);
3439                         btrfs_mark_buffer_dirty(leaf);
3440                 }
3441                 if (found_extent) {
3442                         ret = remove_extent_backref(trans, extent_root, path,
3443                                                     iref, refs_to_drop,
3444                                                     is_data);
3445                         BUG_ON(ret);
3446                 }
3447         } else {
3448                 int mark_free = 0;
3449                 struct extent_buffer *must_clean = NULL;
3450
3451                 if (found_extent) {
3452                         BUG_ON(is_data && refs_to_drop !=
3453                                extent_data_ref_count(root, path, iref));
3454                         if (iref) {
3455                                 BUG_ON(path->slots[0] != extent_slot);
3456                         } else {
3457                                 BUG_ON(path->slots[0] != extent_slot + 1);
3458                                 path->slots[0] = extent_slot;
3459                                 num_to_del = 2;
3460                         }
3461                 }
3462
3463                 ret = pin_down_bytes(trans, root, path, bytenr,
3464                                      num_bytes, is_data, &must_clean);
3465                 if (ret > 0)
3466                         mark_free = 1;
3467                 BUG_ON(ret < 0);
3468                 /*
3469                  * it is going to be very rare for someone to be waiting
3470                  * on the block we're freeing.  del_items might need to
3471                  * schedule, so rather than get fancy, just force it
3472                  * to blocking here
3473                  */
3474                 if (must_clean)
3475                         btrfs_set_lock_blocking(must_clean);
3476
3477                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3478                                       num_to_del);
3479                 BUG_ON(ret);
3480                 btrfs_release_path(extent_root, path);
3481
3482                 if (must_clean) {
3483                         clean_tree_block(NULL, root, must_clean);
3484                         btrfs_tree_unlock(must_clean);
3485                         free_extent_buffer(must_clean);
3486                 }
3487
3488                 if (is_data) {
3489                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3490                         BUG_ON(ret);
3491                 } else {
3492                         invalidate_mapping_pages(info->btree_inode->i_mapping,
3493                              bytenr >> PAGE_CACHE_SHIFT,
3494                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3495                 }
3496
3497                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
3498                                          mark_free);
3499                 BUG_ON(ret);
3500         }
3501         btrfs_free_path(path);
3502         return ret;
3503 }
3504
3505 /*
3506  * when we free an extent, it is possible (and likely) that we free the last
3507  * delayed ref for that extent as well.  This searches the delayed ref tree for
3508  * a given extent, and if there are no other delayed refs to be processed, it
3509  * removes it from the tree.
3510  */
3511 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3512                                       struct btrfs_root *root, u64 bytenr)
3513 {
3514         struct btrfs_delayed_ref_head *head;
3515         struct btrfs_delayed_ref_root *delayed_refs;
3516         struct btrfs_delayed_ref_node *ref;
3517         struct rb_node *node;
3518         int ret;
3519
3520         delayed_refs = &trans->transaction->delayed_refs;
3521         spin_lock(&delayed_refs->lock);
3522         head = btrfs_find_delayed_ref_head(trans, bytenr);
3523         if (!head)
3524                 goto out;
3525
3526         node = rb_prev(&head->node.rb_node);
3527         if (!node)
3528                 goto out;
3529
3530         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3531
3532         /* there are still entries for this ref, we can't drop it */
3533         if (ref->bytenr == bytenr)
3534                 goto out;
3535
3536         if (head->extent_op) {
3537                 if (!head->must_insert_reserved)
3538                         goto out;
3539                 kfree(head->extent_op);
3540                 head->extent_op = NULL;
3541         }
3542
3543         /*
3544          * waiting for the lock here would deadlock.  If someone else has it
3545          * locked they are already in the process of dropping it anyway
3546          */
3547         if (!mutex_trylock(&head->mutex))
3548                 goto out;
3549
3550         /*
3551          * at this point we have a head with no other entries.  Go
3552          * ahead and process it.
3553          */
3554         head->node.in_tree = 0;
3555         rb_erase(&head->node.rb_node, &delayed_refs->root);
3556
3557         delayed_refs->num_entries--;
3558
3559         /*
3560          * we don't take a ref on the node because we're removing it from the
3561          * tree, so we just steal the ref the tree was holding.
3562          */
3563         delayed_refs->num_heads--;
3564         if (list_empty(&head->cluster))
3565                 delayed_refs->num_heads_ready--;
3566
3567         list_del_init(&head->cluster);
3568         spin_unlock(&delayed_refs->lock);
3569
3570         ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
3571                                   &head->node, head->extent_op,
3572                                   head->must_insert_reserved);
3573         BUG_ON(ret);
3574         btrfs_put_delayed_ref(&head->node);
3575         return 0;
3576 out:
3577         spin_unlock(&delayed_refs->lock);
3578         return 0;
3579 }
3580
3581 int btrfs_free_extent(struct btrfs_trans_handle *trans,
3582                       struct btrfs_root *root,
3583                       u64 bytenr, u64 num_bytes, u64 parent,
3584                       u64 root_objectid, u64 owner, u64 offset)
3585 {
3586         int ret;
3587
3588         /*
3589          * tree log blocks never actually go into the extent allocation
3590          * tree, just update pinning info and exit early.
3591          */
3592         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3593                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3594                 /* unlocks the pinned mutex */
3595                 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
3596                 update_reserved_extents(root, bytenr, num_bytes, 0);
3597                 ret = 0;
3598         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3599                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
3600                                         parent, root_objectid, (int)owner,
3601                                         BTRFS_DROP_DELAYED_REF, NULL);
3602                 BUG_ON(ret);
3603                 ret = check_ref_cleanup(trans, root, bytenr);
3604                 BUG_ON(ret);
3605         } else {
3606                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
3607                                         parent, root_objectid, owner,
3608                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
3609                 BUG_ON(ret);
3610         }
3611         return ret;
3612 }
3613
3614 static u64 stripe_align(struct btrfs_root *root, u64 val)
3615 {
3616         u64 mask = ((u64)root->stripesize - 1);
3617         u64 ret = (val + mask) & ~mask;
3618         return ret;
3619 }
3620
3621 /*
3622  * when we wait for progress in the block group caching, its because
3623  * our allocation attempt failed at least once.  So, we must sleep
3624  * and let some progress happen before we try again.
3625  *
3626  * This function will sleep at least once waiting for new free space to
3627  * show up, and then it will check the block group free space numbers
3628  * for our min num_bytes.  Another option is to have it go ahead
3629  * and look in the rbtree for a free extent of a given size, but this
3630  * is a good start.
3631  */
3632 static noinline int
3633 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3634                                 u64 num_bytes)
3635 {
3636         DEFINE_WAIT(wait);
3637
3638         prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3639
3640         if (block_group_cache_done(cache)) {
3641                 finish_wait(&cache->caching_q, &wait);
3642                 return 0;
3643         }
3644         schedule();
3645         finish_wait(&cache->caching_q, &wait);
3646
3647         wait_event(cache->caching_q, block_group_cache_done(cache) ||
3648                    (cache->free_space >= num_bytes));
3649         return 0;
3650 }
3651
3652 enum btrfs_loop_type {
3653         LOOP_CACHED_ONLY = 0,
3654         LOOP_CACHING_NOWAIT = 1,
3655         LOOP_CACHING_WAIT = 2,
3656         LOOP_ALLOC_CHUNK = 3,
3657         LOOP_NO_EMPTY_SIZE = 4,
3658 };
3659
3660 /*
3661  * walks the btree of allocated extents and find a hole of a given size.
3662  * The key ins is changed to record the hole:
3663  * ins->objectid == block start
3664  * ins->flags = BTRFS_EXTENT_ITEM_KEY
3665  * ins->offset == number of blocks
3666  * Any available blocks before search_start are skipped.
3667  */
3668 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3669                                      struct btrfs_root *orig_root,
3670                                      u64 num_bytes, u64 empty_size,
3671                                      u64 search_start, u64 search_end,
3672                                      u64 hint_byte, struct btrfs_key *ins,
3673                                      u64 exclude_start, u64 exclude_nr,
3674                                      int data)
3675 {
3676         int ret = 0;
3677         struct btrfs_root *root = orig_root->fs_info->extent_root;
3678         struct btrfs_free_cluster *last_ptr = NULL;
3679         struct btrfs_block_group_cache *block_group = NULL;
3680         int empty_cluster = 2 * 1024 * 1024;
3681         int allowed_chunk_alloc = 0;
3682         struct btrfs_space_info *space_info;
3683         int last_ptr_loop = 0;
3684         int loop = 0;
3685         bool found_uncached_bg = false;
3686
3687         WARN_ON(num_bytes < root->sectorsize);
3688         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3689         ins->objectid = 0;
3690         ins->offset = 0;
3691
3692         space_info = __find_space_info(root->fs_info, data);
3693
3694         if (orig_root->ref_cows || empty_size)
3695                 allowed_chunk_alloc = 1;
3696
3697         if (data & BTRFS_BLOCK_GROUP_METADATA) {
3698                 last_ptr = &root->fs_info->meta_alloc_cluster;
3699                 if (!btrfs_test_opt(root, SSD))
3700                         empty_cluster = 64 * 1024;
3701         }
3702
3703         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
3704                 last_ptr = &root->fs_info->data_alloc_cluster;
3705         }
3706
3707         if (last_ptr) {
3708                 spin_lock(&last_ptr->lock);
3709                 if (last_ptr->block_group)
3710                         hint_byte = last_ptr->window_start;
3711                 spin_unlock(&last_ptr->lock);
3712         }
3713
3714         search_start = max(search_start, first_logical_byte(root, 0));
3715         search_start = max(search_start, hint_byte);
3716
3717         if (!last_ptr)
3718                 empty_cluster = 0;
3719
3720         if (search_start == hint_byte) {
3721                 block_group = btrfs_lookup_block_group(root->fs_info,
3722                                                        search_start);
3723                 /*
3724                  * we don't want to use the block group if it doesn't match our
3725                  * allocation bits, or if its not cached.
3726                  */
3727                 if (block_group && block_group_bits(block_group, data) &&
3728                     block_group_cache_done(block_group)) {
3729                         down_read(&space_info->groups_sem);
3730                         if (list_empty(&block_group->list) ||
3731                             block_group->ro) {
3732                                 /*
3733                                  * someone is removing this block group,
3734                                  * we can't jump into the have_block_group
3735                                  * target because our list pointers are not
3736                                  * valid
3737                                  */
3738                                 btrfs_put_block_group(block_group);
3739                                 up_read(&space_info->groups_sem);
3740                         } else
3741                                 goto have_block_group;
3742                 } else if (block_group) {
3743                         btrfs_put_block_group(block_group);
3744                 }
3745         }
3746
3747 search:
3748         down_read(&space_info->groups_sem);
3749         list_for_each_entry(block_group, &space_info->block_groups, list) {
3750                 u64 offset;
3751                 int cached;
3752
3753                 atomic_inc(&block_group->count);
3754                 search_start = block_group->key.objectid;
3755
3756 have_block_group:
3757                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
3758                         /*
3759                          * we want to start caching kthreads, but not too many
3760                          * right off the bat so we don't overwhelm the system,
3761                          * so only start them if there are less than 2 and we're
3762                          * in the initial allocation phase.
3763                          */
3764                         if (loop > LOOP_CACHING_NOWAIT ||
3765                             atomic_read(&space_info->caching_threads) < 2) {
3766                                 ret = cache_block_group(block_group);
3767                                 BUG_ON(ret);
3768                         }
3769                 }
3770
3771                 cached = block_group_cache_done(block_group);
3772                 if (unlikely(!cached)) {
3773                         found_uncached_bg = true;
3774
3775                         /* if we only want cached bgs, loop */
3776                         if (loop == LOOP_CACHED_ONLY)
3777                                 goto loop;
3778                 }
3779
3780                 if (unlikely(block_group->ro))
3781                         goto loop;
3782
3783                 if (last_ptr) {
3784                         /*
3785                          * the refill lock keeps out other
3786                          * people trying to start a new cluster
3787                          */
3788                         spin_lock(&last_ptr->refill_lock);
3789                         if (last_ptr->block_group &&
3790                             (last_ptr->block_group->ro ||
3791                             !block_group_bits(last_ptr->block_group, data))) {
3792                                 offset = 0;
3793                                 goto refill_cluster;
3794                         }
3795
3796                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
3797                                                  num_bytes, search_start);
3798                         if (offset) {
3799                                 /* we have a block, we're done */
3800                                 spin_unlock(&last_ptr->refill_lock);
3801                                 goto checks;
3802                         }
3803
3804                         spin_lock(&last_ptr->lock);
3805                         /*
3806                          * whoops, this cluster doesn't actually point to
3807                          * this block group.  Get a ref on the block
3808                          * group is does point to and try again
3809                          */
3810                         if (!last_ptr_loop && last_ptr->block_group &&
3811                             last_ptr->block_group != block_group) {
3812
3813                                 btrfs_put_block_group(block_group);
3814                                 block_group = last_ptr->block_group;
3815                                 atomic_inc(&block_group->count);
3816                                 spin_unlock(&last_ptr->lock);
3817                                 spin_unlock(&last_ptr->refill_lock);
3818
3819                                 last_ptr_loop = 1;
3820                                 search_start = block_group->key.objectid;
3821                                 /*
3822                                  * we know this block group is properly
3823                                  * in the list because
3824                                  * btrfs_remove_block_group, drops the
3825                                  * cluster before it removes the block
3826                                  * group from the list
3827                                  */
3828                                 goto have_block_group;
3829                         }
3830                         spin_unlock(&last_ptr->lock);
3831 refill_cluster:
3832                         /*
3833                          * this cluster didn't work out, free it and
3834                          * start over
3835                          */
3836                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3837
3838                         last_ptr_loop = 0;
3839
3840                         /* allocate a cluster in this block group */
3841                         ret = btrfs_find_space_cluster(trans, root,
3842                                                block_group, last_ptr,
3843                                                offset, num_bytes,
3844                                                empty_cluster + empty_size);
3845                         if (ret == 0) {
3846                                 /*
3847                                  * now pull our allocation out of this
3848                                  * cluster
3849                                  */
3850                                 offset = btrfs_alloc_from_cluster(block_group,
3851                                                   last_ptr, num_bytes,
3852                                                   search_start);
3853                                 if (offset) {
3854                                         /* we found one, proceed */
3855                                         spin_unlock(&last_ptr->refill_lock);
3856                                         goto checks;
3857                                 }
3858                         } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3859                                 spin_unlock(&last_ptr->refill_lock);
3860
3861                                 wait_block_group_cache_progress(block_group,
3862                                        num_bytes + empty_cluster + empty_size);
3863                                 goto have_block_group;
3864                         }
3865
3866                         /*
3867                          * at this point we either didn't find a cluster
3868                          * or we weren't able to allocate a block from our
3869                          * cluster.  Free the cluster we've been trying
3870                          * to use, and go to the next block group
3871                          */
3872                         if (loop < LOOP_NO_EMPTY_SIZE) {
3873                                 btrfs_return_cluster_to_free_space(NULL,
3874                                                                    last_ptr);
3875                                 spin_unlock(&last_ptr->refill_lock);
3876                                 goto loop;
3877                         }
3878                         spin_unlock(&last_ptr->refill_lock);
3879                 }
3880
3881                 offset = btrfs_find_space_for_alloc(block_group, search_start,
3882                                                     num_bytes, empty_size);
3883                 if (!offset && (cached || (!cached &&
3884                                            loop == LOOP_CACHING_NOWAIT))) {
3885                         goto loop;
3886                 } else if (!offset && (!cached &&
3887                                        loop > LOOP_CACHING_NOWAIT)) {
3888                         wait_block_group_cache_progress(block_group,
3889                                         num_bytes + empty_size);
3890                         goto have_block_group;
3891                 }
3892 checks:
3893                 search_start = stripe_align(root, offset);
3894                 /* move on to the next group */
3895                 if (search_start + num_bytes >= search_end) {
3896                         btrfs_add_free_space(block_group, offset, num_bytes);
3897                         goto loop;
3898                 }
3899
3900                 /* move on to the next group */
3901                 if (search_start + num_bytes >
3902                     block_group->key.objectid + block_group->key.offset) {
3903                         btrfs_add_free_space(block_group, offset, num_bytes);
3904                         goto loop;
3905                 }
3906
3907                 if (exclude_nr > 0 &&
3908                     (search_start + num_bytes > exclude_start &&
3909                      search_start < exclude_start + exclude_nr)) {
3910                         search_start = exclude_start + exclude_nr;
3911
3912                         btrfs_add_free_space(block_group, offset, num_bytes);
3913                         /*
3914                          * if search_start is still in this block group
3915                          * then we just re-search this block group
3916                          */
3917                         if (search_start >= block_group->key.objectid &&
3918                             search_start < (block_group->key.objectid +
3919                                             block_group->key.offset))
3920                                 goto have_block_group;
3921                         goto loop;
3922                 }
3923
3924                 ins->objectid = search_start;
3925                 ins->offset = num_bytes;
3926
3927                 if (offset < search_start)
3928                         btrfs_add_free_space(block_group, offset,
3929                                              search_start - offset);
3930                 BUG_ON(offset > search_start);
3931
3932                 /* we are all good, lets return */
3933                 break;
3934 loop:
3935                 btrfs_put_block_group(block_group);
3936         }
3937         up_read(&space_info->groups_sem);
3938
3939         /* LOOP_CACHED_ONLY, only search fully cached block groups
3940          * LOOP_CACHING_NOWAIT, search partially cached block groups, but
3941          *                      dont wait foR them to finish caching
3942          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3943          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3944          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3945          *                      again
3946          */
3947         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
3948             (found_uncached_bg || empty_size || empty_cluster ||
3949              allowed_chunk_alloc)) {
3950                 if (found_uncached_bg) {
3951                         found_uncached_bg = false;
3952                         if (loop < LOOP_CACHING_WAIT) {
3953                                 loop++;
3954                                 goto search;
3955                         }
3956                 }
3957
3958                 if (loop == LOOP_ALLOC_CHUNK) {
3959                         empty_size = 0;
3960                         empty_cluster = 0;
3961                 }
3962
3963                 if (allowed_chunk_alloc) {
3964                         ret = do_chunk_alloc(trans, root, num_bytes +
3965                                              2 * 1024 * 1024, data, 1);
3966                         allowed_chunk_alloc = 0;
3967                 } else {
3968                         space_info->force_alloc = 1;
3969                 }
3970
3971                 if (loop < LOOP_NO_EMPTY_SIZE) {
3972                         loop++;
3973                         goto search;
3974                 }
3975                 ret = -ENOSPC;
3976         } else if (!ins->objectid) {
3977                 ret = -ENOSPC;
3978         }
3979
3980         /* we found what we needed */
3981         if (ins->objectid) {
3982                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
3983                         trans->block_group = block_group->key.objectid;
3984
3985                 btrfs_put_block_group(block_group);
3986                 ret = 0;
3987         }
3988
3989         return ret;
3990 }
3991
3992 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3993 {
3994         struct btrfs_block_group_cache *cache;
3995
3996         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3997                (unsigned long long)(info->total_bytes - info->bytes_used -
3998                                     info->bytes_pinned - info->bytes_reserved),
3999                (info->full) ? "" : "not ");
4000         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4001                " may_use=%llu, used=%llu\n",
4002                (unsigned long long)info->total_bytes,
4003                (unsigned long long)info->bytes_pinned,
4004                (unsigned long long)info->bytes_delalloc,
4005                (unsigned long long)info->bytes_may_use,
4006                (unsigned long long)info->bytes_used);
4007
4008         down_read(&info->groups_sem);
4009         list_for_each_entry(cache, &info->block_groups, list) {
4010                 spin_lock(&cache->lock);
4011                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4012                        "%llu pinned %llu reserved\n",
4013                        (unsigned long long)cache->key.objectid,
4014                        (unsigned long long)cache->key.offset,
4015                        (unsigned long long)btrfs_block_group_used(&cache->item),
4016                        (unsigned long long)cache->pinned,
4017                        (unsigned long long)cache->reserved);
4018                 btrfs_dump_free_space(cache, bytes);
4019                 spin_unlock(&cache->lock);
4020         }
4021         up_read(&info->groups_sem);
4022 }
4023
4024 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4025                                   struct btrfs_root *root,
4026                                   u64 num_bytes, u64 min_alloc_size,
4027                                   u64 empty_size, u64 hint_byte,
4028                                   u64 search_end, struct btrfs_key *ins,
4029                                   u64 data)
4030 {
4031         int ret;
4032         u64 search_start = 0;
4033         struct btrfs_fs_info *info = root->fs_info;
4034
4035         data = btrfs_get_alloc_profile(root, data);
4036 again:
4037         /*
4038          * the only place that sets empty_size is btrfs_realloc_node, which
4039          * is not called recursively on allocations
4040          */
4041         if (empty_size || root->ref_cows) {
4042                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
4043                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4044                                      2 * 1024 * 1024,
4045                                      BTRFS_BLOCK_GROUP_METADATA |
4046                                      (info->metadata_alloc_profile &
4047                                       info->avail_metadata_alloc_bits), 0);
4048                 }
4049                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4050                                      num_bytes + 2 * 1024 * 1024, data, 0);
4051         }
4052
4053         WARN_ON(num_bytes < root->sectorsize);
4054         ret = find_free_extent(trans, root, num_bytes, empty_size,
4055                                search_start, search_end, hint_byte, ins,
4056                                trans->alloc_exclude_start,
4057                                trans->alloc_exclude_nr, data);
4058
4059         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4060                 num_bytes = num_bytes >> 1;
4061                 num_bytes = num_bytes & ~(root->sectorsize - 1);
4062                 num_bytes = max(num_bytes, min_alloc_size);
4063                 do_chunk_alloc(trans, root->fs_info->extent_root,
4064                                num_bytes, data, 1);
4065                 goto again;
4066         }
4067         if (ret == -ENOSPC) {
4068                 struct btrfs_space_info *sinfo;
4069
4070                 sinfo = __find_space_info(root->fs_info, data);
4071                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4072                        "wanted %llu\n", (unsigned long long)data,
4073                        (unsigned long long)num_bytes);
4074                 dump_space_info(sinfo, num_bytes);
4075         }
4076
4077         return ret;
4078 }
4079
4080 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4081 {
4082         struct btrfs_block_group_cache *cache;
4083         int ret = 0;
4084
4085         cache = btrfs_lookup_block_group(root->fs_info, start);
4086         if (!cache) {
4087                 printk(KERN_ERR "Unable to find block group for %llu\n",
4088                        (unsigned long long)start);
4089                 return -ENOSPC;
4090         }
4091
4092         ret = btrfs_discard_extent(root, start, len);
4093
4094         btrfs_add_free_space(cache, start, len);
4095         btrfs_put_block_group(cache);
4096         update_reserved_extents(root, start, len, 0);
4097
4098         return ret;
4099 }
4100
4101 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4102                                   struct btrfs_root *root,
4103                                   u64 num_bytes, u64 min_alloc_size,
4104                                   u64 empty_size, u64 hint_byte,
4105                                   u64 search_end, struct btrfs_key *ins,
4106                                   u64 data)
4107 {
4108         int ret;
4109         ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
4110                                      empty_size, hint_byte, search_end, ins,
4111                                      data);
4112         if (!ret)
4113                 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4114
4115         return ret;
4116 }
4117
4118 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4119                                       struct btrfs_root *root,
4120                                       u64 parent, u64 root_objectid,
4121                                       u64 flags, u64 owner, u64 offset,
4122                                       struct btrfs_key *ins, int ref_mod)
4123 {
4124         int ret;
4125         struct btrfs_fs_info *fs_info = root->fs_info;
4126         struct btrfs_extent_item *extent_item;
4127         struct btrfs_extent_inline_ref *iref;
4128         struct btrfs_path *path;
4129         struct extent_buffer *leaf;
4130         int type;
4131         u32 size;
4132
4133         if (parent > 0)
4134                 type = BTRFS_SHARED_DATA_REF_KEY;
4135         else
4136                 type = BTRFS_EXTENT_DATA_REF_KEY;
4137
4138         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4139
4140         path = btrfs_alloc_path();
4141         BUG_ON(!path);
4142
4143         path->leave_spinning = 1;
4144         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4145                                       ins, size);
4146         BUG_ON(ret);
4147
4148         leaf = path->nodes[0];
4149         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4150                                      struct btrfs_extent_item);
4151         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4152         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4153         btrfs_set_extent_flags(leaf, extent_item,
4154                                flags | BTRFS_EXTENT_FLAG_DATA);
4155
4156         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4157         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4158         if (parent > 0) {
4159                 struct btrfs_shared_data_ref *ref;
4160                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4161                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4162                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4163         } else {
4164                 struct btrfs_extent_data_ref *ref;
4165                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4166                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4167                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4168                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4169                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4170         }
4171
4172         btrfs_mark_buffer_dirty(path->nodes[0]);
4173         btrfs_free_path(path);
4174
4175         ret = update_block_group(trans, root, ins->objectid, ins->offset,
4176                                  1, 0);
4177         if (ret) {
4178                 printk(KERN_ERR "btrfs update block group failed for %llu "
4179                        "%llu\n", (unsigned long long)ins->objectid,
4180                        (unsigned long long)ins->offset);
4181                 BUG();
4182         }
4183         return ret;
4184 }
4185
4186 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4187                                      struct btrfs_root *root,
4188                                      u64 parent, u64 root_objectid,
4189                                      u64 flags, struct btrfs_disk_key *key,
4190                                      int level, struct btrfs_key *ins)
4191 {
4192         int ret;
4193         struct btrfs_fs_info *fs_info = root->fs_info;
4194         struct btrfs_extent_item *extent_item;
4195         struct btrfs_tree_block_info *block_info;
4196         struct btrfs_extent_inline_ref *iref;
4197         struct btrfs_path *path;
4198         struct extent_buffer *leaf;
4199         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
4200
4201         path = btrfs_alloc_path();
4202         BUG_ON(!path);
4203
4204         path->leave_spinning = 1;
4205         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4206                                       ins, size);
4207         BUG_ON(ret);
4208
4209         leaf = path->nodes[0];
4210         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4211                                      struct btrfs_extent_item);
4212         btrfs_set_extent_refs(leaf, extent_item, 1);
4213         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4214         btrfs_set_extent_flags(leaf, extent_item,
4215                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4216         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4217
4218         btrfs_set_tree_block_key(leaf, block_info, key);
4219         btrfs_set_tree_block_level(leaf, block_info, level);
4220
4221         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4222         if (parent > 0) {
4223                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4224                 btrfs_set_extent_inline_ref_type(leaf, iref,
4225                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4226                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4227         } else {
4228                 btrfs_set_extent_inline_ref_type(leaf, iref,
4229                                                  BTRFS_TREE_BLOCK_REF_KEY);
4230                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
4231         }
4232
4233         btrfs_mark_buffer_dirty(leaf);
4234         btrfs_free_path(path);
4235
4236         ret = update_block_group(trans, root, ins->objectid, ins->offset,
4237                                  1, 0);
4238         if (ret) {
4239                 printk(KERN_ERR "btrfs update block group failed for %llu "
4240                        "%llu\n", (unsigned long long)ins->objectid,
4241                        (unsigned long long)ins->offset);
4242                 BUG();
4243         }
4244         return ret;
4245 }
4246
4247 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4248                                      struct btrfs_root *root,
4249                                      u64 root_objectid, u64 owner,
4250                                      u64 offset, struct btrfs_key *ins)
4251 {
4252         int ret;
4253
4254         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
4255
4256         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
4257                                          0, root_objectid, owner, offset,
4258                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
4259         return ret;
4260 }
4261
4262 /*
4263  * this is used by the tree logging recovery code.  It records that
4264  * an extent has been allocated and makes sure to clear the free
4265  * space cache bits as well
4266  */
4267 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4268                                    struct btrfs_root *root,
4269                                    u64 root_objectid, u64 owner, u64 offset,
4270                                    struct btrfs_key *ins)
4271 {
4272         int ret;
4273         struct btrfs_block_group_cache *block_group;
4274
4275         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4276         cache_block_group(block_group);
4277         wait_event(block_group->caching_q,
4278                    block_group_cache_done(block_group));
4279
4280         ret = btrfs_remove_free_space(block_group, ins->objectid,
4281                                       ins->offset);
4282         BUG_ON(ret);
4283         btrfs_put_block_group(block_group);
4284         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4285                                          0, owner, offset, ins, 1);
4286         return ret;
4287 }
4288
4289 /*
4290  * finds a free extent and does all the dirty work required for allocation
4291  * returns the key for the extent through ins, and a tree buffer for
4292  * the first block of the extent through buf.
4293  *
4294  * returns 0 if everything worked, non-zero otherwise.
4295  */
4296 static int alloc_tree_block(struct btrfs_trans_handle *trans,
4297                             struct btrfs_root *root,
4298                             u64 num_bytes, u64 parent, u64 root_objectid,
4299                             struct btrfs_disk_key *key, int level,
4300                             u64 empty_size, u64 hint_byte, u64 search_end,
4301                             struct btrfs_key *ins)
4302 {
4303         int ret;
4304         u64 flags = 0;
4305
4306         ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4307                                      empty_size, hint_byte, search_end,
4308                                      ins, 0);
4309         if (ret)
4310                 return ret;
4311
4312         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4313                 if (parent == 0)
4314                         parent = ins->objectid;
4315                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4316         } else
4317                 BUG_ON(parent > 0);
4318
4319         update_reserved_extents(root, ins->objectid, ins->offset, 1);
4320         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4321                 struct btrfs_delayed_extent_op *extent_op;
4322                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4323                 BUG_ON(!extent_op);
4324                 if (key)
4325                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4326                 else
4327                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4328                 extent_op->flags_to_set = flags;
4329                 extent_op->update_key = 1;
4330                 extent_op->update_flags = 1;
4331                 extent_op->is_data = 0;
4332
4333                 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4334                                         ins->offset, parent, root_objectid,
4335                                         level, BTRFS_ADD_DELAYED_EXTENT,
4336                                         extent_op);
4337                 BUG_ON(ret);
4338         }
4339         return ret;
4340 }
4341
4342 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4343                                             struct btrfs_root *root,
4344                                             u64 bytenr, u32 blocksize,
4345                                             int level)
4346 {
4347         struct extent_buffer *buf;
4348
4349         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
4350         if (!buf)
4351                 return ERR_PTR(-ENOMEM);
4352         btrfs_set_header_generation(buf, trans->transid);
4353         btrfs_set_buffer_lockdep_class(buf, level);
4354         btrfs_tree_lock(buf);
4355         clean_tree_block(trans, root, buf);
4356
4357         btrfs_set_lock_blocking(buf);
4358         btrfs_set_buffer_uptodate(buf);
4359
4360         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4361                 set_extent_dirty(&root->dirty_log_pages, buf->start,
4362                          buf->start + buf->len - 1, GFP_NOFS);
4363         } else {
4364                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4365                          buf->start + buf->len - 1, GFP_NOFS);
4366         }
4367         trans->blocks_used++;
4368         /* this returns a buffer locked for blocking */
4369         return buf;
4370 }
4371
4372 /*
4373  * helper function to allocate a block for a given tree
4374  * returns the tree buffer or NULL.
4375  */
4376 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4377                                         struct btrfs_root *root, u32 blocksize,
4378                                         u64 parent, u64 root_objectid,
4379                                         struct btrfs_disk_key *key, int level,
4380                                         u64 hint, u64 empty_size)
4381 {
4382         struct btrfs_key ins;
4383         int ret;
4384         struct extent_buffer *buf;
4385
4386         ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
4387                                key, level, empty_size, hint, (u64)-1, &ins);
4388         if (ret) {
4389                 BUG_ON(ret > 0);
4390                 return ERR_PTR(ret);
4391         }
4392
4393         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4394                                     blocksize, level);
4395         return buf;
4396 }
4397
4398 #if 0
4399 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
4400                         struct btrfs_root *root, struct extent_buffer *leaf)
4401 {
4402         u64 disk_bytenr;
4403         u64 num_bytes;
4404         struct btrfs_key key;
4405         struct btrfs_file_extent_item *fi;
4406         u32 nritems;
4407         int i;
4408         int ret;
4409
4410         BUG_ON(!btrfs_is_leaf(leaf));
4411         nritems = btrfs_header_nritems(leaf);
4412
4413         for (i = 0; i < nritems; i++) {
4414                 cond_resched();
4415                 btrfs_item_key_to_cpu(leaf, &key, i);
4416
4417                 /* only extents have references, skip everything else */
4418                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4419                         continue;
4420
4421                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4422
4423                 /* inline extents live in the btree, they don't have refs */
4424                 if (btrfs_file_extent_type(leaf, fi) ==
4425                     BTRFS_FILE_EXTENT_INLINE)
4426                         continue;
4427
4428                 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4429
4430                 /* holes don't have refs */
4431                 if (disk_bytenr == 0)
4432                         continue;
4433
4434                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4435                 ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
4436                                         leaf->start, 0, key.objectid, 0);
4437                 BUG_ON(ret);
4438         }
4439         return 0;
4440 }
4441
4442 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
4443                                         struct btrfs_root *root,
4444                                         struct btrfs_leaf_ref *ref)
4445 {
4446         int i;
4447         int ret;
4448         struct btrfs_extent_info *info;
4449         struct refsort *sorted;
4450
4451         if (ref->nritems == 0)
4452                 return 0;
4453
4454         sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
4455         for (i = 0; i < ref->nritems; i++) {
4456                 sorted[i].bytenr = ref->extents[i].bytenr;
4457                 sorted[i].slot = i;
4458         }
4459         sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
4460
4461         /*
4462          * the items in the ref were sorted when the ref was inserted
4463          * into the ref cache, so this is already in order
4464          */
4465         for (i = 0; i < ref->nritems; i++) {
4466                 info = ref->extents + sorted[i].slot;
4467                 ret = btrfs_free_extent(trans, root, info->bytenr,
4468                                           info->num_bytes, ref->bytenr,
4469                                           ref->owner, ref->generation,
4470                                           info->objectid, 0);
4471
4472                 atomic_inc(&root->fs_info->throttle_gen);
4473                 wake_up(&root->fs_info->transaction_throttle);
4474                 cond_resched();
4475
4476                 BUG_ON(ret);
4477                 info++;
4478         }
4479
4480         kfree(sorted);
4481         return 0;
4482 }
4483
4484
4485 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
4486                                      struct btrfs_root *root, u64 start,
4487                                      u64 len, u32 *refs)
4488 {
4489         int ret;
4490
4491         ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
4492         BUG_ON(ret);
4493
4494 #if 0 /* some debugging code in case we see problems here */
4495         /* if the refs count is one, it won't get increased again.  But
4496          * if the ref count is > 1, someone may be decreasing it at
4497          * the same time we are.
4498          */
4499         if (*refs != 1) {
4500                 struct extent_buffer *eb = NULL;
4501                 eb = btrfs_find_create_tree_block(root, start, len);
4502                 if (eb)
4503                         btrfs_tree_lock(eb);
4504
4505                 mutex_lock(&root->fs_info->alloc_mutex);
4506                 ret = lookup_extent_ref(NULL, root, start, len, refs);
4507                 BUG_ON(ret);
4508                 mutex_unlock(&root->fs_info->alloc_mutex);
4509
4510                 if (eb) {
4511                         btrfs_tree_unlock(eb);
4512                         free_extent_buffer(eb);
4513                 }
4514                 if (*refs == 1) {
4515                         printk(KERN_ERR "btrfs block %llu went down to one "
4516                                "during drop_snap\n", (unsigned long long)start);
4517                 }
4518
4519         }
4520 #endif
4521
4522         cond_resched();
4523         return ret;
4524 }
4525
4526
4527 /*
4528  * this is used while deleting old snapshots, and it drops the refs
4529  * on a whole subtree starting from a level 1 node.
4530  *
4531  * The idea is to sort all the leaf pointers, and then drop the
4532  * ref on all the leaves in order.  Most of the time the leaves
4533  * will have ref cache entries, so no leaf IOs will be required to
4534  * find the extents they have references on.
4535  *
4536  * For each leaf, any references it has are also dropped in order
4537  *
4538  * This ends up dropping the references in something close to optimal
4539  * order for reading and modifying the extent allocation tree.
4540  */
4541 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4542                                         struct btrfs_root *root,
4543                                         struct btrfs_path *path)
4544 {
4545         u64 bytenr;
4546         u64 root_owner;
4547         u64 root_gen;
4548         struct extent_buffer *eb = path->nodes[1];
4549         struct extent_buffer *leaf;
4550         struct btrfs_leaf_ref *ref;
4551         struct refsort *sorted = NULL;
4552         int nritems = btrfs_header_nritems(eb);
4553         int ret;
4554         int i;
4555         int refi = 0;
4556         int slot = path->slots[1];
4557         u32 blocksize = btrfs_level_size(root, 0);
4558         u32 refs;
4559
4560         if (nritems == 0)
4561                 goto out;
4562
4563         root_owner = btrfs_header_owner(eb);
4564         root_gen = btrfs_header_generation(eb);
4565         sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
4566
4567         /*
4568          * step one, sort all the leaf pointers so we don't scribble
4569          * randomly into the extent allocation tree
4570          */
4571         for (i = slot; i < nritems; i++) {
4572                 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
4573                 sorted[refi].slot = i;
4574                 refi++;
4575         }
4576
4577         /*
4578          * nritems won't be zero, but if we're picking up drop_snapshot
4579          * after a crash, slot might be > 0, so double check things
4580          * just in case.
4581          */
4582         if (refi == 0)
4583                 goto out;
4584
4585         sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
4586
4587         /*
4588          * the first loop frees everything the leaves point to
4589          */
4590         for (i = 0; i < refi; i++) {
4591                 u64 ptr_gen;
4592
4593                 bytenr = sorted[i].bytenr;
4594
4595                 /*
4596                  * check the reference count on this leaf.  If it is > 1
4597                  * we just decrement it below and don't update any
4598                  * of the refs the leaf points to.
4599                  */
4600                 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4601                                                 blocksize, &refs);
4602                 BUG_ON(ret);
4603                 if (refs != 1)
4604                         continue;
4605
4606                 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
4607
4608                 /*
4609                  * the leaf only had one reference, which means the
4610                  * only thing pointing to this leaf is the snapshot
4611                  * we're deleting.  It isn't possible for the reference
4612                  * count to increase again later
4613                  *
4614                  * The reference cache is checked for the leaf,
4615                  * and if found we'll be able to drop any refs held by
4616                  * the leaf without needing to read it in.
4617                  */
4618                 ref = btrfs_lookup_leaf_ref(root, bytenr);
4619                 if (ref && ref->generation != ptr_gen) {
4620                         btrfs_free_leaf_ref(root, ref);
4621                         ref = NULL;
4622                 }
4623                 if (ref) {
4624                         ret = cache_drop_leaf_ref(trans, root, ref);
4625                         BUG_ON(ret);
4626                         btrfs_remove_leaf_ref(root, ref);
4627                         btrfs_free_leaf_ref(root, ref);
4628                 } else {
4629                         /*
4630                          * the leaf wasn't in the reference cache, so
4631                          * we have to read it.
4632                          */
4633                         leaf = read_tree_block(root, bytenr, blocksize,
4634                                                ptr_gen);
4635                         ret = btrfs_drop_leaf_ref(trans, root, leaf);
4636                         BUG_ON(ret);
4637                         free_extent_buffer(leaf);
4638                 }
4639                 atomic_inc(&root->fs_info->throttle_gen);
4640                 wake_up(&root->fs_info->transaction_throttle);
4641                 cond_resched();
4642         }
4643
4644         /*
4645          * run through the loop again to free the refs on the leaves.
4646          * This is faster than doing it in the loop above because
4647          * the leaves are likely to be clustered together.  We end up
4648          * working in nice chunks on the extent allocation tree.
4649          */
4650         for (i = 0; i < refi; i++) {
4651                 bytenr = sorted[i].bytenr;
4652                 ret = btrfs_free_extent(trans, root, bytenr,
4653                                         blocksize, eb->start,
4654                                         root_owner, root_gen, 0, 1);
4655                 BUG_ON(ret);
4656
4657                 atomic_inc(&root->fs_info->throttle_gen);
4658                 wake_up(&root->fs_info->transaction_throttle);
4659                 cond_resched();
4660         }
4661 out:
4662         kfree(sorted);
4663
4664         /*
4665          * update the path to show we've processed the entire level 1
4666          * node.  This will get saved into the root's drop_snapshot_progress
4667          * field so these drops are not repeated again if this transaction
4668          * commits.
4669          */
4670         path->slots[1] = nritems;
4671         return 0;
4672 }
4673
4674 /*
4675  * helper function for drop_snapshot, this walks down the tree dropping ref
4676  * counts as it goes.
4677  */
4678 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4679                                    struct btrfs_root *root,
4680                                    struct btrfs_path *path, int *level)
4681 {
4682         u64 root_owner;
4683         u64 root_gen;
4684         u64 bytenr;
4685         u64 ptr_gen;
4686         struct extent_buffer *next;
4687         struct extent_buffer *cur;
4688         struct extent_buffer *parent;
4689         u32 blocksize;
4690         int ret;
4691         u32 refs;
4692
4693         WARN_ON(*level < 0);
4694         WARN_ON(*level >= BTRFS_MAX_LEVEL);
4695         ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4696                                 path->nodes[*level]->len, &refs);
4697         BUG_ON(ret);
4698         if (refs > 1)
4699                 goto out;
4700
4701         /*
4702          * walk down to the last node level and free all the leaves
4703          */
4704         while (*level >= 0) {
4705                 WARN_ON(*level < 0);
4706                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4707                 cur = path->nodes[*level];
4708
4709                 if (btrfs_header_level(cur) != *level)
4710                         WARN_ON(1);
4711
4712                 if (path->slots[*level] >=
4713                     btrfs_header_nritems(cur))
4714                         break;
4715
4716                 /* the new code goes down to level 1 and does all the
4717                  * leaves pointed to that node in bulk.  So, this check
4718                  * for level 0 will always be false.
4719                  *
4720                  * But, the disk format allows the drop_snapshot_progress
4721                  * field in the root to leave things in a state where
4722                  * a leaf will need cleaning up here.  If someone crashes
4723                  * with the old code and then boots with the new code,
4724                  * we might find a leaf here.
4725                  */
4726                 if (*level == 0) {
4727                         ret = btrfs_drop_leaf_ref(trans, root, cur);
4728                         BUG_ON(ret);
4729                         break;
4730                 }
4731
4732                 /*
4733                  * once we get to level one, process the whole node
4734                  * at once, including everything below it.
4735                  */
4736                 if (*level == 1) {
4737                         ret = drop_level_one_refs(trans, root, path);
4738                         BUG_ON(ret);
4739                         break;
4740                 }
4741
4742                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4743                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4744                 blocksize = btrfs_level_size(root, *level - 1);
4745
4746                 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4747                                                 blocksize, &refs);
4748                 BUG_ON(ret);
4749
4750                 /*
4751                  * if there is more than one reference, we don't need
4752                  * to read that node to drop any references it has.  We
4753                  * just drop the ref we hold on that node and move on to the
4754                  * next slot in this level.
4755                  */
4756                 if (refs != 1) {
4757                         parent = path->nodes[*level];
4758                         root_owner = btrfs_header_owner(parent);
4759                         root_gen = btrfs_header_generation(parent);
4760                         path->slots[*level]++;
4761
4762                         ret = btrfs_free_extent(trans, root, bytenr,
4763                                                 blocksize, parent->start,
4764                                                 root_owner, root_gen,
4765                                                 *level - 1, 1);
4766                         BUG_ON(ret);
4767
4768                         atomic_inc(&root->fs_info->throttle_gen);
4769                         wake_up(&root->fs_info->transaction_throttle);
4770                         cond_resched();
4771
4772                         continue;
4773                 }
4774
4775                 /*
4776                  * we need to keep freeing things in the next level down.
4777                  * read the block and loop around to process it
4778                  */
4779                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4780                 WARN_ON(*level <= 0);
4781                 if (path->nodes[*level-1])
4782                         free_extent_buffer(path->nodes[*level-1]);
4783                 path->nodes[*level-1] = next;
4784                 *level = btrfs_header_level(next);
4785                 path->slots[*level] = 0;
4786                 cond_resched();
4787         }
4788 out:
4789         WARN_ON(*level < 0);
4790         WARN_ON(*level >= BTRFS_MAX_LEVEL);
4791
4792         if (path->nodes[*level] == root->node) {
4793                 parent = path->nodes[*level];
4794                 bytenr = path->nodes[*level]->start;
4795         } else {
4796                 parent = path->nodes[*level + 1];
4797                 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4798         }
4799
4800         blocksize = btrfs_level_size(root, *level);
4801         root_owner = btrfs_header_owner(parent);
4802         root_gen = btrfs_header_generation(parent);
4803
4804         /*
4805          * cleanup and free the reference on the last node
4806          * we processed
4807          */
4808         ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4809                                   parent->start, root_owner, root_gen,
4810                                   *level, 1);
4811         free_extent_buffer(path->nodes[*level]);
4812         path->nodes[*level] = NULL;
4813
4814         *level += 1;
4815         BUG_ON(ret);
4816
4817         cond_resched();
4818         return 0;
4819 }
4820 #endif
4821
4822 struct walk_control {
4823         u64 refs[BTRFS_MAX_LEVEL];
4824         u64 flags[BTRFS_MAX_LEVEL];
4825         struct btrfs_key update_progress;
4826         int stage;
4827         int level;
4828         int shared_level;
4829         int update_ref;
4830         int keep_locks;
4831 };
4832
4833 #define DROP_REFERENCE  1
4834 #define UPDATE_BACKREF  2
4835
4836 /*
4837  * hepler to process tree block while walking down the tree.
4838  *
4839  * when wc->stage == DROP_REFERENCE, this function checks
4840  * reference count of the block. if the block is shared and
4841  * we need update back refs for the subtree rooted at the
4842  * block, this function changes wc->stage to UPDATE_BACKREF
4843  *
4844  * when wc->stage == UPDATE_BACKREF, this function updates
4845  * back refs for pointers in the block.
4846  *
4847  * NOTE: return value 1 means we should stop walking down.
4848  */
4849 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4850                                    struct btrfs_root *root,
4851                                    struct btrfs_path *path,
4852                                    struct walk_control *wc)
4853 {
4854         int level = wc->level;
4855         struct extent_buffer *eb = path->nodes[level];
4856         struct btrfs_key key;
4857         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4858         int ret;
4859
4860         if (wc->stage == UPDATE_BACKREF &&
4861             btrfs_header_owner(eb) != root->root_key.objectid)
4862                 return 1;
4863
4864         /*
4865          * when reference count of tree block is 1, it won't increase
4866          * again. once full backref flag is set, we never clear it.
4867          */
4868         if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4869             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
4870                 BUG_ON(!path->locks[level]);
4871                 ret = btrfs_lookup_extent_info(trans, root,
4872                                                eb->start, eb->len,
4873                                                &wc->refs[level],
4874                                                &wc->flags[level]);
4875                 BUG_ON(ret);
4876                 BUG_ON(wc->refs[level] == 0);
4877         }
4878
4879         if (wc->stage == DROP_REFERENCE &&
4880             wc->update_ref && wc->refs[level] > 1) {
4881                 BUG_ON(eb == root->node);
4882                 BUG_ON(path->slots[level] > 0);
4883                 if (level == 0)
4884                         btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4885                 else
4886                         btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4887                 if (btrfs_header_owner(eb) == root->root_key.objectid &&
4888                     btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4889                         wc->stage = UPDATE_BACKREF;
4890                         wc->shared_level = level;
4891                 }
4892         }
4893
4894         if (wc->stage == DROP_REFERENCE) {
4895                 if (wc->refs[level] > 1)
4896                         return 1;
4897
4898                 if (path->locks[level] && !wc->keep_locks) {
4899                         btrfs_tree_unlock(eb);
4900                         path->locks[level] = 0;
4901                 }
4902                 return 0;
4903         }
4904
4905         /* wc->stage == UPDATE_BACKREF */
4906         if (!(wc->flags[level] & flag)) {
4907                 BUG_ON(!path->locks[level]);
4908                 ret = btrfs_inc_ref(trans, root, eb, 1);
4909                 BUG_ON(ret);
4910                 ret = btrfs_dec_ref(trans, root, eb, 0);
4911                 BUG_ON(ret);
4912                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
4913                                                   eb->len, flag, 0);
4914                 BUG_ON(ret);
4915                 wc->flags[level] |= flag;
4916         }
4917
4918         /*
4919          * the block is shared by multiple trees, so it's not good to
4920          * keep the tree lock
4921          */
4922         if (path->locks[level] && level > 0) {
4923                 btrfs_tree_unlock(eb);
4924                 path->locks[level] = 0;
4925         }
4926         return 0;
4927 }
4928
4929 /*
4930  * hepler to process tree block while walking up the tree.
4931  *
4932  * when wc->stage == DROP_REFERENCE, this function drops
4933  * reference count on the block.
4934  *
4935  * when wc->stage == UPDATE_BACKREF, this function changes
4936  * wc->stage back to DROP_REFERENCE if we changed wc->stage
4937  * to UPDATE_BACKREF previously while processing the block.
4938  *
4939  * NOTE: return value 1 means we should stop walking up.
4940  */
4941 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4942                                  struct btrfs_root *root,
4943                                  struct btrfs_path *path,
4944                                  struct walk_control *wc)
4945 {
4946         int ret = 0;
4947         int level = wc->level;
4948         struct extent_buffer *eb = path->nodes[level];
4949         u64 parent = 0;
4950
4951         if (wc->stage == UPDATE_BACKREF) {
4952                 BUG_ON(wc->shared_level < level);
4953                 if (level < wc->shared_level)
4954                         goto out;
4955
4956                 BUG_ON(wc->refs[level] <= 1);
4957                 ret = find_next_key(path, level + 1, &wc->update_progress);
4958                 if (ret > 0)
4959                         wc->update_ref = 0;
4960
4961                 wc->stage = DROP_REFERENCE;
4962                 wc->shared_level = -1;
4963                 path->slots[level] = 0;
4964
4965                 /*
4966                  * check reference count again if the block isn't locked.
4967                  * we should start walking down the tree again if reference
4968                  * count is one.
4969                  */
4970                 if (!path->locks[level]) {
4971                         BUG_ON(level == 0);
4972                         btrfs_tree_lock(eb);
4973                         btrfs_set_lock_blocking(eb);
4974                         path->locks[level] = 1;
4975
4976                         ret = btrfs_lookup_extent_info(trans, root,
4977                                                        eb->start, eb->len,
4978                                                        &wc->refs[level],
4979                                                        &wc->flags[level]);
4980                         BUG_ON(ret);
4981                         BUG_ON(wc->refs[level] == 0);
4982                         if (wc->refs[level] == 1) {
4983                                 btrfs_tree_unlock(eb);
4984                                 path->locks[level] = 0;
4985                                 return 1;
4986                         }
4987                 } else {
4988                         BUG_ON(level != 0);
4989                 }
4990         }
4991
4992         /* wc->stage == DROP_REFERENCE */
4993         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
4994
4995         if (wc->refs[level] == 1) {
4996                 if (level == 0) {
4997                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4998                                 ret = btrfs_dec_ref(trans, root, eb, 1);
4999                         else
5000                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5001                         BUG_ON(ret);
5002                 }
5003                 /* make block locked assertion in clean_tree_block happy */
5004                 if (!path->locks[level] &&
5005                     btrfs_header_generation(eb) == trans->transid) {
5006                         btrfs_tree_lock(eb);
5007                         btrfs_set_lock_blocking(eb);
5008                         path->locks[level] = 1;
5009                 }
5010                 clean_tree_block(trans, root, eb);
5011         }
5012
5013         if (eb == root->node) {
5014                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5015                         parent = eb->start;
5016                 else
5017                         BUG_ON(root->root_key.objectid !=
5018                                btrfs_header_owner(eb));
5019         } else {
5020                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5021                         parent = path->nodes[level + 1]->start;
5022                 else
5023                         BUG_ON(root->root_key.objectid !=
5024                                btrfs_header_owner(path->nodes[level + 1]));
5025         }
5026
5027         ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
5028                                 root->root_key.objectid, level, 0);
5029         BUG_ON(ret);
5030 out:
5031         wc->refs[level] = 0;
5032         wc->flags[level] = 0;
5033         return ret;
5034 }
5035
5036 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5037                                    struct btrfs_root *root,
5038                                    struct btrfs_path *path,
5039                                    struct walk_control *wc)
5040 {
5041         struct extent_buffer *next;
5042         struct extent_buffer *cur;
5043         u64 bytenr;
5044         u64 ptr_gen;
5045         u32 blocksize;
5046         int level = wc->level;
5047         int ret;
5048
5049         while (level >= 0) {
5050                 cur = path->nodes[level];
5051                 BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
5052
5053                 ret = walk_down_proc(trans, root, path, wc);
5054                 if (ret > 0)
5055                         break;
5056
5057                 if (level == 0)
5058                         break;
5059
5060                 bytenr = btrfs_node_blockptr(cur, path->slots[level]);
5061                 blocksize = btrfs_level_size(root, level - 1);
5062                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
5063
5064                 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
5065                 btrfs_tree_lock(next);
5066                 btrfs_set_lock_blocking(next);
5067
5068                 level--;
5069                 BUG_ON(level != btrfs_header_level(next));
5070                 path->nodes[level] = next;
5071                 path->slots[level] = 0;
5072                 path->locks[level] = 1;
5073                 wc->level = level;
5074         }
5075         return 0;
5076 }
5077
5078 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5079                                  struct btrfs_root *root,
5080                                  struct btrfs_path *path,
5081                                  struct walk_control *wc, int max_level)
5082 {
5083         int level = wc->level;
5084         int ret;
5085
5086         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5087         while (level < max_level && path->nodes[level]) {
5088                 wc->level = level;
5089                 if (path->slots[level] + 1 <
5090                     btrfs_header_nritems(path->nodes[level])) {
5091                         path->slots[level]++;
5092                         return 0;
5093                 } else {
5094                         ret = walk_up_proc(trans, root, path, wc);
5095                         if (ret > 0)
5096                                 return 0;
5097
5098                         if (path->locks[level]) {
5099                                 btrfs_tree_unlock(path->nodes[level]);
5100                                 path->locks[level] = 0;
5101                         }
5102                         free_extent_buffer(path->nodes[level]);
5103                         path->nodes[level] = NULL;
5104                         level++;
5105                 }
5106         }
5107         return 1;
5108 }
5109
5110 /*
5111  * drop a subvolume tree.
5112  *
5113  * this function traverses the tree freeing any blocks that only
5114  * referenced by the tree.
5115  *
5116  * when a shared tree block is found. this function decreases its
5117  * reference count by one. if update_ref is true, this function
5118  * also make sure backrefs for the shared block and all lower level
5119  * blocks are properly updated.
5120  */
5121 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5122 {
5123         struct btrfs_path *path;
5124         struct btrfs_trans_handle *trans;
5125         struct btrfs_root *tree_root = root->fs_info->tree_root;
5126         struct btrfs_root_item *root_item = &root->root_item;
5127         struct walk_control *wc;
5128         struct btrfs_key key;
5129         int err = 0;
5130         int ret;
5131         int level;
5132
5133         path = btrfs_alloc_path();
5134         BUG_ON(!path);
5135
5136         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5137         BUG_ON(!wc);
5138
5139         trans = btrfs_start_transaction(tree_root, 1);
5140
5141         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5142                 level = btrfs_header_level(root->node);
5143                 path->nodes[level] = btrfs_lock_root_node(root);
5144                 btrfs_set_lock_blocking(path->nodes[level]);
5145                 path->slots[level] = 0;
5146                 path->locks[level] = 1;
5147                 memset(&wc->update_progress, 0,
5148                        sizeof(wc->update_progress));
5149         } else {
5150                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5151                 memcpy(&wc->update_progress, &key,
5152                        sizeof(wc->update_progress));
5153
5154                 level = root_item->drop_level;
5155                 BUG_ON(level == 0);
5156                 path->lowest_level = level;
5157                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5158                 path->lowest_level = 0;
5159                 if (ret < 0) {
5160                         err = ret;
5161                         goto out;
5162                 }
5163                 btrfs_node_key_to_cpu(path->nodes[level], &key,
5164                                       path->slots[level]);
5165                 WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
5166
5167                 /*
5168                  * unlock our path, this is safe because only this
5169                  * function is allowed to delete this snapshot
5170                  */
5171                 btrfs_unlock_up_safe(path, 0);
5172
5173                 level = btrfs_header_level(root->node);
5174                 while (1) {
5175                         btrfs_tree_lock(path->nodes[level]);
5176                         btrfs_set_lock_blocking(path->nodes[level]);
5177
5178                         ret = btrfs_lookup_extent_info(trans, root,
5179                                                 path->nodes[level]->start,
5180                                                 path->nodes[level]->len,
5181                                                 &wc->refs[level],
5182                                                 &wc->flags[level]);
5183                         BUG_ON(ret);
5184                         BUG_ON(wc->refs[level] == 0);
5185
5186                         if (level == root_item->drop_level)
5187                                 break;
5188
5189                         btrfs_tree_unlock(path->nodes[level]);
5190                         WARN_ON(wc->refs[level] != 1);
5191                         level--;
5192                 }
5193         }
5194
5195         wc->level = level;
5196         wc->shared_level = -1;
5197         wc->stage = DROP_REFERENCE;
5198         wc->update_ref = update_ref;
5199         wc->keep_locks = 0;
5200
5201         while (1) {
5202                 ret = walk_down_tree(trans, root, path, wc);
5203                 if (ret < 0) {
5204                         err = ret;
5205                         break;
5206                 }
5207
5208                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5209                 if (ret < 0) {
5210                         err = ret;
5211                         break;
5212                 }
5213
5214                 if (ret > 0) {
5215                         BUG_ON(wc->stage != DROP_REFERENCE);
5216                         break;
5217                 }
5218
5219                 if (wc->stage == DROP_REFERENCE) {
5220                         level = wc->level;
5221                         btrfs_node_key(path->nodes[level],
5222                                        &root_item->drop_progress,
5223                                        path->slots[level]);
5224                         root_item->drop_level = level;
5225                 }
5226
5227                 BUG_ON(wc->level == 0);
5228                 if (trans->transaction->in_commit ||
5229                     trans->transaction->delayed_refs.flushing) {
5230                         ret = btrfs_update_root(trans, tree_root,
5231                                                 &root->root_key,
5232                                                 root_item);
5233                         BUG_ON(ret);
5234
5235                         btrfs_end_transaction(trans, tree_root);
5236                         trans = btrfs_start_transaction(tree_root, 1);
5237                 } else {
5238                         unsigned long update;
5239                         update = trans->delayed_ref_updates;
5240                         trans->delayed_ref_updates = 0;
5241                         if (update)
5242                                 btrfs_run_delayed_refs(trans, tree_root,
5243                                                        update);
5244                 }
5245         }
5246         btrfs_release_path(root, path);
5247         BUG_ON(err);
5248
5249         ret = btrfs_del_root(trans, tree_root, &root->root_key);
5250         BUG_ON(ret);
5251
5252         free_extent_buffer(root->node);
5253         free_extent_buffer(root->commit_root);
5254         kfree(root);
5255 out:
5256         btrfs_end_transaction(trans, tree_root);
5257         kfree(wc);
5258         btrfs_free_path(path);
5259         return err;
5260 }
5261
5262 /*
5263  * drop subtree rooted at tree block 'node'.
5264  *
5265  * NOTE: this function will unlock and release tree block 'node'
5266  */
5267 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5268                         struct btrfs_root *root,
5269                         struct extent_buffer *node,
5270                         struct extent_buffer *parent)
5271 {
5272         struct btrfs_path *path;
5273         struct walk_control *wc;
5274         int level;
5275         int parent_level;
5276         int ret = 0;
5277         int wret;
5278
5279         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5280
5281         path = btrfs_alloc_path();
5282         BUG_ON(!path);
5283
5284         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5285         BUG_ON(!wc);
5286
5287         btrfs_assert_tree_locked(parent);
5288         parent_level = btrfs_header_level(parent);
5289         extent_buffer_get(parent);
5290         path->nodes[parent_level] = parent;
5291         path->slots[parent_level] = btrfs_header_nritems(parent);
5292
5293         btrfs_assert_tree_locked(node);
5294         level = btrfs_header_level(node);
5295         path->nodes[level] = node;
5296         path->slots[level] = 0;
5297         path->locks[level] = 1;
5298
5299         wc->refs[parent_level] = 1;
5300         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5301         wc->level = level;
5302         wc->shared_level = -1;
5303         wc->stage = DROP_REFERENCE;
5304         wc->update_ref = 0;
5305         wc->keep_locks = 1;
5306
5307         while (1) {
5308                 wret = walk_down_tree(trans, root, path, wc);
5309                 if (wret < 0) {
5310                         ret = wret;
5311                         break;
5312                 }
5313
5314                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5315                 if (wret < 0)
5316                         ret = wret;
5317                 if (wret != 0)
5318                         break;
5319         }
5320
5321         kfree(wc);
5322         btrfs_free_path(path);
5323         return ret;
5324 }
5325
5326 #if 0
5327 static unsigned long calc_ra(unsigned long start, unsigned long last,
5328                              unsigned long nr)
5329 {
5330         return min(last, start + nr - 1);
5331 }
5332
5333 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
5334                                          u64 len)
5335 {
5336         u64 page_start;
5337         u64 page_end;
5338         unsigned long first_index;
5339         unsigned long last_index;
5340         unsigned long i;
5341         struct page *page;
5342         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5343         struct file_ra_state *ra;
5344         struct btrfs_ordered_extent *ordered;
5345         unsigned int total_read = 0;
5346         unsigned int total_dirty = 0;
5347         int ret = 0;
5348
5349         ra = kzalloc(sizeof(*ra), GFP_NOFS);
5350
5351         mutex_lock(&inode->i_mutex);
5352         first_index = start >> PAGE_CACHE_SHIFT;
5353         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
5354
5355         /* make sure the dirty trick played by the caller work */
5356         ret = invalidate_inode_pages2_range(inode->i_mapping,
5357                                             first_index, last_index);
5358         if (ret)
5359                 goto out_unlock;
5360
5361         file_ra_state_init(ra, inode->i_mapping);
5362
5363         for (i = first_index ; i <= last_index; i++) {
5364                 if (total_read % ra->ra_pages == 0) {
5365                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
5366                                        calc_ra(i, last_index, ra->ra_pages));
5367                 }
5368                 total_read++;
5369 again:
5370                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
5371                         BUG_ON(1);
5372                 page = grab_cache_page(inode->i_mapping, i);
5373                 if (!page) {
5374                         ret = -ENOMEM;
5375                         goto out_unlock;
5376                 }
5377                 if (!PageUptodate(page)) {
5378                         btrfs_readpage(NULL, page);
5379                         lock_page(page);
5380                         if (!PageUptodate(page)) {
5381                                 unlock_page(page);
5382                                 page_cache_release(page);
5383                                 ret = -EIO;
5384                                 goto out_unlock;
5385                         }
5386                 }
5387                 wait_on_page_writeback(page);
5388
5389                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
5390                 page_end = page_start + PAGE_CACHE_SIZE - 1;
5391                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
5392
5393                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5394                 if (ordered) {
5395                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5396                         unlock_page(page);
5397                         page_cache_release(page);
5398                         btrfs_start_ordered_extent(inode, ordered, 1);
5399                         btrfs_put_ordered_extent(ordered);
5400                         goto again;
5401                 }
5402                 set_page_extent_mapped(page);
5403
5404                 if (i == first_index)
5405                         set_extent_bits(io_tree, page_start, page_end,
5406                                         EXTENT_BOUNDARY, GFP_NOFS);
5407                 btrfs_set_extent_delalloc(inode, page_start, page_end);
5408
5409                 set_page_dirty(page);
5410                 total_dirty++;
5411
5412                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5413                 unlock_page(page);
5414                 page_cache_release(page);
5415         }
5416
5417 out_unlock:
5418         kfree(ra);
5419         mutex_unlock(&inode->i_mutex);
5420         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
5421         return ret;
5422 }
5423
5424 static noinline int relocate_data_extent(struct inode *reloc_inode,
5425                                          struct btrfs_key *extent_key,
5426                                          u64 offset)
5427 {
5428         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5429         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
5430         struct extent_map *em;
5431         u64 start = extent_key->objectid - offset;
5432         u64 end = start + extent_key->offset - 1;
5433
5434         em = alloc_extent_map(GFP_NOFS);
5435         BUG_ON(!em || IS_ERR(em));
5436
5437         em->start = start;
5438         em->len = extent_key->offset;
5439         em->block_len = extent_key->offset;
5440         em->block_start = extent_key->objectid;
5441         em->bdev = root->fs_info->fs_devices->latest_bdev;
5442         set_bit(EXTENT_FLAG_PINNED, &em->flags);
5443
5444         /* setup extent map to cheat btrfs_readpage */
5445         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5446         while (1) {
5447                 int ret;
5448                 spin_lock(&em_tree->lock);
5449                 ret = add_extent_mapping(em_tree, em);
5450                 spin_unlock(&em_tree->lock);
5451                 if (ret != -EEXIST) {
5452                         free_extent_map(em);
5453                         break;
5454                 }
5455                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
5456         }
5457         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5458
5459         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
5460 }
5461
5462 struct btrfs_ref_path {
5463         u64 extent_start;
5464         u64 nodes[BTRFS_MAX_LEVEL];
5465         u64 root_objectid;
5466         u64 root_generation;
5467         u64 owner_objectid;
5468         u32 num_refs;
5469         int lowest_level;
5470         int current_level;
5471         int shared_level;
5472
5473         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
5474         u64 new_nodes[BTRFS_MAX_LEVEL];
5475 };
5476
5477 struct disk_extent {
5478         u64 ram_bytes;
5479         u64 disk_bytenr;
5480         u64 disk_num_bytes;
5481         u64 offset;
5482         u64 num_bytes;
5483         u8 compression;
5484         u8 encryption;
5485         u16 other_encoding;
5486 };
5487
5488 static int is_cowonly_root(u64 root_objectid)
5489 {
5490         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
5491             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
5492             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
5493             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
5494             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5495             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
5496                 return 1;
5497         return 0;
5498 }
5499
5500 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
5501                                     struct btrfs_root *extent_root,
5502                                     struct btrfs_ref_path *ref_path,
5503                                     int first_time)
5504 {
5505         struct extent_buffer *leaf;
5506         struct btrfs_path *path;
5507         struct btrfs_extent_ref *ref;
5508         struct btrfs_key key;
5509         struct btrfs_key found_key;
5510         u64 bytenr;
5511         u32 nritems;
5512         int level;
5513         int ret = 1;
5514
5515         path = btrfs_alloc_path();
5516         if (!path)
5517                 return -ENOMEM;
5518
5519         if (first_time) {
5520                 ref_path->lowest_level = -1;
5521                 ref_path->current_level = -1;
5522                 ref_path->shared_level = -1;
5523                 goto walk_up;
5524         }
5525 walk_down:
5526         level = ref_path->current_level - 1;
5527         while (level >= -1) {
5528                 u64 parent;
5529                 if (level < ref_path->lowest_level)
5530                         break;
5531
5532                 if (level >= 0)
5533                         bytenr = ref_path->nodes[level];
5534                 else
5535                         bytenr = ref_path->extent_start;
5536                 BUG_ON(bytenr == 0);
5537
5538                 parent = ref_path->nodes[level + 1];
5539                 ref_path->nodes[level + 1] = 0;
5540                 ref_path->current_level = level;
5541                 BUG_ON(parent == 0);
5542
5543                 key.objectid = bytenr;
5544                 key.offset = parent + 1;
5545                 key.type = BTRFS_EXTENT_REF_KEY;
5546
5547                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5548                 if (ret < 0)
5549                         goto out;
5550                 BUG_ON(ret == 0);
5551
5552                 leaf = path->nodes[0];
5553                 nritems = btrfs_header_nritems(leaf);
5554                 if (path->slots[0] >= nritems) {
5555                         ret = btrfs_next_leaf(extent_root, path);
5556                         if (ret < 0)
5557                                 goto out;
5558                         if (ret > 0)
5559                                 goto next;
5560                         leaf = path->nodes[0];
5561                 }
5562
5563                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5564                 if (found_key.objectid == bytenr &&
5565                     found_key.type == BTRFS_EXTENT_REF_KEY) {
5566                         if (level < ref_path->shared_level)
5567                                 ref_path->shared_level = level;
5568                         goto found;
5569                 }
5570 next:
5571                 level--;
5572                 btrfs_release_path(extent_root, path);
5573                 cond_resched();
5574         }
5575         /* reached lowest level */
5576         ret = 1;
5577         goto out;
5578 walk_up:
5579         level = ref_path->current_level;
5580         while (level < BTRFS_MAX_LEVEL - 1) {
5581                 u64 ref_objectid;
5582
5583                 if (level >= 0)
5584                         bytenr = ref_path->nodes[level];
5585                 else
5586                         bytenr = ref_path->extent_start;
5587
5588                 BUG_ON(bytenr == 0);
5589
5590                 key.objectid = bytenr;
5591                 key.offset = 0;
5592                 key.type = BTRFS_EXTENT_REF_KEY;
5593
5594                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5595                 if (ret < 0)
5596                         goto out;
5597
5598                 leaf = path->nodes[0];
5599                 nritems = btrfs_header_nritems(leaf);
5600                 if (path->slots[0] >= nritems) {
5601                         ret = btrfs_next_leaf(extent_root, path);
5602                         if (ret < 0)
5603                                 goto out;
5604                         if (ret > 0) {
5605                                 /* the extent was freed by someone */
5606                                 if (ref_path->lowest_level == level)
5607                                         goto out;
5608                                 btrfs_release_path(extent_root, path);
5609                                 goto walk_down;
5610                         }
5611                         leaf = path->nodes[0];
5612                 }
5613
5614                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5615                 if (found_key.objectid != bytenr ||
5616                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
5617                         /* the extent was freed by someone */
5618                         if (ref_path->lowest_level == level) {
5619                                 ret = 1;
5620                                 goto out;
5621                         }
5622                         btrfs_release_path(extent_root, path);
5623                         goto walk_down;
5624                 }
5625 found:
5626                 ref = btrfs_item_ptr(leaf, path->slots[0],
5627                                 struct btrfs_extent_ref);
5628                 ref_objectid = btrfs_ref_objectid(leaf, ref);
5629                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5630                         if (first_time) {
5631                                 level = (int)ref_objectid;
5632                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
5633                                 ref_path->lowest_level = level;
5634                                 ref_path->current_level = level;
5635                                 ref_path->nodes[level] = bytenr;
5636                         } else {
5637                                 WARN_ON(ref_objectid != level);
5638                         }
5639                 } else {
5640                         WARN_ON(level != -1);
5641                 }
5642                 first_time = 0;
5643
5644                 if (ref_path->lowest_level == level) {
5645                         ref_path->owner_objectid = ref_objectid;
5646                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
5647                 }
5648
5649                 /*
5650                  * the block is tree root or the block isn't in reference
5651                  * counted tree.
5652                  */
5653                 if (found_key.objectid == found_key.offset ||
5654                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
5655                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
5656                         ref_path->root_generation =
5657                                 btrfs_ref_generation(leaf, ref);
5658                         if (level < 0) {
5659                                 /* special reference from the tree log */
5660                                 ref_path->nodes[0] = found_key.offset;
5661                                 ref_path->current_level = 0;
5662                         }
5663                         ret = 0;
5664                         goto out;
5665                 }
5666
5667                 level++;
5668                 BUG_ON(ref_path->nodes[level] != 0);
5669                 ref_path->nodes[level] = found_key.offset;
5670                 ref_path->current_level = level;
5671
5672                 /*
5673                  * the reference was created in the running transaction,
5674                  * no need to continue walking up.
5675                  */
5676                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
5677                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
5678                         ref_path->root_generation =
5679                                 btrfs_ref_generation(leaf, ref);
5680                         ret = 0;
5681                         goto out;
5682                 }
5683
5684                 btrfs_release_path(extent_root, path);
5685                 cond_resched();
5686         }
5687         /* reached max tree level, but no tree root found. */
5688         BUG();
5689 out:
5690         btrfs_free_path(path);
5691         return ret;
5692 }
5693
5694 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
5695                                 struct btrfs_root *extent_root,
5696                                 struct btrfs_ref_path *ref_path,
5697                                 u64 extent_start)
5698 {
5699         memset(ref_path, 0, sizeof(*ref_path));
5700         ref_path->extent_start = extent_start;
5701
5702         return __next_ref_path(trans, extent_root, ref_path, 1);
5703 }
5704
5705 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
5706                                struct btrfs_root *extent_root,
5707                                struct btrfs_ref_path *ref_path)
5708 {
5709         return __next_ref_path(trans, extent_root, ref_path, 0);
5710 }
5711
5712 static noinline int get_new_locations(struct inode *reloc_inode,
5713                                       struct btrfs_key *extent_key,
5714                                       u64 offset, int no_fragment,
5715                                       struct disk_extent **extents,
5716                                       int *nr_extents)
5717 {
5718         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5719         struct btrfs_path *path;
5720         struct btrfs_file_extent_item *fi;
5721         struct extent_buffer *leaf;
5722         struct disk_extent *exts = *extents;
5723         struct btrfs_key found_key;
5724         u64 cur_pos;
5725         u64 last_byte;
5726         u32 nritems;
5727         int nr = 0;
5728         int max = *nr_extents;
5729         int ret;
5730
5731         WARN_ON(!no_fragment && *extents);
5732         if (!exts) {
5733                 max = 1;
5734                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
5735                 if (!exts)
5736                         return -ENOMEM;
5737         }
5738
5739         path = btrfs_alloc_path();
5740         BUG_ON(!path);
5741
5742         cur_pos = extent_key->objectid - offset;
5743         last_byte = extent_key->objectid + extent_key->offset;
5744         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
5745                                        cur_pos, 0);
5746         if (ret < 0)
5747                 goto out;
5748         if (ret > 0) {
5749                 ret = -ENOENT;
5750                 goto out;
5751         }
5752
5753         while (1) {
5754                 leaf = path->nodes[0];
5755                 nritems = btrfs_header_nritems(leaf);
5756                 if (path->slots[0] >= nritems) {
5757                         ret = btrfs_next_leaf(root, path);
5758                         if (ret < 0)
5759                                 goto out;
5760                         if (ret > 0)
5761                                 break;
5762                         leaf = path->nodes[0];
5763                 }
5764
5765                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5766                 if (found_key.offset != cur_pos ||
5767                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
5768                     found_key.objectid != reloc_inode->i_ino)
5769                         break;
5770
5771                 fi = btrfs_item_ptr(leaf, path->slots[0],
5772                                     struct btrfs_file_extent_item);
5773                 if (btrfs_file_extent_type(leaf, fi) !=
5774                     BTRFS_FILE_EXTENT_REG ||
5775                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
5776                         break;
5777
5778                 if (nr == max) {
5779                         struct disk_extent *old = exts;
5780                         max *= 2;
5781                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
5782                         memcpy(exts, old, sizeof(*exts) * nr);
5783                         if (old != *extents)
5784                                 kfree(old);
5785                 }
5786
5787                 exts[nr].disk_bytenr =
5788                         btrfs_file_extent_disk_bytenr(leaf, fi);
5789                 exts[nr].disk_num_bytes =
5790                         btrfs_file_extent_disk_num_bytes(leaf, fi);
5791                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
5792                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5793                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
5794                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
5795                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
5796                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
5797                                                                            fi);
5798                 BUG_ON(exts[nr].offset > 0);
5799                 BUG_ON(exts[nr].compression || exts[nr].encryption);
5800                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
5801
5802                 cur_pos += exts[nr].num_bytes;
5803                 nr++;
5804
5805                 if (cur_pos + offset >= last_byte)
5806                         break;
5807
5808                 if (no_fragment) {
5809                         ret = 1;
5810                         goto out;
5811                 }
5812                 path->slots[0]++;
5813         }
5814
5815         BUG_ON(cur_pos + offset > last_byte);
5816         if (cur_pos + offset < last_byte) {
5817                 ret = -ENOENT;
5818                 goto out;
5819         }
5820         ret = 0;
5821 out:
5822         btrfs_free_path(path);
5823         if (ret) {
5824                 if (exts != *extents)
5825                         kfree(exts);
5826         } else {
5827                 *extents = exts;
5828                 *nr_extents = nr;
5829         }
5830         return ret;
5831 }
5832
5833 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
5834                                         struct btrfs_root *root,
5835                                         struct btrfs_path *path,
5836                                         struct btrfs_key *extent_key,
5837                                         struct btrfs_key *leaf_key,
5838                                         struct btrfs_ref_path *ref_path,
5839                                         struct disk_extent *new_extents,
5840                                         int nr_extents)
5841 {
5842         struct extent_buffer *leaf;
5843         struct btrfs_file_extent_item *fi;
5844         struct inode *inode = NULL;
5845         struct btrfs_key key;
5846         u64 lock_start = 0;
5847         u64 lock_end = 0;
5848         u64 num_bytes;
5849         u64 ext_offset;
5850         u64 search_end = (u64)-1;
5851         u32 nritems;
5852         int nr_scaned = 0;
5853         int extent_locked = 0;
5854         int extent_type;
5855         int ret;
5856
5857         memcpy(&key, leaf_key, sizeof(key));
5858         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5859                 if (key.objectid < ref_path->owner_objectid ||
5860                     (key.objectid == ref_path->owner_objectid &&
5861                      key.type < BTRFS_EXTENT_DATA_KEY)) {
5862                         key.objectid = ref_path->owner_objectid;
5863                         key.type = BTRFS_EXTENT_DATA_KEY;
5864                         key.offset = 0;
5865                 }
5866         }
5867
5868         while (1) {
5869                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5870                 if (ret < 0)
5871                         goto out;
5872
5873                 leaf = path->nodes[0];
5874                 nritems = btrfs_header_nritems(leaf);
5875 next:
5876                 if (extent_locked && ret > 0) {
5877                         /*
5878                          * the file extent item was modified by someone
5879                          * before the extent got locked.
5880                          */
5881                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5882                                       lock_end, GFP_NOFS);
5883                         extent_locked = 0;
5884                 }
5885
5886                 if (path->slots[0] >= nritems) {
5887                         if (++nr_scaned > 2)
5888                                 break;
5889
5890                         BUG_ON(extent_locked);
5891                         ret = btrfs_next_leaf(root, path);
5892                         if (ret < 0)
5893                                 goto out;
5894                         if (ret > 0)
5895                                 break;
5896                         leaf = path->nodes[0];
5897                         nritems = btrfs_header_nritems(leaf);
5898                 }
5899
5900                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5901
5902                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5903                         if ((key.objectid > ref_path->owner_objectid) ||
5904                             (key.objectid == ref_path->owner_objectid &&
5905                              key.type > BTRFS_EXTENT_DATA_KEY) ||
5906                             key.offset >= search_end)
5907                                 break;
5908                 }
5909
5910                 if (inode && key.objectid != inode->i_ino) {
5911                         BUG_ON(extent_locked);
5912                         btrfs_release_path(root, path);
5913                         mutex_unlock(&inode->i_mutex);
5914                         iput(inode);
5915                         inode = NULL;
5916                         continue;
5917                 }
5918
5919                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
5920                         path->slots[0]++;
5921                         ret = 1;
5922                         goto next;
5923                 }
5924                 fi = btrfs_item_ptr(leaf, path->slots[0],
5925                                     struct btrfs_file_extent_item);
5926                 extent_type = btrfs_file_extent_type(leaf, fi);
5927                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
5928                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
5929                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
5930                      extent_key->objectid)) {
5931                         path->slots[0]++;
5932                         ret = 1;
5933                         goto next;
5934                 }
5935
5936                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5937                 ext_offset = btrfs_file_extent_offset(leaf, fi);
5938
5939                 if (search_end == (u64)-1) {
5940                         search_end = key.offset - ext_offset +
5941                                 btrfs_file_extent_ram_bytes(leaf, fi);
5942                 }
5943
5944                 if (!extent_locked) {
5945                         lock_start = key.offset;
5946                         lock_end = lock_start + num_bytes - 1;
5947                 } else {
5948                         if (lock_start > key.offset ||
5949                             lock_end + 1 < key.offset + num_bytes) {
5950                                 unlock_extent(&BTRFS_I(inode)->io_tree,
5951                                               lock_start, lock_end, GFP_NOFS);
5952                                 extent_locked = 0;
5953                         }
5954                 }
5955
5956                 if (!inode) {
5957                         btrfs_release_path(root, path);
5958
5959                         inode = btrfs_iget_locked(root->fs_info->sb,
5960                                                   key.objectid, root);
5961                         if (inode->i_state & I_NEW) {
5962                                 BTRFS_I(inode)->root = root;
5963                                 BTRFS_I(inode)->location.objectid =
5964                                         key.objectid;
5965                                 BTRFS_I(inode)->location.type =
5966                                         BTRFS_INODE_ITEM_KEY;
5967                                 BTRFS_I(inode)->location.offset = 0;
5968                                 btrfs_read_locked_inode(inode);
5969                                 unlock_new_inode(inode);
5970                         }
5971                         /*
5972                          * some code call btrfs_commit_transaction while
5973                          * holding the i_mutex, so we can't use mutex_lock
5974                          * here.
5975                          */
5976                         if (is_bad_inode(inode) ||
5977                             !mutex_trylock(&inode->i_mutex)) {
5978                                 iput(inode);
5979                                 inode = NULL;
5980                                 key.offset = (u64)-1;
5981                                 goto skip;
5982                         }
5983                 }
5984
5985                 if (!extent_locked) {
5986                         struct btrfs_ordered_extent *ordered;
5987
5988                         btrfs_release_path(root, path);
5989
5990                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5991                                     lock_end, GFP_NOFS);
5992                         ordered = btrfs_lookup_first_ordered_extent(inode,
5993                                                                     lock_end);
5994                         if (ordered &&
5995                             ordered->file_offset <= lock_end &&
5996                             ordered->file_offset + ordered->len > lock_start) {
5997                                 unlock_extent(&BTRFS_I(inode)->io_tree,
5998                                               lock_start, lock_end, GFP_NOFS);
5999                                 btrfs_start_ordered_extent(inode, ordered, 1);
6000                                 btrfs_put_ordered_extent(ordered);
6001                                 key.offset += num_bytes;
6002                                 goto skip;
6003                         }
6004                         if (ordered)
6005                                 btrfs_put_ordered_extent(ordered);
6006
6007                         extent_locked = 1;
6008                         continue;
6009                 }
6010
6011                 if (nr_extents == 1) {
6012                         /* update extent pointer in place */
6013                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
6014                                                 new_extents[0].disk_bytenr);
6015                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6016                                                 new_extents[0].disk_num_bytes);
6017                         btrfs_mark_buffer_dirty(leaf);
6018
6019                         btrfs_drop_extent_cache(inode, key.offset,
6020                                                 key.offset + num_bytes - 1, 0);
6021
6022                         ret = btrfs_inc_extent_ref(trans, root,
6023                                                 new_extents[0].disk_bytenr,
6024                                                 new_extents[0].disk_num_bytes,
6025                                                 leaf->start,
6026                                                 root->root_key.objectid,
6027                                                 trans->transid,
6028                                                 key.objectid);
6029                         BUG_ON(ret);
6030
6031                         ret = btrfs_free_extent(trans, root,
6032                                                 extent_key->objectid,
6033                                                 extent_key->offset,
6034                                                 leaf->start,
6035                                                 btrfs_header_owner(leaf),
6036                                                 btrfs_header_generation(leaf),
6037                                                 key.objectid, 0);
6038                         BUG_ON(ret);
6039
6040                         btrfs_release_path(root, path);
6041                         key.offset += num_bytes;
6042                 } else {
6043                         BUG_ON(1);
6044 #if 0
6045                         u64 alloc_hint;
6046                         u64 extent_len;
6047                         int i;
6048                         /*
6049                          * drop old extent pointer at first, then insert the
6050                          * new pointers one bye one
6051                          */
6052                         btrfs_release_path(root, path);
6053                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
6054                                                  key.offset + num_bytes,
6055                                                  key.offset, &alloc_hint);
6056                         BUG_ON(ret);
6057
6058                         for (i = 0; i < nr_extents; i++) {
6059                                 if (ext_offset >= new_extents[i].num_bytes) {
6060                                         ext_offset -= new_extents[i].num_bytes;
6061                                         continue;
6062                                 }
6063                                 extent_len = min(new_extents[i].num_bytes -
6064                                                  ext_offset, num_bytes);
6065
6066                                 ret = btrfs_insert_empty_item(trans, root,
6067                                                               path, &key,
6068                                                               sizeof(*fi));
6069                                 BUG_ON(ret);
6070
6071                                 leaf = path->nodes[0];
6072                                 fi = btrfs_item_ptr(leaf, path->slots[0],
6073                                                 struct btrfs_file_extent_item);
6074                                 btrfs_set_file_extent_generation(leaf, fi,
6075                                                         trans->transid);
6076                                 btrfs_set_file_extent_type(leaf, fi,
6077                                                         BTRFS_FILE_EXTENT_REG);
6078                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6079                                                 new_extents[i].disk_bytenr);
6080                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6081                                                 new_extents[i].disk_num_bytes);
6082                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
6083                                                 new_extents[i].ram_bytes);
6084
6085                                 btrfs_set_file_extent_compression(leaf, fi,
6086                                                 new_extents[i].compression);
6087                                 btrfs_set_file_extent_encryption(leaf, fi,
6088                                                 new_extents[i].encryption);
6089                                 btrfs_set_file_extent_other_encoding(leaf, fi,
6090                                                 new_extents[i].other_encoding);
6091
6092                                 btrfs_set_file_extent_num_bytes(leaf, fi,
6093                                                         extent_len);
6094                                 ext_offset += new_extents[i].offset;
6095                                 btrfs_set_file_extent_offset(leaf, fi,
6096                                                         ext_offset);
6097                                 btrfs_mark_buffer_dirty(leaf);
6098
6099                                 btrfs_drop_extent_cache(inode, key.offset,
6100                                                 key.offset + extent_len - 1, 0);
6101
6102                                 ret = btrfs_inc_extent_ref(trans, root,
6103                                                 new_extents[i].disk_bytenr,
6104                                                 new_extents[i].disk_num_bytes,
6105                                                 leaf->start,
6106                                                 root->root_key.objectid,
6107                                                 trans->transid, key.objectid);
6108                                 BUG_ON(ret);
6109                                 btrfs_release_path(root, path);
6110
6111                                 inode_add_bytes(inode, extent_len);
6112
6113                                 ext_offset = 0;
6114                                 num_bytes -= extent_len;
6115                                 key.offset += extent_len;
6116
6117                                 if (num_bytes == 0)
6118                                         break;
6119                         }
6120                         BUG_ON(i >= nr_extents);
6121 #endif
6122                 }
6123
6124                 if (extent_locked) {
6125                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6126                                       lock_end, GFP_NOFS);
6127                         extent_locked = 0;
6128                 }
6129 skip:
6130                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
6131                     key.offset >= search_end)
6132                         break;
6133
6134                 cond_resched();
6135         }
6136         ret = 0;
6137 out:
6138         btrfs_release_path(root, path);
6139         if (inode) {
6140                 mutex_unlock(&inode->i_mutex);
6141                 if (extent_locked) {
6142                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6143                                       lock_end, GFP_NOFS);
6144                 }
6145                 iput(inode);
6146         }
6147         return ret;
6148 }
6149
6150 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6151                                struct btrfs_root *root,
6152                                struct extent_buffer *buf, u64 orig_start)
6153 {
6154         int level;
6155         int ret;
6156
6157         BUG_ON(btrfs_header_generation(buf) != trans->transid);
6158         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6159
6160         level = btrfs_header_level(buf);
6161         if (level == 0) {
6162                 struct btrfs_leaf_ref *ref;
6163                 struct btrfs_leaf_ref *orig_ref;
6164
6165                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6166                 if (!orig_ref)
6167                         return -ENOENT;
6168
6169                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6170                 if (!ref) {
6171                         btrfs_free_leaf_ref(root, orig_ref);
6172                         return -ENOMEM;
6173                 }
6174
6175                 ref->nritems = orig_ref->nritems;
6176                 memcpy(ref->extents, orig_ref->extents,
6177                         sizeof(ref->extents[0]) * ref->nritems);
6178
6179                 btrfs_free_leaf_ref(root, orig_ref);
6180
6181                 ref->root_gen = trans->transid;
6182                 ref->bytenr = buf->start;
6183                 ref->owner = btrfs_header_owner(buf);
6184                 ref->generation = btrfs_header_generation(buf);
6185
6186                 ret = btrfs_add_leaf_ref(root, ref, 0);
6187                 WARN_ON(ret);
6188                 btrfs_free_leaf_ref(root, ref);
6189         }
6190         return 0;
6191 }
6192
6193 static noinline int invalidate_extent_cache(struct btrfs_root *root,
6194                                         struct extent_buffer *leaf,
6195                                         struct btrfs_block_group_cache *group,
6196                                         struct btrfs_root *target_root)
6197 {
6198         struct btrfs_key key;
6199         struct inode *inode = NULL;
6200         struct btrfs_file_extent_item *fi;
6201         u64 num_bytes;
6202         u64 skip_objectid = 0;
6203         u32 nritems;
6204         u32 i;
6205
6206         nritems = btrfs_header_nritems(leaf);
6207         for (i = 0; i < nritems; i++) {
6208                 btrfs_item_key_to_cpu(leaf, &key, i);
6209                 if (key.objectid == skip_objectid ||
6210                     key.type != BTRFS_EXTENT_DATA_KEY)
6211                         continue;
6212                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6213                 if (btrfs_file_extent_type(leaf, fi) ==
6214                     BTRFS_FILE_EXTENT_INLINE)
6215                         continue;
6216                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6217                         continue;
6218                 if (!inode || inode->i_ino != key.objectid) {
6219                         iput(inode);
6220                         inode = btrfs_ilookup(target_root->fs_info->sb,
6221                                               key.objectid, target_root, 1);
6222                 }
6223                 if (!inode) {
6224                         skip_objectid = key.objectid;
6225                         continue;
6226                 }
6227                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6228
6229                 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
6230                             key.offset + num_bytes - 1, GFP_NOFS);
6231                 btrfs_drop_extent_cache(inode, key.offset,
6232                                         key.offset + num_bytes - 1, 1);
6233                 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
6234                               key.offset + num_bytes - 1, GFP_NOFS);
6235                 cond_resched();
6236         }
6237         iput(inode);
6238         return 0;
6239 }
6240
6241 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
6242                                         struct btrfs_root *root,
6243                                         struct extent_buffer *leaf,
6244                                         struct btrfs_block_group_cache *group,
6245                                         struct inode *reloc_inode)
6246 {
6247         struct btrfs_key key;
6248         struct btrfs_key extent_key;
6249         struct btrfs_file_extent_item *fi;
6250         struct btrfs_leaf_ref *ref;
6251         struct disk_extent *new_extent;
6252         u64 bytenr;
6253         u64 num_bytes;
6254         u32 nritems;
6255         u32 i;
6256         int ext_index;
6257         int nr_extent;
6258         int ret;
6259
6260         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
6261         BUG_ON(!new_extent);
6262
6263         ref = btrfs_lookup_leaf_ref(root, leaf->start);
6264         BUG_ON(!ref);
6265
6266         ext_index = -1;
6267         nritems = btrfs_header_nritems(leaf);
6268         for (i = 0; i < nritems; i++) {
6269                 btrfs_item_key_to_cpu(leaf, &key, i);
6270                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
6271                         continue;
6272                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6273                 if (btrfs_file_extent_type(leaf, fi) ==
6274                     BTRFS_FILE_EXTENT_INLINE)
6275                         continue;
6276                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6277                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6278                 if (bytenr == 0)
6279                         continue;
6280
6281                 ext_index++;
6282                 if (bytenr >= group->key.objectid + group->key.offset ||
6283                     bytenr + num_bytes <= group->key.objectid)
6284                         continue;
6285
6286                 extent_key.objectid = bytenr;
6287                 extent_key.offset = num_bytes;
6288                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6289                 nr_extent = 1;
6290                 ret = get_new_locations(reloc_inode, &extent_key,
6291                                         group->key.objectid, 1,
6292                                         &new_extent, &nr_extent);
6293                 if (ret > 0)
6294                         continue;
6295                 BUG_ON(ret < 0);
6296
6297                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
6298                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
6299                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
6300                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
6301
6302                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6303                                                 new_extent->disk_bytenr);
6304                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6305                                                 new_extent->disk_num_bytes);
6306                 btrfs_mark_buffer_dirty(leaf);
6307
6308                 ret = btrfs_inc_extent_ref(trans, root,
6309                                         new_extent->disk_bytenr,
6310                                         new_extent->disk_num_bytes,
6311                                         leaf->start,
6312                                         root->root_key.objectid,
6313                                         trans->transid, key.objectid);
6314                 BUG_ON(ret);
6315
6316                 ret = btrfs_free_extent(trans, root,
6317                                         bytenr, num_bytes, leaf->start,
6318                                         btrfs_header_owner(leaf),
6319                                         btrfs_header_generation(leaf),
6320                                         key.objectid, 0);
6321                 BUG_ON(ret);
6322                 cond_resched();
6323         }
6324         kfree(new_extent);
6325         BUG_ON(ext_index + 1 != ref->nritems);
6326         btrfs_free_leaf_ref(root, ref);
6327         return 0;
6328 }
6329
6330 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
6331                           struct btrfs_root *root)
6332 {
6333         struct btrfs_root *reloc_root;
6334         int ret;
6335
6336         if (root->reloc_root) {
6337                 reloc_root = root->reloc_root;
6338                 root->reloc_root = NULL;
6339                 list_add(&reloc_root->dead_list,
6340                          &root->fs_info->dead_reloc_roots);
6341
6342                 btrfs_set_root_bytenr(&reloc_root->root_item,
6343                                       reloc_root->node->start);
6344                 btrfs_set_root_level(&root->root_item,
6345                                      btrfs_header_level(reloc_root->node));
6346                 memset(&reloc_root->root_item.drop_progress, 0,
6347                         sizeof(struct btrfs_disk_key));
6348                 reloc_root->root_item.drop_level = 0;
6349
6350                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
6351                                         &reloc_root->root_key,
6352                                         &reloc_root->root_item);
6353                 BUG_ON(ret);
6354         }
6355         return 0;
6356 }
6357
6358 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
6359 {
6360         struct btrfs_trans_handle *trans;
6361         struct btrfs_root *reloc_root;
6362         struct btrfs_root *prev_root = NULL;
6363         struct list_head dead_roots;
6364         int ret;
6365         unsigned long nr;
6366
6367         INIT_LIST_HEAD(&dead_roots);
6368         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
6369
6370         while (!list_empty(&dead_roots)) {
6371                 reloc_root = list_entry(dead_roots.prev,
6372                                         struct btrfs_root, dead_list);
6373                 list_del_init(&reloc_root->dead_list);
6374
6375                 BUG_ON(reloc_root->commit_root != NULL);
6376                 while (1) {
6377                         trans = btrfs_join_transaction(root, 1);
6378                         BUG_ON(!trans);
6379
6380                         mutex_lock(&root->fs_info->drop_mutex);
6381                         ret = btrfs_drop_snapshot(trans, reloc_root);
6382                         if (ret != -EAGAIN)
6383                                 break;
6384                         mutex_unlock(&root->fs_info->drop_mutex);
6385
6386                         nr = trans->blocks_used;
6387                         ret = btrfs_end_transaction(trans, root);
6388                         BUG_ON(ret);
6389                         btrfs_btree_balance_dirty(root, nr);
6390                 }
6391
6392                 free_extent_buffer(reloc_root->node);
6393
6394                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
6395                                      &reloc_root->root_key);
6396                 BUG_ON(ret);
6397                 mutex_unlock(&root->fs_info->drop_mutex);
6398
6399                 nr = trans->blocks_used;
6400                 ret = btrfs_end_transaction(trans, root);
6401                 BUG_ON(ret);
6402                 btrfs_btree_balance_dirty(root, nr);
6403
6404                 kfree(prev_root);
6405                 prev_root = reloc_root;
6406         }
6407         if (prev_root) {
6408                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
6409                 kfree(prev_root);
6410         }
6411         return 0;
6412 }
6413
6414 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
6415 {
6416         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
6417         return 0;
6418 }
6419
6420 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
6421 {
6422         struct btrfs_root *reloc_root;
6423         struct btrfs_trans_handle *trans;
6424         struct btrfs_key location;
6425         int found;
6426         int ret;
6427
6428         mutex_lock(&root->fs_info->tree_reloc_mutex);
6429         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
6430         BUG_ON(ret);
6431         found = !list_empty(&root->fs_info->dead_reloc_roots);
6432         mutex_unlock(&root->fs_info->tree_reloc_mutex);
6433
6434         if (found) {
6435                 trans = btrfs_start_transaction(root, 1);
6436                 BUG_ON(!trans);
6437                 ret = btrfs_commit_transaction(trans, root);
6438                 BUG_ON(ret);
6439         }
6440
6441         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6442         location.offset = (u64)-1;
6443         location.type = BTRFS_ROOT_ITEM_KEY;
6444
6445         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
6446         BUG_ON(!reloc_root);
6447         btrfs_orphan_cleanup(reloc_root);
6448         return 0;
6449 }
6450
6451 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
6452                                     struct btrfs_root *root)
6453 {
6454         struct btrfs_root *reloc_root;
6455         struct extent_buffer *eb;
6456         struct btrfs_root_item *root_item;
6457         struct btrfs_key root_key;
6458         int ret;
6459
6460         BUG_ON(!root->ref_cows);
6461         if (root->reloc_root)
6462                 return 0;
6463
6464         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
6465         BUG_ON(!root_item);
6466
6467         ret = btrfs_copy_root(trans, root, root->commit_root,
6468                               &eb, BTRFS_TREE_RELOC_OBJECTID);
6469         BUG_ON(ret);
6470
6471         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
6472         root_key.offset = root->root_key.objectid;
6473         root_key.type = BTRFS_ROOT_ITEM_KEY;
6474
6475         memcpy(root_item, &root->root_item, sizeof(root_item));
6476         btrfs_set_root_refs(root_item, 0);
6477         btrfs_set_root_bytenr(root_item, eb->start);
6478         btrfs_set_root_level(root_item, btrfs_header_level(eb));
6479         btrfs_set_root_generation(root_item, trans->transid);
6480
6481         btrfs_tree_unlock(eb);
6482         free_extent_buffer(eb);
6483
6484         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
6485                                 &root_key, root_item);
6486         BUG_ON(ret);
6487         kfree(root_item);
6488
6489         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
6490                                                  &root_key);
6491         BUG_ON(!reloc_root);
6492         reloc_root->last_trans = trans->transid;
6493         reloc_root->commit_root = NULL;
6494         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
6495
6496         root->reloc_root = reloc_root;
6497         return 0;
6498 }
6499
6500 /*
6501  * Core function of space balance.
6502  *
6503  * The idea is using reloc trees to relocate tree blocks in reference
6504  * counted roots. There is one reloc tree for each subvol, and all
6505  * reloc trees share same root key objectid. Reloc trees are snapshots
6506  * of the latest committed roots of subvols (root->commit_root).
6507  *
6508  * To relocate a tree block referenced by a subvol, there are two steps.
6509  * COW the block through subvol's reloc tree, then update block pointer
6510  * in the subvol to point to the new block. Since all reloc trees share
6511  * same root key objectid, doing special handing for tree blocks owned
6512  * by them is easy. Once a tree block has been COWed in one reloc tree,
6513  * we can use the resulting new block directly when the same block is
6514  * required to COW again through other reloc trees. By this way, relocated
6515  * tree blocks are shared between reloc trees, so they are also shared
6516  * between subvols.
6517  */
6518 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
6519                                       struct btrfs_root *root,
6520                                       struct btrfs_path *path,
6521                                       struct btrfs_key *first_key,
6522                                       struct btrfs_ref_path *ref_path,
6523                                       struct btrfs_block_group_cache *group,
6524                                       struct inode *reloc_inode)
6525 {
6526         struct btrfs_root *reloc_root;
6527         struct extent_buffer *eb = NULL;
6528         struct btrfs_key *keys;
6529         u64 *nodes;
6530         int level;
6531         int shared_level;
6532         int lowest_level = 0;
6533         int ret;
6534
6535         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
6536                 lowest_level = ref_path->owner_objectid;
6537
6538         if (!root->ref_cows) {
6539                 path->lowest_level = lowest_level;
6540                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
6541                 BUG_ON(ret < 0);
6542                 path->lowest_level = 0;
6543                 btrfs_release_path(root, path);
6544                 return 0;
6545         }
6546
6547         mutex_lock(&root->fs_info->tree_reloc_mutex);
6548         ret = init_reloc_tree(trans, root);
6549         BUG_ON(ret);
6550         reloc_root = root->reloc_root;
6551
6552         shared_level = ref_path->shared_level;
6553         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
6554
6555         keys = ref_path->node_keys;
6556         nodes = ref_path->new_nodes;
6557         memset(&keys[shared_level + 1], 0,
6558                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
6559         memset(&nodes[shared_level + 1], 0,
6560                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
6561
6562         if (nodes[lowest_level] == 0) {
6563                 path->lowest_level = lowest_level;
6564                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6565                                         0, 1);
6566                 BUG_ON(ret);
6567                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
6568                         eb = path->nodes[level];
6569                         if (!eb || eb == reloc_root->node)
6570                                 break;
6571                         nodes[level] = eb->start;
6572                         if (level == 0)
6573                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
6574                         else
6575                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
6576                 }
6577                 if (nodes[0] &&
6578                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6579                         eb = path->nodes[0];
6580                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
6581                                                       group, reloc_inode);
6582                         BUG_ON(ret);
6583                 }
6584                 btrfs_release_path(reloc_root, path);
6585         } else {
6586                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
6587                                        lowest_level);
6588                 BUG_ON(ret);
6589         }
6590
6591         /*
6592          * replace tree blocks in the fs tree with tree blocks in
6593          * the reloc tree.
6594          */
6595         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
6596         BUG_ON(ret < 0);
6597
6598         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6599                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6600                                         0, 0);
6601                 BUG_ON(ret);
6602                 extent_buffer_get(path->nodes[0]);
6603                 eb = path->nodes[0];
6604                 btrfs_release_path(reloc_root, path);
6605                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
6606                 BUG_ON(ret);
6607                 free_extent_buffer(eb);
6608         }
6609
6610         mutex_unlock(&root->fs_info->tree_reloc_mutex);
6611         path->lowest_level = 0;
6612         return 0;
6613 }
6614
6615 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
6616                                         struct btrfs_root *root,
6617                                         struct btrfs_path *path,
6618                                         struct btrfs_key *first_key,
6619                                         struct btrfs_ref_path *ref_path)
6620 {
6621         int ret;
6622
6623         ret = relocate_one_path(trans, root, path, first_key,
6624                                 ref_path, NULL, NULL);
6625         BUG_ON(ret);
6626
6627         return 0;
6628 }
6629
6630 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
6631                                     struct btrfs_root *extent_root,
6632                                     struct btrfs_path *path,
6633                                     struct btrfs_key *extent_key)
6634 {
6635         int ret;
6636
6637         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
6638         if (ret)
6639                 goto out;
6640         ret = btrfs_del_item(trans, extent_root, path);
6641 out:
6642         btrfs_release_path(extent_root, path);
6643         return ret;
6644 }
6645
6646 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
6647                                                 struct btrfs_ref_path *ref_path)
6648 {
6649         struct btrfs_key root_key;
6650
6651         root_key.objectid = ref_path->root_objectid;
6652         root_key.type = BTRFS_ROOT_ITEM_KEY;
6653         if (is_cowonly_root(ref_path->root_objectid))
6654                 root_key.offset = 0;
6655         else
6656                 root_key.offset = (u64)-1;
6657
6658         return btrfs_read_fs_root_no_name(fs_info, &root_key);
6659 }
6660
6661 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
6662                                         struct btrfs_path *path,
6663                                         struct btrfs_key *extent_key,
6664                                         struct btrfs_block_group_cache *group,
6665                                         struct inode *reloc_inode, int pass)
6666 {
6667         struct btrfs_trans_handle *trans;
6668         struct btrfs_root *found_root;
6669         struct btrfs_ref_path *ref_path = NULL;
6670         struct disk_extent *new_extents = NULL;
6671         int nr_extents = 0;
6672         int loops;
6673         int ret;
6674         int level;
6675         struct btrfs_key first_key;
6676         u64 prev_block = 0;
6677
6678
6679         trans = btrfs_start_transaction(extent_root, 1);
6680         BUG_ON(!trans);
6681
6682         if (extent_key->objectid == 0) {
6683                 ret = del_extent_zero(trans, extent_root, path, extent_key);
6684                 goto out;
6685         }
6686
6687         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
6688         if (!ref_path) {
6689                 ret = -ENOMEM;
6690                 goto out;
6691         }
6692
6693         for (loops = 0; ; loops++) {
6694                 if (loops == 0) {
6695                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
6696                                                    extent_key->objectid);
6697                 } else {
6698                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
6699                 }
6700                 if (ret < 0)
6701                         goto out;
6702                 if (ret > 0)
6703                         break;
6704
6705                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6706                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
6707                         continue;
6708
6709                 found_root = read_ref_root(extent_root->fs_info, ref_path);
6710                 BUG_ON(!found_root);
6711                 /*
6712                  * for reference counted tree, only process reference paths
6713                  * rooted at the latest committed root.
6714                  */
6715                 if (found_root->ref_cows &&
6716                     ref_path->root_generation != found_root->root_key.offset)
6717                         continue;
6718
6719                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6720                         if (pass == 0) {
6721                                 /*
6722                                  * copy data extents to new locations
6723                                  */
6724                                 u64 group_start = group->key.objectid;
6725                                 ret = relocate_data_extent(reloc_inode,
6726                                                            extent_key,
6727                                                            group_start);
6728                                 if (ret < 0)
6729                                         goto out;
6730                                 break;
6731                         }
6732                         level = 0;
6733                 } else {
6734                         level = ref_path->owner_objectid;
6735                 }
6736
6737                 if (prev_block != ref_path->nodes[level]) {
6738                         struct extent_buffer *eb;
6739                         u64 block_start = ref_path->nodes[level];
6740                         u64 block_size = btrfs_level_size(found_root, level);
6741
6742                         eb = read_tree_block(found_root, block_start,
6743                                              block_size, 0);
6744                         btrfs_tree_lock(eb);
6745                         BUG_ON(level != btrfs_header_level(eb));
6746
6747                         if (level == 0)
6748                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
6749                         else
6750                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
6751
6752                         btrfs_tree_unlock(eb);
6753                         free_extent_buffer(eb);
6754                         prev_block = block_start;
6755                 }
6756
6757                 mutex_lock(&extent_root->fs_info->trans_mutex);
6758                 btrfs_record_root_in_trans(found_root);
6759                 mutex_unlock(&extent_root->fs_info->trans_mutex);
6760                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6761                         /*
6762                          * try to update data extent references while
6763                          * keeping metadata shared between snapshots.
6764                          */
6765                         if (pass == 1) {
6766                                 ret = relocate_one_path(trans, found_root,
6767                                                 path, &first_key, ref_path,
6768                                                 group, reloc_inode);
6769                                 if (ret < 0)
6770                                         goto out;
6771                                 continue;
6772                         }
6773                         /*
6774                          * use fallback method to process the remaining
6775                          * references.
6776                          */
6777                         if (!new_extents) {
6778                                 u64 group_start = group->key.objectid;
6779                                 new_extents = kmalloc(sizeof(*new_extents),
6780                                                       GFP_NOFS);
6781                                 nr_extents = 1;
6782                                 ret = get_new_locations(reloc_inode,
6783                                                         extent_key,
6784                                                         group_start, 1,
6785                                                         &new_extents,
6786                                                         &nr_extents);
6787                                 if (ret)
6788                                         goto out;
6789                         }
6790                         ret = replace_one_extent(trans, found_root,
6791                                                 path, extent_key,
6792                                                 &first_key, ref_path,
6793                                                 new_extents, nr_extents);
6794                 } else {
6795                         ret = relocate_tree_block(trans, found_root, path,
6796                                                   &first_key, ref_path);
6797                 }
6798                 if (ret < 0)
6799                         goto out;
6800         }
6801         ret = 0;
6802 out:
6803         btrfs_end_transaction(trans, extent_root);
6804         kfree(new_extents);
6805         kfree(ref_path);
6806         return ret;
6807 }
6808 #endif
6809
6810 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6811 {
6812         u64 num_devices;
6813         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6814                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6815
6816         num_devices = root->fs_info->fs_devices->rw_devices;
6817         if (num_devices == 1) {
6818                 stripped |= BTRFS_BLOCK_GROUP_DUP;
6819                 stripped = flags & ~stripped;
6820
6821                 /* turn raid0 into single device chunks */
6822                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
6823                         return stripped;
6824
6825                 /* turn mirroring into duplication */
6826                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
6827                              BTRFS_BLOCK_GROUP_RAID10))
6828                         return stripped | BTRFS_BLOCK_GROUP_DUP;
6829                 return flags;
6830         } else {
6831                 /* they already had raid on here, just return */
6832                 if (flags & stripped)
6833                         return flags;
6834
6835                 stripped |= BTRFS_BLOCK_GROUP_DUP;
6836                 stripped = flags & ~stripped;
6837
6838                 /* switch duplicated blocks with raid1 */
6839                 if (flags & BTRFS_BLOCK_GROUP_DUP)
6840                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
6841
6842                 /* turn single device chunks into raid0 */
6843                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
6844         }
6845         return flags;
6846 }
6847
6848 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
6849                      struct btrfs_block_group_cache *shrink_block_group,
6850                      int force)
6851 {
6852         struct btrfs_trans_handle *trans;
6853         u64 new_alloc_flags;
6854         u64 calc;
6855
6856         spin_lock(&shrink_block_group->lock);
6857         if (btrfs_block_group_used(&shrink_block_group->item) +
6858             shrink_block_group->reserved > 0) {
6859                 spin_unlock(&shrink_block_group->lock);
6860
6861                 trans = btrfs_start_transaction(root, 1);
6862                 spin_lock(&shrink_block_group->lock);
6863
6864                 new_alloc_flags = update_block_group_flags(root,
6865                                                    shrink_block_group->flags);
6866                 if (new_alloc_flags != shrink_block_group->flags) {
6867                         calc =
6868                              btrfs_block_group_used(&shrink_block_group->item);
6869                 } else {
6870                         calc = shrink_block_group->key.offset;
6871                 }
6872                 spin_unlock(&shrink_block_group->lock);
6873
6874                 do_chunk_alloc(trans, root->fs_info->extent_root,
6875                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
6876
6877                 btrfs_end_transaction(trans, root);
6878         } else
6879                 spin_unlock(&shrink_block_group->lock);
6880         return 0;
6881 }
6882
6883
6884 int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6885                                          struct btrfs_block_group_cache *group)
6886
6887 {
6888         __alloc_chunk_for_shrink(root, group, 1);
6889         set_block_group_readonly(group);
6890         return 0;
6891 }
6892
6893 #if 0
6894 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6895                                  struct btrfs_root *root,
6896                                  u64 objectid, u64 size)
6897 {
6898         struct btrfs_path *path;
6899         struct btrfs_inode_item *item;
6900         struct extent_buffer *leaf;
6901         int ret;
6902
6903         path = btrfs_alloc_path();
6904         if (!path)
6905                 return -ENOMEM;
6906
6907         path->leave_spinning = 1;
6908         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6909         if (ret)
6910                 goto out;
6911
6912         leaf = path->nodes[0];
6913         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6914         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6915         btrfs_set_inode_generation(leaf, item, 1);
6916         btrfs_set_inode_size(leaf, item, size);
6917         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6918         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6919         btrfs_mark_buffer_dirty(leaf);
6920         btrfs_release_path(root, path);
6921 out:
6922         btrfs_free_path(path);
6923         return ret;
6924 }
6925
6926 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6927                                         struct btrfs_block_group_cache *group)
6928 {
6929         struct inode *inode = NULL;
6930         struct btrfs_trans_handle *trans;
6931         struct btrfs_root *root;
6932         struct btrfs_key root_key;
6933         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
6934         int err = 0;
6935
6936         root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6937         root_key.type = BTRFS_ROOT_ITEM_KEY;
6938         root_key.offset = (u64)-1;
6939         root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6940         if (IS_ERR(root))
6941                 return ERR_CAST(root);
6942
6943         trans = btrfs_start_transaction(root, 1);
6944         BUG_ON(!trans);
6945
6946         err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
6947         if (err)
6948                 goto out;
6949
6950         err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
6951         BUG_ON(err);
6952
6953         err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6954                                        group->key.offset, 0, group->key.offset,
6955                                        0, 0, 0);
6956         BUG_ON(err);
6957
6958         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6959         if (inode->i_state & I_NEW) {
6960                 BTRFS_I(inode)->root = root;
6961                 BTRFS_I(inode)->location.objectid = objectid;
6962                 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6963                 BTRFS_I(inode)->location.offset = 0;
6964                 btrfs_read_locked_inode(inode);
6965                 unlock_new_inode(inode);
6966                 BUG_ON(is_bad_inode(inode));
6967         } else {
6968                 BUG_ON(1);
6969         }
6970         BTRFS_I(inode)->index_cnt = group->key.objectid;
6971
6972         err = btrfs_orphan_add(trans, inode);
6973 out:
6974         btrfs_end_transaction(trans, root);
6975         if (err) {
6976                 if (inode)
6977                         iput(inode);
6978                 inode = ERR_PTR(err);
6979         }
6980         return inode;
6981 }
6982
6983 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6984 {
6985
6986         struct btrfs_ordered_sum *sums;
6987         struct btrfs_sector_sum *sector_sum;
6988         struct btrfs_ordered_extent *ordered;
6989         struct btrfs_root *root = BTRFS_I(inode)->root;
6990         struct list_head list;
6991         size_t offset;
6992         int ret;
6993         u64 disk_bytenr;
6994
6995         INIT_LIST_HEAD(&list);
6996
6997         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6998         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6999
7000         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
7001         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
7002                                        disk_bytenr + len - 1, &list);
7003
7004         while (!list_empty(&list)) {
7005                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
7006                 list_del_init(&sums->list);
7007
7008                 sector_sum = sums->sums;
7009                 sums->bytenr = ordered->start;
7010
7011                 offset = 0;
7012                 while (offset < sums->len) {
7013                         sector_sum->bytenr += ordered->start - disk_bytenr;
7014                         sector_sum++;
7015                         offset += root->sectorsize;
7016                 }
7017
7018                 btrfs_add_ordered_sum(inode, ordered, sums);
7019         }
7020         btrfs_put_ordered_extent(ordered);
7021         return 0;
7022 }
7023
7024 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
7025 {
7026         struct btrfs_trans_handle *trans;
7027         struct btrfs_path *path;
7028         struct btrfs_fs_info *info = root->fs_info;
7029         struct extent_buffer *leaf;
7030         struct inode *reloc_inode;
7031         struct btrfs_block_group_cache *block_group;
7032         struct btrfs_key key;
7033         u64 skipped;
7034         u64 cur_byte;
7035         u64 total_found;
7036         u32 nritems;
7037         int ret;
7038         int progress;
7039         int pass = 0;
7040
7041         root = root->fs_info->extent_root;
7042
7043         block_group = btrfs_lookup_block_group(info, group_start);
7044         BUG_ON(!block_group);
7045
7046         printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
7047                (unsigned long long)block_group->key.objectid,
7048                (unsigned long long)block_group->flags);
7049
7050         path = btrfs_alloc_path();
7051         BUG_ON(!path);
7052
7053         reloc_inode = create_reloc_inode(info, block_group);
7054         BUG_ON(IS_ERR(reloc_inode));
7055
7056         __alloc_chunk_for_shrink(root, block_group, 1);
7057         set_block_group_readonly(block_group);
7058
7059         btrfs_start_delalloc_inodes(info->tree_root);
7060         btrfs_wait_ordered_extents(info->tree_root, 0);
7061 again:
7062         skipped = 0;
7063         total_found = 0;
7064         progress = 0;
7065         key.objectid = block_group->key.objectid;
7066         key.offset = 0;
7067         key.type = 0;
7068         cur_byte = key.objectid;
7069
7070         trans = btrfs_start_transaction(info->tree_root, 1);
7071         btrfs_commit_transaction(trans, info->tree_root);
7072
7073         mutex_lock(&root->fs_info->cleaner_mutex);
7074         btrfs_clean_old_snapshots(info->tree_root);
7075         btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
7076         mutex_unlock(&root->fs_info->cleaner_mutex);
7077
7078         trans = btrfs_start_transaction(info->tree_root, 1);
7079         btrfs_commit_transaction(trans, info->tree_root);
7080
7081         while (1) {
7082                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7083                 if (ret < 0)
7084                         goto out;
7085 next:
7086                 leaf = path->nodes[0];
7087                 nritems = btrfs_header_nritems(leaf);
7088                 if (path->slots[0] >= nritems) {
7089                         ret = btrfs_next_leaf(root, path);
7090                         if (ret < 0)
7091                                 goto out;
7092                         if (ret == 1) {
7093                                 ret = 0;
7094                                 break;
7095                         }
7096                         leaf = path->nodes[0];
7097                         nritems = btrfs_header_nritems(leaf);
7098                 }
7099
7100                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7101
7102                 if (key.objectid >= block_group->key.objectid +
7103                     block_group->key.offset)
7104                         break;
7105
7106                 if (progress && need_resched()) {
7107                         btrfs_release_path(root, path);
7108                         cond_resched();
7109                         progress = 0;
7110                         continue;
7111                 }
7112                 progress = 1;
7113
7114                 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
7115                     key.objectid + key.offset <= cur_byte) {
7116                         path->slots[0]++;
7117                         goto next;
7118                 }
7119
7120                 total_found++;
7121                 cur_byte = key.objectid + key.offset;
7122                 btrfs_release_path(root, path);
7123
7124                 __alloc_chunk_for_shrink(root, block_group, 0);
7125                 ret = relocate_one_extent(root, path, &key, block_group,
7126                                           reloc_inode, pass);
7127                 BUG_ON(ret < 0);
7128                 if (ret > 0)
7129                         skipped++;
7130
7131                 key.objectid = cur_byte;
7132                 key.type = 0;
7133                 key.offset = 0;
7134         }
7135
7136         btrfs_release_path(root, path);
7137
7138         if (pass == 0) {
7139                 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
7140                 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
7141         }
7142
7143         if (total_found > 0) {
7144                 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
7145                        (unsigned long long)total_found, pass);
7146                 pass++;
7147                 if (total_found == skipped && pass > 2) {
7148                         iput(reloc_inode);
7149                         reloc_inode = create_reloc_inode(info, block_group);
7150                         pass = 0;
7151                 }
7152                 goto again;
7153         }
7154
7155         /* delete reloc_inode */
7156         iput(reloc_inode);
7157
7158         /* unpin extents in this range */
7159         trans = btrfs_start_transaction(info->tree_root, 1);
7160         btrfs_commit_transaction(trans, info->tree_root);
7161
7162         spin_lock(&block_group->lock);
7163         WARN_ON(block_group->pinned > 0);
7164         WARN_ON(block_group->reserved > 0);
7165         WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
7166         spin_unlock(&block_group->lock);
7167         btrfs_put_block_group(block_group);
7168         ret = 0;
7169 out:
7170         btrfs_free_path(path);
7171         return ret;
7172 }
7173 #endif
7174
7175 static int find_first_block_group(struct btrfs_root *root,
7176                 struct btrfs_path *path, struct btrfs_key *key)
7177 {
7178         int ret = 0;
7179         struct btrfs_key found_key;
7180         struct extent_buffer *leaf;
7181         int slot;
7182
7183         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7184         if (ret < 0)
7185                 goto out;
7186
7187         while (1) {
7188                 slot = path->slots[0];
7189                 leaf = path->nodes[0];
7190                 if (slot >= btrfs_header_nritems(leaf)) {
7191                         ret = btrfs_next_leaf(root, path);
7192                         if (ret == 0)
7193                                 continue;
7194                         if (ret < 0)
7195                                 goto out;
7196                         break;
7197                 }
7198                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7199
7200                 if (found_key.objectid >= key->objectid &&
7201                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7202                         ret = 0;
7203                         goto out;
7204                 }
7205                 path->slots[0]++;
7206         }
7207         ret = -ENOENT;
7208 out:
7209         return ret;
7210 }
7211
7212 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7213 {
7214         struct btrfs_block_group_cache *block_group;
7215         struct btrfs_space_info *space_info;
7216         struct rb_node *n;
7217
7218         spin_lock(&info->block_group_cache_lock);
7219         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7220                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7221                                        cache_node);
7222                 rb_erase(&block_group->cache_node,
7223                          &info->block_group_cache_tree);
7224                 spin_unlock(&info->block_group_cache_lock);
7225
7226                 down_write(&block_group->space_info->groups_sem);
7227                 list_del(&block_group->list);
7228                 up_write(&block_group->space_info->groups_sem);
7229
7230                 if (block_group->cached == BTRFS_CACHE_STARTED)
7231                         wait_event(block_group->caching_q,
7232                                    block_group_cache_done(block_group));
7233
7234                 btrfs_remove_free_space_cache(block_group);
7235
7236                 WARN_ON(atomic_read(&block_group->count) != 1);
7237                 kfree(block_group);
7238
7239                 spin_lock(&info->block_group_cache_lock);
7240         }
7241         spin_unlock(&info->block_group_cache_lock);
7242
7243         /* now that all the block groups are freed, go through and
7244          * free all the space_info structs.  This is only called during
7245          * the final stages of unmount, and so we know nobody is
7246          * using them.  We call synchronize_rcu() once before we start,
7247          * just to be on the safe side.
7248          */
7249         synchronize_rcu();
7250
7251         while(!list_empty(&info->space_info)) {
7252                 space_info = list_entry(info->space_info.next,
7253                                         struct btrfs_space_info,
7254                                         list);
7255
7256                 list_del(&space_info->list);
7257                 kfree(space_info);
7258         }
7259         return 0;
7260 }
7261
7262 int btrfs_read_block_groups(struct btrfs_root *root)
7263 {
7264         struct btrfs_path *path;
7265         int ret;
7266         struct btrfs_block_group_cache *cache;
7267         struct btrfs_fs_info *info = root->fs_info;
7268         struct btrfs_space_info *space_info;
7269         struct btrfs_key key;
7270         struct btrfs_key found_key;
7271         struct extent_buffer *leaf;
7272
7273         root = info->extent_root;
7274         key.objectid = 0;
7275         key.offset = 0;
7276         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7277         path = btrfs_alloc_path();
7278         if (!path)
7279                 return -ENOMEM;
7280
7281         while (1) {
7282                 ret = find_first_block_group(root, path, &key);
7283                 if (ret > 0) {
7284                         ret = 0;
7285                         goto error;
7286                 }
7287                 if (ret != 0)
7288                         goto error;
7289
7290                 leaf = path->nodes[0];
7291                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7292                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7293                 if (!cache) {
7294                         ret = -ENOMEM;
7295                         break;
7296                 }
7297
7298                 atomic_set(&cache->count, 1);
7299                 spin_lock_init(&cache->lock);
7300                 spin_lock_init(&cache->tree_lock);
7301                 cache->fs_info = info;
7302                 init_waitqueue_head(&cache->caching_q);
7303                 INIT_LIST_HEAD(&cache->list);
7304                 INIT_LIST_HEAD(&cache->cluster_list);
7305
7306                 /*
7307                  * we only want to have 32k of ram per block group for keeping
7308                  * track of free space, and if we pass 1/2 of that we want to
7309                  * start converting things over to using bitmaps
7310                  */
7311                 cache->extents_thresh = ((1024 * 32) / 2) /
7312                         sizeof(struct btrfs_free_space);
7313
7314                 read_extent_buffer(leaf, &cache->item,
7315                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7316                                    sizeof(cache->item));
7317                 memcpy(&cache->key, &found_key, sizeof(found_key));
7318
7319                 key.objectid = found_key.objectid + found_key.offset;
7320                 btrfs_release_path(root, path);
7321                 cache->flags = btrfs_block_group_flags(&cache->item);
7322                 cache->sectorsize = root->sectorsize;
7323
7324                 remove_sb_from_cache(root, cache);
7325
7326                 /*
7327                  * check for two cases, either we are full, and therefore
7328                  * don't need to bother with the caching work since we won't
7329                  * find any space, or we are empty, and we can just add all
7330                  * the space in and be done with it.  This saves us _alot_ of
7331                  * time, particularly in the full case.
7332                  */
7333                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7334                         cache->cached = BTRFS_CACHE_FINISHED;
7335                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7336                         cache->cached = BTRFS_CACHE_FINISHED;
7337                         add_new_free_space(cache, root->fs_info,
7338                                            found_key.objectid,
7339                                            found_key.objectid +
7340                                            found_key.offset);
7341                 }
7342
7343                 ret = update_space_info(info, cache->flags, found_key.offset,
7344                                         btrfs_block_group_used(&cache->item),
7345                                         &space_info);
7346                 BUG_ON(ret);
7347                 cache->space_info = space_info;
7348                 down_write(&space_info->groups_sem);
7349                 list_add_tail(&cache->list, &space_info->block_groups);
7350                 up_write(&space_info->groups_sem);
7351
7352                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7353                 BUG_ON(ret);
7354
7355                 set_avail_alloc_bits(root->fs_info, cache->flags);
7356                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7357                         set_block_group_readonly(cache);
7358         }
7359         ret = 0;
7360 error:
7361         btrfs_free_path(path);
7362         return ret;
7363 }
7364
7365 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7366                            struct btrfs_root *root, u64 bytes_used,
7367                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7368                            u64 size)
7369 {
7370         int ret;
7371         struct btrfs_root *extent_root;
7372         struct btrfs_block_group_cache *cache;
7373
7374         extent_root = root->fs_info->extent_root;
7375
7376         root->fs_info->last_trans_log_full_commit = trans->transid;
7377
7378         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7379         if (!cache)
7380                 return -ENOMEM;
7381
7382         cache->key.objectid = chunk_offset;
7383         cache->key.offset = size;
7384         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7385         cache->sectorsize = root->sectorsize;
7386
7387         /*
7388          * we only want to have 32k of ram per block group for keeping track
7389          * of free space, and if we pass 1/2 of that we want to start
7390          * converting things over to using bitmaps
7391          */
7392         cache->extents_thresh = ((1024 * 32) / 2) /
7393                 sizeof(struct btrfs_free_space);
7394         atomic_set(&cache->count, 1);
7395         spin_lock_init(&cache->lock);
7396         spin_lock_init(&cache->tree_lock);
7397         init_waitqueue_head(&cache->caching_q);
7398         INIT_LIST_HEAD(&cache->list);
7399         INIT_LIST_HEAD(&cache->cluster_list);
7400
7401         btrfs_set_block_group_used(&cache->item, bytes_used);
7402         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7403         cache->flags = type;
7404         btrfs_set_block_group_flags(&cache->item, type);
7405
7406         cache->cached = BTRFS_CACHE_FINISHED;
7407         remove_sb_from_cache(root, cache);
7408
7409         add_new_free_space(cache, root->fs_info, chunk_offset,
7410                            chunk_offset + size);
7411
7412         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7413                                 &cache->space_info);
7414         BUG_ON(ret);
7415         down_write(&cache->space_info->groups_sem);
7416         list_add_tail(&cache->list, &cache->space_info->block_groups);
7417         up_write(&cache->space_info->groups_sem);
7418
7419         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7420         BUG_ON(ret);
7421
7422         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7423                                 sizeof(cache->item));
7424         BUG_ON(ret);
7425
7426         set_avail_alloc_bits(extent_root->fs_info, type);
7427
7428         return 0;
7429 }
7430
7431 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7432                              struct btrfs_root *root, u64 group_start)
7433 {
7434         struct btrfs_path *path;
7435         struct btrfs_block_group_cache *block_group;
7436         struct btrfs_free_cluster *cluster;
7437         struct btrfs_key key;
7438         int ret;
7439
7440         root = root->fs_info->extent_root;
7441
7442         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7443         BUG_ON(!block_group);
7444         BUG_ON(!block_group->ro);
7445
7446         memcpy(&key, &block_group->key, sizeof(key));
7447
7448         /* make sure this block group isn't part of an allocation cluster */
7449         cluster = &root->fs_info->data_alloc_cluster;
7450         spin_lock(&cluster->refill_lock);
7451         btrfs_return_cluster_to_free_space(block_group, cluster);
7452         spin_unlock(&cluster->refill_lock);
7453
7454         /*
7455          * make sure this block group isn't part of a metadata
7456          * allocation cluster
7457          */
7458         cluster = &root->fs_info->meta_alloc_cluster;
7459         spin_lock(&cluster->refill_lock);
7460         btrfs_return_cluster_to_free_space(block_group, cluster);
7461         spin_unlock(&cluster->refill_lock);
7462
7463         path = btrfs_alloc_path();
7464         BUG_ON(!path);
7465
7466         spin_lock(&root->fs_info->block_group_cache_lock);
7467         rb_erase(&block_group->cache_node,
7468                  &root->fs_info->block_group_cache_tree);
7469         spin_unlock(&root->fs_info->block_group_cache_lock);
7470
7471         down_write(&block_group->space_info->groups_sem);
7472         /*
7473          * we must use list_del_init so people can check to see if they
7474          * are still on the list after taking the semaphore
7475          */
7476         list_del_init(&block_group->list);
7477         up_write(&block_group->space_info->groups_sem);
7478
7479         if (block_group->cached == BTRFS_CACHE_STARTED)
7480                 wait_event(block_group->caching_q,
7481                            block_group_cache_done(block_group));
7482
7483         btrfs_remove_free_space_cache(block_group);
7484
7485         spin_lock(&block_group->space_info->lock);
7486         block_group->space_info->total_bytes -= block_group->key.offset;
7487         block_group->space_info->bytes_readonly -= block_group->key.offset;
7488         spin_unlock(&block_group->space_info->lock);
7489         block_group->space_info->full = 0;
7490
7491         btrfs_put_block_group(block_group);
7492         btrfs_put_block_group(block_group);
7493
7494         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7495         if (ret > 0)
7496                 ret = -EIO;
7497         if (ret < 0)
7498                 goto out;
7499
7500         ret = btrfs_del_item(trans, root, path);
7501 out:
7502         btrfs_free_path(path);
7503         return ret;
7504 }