Btrfs: fix deadlocks and stalls on dead root removal
[linux-2.6] / fs / btrfs / transaction.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include <linux/writeback.h>
22 #include <linux/pagemap.h>
23 #include <linux/blkdev.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "locking.h"
28 #include "ref-cache.h"
29 #include "tree-log.h"
30
31 #define BTRFS_ROOT_TRANS_TAG 0
32
33 static noinline void put_transaction(struct btrfs_transaction *transaction)
34 {
35         WARN_ON(transaction->use_count == 0);
36         transaction->use_count--;
37         if (transaction->use_count == 0) {
38                 list_del_init(&transaction->list);
39                 memset(transaction, 0, sizeof(*transaction));
40                 kmem_cache_free(btrfs_transaction_cachep, transaction);
41         }
42 }
43
44 /*
45  * either allocate a new transaction or hop into the existing one
46  */
47 static noinline int join_transaction(struct btrfs_root *root)
48 {
49         struct btrfs_transaction *cur_trans;
50         cur_trans = root->fs_info->running_transaction;
51         if (!cur_trans) {
52                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53                                              GFP_NOFS);
54                 BUG_ON(!cur_trans);
55                 root->fs_info->generation++;
56                 cur_trans->num_writers = 1;
57                 cur_trans->num_joined = 0;
58                 cur_trans->transid = root->fs_info->generation;
59                 init_waitqueue_head(&cur_trans->writer_wait);
60                 init_waitqueue_head(&cur_trans->commit_wait);
61                 cur_trans->in_commit = 0;
62                 cur_trans->blocked = 0;
63                 cur_trans->use_count = 1;
64                 cur_trans->commit_done = 0;
65                 cur_trans->start_time = get_seconds();
66
67                 cur_trans->delayed_refs.root.rb_node = NULL;
68                 cur_trans->delayed_refs.num_entries = 0;
69                 cur_trans->delayed_refs.num_heads_ready = 0;
70                 cur_trans->delayed_refs.num_heads = 0;
71                 cur_trans->delayed_refs.flushing = 0;
72                 cur_trans->delayed_refs.run_delayed_start = 0;
73                 spin_lock_init(&cur_trans->delayed_refs.lock);
74
75                 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
76                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
77                 extent_io_tree_init(&cur_trans->dirty_pages,
78                                      root->fs_info->btree_inode->i_mapping,
79                                      GFP_NOFS);
80                 spin_lock(&root->fs_info->new_trans_lock);
81                 root->fs_info->running_transaction = cur_trans;
82                 spin_unlock(&root->fs_info->new_trans_lock);
83         } else {
84                 cur_trans->num_writers++;
85                 cur_trans->num_joined++;
86         }
87
88         return 0;
89 }
90
91 /*
92  * this does all the record keeping required to make sure that a reference
93  * counted root is properly recorded in a given transaction.  This is required
94  * to make sure the old root from before we joined the transaction is deleted
95  * when the transaction commits
96  */
97 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
98 {
99         struct btrfs_dirty_root *dirty;
100         u64 running_trans_id = root->fs_info->running_transaction->transid;
101         if (root->ref_cows && root->last_trans < running_trans_id) {
102                 WARN_ON(root == root->fs_info->extent_root);
103                 if (root->root_item.refs != 0) {
104                         radix_tree_tag_set(&root->fs_info->fs_roots_radix,
105                                    (unsigned long)root->root_key.objectid,
106                                    BTRFS_ROOT_TRANS_TAG);
107
108                         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
109                         BUG_ON(!dirty);
110                         dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
111                         BUG_ON(!dirty->root);
112                         dirty->latest_root = root;
113                         INIT_LIST_HEAD(&dirty->list);
114
115                         root->commit_root = btrfs_root_node(root);
116
117                         memcpy(dirty->root, root, sizeof(*root));
118                         spin_lock_init(&dirty->root->node_lock);
119                         spin_lock_init(&dirty->root->list_lock);
120                         mutex_init(&dirty->root->objectid_mutex);
121                         mutex_init(&dirty->root->log_mutex);
122                         INIT_LIST_HEAD(&dirty->root->dead_list);
123                         dirty->root->node = root->commit_root;
124                         dirty->root->commit_root = NULL;
125
126                         spin_lock(&root->list_lock);
127                         list_add(&dirty->root->dead_list, &root->dead_list);
128                         spin_unlock(&root->list_lock);
129
130                         root->dirty_root = dirty;
131                 } else {
132                         WARN_ON(1);
133                 }
134                 root->last_trans = running_trans_id;
135         }
136         return 0;
137 }
138
139 /* wait for commit against the current transaction to become unblocked
140  * when this is done, it is safe to start a new transaction, but the current
141  * transaction might not be fully on disk.
142  */
143 static void wait_current_trans(struct btrfs_root *root)
144 {
145         struct btrfs_transaction *cur_trans;
146
147         cur_trans = root->fs_info->running_transaction;
148         if (cur_trans && cur_trans->blocked) {
149                 DEFINE_WAIT(wait);
150                 cur_trans->use_count++;
151                 while (1) {
152                         prepare_to_wait(&root->fs_info->transaction_wait, &wait,
153                                         TASK_UNINTERRUPTIBLE);
154                         if (cur_trans->blocked) {
155                                 mutex_unlock(&root->fs_info->trans_mutex);
156                                 schedule();
157                                 mutex_lock(&root->fs_info->trans_mutex);
158                                 finish_wait(&root->fs_info->transaction_wait,
159                                             &wait);
160                         } else {
161                                 finish_wait(&root->fs_info->transaction_wait,
162                                             &wait);
163                                 break;
164                         }
165                 }
166                 put_transaction(cur_trans);
167         }
168 }
169
170 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
171                                              int num_blocks, int wait)
172 {
173         struct btrfs_trans_handle *h =
174                 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
175         int ret;
176
177         mutex_lock(&root->fs_info->trans_mutex);
178         if (!root->fs_info->log_root_recovering &&
179             ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
180                 wait_current_trans(root);
181         ret = join_transaction(root);
182         BUG_ON(ret);
183
184         btrfs_record_root_in_trans(root);
185         h->transid = root->fs_info->running_transaction->transid;
186         h->transaction = root->fs_info->running_transaction;
187         h->blocks_reserved = num_blocks;
188         h->blocks_used = 0;
189         h->block_group = 0;
190         h->alloc_exclude_nr = 0;
191         h->alloc_exclude_start = 0;
192         h->delayed_ref_updates = 0;
193
194         root->fs_info->running_transaction->use_count++;
195         mutex_unlock(&root->fs_info->trans_mutex);
196         return h;
197 }
198
199 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
200                                                    int num_blocks)
201 {
202         return start_transaction(root, num_blocks, 1);
203 }
204 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
205                                                    int num_blocks)
206 {
207         return start_transaction(root, num_blocks, 0);
208 }
209
210 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
211                                                          int num_blocks)
212 {
213         return start_transaction(r, num_blocks, 2);
214 }
215
216 /* wait for a transaction commit to be fully complete */
217 static noinline int wait_for_commit(struct btrfs_root *root,
218                                     struct btrfs_transaction *commit)
219 {
220         DEFINE_WAIT(wait);
221         mutex_lock(&root->fs_info->trans_mutex);
222         while (!commit->commit_done) {
223                 prepare_to_wait(&commit->commit_wait, &wait,
224                                 TASK_UNINTERRUPTIBLE);
225                 if (commit->commit_done)
226                         break;
227                 mutex_unlock(&root->fs_info->trans_mutex);
228                 schedule();
229                 mutex_lock(&root->fs_info->trans_mutex);
230         }
231         mutex_unlock(&root->fs_info->trans_mutex);
232         finish_wait(&commit->commit_wait, &wait);
233         return 0;
234 }
235
236 /*
237  * rate limit against the drop_snapshot code.  This helps to slow down new
238  * operations if the drop_snapshot code isn't able to keep up.
239  */
240 static void throttle_on_drops(struct btrfs_root *root)
241 {
242         struct btrfs_fs_info *info = root->fs_info;
243         int harder_count = 0;
244
245 harder:
246         if (atomic_read(&info->throttles)) {
247                 DEFINE_WAIT(wait);
248                 int thr;
249                 thr = atomic_read(&info->throttle_gen);
250
251                 do {
252                         prepare_to_wait(&info->transaction_throttle,
253                                         &wait, TASK_UNINTERRUPTIBLE);
254                         if (!atomic_read(&info->throttles)) {
255                                 finish_wait(&info->transaction_throttle, &wait);
256                                 break;
257                         }
258                         schedule();
259                         finish_wait(&info->transaction_throttle, &wait);
260                 } while (thr == atomic_read(&info->throttle_gen));
261                 harder_count++;
262
263                 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
264                     harder_count < 2)
265                         goto harder;
266
267                 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
268                     harder_count < 10)
269                         goto harder;
270
271                 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
272                     harder_count < 20)
273                         goto harder;
274         }
275 }
276
277 void btrfs_throttle(struct btrfs_root *root)
278 {
279         mutex_lock(&root->fs_info->trans_mutex);
280         if (!root->fs_info->open_ioctl_trans)
281                 wait_current_trans(root);
282         mutex_unlock(&root->fs_info->trans_mutex);
283         throttle_on_drops(root);
284 }
285
286 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
287                           struct btrfs_root *root, int throttle)
288 {
289         struct btrfs_transaction *cur_trans;
290         struct btrfs_fs_info *info = root->fs_info;
291         int count = 0;
292
293         while (count < 4) {
294                 unsigned long cur = trans->delayed_ref_updates;
295                 trans->delayed_ref_updates = 0;
296                 if (cur &&
297                     trans->transaction->delayed_refs.num_heads_ready > 64) {
298                         trans->delayed_ref_updates = 0;
299
300                         /*
301                          * do a full flush if the transaction is trying
302                          * to close
303                          */
304                         if (trans->transaction->delayed_refs.flushing)
305                                 cur = 0;
306                         btrfs_run_delayed_refs(trans, root, cur);
307                 } else {
308                         break;
309                 }
310                 count++;
311         }
312
313         mutex_lock(&info->trans_mutex);
314         cur_trans = info->running_transaction;
315         WARN_ON(cur_trans != trans->transaction);
316         WARN_ON(cur_trans->num_writers < 1);
317         cur_trans->num_writers--;
318
319         if (waitqueue_active(&cur_trans->writer_wait))
320                 wake_up(&cur_trans->writer_wait);
321         put_transaction(cur_trans);
322         mutex_unlock(&info->trans_mutex);
323         memset(trans, 0, sizeof(*trans));
324         kmem_cache_free(btrfs_trans_handle_cachep, trans);
325
326         if (throttle)
327                 throttle_on_drops(root);
328
329         return 0;
330 }
331
332 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
333                           struct btrfs_root *root)
334 {
335         return __btrfs_end_transaction(trans, root, 0);
336 }
337
338 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
339                                    struct btrfs_root *root)
340 {
341         return __btrfs_end_transaction(trans, root, 1);
342 }
343
344 /*
345  * when btree blocks are allocated, they have some corresponding bits set for
346  * them in one of two extent_io trees.  This is used to make sure all of
347  * those extents are on disk for transaction or log commit
348  */
349 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
350                                         struct extent_io_tree *dirty_pages)
351 {
352         int ret;
353         int err = 0;
354         int werr = 0;
355         struct page *page;
356         struct inode *btree_inode = root->fs_info->btree_inode;
357         u64 start = 0;
358         u64 end;
359         unsigned long index;
360
361         while (1) {
362                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
363                                             EXTENT_DIRTY);
364                 if (ret)
365                         break;
366                 while (start <= end) {
367                         cond_resched();
368
369                         index = start >> PAGE_CACHE_SHIFT;
370                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
371                         page = find_get_page(btree_inode->i_mapping, index);
372                         if (!page)
373                                 continue;
374
375                         btree_lock_page_hook(page);
376                         if (!page->mapping) {
377                                 unlock_page(page);
378                                 page_cache_release(page);
379                                 continue;
380                         }
381
382                         if (PageWriteback(page)) {
383                                 if (PageDirty(page))
384                                         wait_on_page_writeback(page);
385                                 else {
386                                         unlock_page(page);
387                                         page_cache_release(page);
388                                         continue;
389                                 }
390                         }
391                         err = write_one_page(page, 0);
392                         if (err)
393                                 werr = err;
394                         page_cache_release(page);
395                 }
396         }
397         while (1) {
398                 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
399                                             EXTENT_DIRTY);
400                 if (ret)
401                         break;
402
403                 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
404                 while (start <= end) {
405                         index = start >> PAGE_CACHE_SHIFT;
406                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
407                         page = find_get_page(btree_inode->i_mapping, index);
408                         if (!page)
409                                 continue;
410                         if (PageDirty(page)) {
411                                 btree_lock_page_hook(page);
412                                 wait_on_page_writeback(page);
413                                 err = write_one_page(page, 0);
414                                 if (err)
415                                         werr = err;
416                         }
417                         wait_on_page_writeback(page);
418                         page_cache_release(page);
419                         cond_resched();
420                 }
421         }
422         if (err)
423                 werr = err;
424         return werr;
425 }
426
427 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
428                                      struct btrfs_root *root)
429 {
430         if (!trans || !trans->transaction) {
431                 struct inode *btree_inode;
432                 btree_inode = root->fs_info->btree_inode;
433                 return filemap_write_and_wait(btree_inode->i_mapping);
434         }
435         return btrfs_write_and_wait_marked_extents(root,
436                                            &trans->transaction->dirty_pages);
437 }
438
439 /*
440  * this is used to update the root pointer in the tree of tree roots.
441  *
442  * But, in the case of the extent allocation tree, updating the root
443  * pointer may allocate blocks which may change the root of the extent
444  * allocation tree.
445  *
446  * So, this loops and repeats and makes sure the cowonly root didn't
447  * change while the root pointer was being updated in the metadata.
448  */
449 static int update_cowonly_root(struct btrfs_trans_handle *trans,
450                                struct btrfs_root *root)
451 {
452         int ret;
453         u64 old_root_bytenr;
454         struct btrfs_root *tree_root = root->fs_info->tree_root;
455
456         btrfs_write_dirty_block_groups(trans, root);
457
458         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459         BUG_ON(ret);
460
461         while (1) {
462                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
463                 if (old_root_bytenr == root->node->start)
464                         break;
465                 btrfs_set_root_bytenr(&root->root_item,
466                                        root->node->start);
467                 btrfs_set_root_level(&root->root_item,
468                                      btrfs_header_level(root->node));
469                 btrfs_set_root_generation(&root->root_item, trans->transid);
470
471                 ret = btrfs_update_root(trans, tree_root,
472                                         &root->root_key,
473                                         &root->root_item);
474                 BUG_ON(ret);
475                 btrfs_write_dirty_block_groups(trans, root);
476
477                 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478                 BUG_ON(ret);
479         }
480         return 0;
481 }
482
483 /*
484  * update all the cowonly tree roots on disk
485  */
486 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
487                             struct btrfs_root *root)
488 {
489         struct btrfs_fs_info *fs_info = root->fs_info;
490         struct list_head *next;
491         struct extent_buffer *eb;
492         int ret;
493
494         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
495         BUG_ON(ret);
496
497         eb = btrfs_lock_root_node(fs_info->tree_root);
498         btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
499         btrfs_tree_unlock(eb);
500         free_extent_buffer(eb);
501
502         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
503         BUG_ON(ret);
504
505         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
506                 next = fs_info->dirty_cowonly_roots.next;
507                 list_del_init(next);
508                 root = list_entry(next, struct btrfs_root, dirty_list);
509
510                 update_cowonly_root(trans, root);
511
512                 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513                 BUG_ON(ret);
514         }
515         return 0;
516 }
517
518 /*
519  * dead roots are old snapshots that need to be deleted.  This allocates
520  * a dirty root struct and adds it into the list of dead roots that need to
521  * be deleted
522  */
523 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
524 {
525         struct btrfs_dirty_root *dirty;
526
527         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
528         if (!dirty)
529                 return -ENOMEM;
530         dirty->root = root;
531         dirty->latest_root = latest;
532
533         mutex_lock(&root->fs_info->trans_mutex);
534         list_add(&dirty->list, &latest->fs_info->dead_roots);
535         mutex_unlock(&root->fs_info->trans_mutex);
536         return 0;
537 }
538
539 /*
540  * at transaction commit time we need to schedule the old roots for
541  * deletion via btrfs_drop_snapshot.  This runs through all the
542  * reference counted roots that were modified in the current
543  * transaction and puts them into the drop list
544  */
545 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
546                                     struct radix_tree_root *radix,
547                                     struct list_head *list)
548 {
549         struct btrfs_dirty_root *dirty;
550         struct btrfs_root *gang[8];
551         struct btrfs_root *root;
552         int i;
553         int ret;
554         int err = 0;
555         u32 refs;
556
557         while (1) {
558                 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
559                                                  ARRAY_SIZE(gang),
560                                                  BTRFS_ROOT_TRANS_TAG);
561                 if (ret == 0)
562                         break;
563                 for (i = 0; i < ret; i++) {
564                         root = gang[i];
565                         radix_tree_tag_clear(radix,
566                                      (unsigned long)root->root_key.objectid,
567                                      BTRFS_ROOT_TRANS_TAG);
568
569                         BUG_ON(!root->ref_tree);
570                         dirty = root->dirty_root;
571
572                         btrfs_free_log(trans, root);
573                         btrfs_free_reloc_root(trans, root);
574
575                         if (root->commit_root == root->node) {
576                                 WARN_ON(root->node->start !=
577                                         btrfs_root_bytenr(&root->root_item));
578
579                                 free_extent_buffer(root->commit_root);
580                                 root->commit_root = NULL;
581                                 root->dirty_root = NULL;
582
583                                 spin_lock(&root->list_lock);
584                                 list_del_init(&dirty->root->dead_list);
585                                 spin_unlock(&root->list_lock);
586
587                                 kfree(dirty->root);
588                                 kfree(dirty);
589
590                                 /* make sure to update the root on disk
591                                  * so we get any updates to the block used
592                                  * counts
593                                  */
594                                 err = btrfs_update_root(trans,
595                                                 root->fs_info->tree_root,
596                                                 &root->root_key,
597                                                 &root->root_item);
598                                 continue;
599                         }
600
601                         memset(&root->root_item.drop_progress, 0,
602                                sizeof(struct btrfs_disk_key));
603                         root->root_item.drop_level = 0;
604                         root->commit_root = NULL;
605                         root->dirty_root = NULL;
606                         root->root_key.offset = root->fs_info->generation;
607                         btrfs_set_root_bytenr(&root->root_item,
608                                               root->node->start);
609                         btrfs_set_root_level(&root->root_item,
610                                              btrfs_header_level(root->node));
611                         btrfs_set_root_generation(&root->root_item,
612                                                   root->root_key.offset);
613
614                         err = btrfs_insert_root(trans, root->fs_info->tree_root,
615                                                 &root->root_key,
616                                                 &root->root_item);
617                         if (err)
618                                 break;
619
620                         refs = btrfs_root_refs(&dirty->root->root_item);
621                         btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
622                         err = btrfs_update_root(trans, root->fs_info->tree_root,
623                                                 &dirty->root->root_key,
624                                                 &dirty->root->root_item);
625
626                         BUG_ON(err);
627                         if (refs == 1) {
628                                 list_add(&dirty->list, list);
629                         } else {
630                                 WARN_ON(1);
631                                 free_extent_buffer(dirty->root->node);
632                                 kfree(dirty->root);
633                                 kfree(dirty);
634                         }
635                 }
636         }
637         return err;
638 }
639
640 /*
641  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
642  * otherwise every leaf in the btree is read and defragged.
643  */
644 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
645 {
646         struct btrfs_fs_info *info = root->fs_info;
647         int ret;
648         struct btrfs_trans_handle *trans;
649         unsigned long nr;
650
651         smp_mb();
652         if (root->defrag_running)
653                 return 0;
654         trans = btrfs_start_transaction(root, 1);
655         while (1) {
656                 root->defrag_running = 1;
657                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
658                 nr = trans->blocks_used;
659                 btrfs_end_transaction(trans, root);
660                 btrfs_btree_balance_dirty(info->tree_root, nr);
661                 cond_resched();
662
663                 trans = btrfs_start_transaction(root, 1);
664                 if (root->fs_info->closing || ret != -EAGAIN)
665                         break;
666         }
667         root->defrag_running = 0;
668         smp_mb();
669         btrfs_end_transaction(trans, root);
670         return 0;
671 }
672
673 /*
674  * when dropping snapshots, we generate a ton of delayed refs, and it makes
675  * sense not to join the transaction while it is trying to flush the current
676  * queue of delayed refs out.
677  *
678  * This is used by the drop snapshot code only
679  */
680 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
681 {
682         DEFINE_WAIT(wait);
683
684         mutex_lock(&info->trans_mutex);
685         while (info->running_transaction &&
686                info->running_transaction->delayed_refs.flushing) {
687                 prepare_to_wait(&info->transaction_wait, &wait,
688                                 TASK_UNINTERRUPTIBLE);
689                 mutex_unlock(&info->trans_mutex);
690
691                 atomic_dec(&info->throttles);
692                 wake_up(&info->transaction_throttle);
693
694                 schedule();
695
696                 atomic_inc(&info->throttles);
697                 mutex_lock(&info->trans_mutex);
698                 finish_wait(&info->transaction_wait, &wait);
699         }
700         mutex_unlock(&info->trans_mutex);
701         return 0;
702 }
703
704 /*
705  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
706  * all of them
707  */
708 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
709                                      struct list_head *list)
710 {
711         struct btrfs_dirty_root *dirty;
712         struct btrfs_trans_handle *trans;
713         unsigned long nr;
714         u64 num_bytes;
715         u64 bytes_used;
716         u64 max_useless;
717         int ret = 0;
718         int err;
719
720         while (!list_empty(list)) {
721                 struct btrfs_root *root;
722
723                 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
724                 list_del_init(&dirty->list);
725
726                 num_bytes = btrfs_root_used(&dirty->root->root_item);
727                 root = dirty->latest_root;
728                 atomic_inc(&root->fs_info->throttles);
729
730                 while (1) {
731                         /*
732                          * we don't want to jump in and create a bunch of
733                          * delayed refs if the transaction is starting to close
734                          */
735                         wait_transaction_pre_flush(tree_root->fs_info);
736                         trans = btrfs_start_transaction(tree_root, 1);
737
738                         /*
739                          * we've joined a transaction, make sure it isn't
740                          * closing right now
741                          */
742                         if (trans->transaction->delayed_refs.flushing) {
743                                 btrfs_end_transaction(trans, tree_root);
744                                 continue;
745                         }
746
747                         mutex_lock(&root->fs_info->drop_mutex);
748                         ret = btrfs_drop_snapshot(trans, dirty->root);
749                         if (ret != -EAGAIN)
750                                 break;
751                         mutex_unlock(&root->fs_info->drop_mutex);
752
753                         err = btrfs_update_root(trans,
754                                         tree_root,
755                                         &dirty->root->root_key,
756                                         &dirty->root->root_item);
757                         if (err)
758                                 ret = err;
759                         nr = trans->blocks_used;
760                         ret = btrfs_end_transaction(trans, tree_root);
761                         BUG_ON(ret);
762
763                         btrfs_btree_balance_dirty(tree_root, nr);
764                         cond_resched();
765                 }
766                 BUG_ON(ret);
767                 atomic_dec(&root->fs_info->throttles);
768                 wake_up(&root->fs_info->transaction_throttle);
769
770                 num_bytes -= btrfs_root_used(&dirty->root->root_item);
771                 bytes_used = btrfs_root_used(&root->root_item);
772                 if (num_bytes) {
773                         mutex_lock(&root->fs_info->trans_mutex);
774                         btrfs_record_root_in_trans(root);
775                         mutex_unlock(&root->fs_info->trans_mutex);
776                         btrfs_set_root_used(&root->root_item,
777                                             bytes_used - num_bytes);
778                 }
779
780                 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
781                 if (ret) {
782                         BUG();
783                         break;
784                 }
785                 mutex_unlock(&root->fs_info->drop_mutex);
786
787                 spin_lock(&root->list_lock);
788                 list_del_init(&dirty->root->dead_list);
789                 if (!list_empty(&root->dead_list)) {
790                         struct btrfs_root *oldest;
791                         oldest = list_entry(root->dead_list.prev,
792                                             struct btrfs_root, dead_list);
793                         max_useless = oldest->root_key.offset - 1;
794                 } else {
795                         max_useless = root->root_key.offset - 1;
796                 }
797                 spin_unlock(&root->list_lock);
798
799                 nr = trans->blocks_used;
800                 ret = btrfs_end_transaction(trans, tree_root);
801                 BUG_ON(ret);
802
803                 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
804                 BUG_ON(ret);
805
806                 free_extent_buffer(dirty->root->node);
807                 kfree(dirty->root);
808                 kfree(dirty);
809
810                 btrfs_btree_balance_dirty(tree_root, nr);
811                 cond_resched();
812         }
813         return ret;
814 }
815
816 /*
817  * new snapshots need to be created at a very specific time in the
818  * transaction commit.  This does the actual creation
819  */
820 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
821                                    struct btrfs_fs_info *fs_info,
822                                    struct btrfs_pending_snapshot *pending)
823 {
824         struct btrfs_key key;
825         struct btrfs_root_item *new_root_item;
826         struct btrfs_root *tree_root = fs_info->tree_root;
827         struct btrfs_root *root = pending->root;
828         struct extent_buffer *tmp;
829         struct extent_buffer *old;
830         int ret;
831         u64 objectid;
832
833         new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
834         if (!new_root_item) {
835                 ret = -ENOMEM;
836                 goto fail;
837         }
838         ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
839         if (ret)
840                 goto fail;
841
842         btrfs_record_root_in_trans(root);
843         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
844         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
845
846         key.objectid = objectid;
847         key.offset = trans->transid;
848         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
849
850         old = btrfs_lock_root_node(root);
851         btrfs_cow_block(trans, root, old, NULL, 0, &old);
852
853         btrfs_copy_root(trans, root, old, &tmp, objectid);
854         btrfs_tree_unlock(old);
855         free_extent_buffer(old);
856
857         btrfs_set_root_bytenr(new_root_item, tmp->start);
858         btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
859         btrfs_set_root_generation(new_root_item, trans->transid);
860         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
861                                 new_root_item);
862         btrfs_tree_unlock(tmp);
863         free_extent_buffer(tmp);
864         if (ret)
865                 goto fail;
866
867         key.offset = (u64)-1;
868         memcpy(&pending->root_key, &key, sizeof(key));
869 fail:
870         kfree(new_root_item);
871         return ret;
872 }
873
874 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
875                                    struct btrfs_pending_snapshot *pending)
876 {
877         int ret;
878         int namelen;
879         u64 index = 0;
880         struct btrfs_trans_handle *trans;
881         struct inode *parent_inode;
882         struct inode *inode;
883         struct btrfs_root *parent_root;
884
885         parent_inode = pending->dentry->d_parent->d_inode;
886         parent_root = BTRFS_I(parent_inode)->root;
887         trans = btrfs_join_transaction(parent_root, 1);
888
889         /*
890          * insert the directory item
891          */
892         namelen = strlen(pending->name);
893         ret = btrfs_set_inode_index(parent_inode, &index);
894         ret = btrfs_insert_dir_item(trans, parent_root,
895                             pending->name, namelen,
896                             parent_inode->i_ino,
897                             &pending->root_key, BTRFS_FT_DIR, index);
898
899         if (ret)
900                 goto fail;
901
902         btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
903         ret = btrfs_update_inode(trans, parent_root, parent_inode);
904         BUG_ON(ret);
905
906         /* add the backref first */
907         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
908                                  pending->root_key.objectid,
909                                  BTRFS_ROOT_BACKREF_KEY,
910                                  parent_root->root_key.objectid,
911                                  parent_inode->i_ino, index, pending->name,
912                                  namelen);
913
914         BUG_ON(ret);
915
916         /* now add the forward ref */
917         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
918                                  parent_root->root_key.objectid,
919                                  BTRFS_ROOT_REF_KEY,
920                                  pending->root_key.objectid,
921                                  parent_inode->i_ino, index, pending->name,
922                                  namelen);
923
924         inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
925         d_instantiate(pending->dentry, inode);
926 fail:
927         btrfs_end_transaction(trans, fs_info->fs_root);
928         return ret;
929 }
930
931 /*
932  * create all the snapshots we've scheduled for creation
933  */
934 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
935                                              struct btrfs_fs_info *fs_info)
936 {
937         struct btrfs_pending_snapshot *pending;
938         struct list_head *head = &trans->transaction->pending_snapshots;
939         int ret;
940
941         list_for_each_entry(pending, head, list) {
942                 ret = create_pending_snapshot(trans, fs_info, pending);
943                 BUG_ON(ret);
944         }
945         return 0;
946 }
947
948 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
949                                              struct btrfs_fs_info *fs_info)
950 {
951         struct btrfs_pending_snapshot *pending;
952         struct list_head *head = &trans->transaction->pending_snapshots;
953         int ret;
954
955         while (!list_empty(head)) {
956                 pending = list_entry(head->next,
957                                      struct btrfs_pending_snapshot, list);
958                 ret = finish_pending_snapshot(fs_info, pending);
959                 BUG_ON(ret);
960                 list_del(&pending->list);
961                 kfree(pending->name);
962                 kfree(pending);
963         }
964         return 0;
965 }
966
967 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
968                              struct btrfs_root *root)
969 {
970         unsigned long joined = 0;
971         unsigned long timeout = 1;
972         struct btrfs_transaction *cur_trans;
973         struct btrfs_transaction *prev_trans = NULL;
974         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
975         struct list_head dirty_fs_roots;
976         struct extent_io_tree *pinned_copy;
977         DEFINE_WAIT(wait);
978         int ret;
979         int should_grow = 0;
980         unsigned long now = get_seconds();
981         int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
982
983         btrfs_run_ordered_operations(root, 0);
984
985         /* make a pass through all the delayed refs we have so far
986          * any runnings procs may add more while we are here
987          */
988         ret = btrfs_run_delayed_refs(trans, root, 0);
989         BUG_ON(ret);
990
991         cur_trans = trans->transaction;
992         /*
993          * set the flushing flag so procs in this transaction have to
994          * start sending their work down.
995          */
996         cur_trans->delayed_refs.flushing = 1;
997
998         ret = btrfs_run_delayed_refs(trans, root, 0);
999         BUG_ON(ret);
1000
1001         mutex_lock(&root->fs_info->trans_mutex);
1002         INIT_LIST_HEAD(&dirty_fs_roots);
1003         if (cur_trans->in_commit) {
1004                 cur_trans->use_count++;
1005                 mutex_unlock(&root->fs_info->trans_mutex);
1006                 btrfs_end_transaction(trans, root);
1007
1008                 ret = wait_for_commit(root, cur_trans);
1009                 BUG_ON(ret);
1010
1011                 mutex_lock(&root->fs_info->trans_mutex);
1012                 put_transaction(cur_trans);
1013                 mutex_unlock(&root->fs_info->trans_mutex);
1014
1015                 return 0;
1016         }
1017
1018         pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
1019         if (!pinned_copy)
1020                 return -ENOMEM;
1021
1022         extent_io_tree_init(pinned_copy,
1023                              root->fs_info->btree_inode->i_mapping, GFP_NOFS);
1024
1025         trans->transaction->in_commit = 1;
1026         trans->transaction->blocked = 1;
1027         if (cur_trans->list.prev != &root->fs_info->trans_list) {
1028                 prev_trans = list_entry(cur_trans->list.prev,
1029                                         struct btrfs_transaction, list);
1030                 if (!prev_trans->commit_done) {
1031                         prev_trans->use_count++;
1032                         mutex_unlock(&root->fs_info->trans_mutex);
1033
1034                         wait_for_commit(root, prev_trans);
1035
1036                         mutex_lock(&root->fs_info->trans_mutex);
1037                         put_transaction(prev_trans);
1038                 }
1039         }
1040
1041         if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1042                 should_grow = 1;
1043
1044         do {
1045                 int snap_pending = 0;
1046                 joined = cur_trans->num_joined;
1047                 if (!list_empty(&trans->transaction->pending_snapshots))
1048                         snap_pending = 1;
1049
1050                 WARN_ON(cur_trans != trans->transaction);
1051                 prepare_to_wait(&cur_trans->writer_wait, &wait,
1052                                 TASK_UNINTERRUPTIBLE);
1053
1054                 if (cur_trans->num_writers > 1)
1055                         timeout = MAX_SCHEDULE_TIMEOUT;
1056                 else if (should_grow)
1057                         timeout = 1;
1058
1059                 mutex_unlock(&root->fs_info->trans_mutex);
1060
1061                 if (flush_on_commit || snap_pending) {
1062                         if (flush_on_commit)
1063                                 btrfs_start_delalloc_inodes(root);
1064                         ret = btrfs_wait_ordered_extents(root, 1);
1065                         BUG_ON(ret);
1066                 }
1067
1068                 /*
1069                  * rename don't use btrfs_join_transaction, so, once we
1070                  * set the transaction to blocked above, we aren't going
1071                  * to get any new ordered operations.  We can safely run
1072                  * it here and no for sure that nothing new will be added
1073                  * to the list
1074                  */
1075                 btrfs_run_ordered_operations(root, 1);
1076
1077                 smp_mb();
1078                 if (cur_trans->num_writers > 1 || should_grow)
1079                         schedule_timeout(timeout);
1080
1081                 mutex_lock(&root->fs_info->trans_mutex);
1082                 finish_wait(&cur_trans->writer_wait, &wait);
1083         } while (cur_trans->num_writers > 1 ||
1084                  (should_grow && cur_trans->num_joined != joined));
1085
1086         ret = create_pending_snapshots(trans, root->fs_info);
1087         BUG_ON(ret);
1088
1089         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1090         BUG_ON(ret);
1091
1092         WARN_ON(cur_trans != trans->transaction);
1093
1094         /* btrfs_commit_tree_roots is responsible for getting the
1095          * various roots consistent with each other.  Every pointer
1096          * in the tree of tree roots has to point to the most up to date
1097          * root for every subvolume and other tree.  So, we have to keep
1098          * the tree logging code from jumping in and changing any
1099          * of the trees.
1100          *
1101          * At this point in the commit, there can't be any tree-log
1102          * writers, but a little lower down we drop the trans mutex
1103          * and let new people in.  By holding the tree_log_mutex
1104          * from now until after the super is written, we avoid races
1105          * with the tree-log code.
1106          */
1107         mutex_lock(&root->fs_info->tree_log_mutex);
1108         /*
1109          * keep tree reloc code from adding new reloc trees
1110          */
1111         mutex_lock(&root->fs_info->tree_reloc_mutex);
1112
1113
1114         ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
1115                               &dirty_fs_roots);
1116         BUG_ON(ret);
1117
1118         /* add_dirty_roots gets rid of all the tree log roots, it is now
1119          * safe to free the root of tree log roots
1120          */
1121         btrfs_free_log_root_tree(trans, root->fs_info);
1122
1123         ret = btrfs_commit_tree_roots(trans, root);
1124         BUG_ON(ret);
1125
1126         cur_trans = root->fs_info->running_transaction;
1127         spin_lock(&root->fs_info->new_trans_lock);
1128         root->fs_info->running_transaction = NULL;
1129         spin_unlock(&root->fs_info->new_trans_lock);
1130         btrfs_set_super_generation(&root->fs_info->super_copy,
1131                                    cur_trans->transid);
1132         btrfs_set_super_root(&root->fs_info->super_copy,
1133                              root->fs_info->tree_root->node->start);
1134         btrfs_set_super_root_level(&root->fs_info->super_copy,
1135                            btrfs_header_level(root->fs_info->tree_root->node));
1136
1137         btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1138                                    chunk_root->node->start);
1139         btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1140                                          btrfs_header_level(chunk_root->node));
1141         btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1142                                 btrfs_header_generation(chunk_root->node));
1143
1144         if (!root->fs_info->log_root_recovering) {
1145                 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1146                 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1147         }
1148
1149         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1150                sizeof(root->fs_info->super_copy));
1151
1152         btrfs_copy_pinned(root, pinned_copy);
1153
1154         trans->transaction->blocked = 0;
1155
1156         wake_up(&root->fs_info->transaction_throttle);
1157         wake_up(&root->fs_info->transaction_wait);
1158
1159         mutex_unlock(&root->fs_info->trans_mutex);
1160         ret = btrfs_write_and_wait_transaction(trans, root);
1161         BUG_ON(ret);
1162         write_ctree_super(trans, root, 0);
1163
1164         /*
1165          * the super is written, we can safely allow the tree-loggers
1166          * to go about their business
1167          */
1168         mutex_unlock(&root->fs_info->tree_log_mutex);
1169
1170         btrfs_finish_extent_commit(trans, root, pinned_copy);
1171         kfree(pinned_copy);
1172
1173         btrfs_drop_dead_reloc_roots(root);
1174         mutex_unlock(&root->fs_info->tree_reloc_mutex);
1175
1176         /* do the directory inserts of any pending snapshot creations */
1177         finish_pending_snapshots(trans, root->fs_info);
1178
1179         mutex_lock(&root->fs_info->trans_mutex);
1180
1181         cur_trans->commit_done = 1;
1182
1183         root->fs_info->last_trans_committed = cur_trans->transid;
1184         wake_up(&cur_trans->commit_wait);
1185
1186         put_transaction(cur_trans);
1187         put_transaction(cur_trans);
1188
1189         list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1190         if (root->fs_info->closing)
1191                 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1192
1193         mutex_unlock(&root->fs_info->trans_mutex);
1194
1195         kmem_cache_free(btrfs_trans_handle_cachep, trans);
1196
1197         if (root->fs_info->closing)
1198                 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1199         return ret;
1200 }
1201
1202 /*
1203  * interface function to delete all the snapshots we have scheduled for deletion
1204  */
1205 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1206 {
1207         struct list_head dirty_roots;
1208         INIT_LIST_HEAD(&dirty_roots);
1209 again:
1210         mutex_lock(&root->fs_info->trans_mutex);
1211         list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1212         mutex_unlock(&root->fs_info->trans_mutex);
1213
1214         if (!list_empty(&dirty_roots)) {
1215                 drop_dirty_roots(root, &dirty_roots);
1216                 goto again;
1217         }
1218         return 0;
1219 }