Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
[linux-2.6] / fs / btrfs / transaction.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include <linux/writeback.h>
22 #include <linux/pagemap.h>
23 #include <linux/blkdev.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "locking.h"
28 #include "ref-cache.h"
29 #include "tree-log.h"
30
31 #define BTRFS_ROOT_TRANS_TAG 0
32
33 static noinline void put_transaction(struct btrfs_transaction *transaction)
34 {
35         WARN_ON(transaction->use_count == 0);
36         transaction->use_count--;
37         if (transaction->use_count == 0) {
38                 list_del_init(&transaction->list);
39                 memset(transaction, 0, sizeof(*transaction));
40                 kmem_cache_free(btrfs_transaction_cachep, transaction);
41         }
42 }
43
44 /*
45  * either allocate a new transaction or hop into the existing one
46  */
47 static noinline int join_transaction(struct btrfs_root *root)
48 {
49         struct btrfs_transaction *cur_trans;
50         cur_trans = root->fs_info->running_transaction;
51         if (!cur_trans) {
52                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53                                              GFP_NOFS);
54                 BUG_ON(!cur_trans);
55                 root->fs_info->generation++;
56                 cur_trans->num_writers = 1;
57                 cur_trans->num_joined = 0;
58                 cur_trans->transid = root->fs_info->generation;
59                 init_waitqueue_head(&cur_trans->writer_wait);
60                 init_waitqueue_head(&cur_trans->commit_wait);
61                 cur_trans->in_commit = 0;
62                 cur_trans->blocked = 0;
63                 cur_trans->use_count = 1;
64                 cur_trans->commit_done = 0;
65                 cur_trans->start_time = get_seconds();
66
67                 cur_trans->delayed_refs.root.rb_node = NULL;
68                 cur_trans->delayed_refs.num_entries = 0;
69                 cur_trans->delayed_refs.num_heads_ready = 0;
70                 cur_trans->delayed_refs.num_heads = 0;
71                 cur_trans->delayed_refs.flushing = 0;
72                 cur_trans->delayed_refs.run_delayed_start = 0;
73                 spin_lock_init(&cur_trans->delayed_refs.lock);
74
75                 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
76                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
77                 extent_io_tree_init(&cur_trans->dirty_pages,
78                                      root->fs_info->btree_inode->i_mapping,
79                                      GFP_NOFS);
80                 spin_lock(&root->fs_info->new_trans_lock);
81                 root->fs_info->running_transaction = cur_trans;
82                 spin_unlock(&root->fs_info->new_trans_lock);
83         } else {
84                 cur_trans->num_writers++;
85                 cur_trans->num_joined++;
86         }
87
88         return 0;
89 }
90
91 /*
92  * this does all the record keeping required to make sure that a reference
93  * counted root is properly recorded in a given transaction.  This is required
94  * to make sure the old root from before we joined the transaction is deleted
95  * when the transaction commits
96  */
97 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
98 {
99         struct btrfs_dirty_root *dirty;
100         u64 running_trans_id = root->fs_info->running_transaction->transid;
101         if (root->ref_cows && root->last_trans < running_trans_id) {
102                 WARN_ON(root == root->fs_info->extent_root);
103                 if (root->root_item.refs != 0) {
104                         radix_tree_tag_set(&root->fs_info->fs_roots_radix,
105                                    (unsigned long)root->root_key.objectid,
106                                    BTRFS_ROOT_TRANS_TAG);
107
108                         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
109                         BUG_ON(!dirty);
110                         dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
111                         BUG_ON(!dirty->root);
112                         dirty->latest_root = root;
113                         INIT_LIST_HEAD(&dirty->list);
114
115                         root->commit_root = btrfs_root_node(root);
116
117                         memcpy(dirty->root, root, sizeof(*root));
118                         spin_lock_init(&dirty->root->node_lock);
119                         spin_lock_init(&dirty->root->list_lock);
120                         mutex_init(&dirty->root->objectid_mutex);
121                         mutex_init(&dirty->root->log_mutex);
122                         INIT_LIST_HEAD(&dirty->root->dead_list);
123                         dirty->root->node = root->commit_root;
124                         dirty->root->commit_root = NULL;
125
126                         spin_lock(&root->list_lock);
127                         list_add(&dirty->root->dead_list, &root->dead_list);
128                         spin_unlock(&root->list_lock);
129
130                         root->dirty_root = dirty;
131                 } else {
132                         WARN_ON(1);
133                 }
134                 root->last_trans = running_trans_id;
135         }
136         return 0;
137 }
138
139 /* wait for commit against the current transaction to become unblocked
140  * when this is done, it is safe to start a new transaction, but the current
141  * transaction might not be fully on disk.
142  */
143 static void wait_current_trans(struct btrfs_root *root)
144 {
145         struct btrfs_transaction *cur_trans;
146
147         cur_trans = root->fs_info->running_transaction;
148         if (cur_trans && cur_trans->blocked) {
149                 DEFINE_WAIT(wait);
150                 cur_trans->use_count++;
151                 while (1) {
152                         prepare_to_wait(&root->fs_info->transaction_wait, &wait,
153                                         TASK_UNINTERRUPTIBLE);
154                         if (cur_trans->blocked) {
155                                 mutex_unlock(&root->fs_info->trans_mutex);
156                                 schedule();
157                                 mutex_lock(&root->fs_info->trans_mutex);
158                                 finish_wait(&root->fs_info->transaction_wait,
159                                             &wait);
160                         } else {
161                                 finish_wait(&root->fs_info->transaction_wait,
162                                             &wait);
163                                 break;
164                         }
165                 }
166                 put_transaction(cur_trans);
167         }
168 }
169
170 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
171                                              int num_blocks, int wait)
172 {
173         struct btrfs_trans_handle *h =
174                 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
175         int ret;
176
177         mutex_lock(&root->fs_info->trans_mutex);
178         if (!root->fs_info->log_root_recovering &&
179             ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
180                 wait_current_trans(root);
181         ret = join_transaction(root);
182         BUG_ON(ret);
183
184         btrfs_record_root_in_trans(root);
185         h->transid = root->fs_info->running_transaction->transid;
186         h->transaction = root->fs_info->running_transaction;
187         h->blocks_reserved = num_blocks;
188         h->blocks_used = 0;
189         h->block_group = 0;
190         h->alloc_exclude_nr = 0;
191         h->alloc_exclude_start = 0;
192         h->delayed_ref_updates = 0;
193
194         root->fs_info->running_transaction->use_count++;
195         mutex_unlock(&root->fs_info->trans_mutex);
196         return h;
197 }
198
199 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
200                                                    int num_blocks)
201 {
202         return start_transaction(root, num_blocks, 1);
203 }
204 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
205                                                    int num_blocks)
206 {
207         return start_transaction(root, num_blocks, 0);
208 }
209
210 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
211                                                          int num_blocks)
212 {
213         return start_transaction(r, num_blocks, 2);
214 }
215
216 /* wait for a transaction commit to be fully complete */
217 static noinline int wait_for_commit(struct btrfs_root *root,
218                                     struct btrfs_transaction *commit)
219 {
220         DEFINE_WAIT(wait);
221         mutex_lock(&root->fs_info->trans_mutex);
222         while (!commit->commit_done) {
223                 prepare_to_wait(&commit->commit_wait, &wait,
224                                 TASK_UNINTERRUPTIBLE);
225                 if (commit->commit_done)
226                         break;
227                 mutex_unlock(&root->fs_info->trans_mutex);
228                 schedule();
229                 mutex_lock(&root->fs_info->trans_mutex);
230         }
231         mutex_unlock(&root->fs_info->trans_mutex);
232         finish_wait(&commit->commit_wait, &wait);
233         return 0;
234 }
235
236 /*
237  * rate limit against the drop_snapshot code.  This helps to slow down new
238  * operations if the drop_snapshot code isn't able to keep up.
239  */
240 static void throttle_on_drops(struct btrfs_root *root)
241 {
242         struct btrfs_fs_info *info = root->fs_info;
243         int harder_count = 0;
244
245 harder:
246         if (atomic_read(&info->throttles)) {
247                 DEFINE_WAIT(wait);
248                 int thr;
249                 thr = atomic_read(&info->throttle_gen);
250
251                 do {
252                         prepare_to_wait(&info->transaction_throttle,
253                                         &wait, TASK_UNINTERRUPTIBLE);
254                         if (!atomic_read(&info->throttles)) {
255                                 finish_wait(&info->transaction_throttle, &wait);
256                                 break;
257                         }
258                         schedule();
259                         finish_wait(&info->transaction_throttle, &wait);
260                 } while (thr == atomic_read(&info->throttle_gen));
261                 harder_count++;
262
263                 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
264                     harder_count < 2)
265                         goto harder;
266
267                 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
268                     harder_count < 10)
269                         goto harder;
270
271                 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
272                     harder_count < 20)
273                         goto harder;
274         }
275 }
276
277 void btrfs_throttle(struct btrfs_root *root)
278 {
279         mutex_lock(&root->fs_info->trans_mutex);
280         if (!root->fs_info->open_ioctl_trans)
281                 wait_current_trans(root);
282         mutex_unlock(&root->fs_info->trans_mutex);
283         throttle_on_drops(root);
284 }
285
286 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
287                           struct btrfs_root *root, int throttle)
288 {
289         struct btrfs_transaction *cur_trans;
290         struct btrfs_fs_info *info = root->fs_info;
291         int count = 0;
292
293         while (count < 4) {
294                 unsigned long cur = trans->delayed_ref_updates;
295                 trans->delayed_ref_updates = 0;
296                 if (cur &&
297                     trans->transaction->delayed_refs.num_heads_ready > 64) {
298                         trans->delayed_ref_updates = 0;
299
300                         /*
301                          * do a full flush if the transaction is trying
302                          * to close
303                          */
304                         if (trans->transaction->delayed_refs.flushing)
305                                 cur = 0;
306                         btrfs_run_delayed_refs(trans, root, cur);
307                 } else {
308                         break;
309                 }
310                 count++;
311         }
312
313         mutex_lock(&info->trans_mutex);
314         cur_trans = info->running_transaction;
315         WARN_ON(cur_trans != trans->transaction);
316         WARN_ON(cur_trans->num_writers < 1);
317         cur_trans->num_writers--;
318
319         if (waitqueue_active(&cur_trans->writer_wait))
320                 wake_up(&cur_trans->writer_wait);
321         put_transaction(cur_trans);
322         mutex_unlock(&info->trans_mutex);
323         memset(trans, 0, sizeof(*trans));
324         kmem_cache_free(btrfs_trans_handle_cachep, trans);
325
326         if (throttle)
327                 throttle_on_drops(root);
328
329         return 0;
330 }
331
332 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
333                           struct btrfs_root *root)
334 {
335         return __btrfs_end_transaction(trans, root, 0);
336 }
337
338 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
339                                    struct btrfs_root *root)
340 {
341         return __btrfs_end_transaction(trans, root, 1);
342 }
343
344 /*
345  * when btree blocks are allocated, they have some corresponding bits set for
346  * them in one of two extent_io trees.  This is used to make sure all of
347  * those extents are on disk for transaction or log commit
348  */
349 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
350                                         struct extent_io_tree *dirty_pages)
351 {
352         int ret;
353         int err = 0;
354         int werr = 0;
355         struct page *page;
356         struct inode *btree_inode = root->fs_info->btree_inode;
357         u64 start = 0;
358         u64 end;
359         unsigned long index;
360
361         while (1) {
362                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
363                                             EXTENT_DIRTY);
364                 if (ret)
365                         break;
366                 while (start <= end) {
367                         cond_resched();
368
369                         index = start >> PAGE_CACHE_SHIFT;
370                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
371                         page = find_get_page(btree_inode->i_mapping, index);
372                         if (!page)
373                                 continue;
374
375                         btree_lock_page_hook(page);
376                         if (!page->mapping) {
377                                 unlock_page(page);
378                                 page_cache_release(page);
379                                 continue;
380                         }
381
382                         if (PageWriteback(page)) {
383                                 if (PageDirty(page))
384                                         wait_on_page_writeback(page);
385                                 else {
386                                         unlock_page(page);
387                                         page_cache_release(page);
388                                         continue;
389                                 }
390                         }
391                         err = write_one_page(page, 0);
392                         if (err)
393                                 werr = err;
394                         page_cache_release(page);
395                 }
396         }
397         while (1) {
398                 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
399                                             EXTENT_DIRTY);
400                 if (ret)
401                         break;
402
403                 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
404                 while (start <= end) {
405                         index = start >> PAGE_CACHE_SHIFT;
406                         start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
407                         page = find_get_page(btree_inode->i_mapping, index);
408                         if (!page)
409                                 continue;
410                         if (PageDirty(page)) {
411                                 btree_lock_page_hook(page);
412                                 wait_on_page_writeback(page);
413                                 err = write_one_page(page, 0);
414                                 if (err)
415                                         werr = err;
416                         }
417                         wait_on_page_writeback(page);
418                         page_cache_release(page);
419                         cond_resched();
420                 }
421         }
422         if (err)
423                 werr = err;
424         return werr;
425 }
426
427 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
428                                      struct btrfs_root *root)
429 {
430         if (!trans || !trans->transaction) {
431                 struct inode *btree_inode;
432                 btree_inode = root->fs_info->btree_inode;
433                 return filemap_write_and_wait(btree_inode->i_mapping);
434         }
435         return btrfs_write_and_wait_marked_extents(root,
436                                            &trans->transaction->dirty_pages);
437 }
438
439 /*
440  * this is used to update the root pointer in the tree of tree roots.
441  *
442  * But, in the case of the extent allocation tree, updating the root
443  * pointer may allocate blocks which may change the root of the extent
444  * allocation tree.
445  *
446  * So, this loops and repeats and makes sure the cowonly root didn't
447  * change while the root pointer was being updated in the metadata.
448  */
449 static int update_cowonly_root(struct btrfs_trans_handle *trans,
450                                struct btrfs_root *root)
451 {
452         int ret;
453         u64 old_root_bytenr;
454         struct btrfs_root *tree_root = root->fs_info->tree_root;
455
456         btrfs_write_dirty_block_groups(trans, root);
457
458         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
459         BUG_ON(ret);
460
461         while (1) {
462                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
463                 if (old_root_bytenr == root->node->start)
464                         break;
465                 btrfs_set_root_bytenr(&root->root_item,
466                                        root->node->start);
467                 btrfs_set_root_level(&root->root_item,
468                                      btrfs_header_level(root->node));
469                 btrfs_set_root_generation(&root->root_item, trans->transid);
470
471                 ret = btrfs_update_root(trans, tree_root,
472                                         &root->root_key,
473                                         &root->root_item);
474                 BUG_ON(ret);
475                 btrfs_write_dirty_block_groups(trans, root);
476
477                 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
478                 BUG_ON(ret);
479         }
480         return 0;
481 }
482
483 /*
484  * update all the cowonly tree roots on disk
485  */
486 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
487                             struct btrfs_root *root)
488 {
489         struct btrfs_fs_info *fs_info = root->fs_info;
490         struct list_head *next;
491         struct extent_buffer *eb;
492         int ret;
493
494         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
495         BUG_ON(ret);
496
497         eb = btrfs_lock_root_node(fs_info->tree_root);
498         btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
499         btrfs_tree_unlock(eb);
500         free_extent_buffer(eb);
501
502         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
503         BUG_ON(ret);
504
505         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
506                 next = fs_info->dirty_cowonly_roots.next;
507                 list_del_init(next);
508                 root = list_entry(next, struct btrfs_root, dirty_list);
509
510                 update_cowonly_root(trans, root);
511
512                 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
513                 BUG_ON(ret);
514         }
515         return 0;
516 }
517
518 /*
519  * dead roots are old snapshots that need to be deleted.  This allocates
520  * a dirty root struct and adds it into the list of dead roots that need to
521  * be deleted
522  */
523 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
524 {
525         struct btrfs_dirty_root *dirty;
526
527         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
528         if (!dirty)
529                 return -ENOMEM;
530         dirty->root = root;
531         dirty->latest_root = latest;
532
533         mutex_lock(&root->fs_info->trans_mutex);
534         list_add(&dirty->list, &latest->fs_info->dead_roots);
535         mutex_unlock(&root->fs_info->trans_mutex);
536         return 0;
537 }
538
539 /*
540  * at transaction commit time we need to schedule the old roots for
541  * deletion via btrfs_drop_snapshot.  This runs through all the
542  * reference counted roots that were modified in the current
543  * transaction and puts them into the drop list
544  */
545 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
546                                     struct radix_tree_root *radix,
547                                     struct list_head *list)
548 {
549         struct btrfs_dirty_root *dirty;
550         struct btrfs_root *gang[8];
551         struct btrfs_root *root;
552         int i;
553         int ret;
554         int err = 0;
555         u32 refs;
556
557         while (1) {
558                 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
559                                                  ARRAY_SIZE(gang),
560                                                  BTRFS_ROOT_TRANS_TAG);
561                 if (ret == 0)
562                         break;
563                 for (i = 0; i < ret; i++) {
564                         root = gang[i];
565                         radix_tree_tag_clear(radix,
566                                      (unsigned long)root->root_key.objectid,
567                                      BTRFS_ROOT_TRANS_TAG);
568
569                         BUG_ON(!root->ref_tree);
570                         dirty = root->dirty_root;
571
572                         btrfs_free_log(trans, root);
573                         btrfs_free_reloc_root(trans, root);
574
575                         if (root->commit_root == root->node) {
576                                 WARN_ON(root->node->start !=
577                                         btrfs_root_bytenr(&root->root_item));
578
579                                 free_extent_buffer(root->commit_root);
580                                 root->commit_root = NULL;
581                                 root->dirty_root = NULL;
582
583                                 spin_lock(&root->list_lock);
584                                 list_del_init(&dirty->root->dead_list);
585                                 spin_unlock(&root->list_lock);
586
587                                 kfree(dirty->root);
588                                 kfree(dirty);
589
590                                 /* make sure to update the root on disk
591                                  * so we get any updates to the block used
592                                  * counts
593                                  */
594                                 err = btrfs_update_root(trans,
595                                                 root->fs_info->tree_root,
596                                                 &root->root_key,
597                                                 &root->root_item);
598                                 continue;
599                         }
600
601                         memset(&root->root_item.drop_progress, 0,
602                                sizeof(struct btrfs_disk_key));
603                         root->root_item.drop_level = 0;
604                         root->commit_root = NULL;
605                         root->dirty_root = NULL;
606                         root->root_key.offset = root->fs_info->generation;
607                         btrfs_set_root_bytenr(&root->root_item,
608                                               root->node->start);
609                         btrfs_set_root_level(&root->root_item,
610                                              btrfs_header_level(root->node));
611                         btrfs_set_root_generation(&root->root_item,
612                                                   root->root_key.offset);
613
614                         err = btrfs_insert_root(trans, root->fs_info->tree_root,
615                                                 &root->root_key,
616                                                 &root->root_item);
617                         if (err)
618                                 break;
619
620                         refs = btrfs_root_refs(&dirty->root->root_item);
621                         btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
622                         err = btrfs_update_root(trans, root->fs_info->tree_root,
623                                                 &dirty->root->root_key,
624                                                 &dirty->root->root_item);
625
626                         BUG_ON(err);
627                         if (refs == 1) {
628                                 list_add(&dirty->list, list);
629                         } else {
630                                 WARN_ON(1);
631                                 free_extent_buffer(dirty->root->node);
632                                 kfree(dirty->root);
633                                 kfree(dirty);
634                         }
635                 }
636         }
637         return err;
638 }
639
640 /*
641  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
642  * otherwise every leaf in the btree is read and defragged.
643  */
644 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
645 {
646         struct btrfs_fs_info *info = root->fs_info;
647         int ret;
648         struct btrfs_trans_handle *trans;
649         unsigned long nr;
650
651         smp_mb();
652         if (root->defrag_running)
653                 return 0;
654         trans = btrfs_start_transaction(root, 1);
655         while (1) {
656                 root->defrag_running = 1;
657                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
658                 nr = trans->blocks_used;
659                 btrfs_end_transaction(trans, root);
660                 btrfs_btree_balance_dirty(info->tree_root, nr);
661                 cond_resched();
662
663                 trans = btrfs_start_transaction(root, 1);
664                 if (root->fs_info->closing || ret != -EAGAIN)
665                         break;
666         }
667         root->defrag_running = 0;
668         smp_mb();
669         btrfs_end_transaction(trans, root);
670         return 0;
671 }
672
673 /*
674  * when dropping snapshots, we generate a ton of delayed refs, and it makes
675  * sense not to join the transaction while it is trying to flush the current
676  * queue of delayed refs out.
677  *
678  * This is used by the drop snapshot code only
679  */
680 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
681 {
682         DEFINE_WAIT(wait);
683
684         mutex_lock(&info->trans_mutex);
685         while (info->running_transaction &&
686                info->running_transaction->delayed_refs.flushing) {
687                 prepare_to_wait(&info->transaction_wait, &wait,
688                                 TASK_UNINTERRUPTIBLE);
689                 mutex_unlock(&info->trans_mutex);
690                 schedule();
691                 mutex_lock(&info->trans_mutex);
692                 finish_wait(&info->transaction_wait, &wait);
693         }
694         mutex_unlock(&info->trans_mutex);
695         return 0;
696 }
697
698 /*
699  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
700  * all of them
701  */
702 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
703                                      struct list_head *list)
704 {
705         struct btrfs_dirty_root *dirty;
706         struct btrfs_trans_handle *trans;
707         unsigned long nr;
708         u64 num_bytes;
709         u64 bytes_used;
710         u64 max_useless;
711         int ret = 0;
712         int err;
713
714         while (!list_empty(list)) {
715                 struct btrfs_root *root;
716
717                 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
718                 list_del_init(&dirty->list);
719
720                 num_bytes = btrfs_root_used(&dirty->root->root_item);
721                 root = dirty->latest_root;
722                 atomic_inc(&root->fs_info->throttles);
723
724                 while (1) {
725                         /*
726                          * we don't want to jump in and create a bunch of
727                          * delayed refs if the transaction is starting to close
728                          */
729                         wait_transaction_pre_flush(tree_root->fs_info);
730                         trans = btrfs_start_transaction(tree_root, 1);
731
732                         /*
733                          * we've joined a transaction, make sure it isn't
734                          * closing right now
735                          */
736                         if (trans->transaction->delayed_refs.flushing) {
737                                 btrfs_end_transaction(trans, tree_root);
738                                 continue;
739                         }
740
741                         mutex_lock(&root->fs_info->drop_mutex);
742                         ret = btrfs_drop_snapshot(trans, dirty->root);
743                         if (ret != -EAGAIN)
744                                 break;
745                         mutex_unlock(&root->fs_info->drop_mutex);
746
747                         err = btrfs_update_root(trans,
748                                         tree_root,
749                                         &dirty->root->root_key,
750                                         &dirty->root->root_item);
751                         if (err)
752                                 ret = err;
753                         nr = trans->blocks_used;
754                         ret = btrfs_end_transaction(trans, tree_root);
755                         BUG_ON(ret);
756
757                         btrfs_btree_balance_dirty(tree_root, nr);
758                         cond_resched();
759                 }
760                 BUG_ON(ret);
761                 atomic_dec(&root->fs_info->throttles);
762                 wake_up(&root->fs_info->transaction_throttle);
763
764                 num_bytes -= btrfs_root_used(&dirty->root->root_item);
765                 bytes_used = btrfs_root_used(&root->root_item);
766                 if (num_bytes) {
767                         mutex_lock(&root->fs_info->trans_mutex);
768                         btrfs_record_root_in_trans(root);
769                         mutex_unlock(&root->fs_info->trans_mutex);
770                         btrfs_set_root_used(&root->root_item,
771                                             bytes_used - num_bytes);
772                 }
773
774                 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
775                 if (ret) {
776                         BUG();
777                         break;
778                 }
779                 mutex_unlock(&root->fs_info->drop_mutex);
780
781                 spin_lock(&root->list_lock);
782                 list_del_init(&dirty->root->dead_list);
783                 if (!list_empty(&root->dead_list)) {
784                         struct btrfs_root *oldest;
785                         oldest = list_entry(root->dead_list.prev,
786                                             struct btrfs_root, dead_list);
787                         max_useless = oldest->root_key.offset - 1;
788                 } else {
789                         max_useless = root->root_key.offset - 1;
790                 }
791                 spin_unlock(&root->list_lock);
792
793                 nr = trans->blocks_used;
794                 ret = btrfs_end_transaction(trans, tree_root);
795                 BUG_ON(ret);
796
797                 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
798                 BUG_ON(ret);
799
800                 free_extent_buffer(dirty->root->node);
801                 kfree(dirty->root);
802                 kfree(dirty);
803
804                 btrfs_btree_balance_dirty(tree_root, nr);
805                 cond_resched();
806         }
807         return ret;
808 }
809
810 /*
811  * new snapshots need to be created at a very specific time in the
812  * transaction commit.  This does the actual creation
813  */
814 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
815                                    struct btrfs_fs_info *fs_info,
816                                    struct btrfs_pending_snapshot *pending)
817 {
818         struct btrfs_key key;
819         struct btrfs_root_item *new_root_item;
820         struct btrfs_root *tree_root = fs_info->tree_root;
821         struct btrfs_root *root = pending->root;
822         struct extent_buffer *tmp;
823         struct extent_buffer *old;
824         int ret;
825         u64 objectid;
826
827         new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
828         if (!new_root_item) {
829                 ret = -ENOMEM;
830                 goto fail;
831         }
832         ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
833         if (ret)
834                 goto fail;
835
836         btrfs_record_root_in_trans(root);
837         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
838         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
839
840         key.objectid = objectid;
841         key.offset = trans->transid;
842         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
843
844         old = btrfs_lock_root_node(root);
845         btrfs_cow_block(trans, root, old, NULL, 0, &old);
846
847         btrfs_copy_root(trans, root, old, &tmp, objectid);
848         btrfs_tree_unlock(old);
849         free_extent_buffer(old);
850
851         btrfs_set_root_bytenr(new_root_item, tmp->start);
852         btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
853         btrfs_set_root_generation(new_root_item, trans->transid);
854         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
855                                 new_root_item);
856         btrfs_tree_unlock(tmp);
857         free_extent_buffer(tmp);
858         if (ret)
859                 goto fail;
860
861         key.offset = (u64)-1;
862         memcpy(&pending->root_key, &key, sizeof(key));
863 fail:
864         kfree(new_root_item);
865         return ret;
866 }
867
868 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
869                                    struct btrfs_pending_snapshot *pending)
870 {
871         int ret;
872         int namelen;
873         u64 index = 0;
874         struct btrfs_trans_handle *trans;
875         struct inode *parent_inode;
876         struct inode *inode;
877         struct btrfs_root *parent_root;
878
879         parent_inode = pending->dentry->d_parent->d_inode;
880         parent_root = BTRFS_I(parent_inode)->root;
881         trans = btrfs_join_transaction(parent_root, 1);
882
883         /*
884          * insert the directory item
885          */
886         namelen = strlen(pending->name);
887         ret = btrfs_set_inode_index(parent_inode, &index);
888         ret = btrfs_insert_dir_item(trans, parent_root,
889                             pending->name, namelen,
890                             parent_inode->i_ino,
891                             &pending->root_key, BTRFS_FT_DIR, index);
892
893         if (ret)
894                 goto fail;
895
896         btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
897         ret = btrfs_update_inode(trans, parent_root, parent_inode);
898         BUG_ON(ret);
899
900         /* add the backref first */
901         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
902                                  pending->root_key.objectid,
903                                  BTRFS_ROOT_BACKREF_KEY,
904                                  parent_root->root_key.objectid,
905                                  parent_inode->i_ino, index, pending->name,
906                                  namelen);
907
908         BUG_ON(ret);
909
910         /* now add the forward ref */
911         ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
912                                  parent_root->root_key.objectid,
913                                  BTRFS_ROOT_REF_KEY,
914                                  pending->root_key.objectid,
915                                  parent_inode->i_ino, index, pending->name,
916                                  namelen);
917
918         inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
919         d_instantiate(pending->dentry, inode);
920 fail:
921         btrfs_end_transaction(trans, fs_info->fs_root);
922         return ret;
923 }
924
925 /*
926  * create all the snapshots we've scheduled for creation
927  */
928 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
929                                              struct btrfs_fs_info *fs_info)
930 {
931         struct btrfs_pending_snapshot *pending;
932         struct list_head *head = &trans->transaction->pending_snapshots;
933         int ret;
934
935         list_for_each_entry(pending, head, list) {
936                 ret = create_pending_snapshot(trans, fs_info, pending);
937                 BUG_ON(ret);
938         }
939         return 0;
940 }
941
942 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
943                                              struct btrfs_fs_info *fs_info)
944 {
945         struct btrfs_pending_snapshot *pending;
946         struct list_head *head = &trans->transaction->pending_snapshots;
947         int ret;
948
949         while (!list_empty(head)) {
950                 pending = list_entry(head->next,
951                                      struct btrfs_pending_snapshot, list);
952                 ret = finish_pending_snapshot(fs_info, pending);
953                 BUG_ON(ret);
954                 list_del(&pending->list);
955                 kfree(pending->name);
956                 kfree(pending);
957         }
958         return 0;
959 }
960
961 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
962                              struct btrfs_root *root)
963 {
964         unsigned long joined = 0;
965         unsigned long timeout = 1;
966         struct btrfs_transaction *cur_trans;
967         struct btrfs_transaction *prev_trans = NULL;
968         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
969         struct list_head dirty_fs_roots;
970         struct extent_io_tree *pinned_copy;
971         DEFINE_WAIT(wait);
972         int ret;
973         int should_grow = 0;
974         unsigned long now = get_seconds();
975         int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
976
977         btrfs_run_ordered_operations(root, 0);
978
979         /* make a pass through all the delayed refs we have so far
980          * any runnings procs may add more while we are here
981          */
982         ret = btrfs_run_delayed_refs(trans, root, 0);
983         BUG_ON(ret);
984
985         cur_trans = trans->transaction;
986         /*
987          * set the flushing flag so procs in this transaction have to
988          * start sending their work down.
989          */
990         cur_trans->delayed_refs.flushing = 1;
991
992         ret = btrfs_run_delayed_refs(trans, root, 0);
993         BUG_ON(ret);
994
995         mutex_lock(&root->fs_info->trans_mutex);
996         INIT_LIST_HEAD(&dirty_fs_roots);
997         if (cur_trans->in_commit) {
998                 cur_trans->use_count++;
999                 mutex_unlock(&root->fs_info->trans_mutex);
1000                 btrfs_end_transaction(trans, root);
1001
1002                 ret = wait_for_commit(root, cur_trans);
1003                 BUG_ON(ret);
1004
1005                 mutex_lock(&root->fs_info->trans_mutex);
1006                 put_transaction(cur_trans);
1007                 mutex_unlock(&root->fs_info->trans_mutex);
1008
1009                 return 0;
1010         }
1011
1012         pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
1013         if (!pinned_copy)
1014                 return -ENOMEM;
1015
1016         extent_io_tree_init(pinned_copy,
1017                              root->fs_info->btree_inode->i_mapping, GFP_NOFS);
1018
1019         trans->transaction->in_commit = 1;
1020         trans->transaction->blocked = 1;
1021         if (cur_trans->list.prev != &root->fs_info->trans_list) {
1022                 prev_trans = list_entry(cur_trans->list.prev,
1023                                         struct btrfs_transaction, list);
1024                 if (!prev_trans->commit_done) {
1025                         prev_trans->use_count++;
1026                         mutex_unlock(&root->fs_info->trans_mutex);
1027
1028                         wait_for_commit(root, prev_trans);
1029
1030                         mutex_lock(&root->fs_info->trans_mutex);
1031                         put_transaction(prev_trans);
1032                 }
1033         }
1034
1035         if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
1036                 should_grow = 1;
1037
1038         do {
1039                 int snap_pending = 0;
1040                 joined = cur_trans->num_joined;
1041                 if (!list_empty(&trans->transaction->pending_snapshots))
1042                         snap_pending = 1;
1043
1044                 WARN_ON(cur_trans != trans->transaction);
1045                 prepare_to_wait(&cur_trans->writer_wait, &wait,
1046                                 TASK_UNINTERRUPTIBLE);
1047
1048                 if (cur_trans->num_writers > 1)
1049                         timeout = MAX_SCHEDULE_TIMEOUT;
1050                 else if (should_grow)
1051                         timeout = 1;
1052
1053                 mutex_unlock(&root->fs_info->trans_mutex);
1054
1055                 if (flush_on_commit || snap_pending) {
1056                         if (flush_on_commit)
1057                                 btrfs_start_delalloc_inodes(root);
1058                         ret = btrfs_wait_ordered_extents(root, 1);
1059                         BUG_ON(ret);
1060                 }
1061
1062                 /*
1063                  * rename don't use btrfs_join_transaction, so, once we
1064                  * set the transaction to blocked above, we aren't going
1065                  * to get any new ordered operations.  We can safely run
1066                  * it here and no for sure that nothing new will be added
1067                  * to the list
1068                  */
1069                 btrfs_run_ordered_operations(root, 1);
1070
1071                 smp_mb();
1072                 if (cur_trans->num_writers > 1 || should_grow)
1073                         schedule_timeout(timeout);
1074
1075                 mutex_lock(&root->fs_info->trans_mutex);
1076                 finish_wait(&cur_trans->writer_wait, &wait);
1077         } while (cur_trans->num_writers > 1 ||
1078                  (should_grow && cur_trans->num_joined != joined));
1079
1080         ret = create_pending_snapshots(trans, root->fs_info);
1081         BUG_ON(ret);
1082
1083         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1084         BUG_ON(ret);
1085
1086         WARN_ON(cur_trans != trans->transaction);
1087
1088         /* btrfs_commit_tree_roots is responsible for getting the
1089          * various roots consistent with each other.  Every pointer
1090          * in the tree of tree roots has to point to the most up to date
1091          * root for every subvolume and other tree.  So, we have to keep
1092          * the tree logging code from jumping in and changing any
1093          * of the trees.
1094          *
1095          * At this point in the commit, there can't be any tree-log
1096          * writers, but a little lower down we drop the trans mutex
1097          * and let new people in.  By holding the tree_log_mutex
1098          * from now until after the super is written, we avoid races
1099          * with the tree-log code.
1100          */
1101         mutex_lock(&root->fs_info->tree_log_mutex);
1102         /*
1103          * keep tree reloc code from adding new reloc trees
1104          */
1105         mutex_lock(&root->fs_info->tree_reloc_mutex);
1106
1107
1108         ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
1109                               &dirty_fs_roots);
1110         BUG_ON(ret);
1111
1112         /* add_dirty_roots gets rid of all the tree log roots, it is now
1113          * safe to free the root of tree log roots
1114          */
1115         btrfs_free_log_root_tree(trans, root->fs_info);
1116
1117         ret = btrfs_commit_tree_roots(trans, root);
1118         BUG_ON(ret);
1119
1120         cur_trans = root->fs_info->running_transaction;
1121         spin_lock(&root->fs_info->new_trans_lock);
1122         root->fs_info->running_transaction = NULL;
1123         spin_unlock(&root->fs_info->new_trans_lock);
1124         btrfs_set_super_generation(&root->fs_info->super_copy,
1125                                    cur_trans->transid);
1126         btrfs_set_super_root(&root->fs_info->super_copy,
1127                              root->fs_info->tree_root->node->start);
1128         btrfs_set_super_root_level(&root->fs_info->super_copy,
1129                            btrfs_header_level(root->fs_info->tree_root->node));
1130
1131         btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1132                                    chunk_root->node->start);
1133         btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1134                                          btrfs_header_level(chunk_root->node));
1135         btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1136                                 btrfs_header_generation(chunk_root->node));
1137
1138         if (!root->fs_info->log_root_recovering) {
1139                 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1140                 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1141         }
1142
1143         memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1144                sizeof(root->fs_info->super_copy));
1145
1146         btrfs_copy_pinned(root, pinned_copy);
1147
1148         trans->transaction->blocked = 0;
1149
1150         wake_up(&root->fs_info->transaction_throttle);
1151         wake_up(&root->fs_info->transaction_wait);
1152
1153         mutex_unlock(&root->fs_info->trans_mutex);
1154         ret = btrfs_write_and_wait_transaction(trans, root);
1155         BUG_ON(ret);
1156         write_ctree_super(trans, root, 0);
1157
1158         /*
1159          * the super is written, we can safely allow the tree-loggers
1160          * to go about their business
1161          */
1162         mutex_unlock(&root->fs_info->tree_log_mutex);
1163
1164         btrfs_finish_extent_commit(trans, root, pinned_copy);
1165         kfree(pinned_copy);
1166
1167         btrfs_drop_dead_reloc_roots(root);
1168         mutex_unlock(&root->fs_info->tree_reloc_mutex);
1169
1170         /* do the directory inserts of any pending snapshot creations */
1171         finish_pending_snapshots(trans, root->fs_info);
1172
1173         mutex_lock(&root->fs_info->trans_mutex);
1174
1175         cur_trans->commit_done = 1;
1176
1177         root->fs_info->last_trans_committed = cur_trans->transid;
1178         wake_up(&cur_trans->commit_wait);
1179
1180         put_transaction(cur_trans);
1181         put_transaction(cur_trans);
1182
1183         list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1184         if (root->fs_info->closing)
1185                 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1186
1187         mutex_unlock(&root->fs_info->trans_mutex);
1188
1189         kmem_cache_free(btrfs_trans_handle_cachep, trans);
1190
1191         if (root->fs_info->closing)
1192                 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1193         return ret;
1194 }
1195
1196 /*
1197  * interface function to delete all the snapshots we have scheduled for deletion
1198  */
1199 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1200 {
1201         struct list_head dirty_roots;
1202         INIT_LIST_HEAD(&dirty_roots);
1203 again:
1204         mutex_lock(&root->fs_info->trans_mutex);
1205         list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1206         mutex_unlock(&root->fs_info->trans_mutex);
1207
1208         if (!list_empty(&dirty_roots)) {
1209                 drop_dirty_roots(root, &dirty_roots);
1210                 goto again;
1211         }
1212         return 0;
1213 }