Btrfs: Fix cow semantic in run_delalloc_nocow()
[linux-2.6] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/smp_lock.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mpage.h>
32 #include <linux/swap.h>
33 #include <linux/writeback.h>
34 #include <linux/statfs.h>
35 #include <linux/compat.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/version.h>
38 #include <linux/xattr.h>
39 #include <linux/posix_acl.h>
40 #include <linux/falloc.h>
41 #include "compat.h"
42 #include "ctree.h"
43 #include "disk-io.h"
44 #include "transaction.h"
45 #include "btrfs_inode.h"
46 #include "ioctl.h"
47 #include "print-tree.h"
48 #include "volumes.h"
49 #include "ordered-data.h"
50 #include "xattr.h"
51 #include "tree-log.h"
52 #include "ref-cache.h"
53 #include "compression.h"
54
55 struct btrfs_iget_args {
56         u64 ino;
57         struct btrfs_root *root;
58 };
59
60 static struct inode_operations btrfs_dir_inode_operations;
61 static struct inode_operations btrfs_symlink_inode_operations;
62 static struct inode_operations btrfs_dir_ro_inode_operations;
63 static struct inode_operations btrfs_special_inode_operations;
64 static struct inode_operations btrfs_file_inode_operations;
65 static struct address_space_operations btrfs_aops;
66 static struct address_space_operations btrfs_symlink_aops;
67 static struct file_operations btrfs_dir_file_operations;
68 static struct extent_io_ops btrfs_extent_io_ops;
69
70 static struct kmem_cache *btrfs_inode_cachep;
71 struct kmem_cache *btrfs_trans_handle_cachep;
72 struct kmem_cache *btrfs_transaction_cachep;
73 struct kmem_cache *btrfs_bit_radix_cachep;
74 struct kmem_cache *btrfs_path_cachep;
75
76 #define S_SHIFT 12
77 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
79         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
80         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
81         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
82         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
83         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
84         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
85 };
86
87 static void btrfs_truncate(struct inode *inode);
88 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89 static noinline int cow_file_range(struct inode *inode,
90                                    struct page *locked_page,
91                                    u64 start, u64 end, int *page_started,
92                                    unsigned long *nr_written, int unlock);
93
94 /*
95  * a very lame attempt at stopping writes when the FS is 85% full.  There
96  * are countless ways this is incorrect, but it is better than nothing.
97  */
98 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99                            int for_del)
100 {
101         u64 total;
102         u64 used;
103         u64 thresh;
104         unsigned long flags;
105         int ret = 0;
106
107         spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
108         total = btrfs_super_total_bytes(&root->fs_info->super_copy);
109         used = btrfs_super_bytes_used(&root->fs_info->super_copy);
110         if (for_del)
111                 thresh = total * 90;
112         else
113                 thresh = total * 85;
114
115         do_div(thresh, 100);
116
117         if (used + root->fs_info->delalloc_bytes + num_required > thresh)
118                 ret = -ENOSPC;
119         spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
120         return ret;
121 }
122
123 /*
124  * this does all the hard work for inserting an inline extent into
125  * the btree.  The caller should have done a btrfs_drop_extents so that
126  * no overlapping inline items exist in the btree
127  */
128 static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
129                                 struct btrfs_root *root, struct inode *inode,
130                                 u64 start, size_t size, size_t compressed_size,
131                                 struct page **compressed_pages)
132 {
133         struct btrfs_key key;
134         struct btrfs_path *path;
135         struct extent_buffer *leaf;
136         struct page *page = NULL;
137         char *kaddr;
138         unsigned long ptr;
139         struct btrfs_file_extent_item *ei;
140         int err = 0;
141         int ret;
142         size_t cur_size = size;
143         size_t datasize;
144         unsigned long offset;
145         int use_compress = 0;
146
147         if (compressed_size && compressed_pages) {
148                 use_compress = 1;
149                 cur_size = compressed_size;
150         }
151
152         path = btrfs_alloc_path(); if (!path)
153                 return -ENOMEM;
154
155         btrfs_set_trans_block_group(trans, inode);
156
157         key.objectid = inode->i_ino;
158         key.offset = start;
159         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160         inode_add_bytes(inode, size);
161         datasize = btrfs_file_extent_calc_inline_size(cur_size);
162
163         inode_add_bytes(inode, size);
164         ret = btrfs_insert_empty_item(trans, root, path, &key,
165                                       datasize);
166         BUG_ON(ret);
167         if (ret) {
168                 err = ret;
169                 printk("got bad ret %d\n", ret);
170                 goto fail;
171         }
172         leaf = path->nodes[0];
173         ei = btrfs_item_ptr(leaf, path->slots[0],
174                             struct btrfs_file_extent_item);
175         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
176         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
177         btrfs_set_file_extent_encryption(leaf, ei, 0);
178         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
179         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
180         ptr = btrfs_file_extent_inline_start(ei);
181
182         if (use_compress) {
183                 struct page *cpage;
184                 int i = 0;
185                 while(compressed_size > 0) {
186                         cpage = compressed_pages[i];
187                         cur_size = min_t(unsigned long, compressed_size,
188                                        PAGE_CACHE_SIZE);
189
190                         kaddr = kmap(cpage);
191                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
192                         kunmap(cpage);
193
194                         i++;
195                         ptr += cur_size;
196                         compressed_size -= cur_size;
197                 }
198                 btrfs_set_file_extent_compression(leaf, ei,
199                                                   BTRFS_COMPRESS_ZLIB);
200         } else {
201                 page = find_get_page(inode->i_mapping,
202                                      start >> PAGE_CACHE_SHIFT);
203                 btrfs_set_file_extent_compression(leaf, ei, 0);
204                 kaddr = kmap_atomic(page, KM_USER0);
205                 offset = start & (PAGE_CACHE_SIZE - 1);
206                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
207                 kunmap_atomic(kaddr, KM_USER0);
208                 page_cache_release(page);
209         }
210         btrfs_mark_buffer_dirty(leaf);
211         btrfs_free_path(path);
212
213         BTRFS_I(inode)->disk_i_size = inode->i_size;
214         btrfs_update_inode(trans, root, inode);
215         return 0;
216 fail:
217         btrfs_free_path(path);
218         return err;
219 }
220
221
222 /*
223  * conditionally insert an inline extent into the file.  This
224  * does the checks required to make sure the data is small enough
225  * to fit as an inline extent.
226  */
227 static int cow_file_range_inline(struct btrfs_trans_handle *trans,
228                                  struct btrfs_root *root,
229                                  struct inode *inode, u64 start, u64 end,
230                                  size_t compressed_size,
231                                  struct page **compressed_pages)
232 {
233         u64 isize = i_size_read(inode);
234         u64 actual_end = min(end + 1, isize);
235         u64 inline_len = actual_end - start;
236         u64 aligned_end = (end + root->sectorsize - 1) &
237                         ~((u64)root->sectorsize - 1);
238         u64 hint_byte;
239         u64 data_len = inline_len;
240         int ret;
241
242         if (compressed_size)
243                 data_len = compressed_size;
244
245         if (start > 0 ||
246             actual_end >= PAGE_CACHE_SIZE ||
247             data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
248             (!compressed_size &&
249             (actual_end & (root->sectorsize - 1)) == 0) ||
250             end + 1 < isize ||
251             data_len > root->fs_info->max_inline) {
252                 return 1;
253         }
254
255         ret = btrfs_drop_extents(trans, root, inode, start,
256                                  aligned_end, start, &hint_byte);
257         BUG_ON(ret);
258
259         if (isize > actual_end)
260                 inline_len = min_t(u64, isize, actual_end);
261         ret = insert_inline_extent(trans, root, inode, start,
262                                    inline_len, compressed_size,
263                                    compressed_pages);
264         BUG_ON(ret);
265         btrfs_drop_extent_cache(inode, start, aligned_end, 0);
266         return 0;
267 }
268
269 struct async_extent {
270         u64 start;
271         u64 ram_size;
272         u64 compressed_size;
273         struct page **pages;
274         unsigned long nr_pages;
275         struct list_head list;
276 };
277
278 struct async_cow {
279         struct inode *inode;
280         struct btrfs_root *root;
281         struct page *locked_page;
282         u64 start;
283         u64 end;
284         struct list_head extents;
285         struct btrfs_work work;
286 };
287
288 static noinline int add_async_extent(struct async_cow *cow,
289                                      u64 start, u64 ram_size,
290                                      u64 compressed_size,
291                                      struct page **pages,
292                                      unsigned long nr_pages)
293 {
294         struct async_extent *async_extent;
295
296         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
297         async_extent->start = start;
298         async_extent->ram_size = ram_size;
299         async_extent->compressed_size = compressed_size;
300         async_extent->pages = pages;
301         async_extent->nr_pages = nr_pages;
302         list_add_tail(&async_extent->list, &cow->extents);
303         return 0;
304 }
305
306 /*
307  * we create compressed extents in two phases.  The first
308  * phase compresses a range of pages that have already been
309  * locked (both pages and state bits are locked).
310  *
311  * This is done inside an ordered work queue, and the compression
312  * is spread across many cpus.  The actual IO submission is step
313  * two, and the ordered work queue takes care of making sure that
314  * happens in the same order things were put onto the queue by
315  * writepages and friends.
316  *
317  * If this code finds it can't get good compression, it puts an
318  * entry onto the work queue to write the uncompressed bytes.  This
319  * makes sure that both compressed inodes and uncompressed inodes
320  * are written in the same order that pdflush sent them down.
321  */
322 static noinline int compress_file_range(struct inode *inode,
323                                         struct page *locked_page,
324                                         u64 start, u64 end,
325                                         struct async_cow *async_cow,
326                                         int *num_added)
327 {
328         struct btrfs_root *root = BTRFS_I(inode)->root;
329         struct btrfs_trans_handle *trans;
330         u64 num_bytes;
331         u64 orig_start;
332         u64 disk_num_bytes;
333         u64 blocksize = root->sectorsize;
334         u64 actual_end;
335         int ret = 0;
336         struct page **pages = NULL;
337         unsigned long nr_pages;
338         unsigned long nr_pages_ret = 0;
339         unsigned long total_compressed = 0;
340         unsigned long total_in = 0;
341         unsigned long max_compressed = 128 * 1024;
342         unsigned long max_uncompressed = 128 * 1024;
343         int i;
344         int will_compress;
345
346         orig_start = start;
347
348 again:
349         will_compress = 0;
350         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352
353         actual_end = min_t(u64, i_size_read(inode), end + 1);
354         total_compressed = actual_end - start;
355
356         /* we want to make sure that amount of ram required to uncompress
357          * an extent is reasonable, so we limit the total size in ram
358          * of a compressed extent to 128k.  This is a crucial number
359          * because it also controls how easily we can spread reads across
360          * cpus for decompression.
361          *
362          * We also want to make sure the amount of IO required to do
363          * a random read is reasonably small, so we limit the size of
364          * a compressed extent to 128k.
365          */
366         total_compressed = min(total_compressed, max_uncompressed);
367         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
368         num_bytes = max(blocksize,  num_bytes);
369         disk_num_bytes = num_bytes;
370         total_in = 0;
371         ret = 0;
372
373         /*
374          * we do compression for mount -o compress and when the
375          * inode has not been flagged as nocompress.  This flag can
376          * change at any time if we discover bad compression ratios.
377          */
378         if (!btrfs_test_flag(inode, NOCOMPRESS) &&
379             btrfs_test_opt(root, COMPRESS)) {
380                 WARN_ON(pages);
381                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
382
383                 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
384                                                 total_compressed, pages,
385                                                 nr_pages, &nr_pages_ret,
386                                                 &total_in,
387                                                 &total_compressed,
388                                                 max_compressed);
389
390                 if (!ret) {
391                         unsigned long offset = total_compressed &
392                                 (PAGE_CACHE_SIZE - 1);
393                         struct page *page = pages[nr_pages_ret - 1];
394                         char *kaddr;
395
396                         /* zero the tail end of the last page, we might be
397                          * sending it down to disk
398                          */
399                         if (offset) {
400                                 kaddr = kmap_atomic(page, KM_USER0);
401                                 memset(kaddr + offset, 0,
402                                        PAGE_CACHE_SIZE - offset);
403                                 kunmap_atomic(kaddr, KM_USER0);
404                         }
405                         will_compress = 1;
406                 }
407         }
408         if (start == 0) {
409                 trans = btrfs_join_transaction(root, 1);
410                 BUG_ON(!trans);
411                 btrfs_set_trans_block_group(trans, inode);
412
413                 /* lets try to make an inline extent */
414                 if (ret || total_in < (actual_end - start)) {
415                         /* we didn't compress the entire range, try
416                          * to make an uncompressed inline extent.
417                          */
418                         ret = cow_file_range_inline(trans, root, inode,
419                                                     start, end, 0, NULL);
420                 } else {
421                         /* try making a compressed inline extent */
422                         ret = cow_file_range_inline(trans, root, inode,
423                                                     start, end,
424                                                     total_compressed, pages);
425                 }
426                 btrfs_end_transaction(trans, root);
427                 if (ret == 0) {
428                         /*
429                          * inline extent creation worked, we don't need
430                          * to create any more async work items.  Unlock
431                          * and free up our temp pages.
432                          */
433                         extent_clear_unlock_delalloc(inode,
434                                                      &BTRFS_I(inode)->io_tree,
435                                                      start, end, NULL, 1, 0,
436                                                      0, 1, 1, 1);
437                         ret = 0;
438                         goto free_pages_out;
439                 }
440         }
441
442         if (will_compress) {
443                 /*
444                  * we aren't doing an inline extent round the compressed size
445                  * up to a block size boundary so the allocator does sane
446                  * things
447                  */
448                 total_compressed = (total_compressed + blocksize - 1) &
449                         ~(blocksize - 1);
450
451                 /*
452                  * one last check to make sure the compression is really a
453                  * win, compare the page count read with the blocks on disk
454                  */
455                 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
456                         ~(PAGE_CACHE_SIZE - 1);
457                 if (total_compressed >= total_in) {
458                         will_compress = 0;
459                 } else {
460                         disk_num_bytes = total_compressed;
461                         num_bytes = total_in;
462                 }
463         }
464         if (!will_compress && pages) {
465                 /*
466                  * the compression code ran but failed to make things smaller,
467                  * free any pages it allocated and our page pointer array
468                  */
469                 for (i = 0; i < nr_pages_ret; i++) {
470                         WARN_ON(pages[i]->mapping);
471                         page_cache_release(pages[i]);
472                 }
473                 kfree(pages);
474                 pages = NULL;
475                 total_compressed = 0;
476                 nr_pages_ret = 0;
477
478                 /* flag the file so we don't compress in the future */
479                 btrfs_set_flag(inode, NOCOMPRESS);
480         }
481         if (will_compress) {
482                 *num_added += 1;
483
484                 /* the async work queues will take care of doing actual
485                  * allocation on disk for these compressed pages,
486                  * and will submit them to the elevator.
487                  */
488                 add_async_extent(async_cow, start, num_bytes,
489                                  total_compressed, pages, nr_pages_ret);
490
491                 if (start + num_bytes < end) {
492                         start += num_bytes;
493                         pages = NULL;
494                         cond_resched();
495                         goto again;
496                 }
497         } else {
498                 /*
499                  * No compression, but we still need to write the pages in
500                  * the file we've been given so far.  redirty the locked
501                  * page if it corresponds to our extent and set things up
502                  * for the async work queue to run cow_file_range to do
503                  * the normal delalloc dance
504                  */
505                 if (page_offset(locked_page) >= start &&
506                     page_offset(locked_page) <= end) {
507                         __set_page_dirty_nobuffers(locked_page);
508                         /* unlocked later on in the async handlers */
509                 }
510                 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
511                 *num_added += 1;
512         }
513
514 out:
515         return 0;
516
517 free_pages_out:
518         for (i = 0; i < nr_pages_ret; i++) {
519                 WARN_ON(pages[i]->mapping);
520                 page_cache_release(pages[i]);
521         }
522         if (pages)
523                 kfree(pages);
524
525         goto out;
526 }
527
528 /*
529  * phase two of compressed writeback.  This is the ordered portion
530  * of the code, which only gets called in the order the work was
531  * queued.  We walk all the async extents created by compress_file_range
532  * and send them down to the disk.
533  */
534 static noinline int submit_compressed_extents(struct inode *inode,
535                                               struct async_cow *async_cow)
536 {
537         struct async_extent *async_extent;
538         u64 alloc_hint = 0;
539         struct btrfs_trans_handle *trans;
540         struct btrfs_key ins;
541         struct extent_map *em;
542         struct btrfs_root *root = BTRFS_I(inode)->root;
543         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
544         struct extent_io_tree *io_tree;
545         int ret;
546
547         if (list_empty(&async_cow->extents))
548                 return 0;
549
550         trans = btrfs_join_transaction(root, 1);
551
552         while(!list_empty(&async_cow->extents)) {
553                 async_extent = list_entry(async_cow->extents.next,
554                                           struct async_extent, list);
555                 list_del(&async_extent->list);
556
557                 io_tree = &BTRFS_I(inode)->io_tree;
558
559                 /* did the compression code fall back to uncompressed IO? */
560                 if (!async_extent->pages) {
561                         int page_started = 0;
562                         unsigned long nr_written = 0;
563
564                         lock_extent(io_tree, async_extent->start,
565                                     async_extent->start + async_extent->ram_size - 1,
566                                     GFP_NOFS);
567
568                         /* allocate blocks */
569                         cow_file_range(inode, async_cow->locked_page,
570                                        async_extent->start,
571                                        async_extent->start +
572                                        async_extent->ram_size - 1,
573                                        &page_started, &nr_written, 0);
574
575                         /*
576                          * if page_started, cow_file_range inserted an
577                          * inline extent and took care of all the unlocking
578                          * and IO for us.  Otherwise, we need to submit
579                          * all those pages down to the drive.
580                          */
581                         if (!page_started)
582                                 extent_write_locked_range(io_tree,
583                                                   inode, async_extent->start,
584                                                   async_extent->start +
585                                                   async_extent->ram_size - 1,
586                                                   btrfs_get_extent,
587                                                   WB_SYNC_ALL);
588                         kfree(async_extent);
589                         cond_resched();
590                         continue;
591                 }
592
593                 lock_extent(io_tree, async_extent->start,
594                             async_extent->start + async_extent->ram_size - 1,
595                             GFP_NOFS);
596                 /*
597                  * here we're doing allocation and writeback of the
598                  * compressed pages
599                  */
600                 btrfs_drop_extent_cache(inode, async_extent->start,
601                                         async_extent->start +
602                                         async_extent->ram_size - 1, 0);
603
604                 ret = btrfs_reserve_extent(trans, root,
605                                            async_extent->compressed_size,
606                                            async_extent->compressed_size,
607                                            0, alloc_hint,
608                                            (u64)-1, &ins, 1);
609                 BUG_ON(ret);
610                 em = alloc_extent_map(GFP_NOFS);
611                 em->start = async_extent->start;
612                 em->len = async_extent->ram_size;
613                 em->orig_start = em->start;
614
615                 em->block_start = ins.objectid;
616                 em->block_len = ins.offset;
617                 em->bdev = root->fs_info->fs_devices->latest_bdev;
618                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
619                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
620
621                 while(1) {
622                         spin_lock(&em_tree->lock);
623                         ret = add_extent_mapping(em_tree, em);
624                         spin_unlock(&em_tree->lock);
625                         if (ret != -EEXIST) {
626                                 free_extent_map(em);
627                                 break;
628                         }
629                         btrfs_drop_extent_cache(inode, async_extent->start,
630                                                 async_extent->start +
631                                                 async_extent->ram_size - 1, 0);
632                 }
633
634                 ret = btrfs_add_ordered_extent(inode, async_extent->start,
635                                                ins.objectid,
636                                                async_extent->ram_size,
637                                                ins.offset,
638                                                BTRFS_ORDERED_COMPRESSED);
639                 BUG_ON(ret);
640
641                 btrfs_end_transaction(trans, root);
642
643                 /*
644                  * clear dirty, set writeback and unlock the pages.
645                  */
646                 extent_clear_unlock_delalloc(inode,
647                                              &BTRFS_I(inode)->io_tree,
648                                              async_extent->start,
649                                              async_extent->start +
650                                              async_extent->ram_size - 1,
651                                              NULL, 1, 1, 0, 1, 1, 0);
652
653                 ret = btrfs_submit_compressed_write(inode,
654                                          async_extent->start,
655                                          async_extent->ram_size,
656                                          ins.objectid,
657                                          ins.offset, async_extent->pages,
658                                          async_extent->nr_pages);
659
660                 BUG_ON(ret);
661                 trans = btrfs_join_transaction(root, 1);
662                 alloc_hint = ins.objectid + ins.offset;
663                 kfree(async_extent);
664                 cond_resched();
665         }
666
667         btrfs_end_transaction(trans, root);
668         return 0;
669 }
670
671 /*
672  * when extent_io.c finds a delayed allocation range in the file,
673  * the call backs end up in this code.  The basic idea is to
674  * allocate extents on disk for the range, and create ordered data structs
675  * in ram to track those extents.
676  *
677  * locked_page is the page that writepage had locked already.  We use
678  * it to make sure we don't do extra locks or unlocks.
679  *
680  * *page_started is set to one if we unlock locked_page and do everything
681  * required to start IO on it.  It may be clean and already done with
682  * IO when we return.
683  */
684 static noinline int cow_file_range(struct inode *inode,
685                                    struct page *locked_page,
686                                    u64 start, u64 end, int *page_started,
687                                    unsigned long *nr_written,
688                                    int unlock)
689 {
690         struct btrfs_root *root = BTRFS_I(inode)->root;
691         struct btrfs_trans_handle *trans;
692         u64 alloc_hint = 0;
693         u64 num_bytes;
694         unsigned long ram_size;
695         u64 disk_num_bytes;
696         u64 cur_alloc_size;
697         u64 blocksize = root->sectorsize;
698         u64 actual_end;
699         struct btrfs_key ins;
700         struct extent_map *em;
701         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
702         int ret = 0;
703
704         trans = btrfs_join_transaction(root, 1);
705         BUG_ON(!trans);
706         btrfs_set_trans_block_group(trans, inode);
707
708         actual_end = min_t(u64, i_size_read(inode), end + 1);
709
710         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
711         num_bytes = max(blocksize,  num_bytes);
712         disk_num_bytes = num_bytes;
713         ret = 0;
714
715         if (start == 0) {
716                 /* lets try to make an inline extent */
717                 ret = cow_file_range_inline(trans, root, inode,
718                                             start, end, 0, NULL);
719                 if (ret == 0) {
720                         extent_clear_unlock_delalloc(inode,
721                                                      &BTRFS_I(inode)->io_tree,
722                                                      start, end, NULL, 1, 1,
723                                                      1, 1, 1, 1);
724                         *nr_written = *nr_written +
725                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
726                         *page_started = 1;
727                         ret = 0;
728                         goto out;
729                 }
730         }
731
732         BUG_ON(disk_num_bytes >
733                btrfs_super_total_bytes(&root->fs_info->super_copy));
734
735         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
736
737         while(disk_num_bytes > 0) {
738                 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
739                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
740                                            root->sectorsize, 0, alloc_hint,
741                                            (u64)-1, &ins, 1);
742                 if (ret) {
743                         BUG();
744                 }
745                 em = alloc_extent_map(GFP_NOFS);
746                 em->start = start;
747                 em->orig_start = em->start;
748
749                 ram_size = ins.offset;
750                 em->len = ins.offset;
751
752                 em->block_start = ins.objectid;
753                 em->block_len = ins.offset;
754                 em->bdev = root->fs_info->fs_devices->latest_bdev;
755                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
756
757                 while(1) {
758                         spin_lock(&em_tree->lock);
759                         ret = add_extent_mapping(em_tree, em);
760                         spin_unlock(&em_tree->lock);
761                         if (ret != -EEXIST) {
762                                 free_extent_map(em);
763                                 break;
764                         }
765                         btrfs_drop_extent_cache(inode, start,
766                                                 start + ram_size - 1, 0);
767                 }
768
769                 cur_alloc_size = ins.offset;
770                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
771                                                ram_size, cur_alloc_size, 0);
772                 BUG_ON(ret);
773
774                 if (disk_num_bytes < cur_alloc_size) {
775                         printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
776                                cur_alloc_size);
777                         break;
778                 }
779                 /* we're not doing compressed IO, don't unlock the first
780                  * page (which the caller expects to stay locked), don't
781                  * clear any dirty bits and don't set any writeback bits
782                  */
783                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
784                                              start, start + ram_size - 1,
785                                              locked_page, unlock, 1,
786                                              1, 0, 0, 0);
787                 disk_num_bytes -= cur_alloc_size;
788                 num_bytes -= cur_alloc_size;
789                 alloc_hint = ins.objectid + ins.offset;
790                 start += cur_alloc_size;
791         }
792 out:
793         ret = 0;
794         btrfs_end_transaction(trans, root);
795
796         return ret;
797 }
798
799 /*
800  * work queue call back to started compression on a file and pages
801  */
802 static noinline void async_cow_start(struct btrfs_work *work)
803 {
804         struct async_cow *async_cow;
805         int num_added = 0;
806         async_cow = container_of(work, struct async_cow, work);
807
808         compress_file_range(async_cow->inode, async_cow->locked_page,
809                             async_cow->start, async_cow->end, async_cow,
810                             &num_added);
811         if (num_added == 0)
812                 async_cow->inode = NULL;
813 }
814
815 /*
816  * work queue call back to submit previously compressed pages
817  */
818 static noinline void async_cow_submit(struct btrfs_work *work)
819 {
820         struct async_cow *async_cow;
821         struct btrfs_root *root;
822         unsigned long nr_pages;
823
824         async_cow = container_of(work, struct async_cow, work);
825
826         root = async_cow->root;
827         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
828                 PAGE_CACHE_SHIFT;
829
830         atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
831
832         if (atomic_read(&root->fs_info->async_delalloc_pages) <
833             5 * 1042 * 1024 &&
834             waitqueue_active(&root->fs_info->async_submit_wait))
835                 wake_up(&root->fs_info->async_submit_wait);
836
837         if (async_cow->inode) {
838                 submit_compressed_extents(async_cow->inode, async_cow);
839         }
840 }
841
842 static noinline void async_cow_free(struct btrfs_work *work)
843 {
844         struct async_cow *async_cow;
845         async_cow = container_of(work, struct async_cow, work);
846         kfree(async_cow);
847 }
848
849 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
850                                 u64 start, u64 end, int *page_started,
851                                 unsigned long *nr_written)
852 {
853         struct async_cow *async_cow;
854         struct btrfs_root *root = BTRFS_I(inode)->root;
855         unsigned long nr_pages;
856         u64 cur_end;
857         int limit = 10 * 1024 * 1042;
858
859         if (!btrfs_test_opt(root, COMPRESS)) {
860                 return cow_file_range(inode, locked_page, start, end,
861                                       page_started, nr_written, 1);
862         }
863
864         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
865                          EXTENT_DELALLOC, 1, 0, GFP_NOFS);
866         while(start < end) {
867                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
868                 async_cow->inode = inode;
869                 async_cow->root = root;
870                 async_cow->locked_page = locked_page;
871                 async_cow->start = start;
872
873                 if (btrfs_test_flag(inode, NOCOMPRESS))
874                         cur_end = end;
875                 else
876                         cur_end = min(end, start + 512 * 1024 - 1);
877
878                 async_cow->end = cur_end;
879                 INIT_LIST_HEAD(&async_cow->extents);
880
881                 async_cow->work.func = async_cow_start;
882                 async_cow->work.ordered_func = async_cow_submit;
883                 async_cow->work.ordered_free = async_cow_free;
884                 async_cow->work.flags = 0;
885
886                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
887                         PAGE_CACHE_SHIFT;
888                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
889
890                 btrfs_queue_worker(&root->fs_info->delalloc_workers,
891                                    &async_cow->work);
892
893                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
894                         wait_event(root->fs_info->async_submit_wait,
895                            (atomic_read(&root->fs_info->async_delalloc_pages) <
896                             limit));
897                 }
898
899                 while(atomic_read(&root->fs_info->async_submit_draining) &&
900                       atomic_read(&root->fs_info->async_delalloc_pages)) {
901                         wait_event(root->fs_info->async_submit_wait,
902                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
903                            0));
904                 }
905
906                 *nr_written += nr_pages;
907                 start = cur_end + 1;
908         }
909         *page_started = 1;
910         return 0;
911 }
912
913 /*
914  * when nowcow writeback call back.  This checks for snapshots or COW copies
915  * of the extents that exist in the file, and COWs the file as required.
916  *
917  * If no cow copies or snapshots exist, we write directly to the existing
918  * blocks on disk
919  */
920 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
921                               u64 start, u64 end, int *page_started, int force,
922                               unsigned long *nr_written)
923 {
924         struct btrfs_root *root = BTRFS_I(inode)->root;
925         struct btrfs_trans_handle *trans;
926         struct extent_buffer *leaf;
927         struct btrfs_path *path;
928         struct btrfs_file_extent_item *fi;
929         struct btrfs_key found_key;
930         u64 cow_start;
931         u64 cur_offset;
932         u64 extent_end;
933         u64 disk_bytenr;
934         u64 num_bytes;
935         int extent_type;
936         int ret;
937         int type;
938         int nocow;
939         int check_prev = 1;
940
941         path = btrfs_alloc_path();
942         BUG_ON(!path);
943         trans = btrfs_join_transaction(root, 1);
944         BUG_ON(!trans);
945
946         cow_start = (u64)-1;
947         cur_offset = start;
948         while (1) {
949                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
950                                                cur_offset, 0);
951                 BUG_ON(ret < 0);
952                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
953                         leaf = path->nodes[0];
954                         btrfs_item_key_to_cpu(leaf, &found_key,
955                                               path->slots[0] - 1);
956                         if (found_key.objectid == inode->i_ino &&
957                             found_key.type == BTRFS_EXTENT_DATA_KEY)
958                                 path->slots[0]--;
959                 }
960                 check_prev = 0;
961 next_slot:
962                 leaf = path->nodes[0];
963                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
964                         ret = btrfs_next_leaf(root, path);
965                         if (ret < 0)
966                                 BUG_ON(1);
967                         if (ret > 0)
968                                 break;
969                         leaf = path->nodes[0];
970                 }
971
972                 nocow = 0;
973                 disk_bytenr = 0;
974                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
975
976                 if (found_key.objectid > inode->i_ino ||
977                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
978                     found_key.offset > end)
979                         break;
980
981                 if (found_key.offset > cur_offset) {
982                         extent_end = found_key.offset;
983                         goto out_check;
984                 }
985
986                 fi = btrfs_item_ptr(leaf, path->slots[0],
987                                     struct btrfs_file_extent_item);
988                 extent_type = btrfs_file_extent_type(leaf, fi);
989
990                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
991                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
992                         struct btrfs_block_group_cache *block_group;
993                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
994                         extent_end = found_key.offset +
995                                 btrfs_file_extent_num_bytes(leaf, fi);
996                         if (extent_end <= start) {
997                                 path->slots[0]++;
998                                 goto next_slot;
999                         }
1000                         if (btrfs_file_extent_compression(leaf, fi) ||
1001                             btrfs_file_extent_encryption(leaf, fi) ||
1002                             btrfs_file_extent_other_encoding(leaf, fi))
1003                                 goto out_check;
1004                         if (disk_bytenr == 0)
1005                                 goto out_check;
1006                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1007                                 goto out_check;
1008                         if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
1009                                 goto out_check;
1010                         block_group = btrfs_lookup_block_group(root->fs_info,
1011                                                                disk_bytenr);
1012                         if (!block_group || block_group->ro)
1013                                 goto out_check;
1014                         disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1015                         nocow = 1;
1016                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1017                         extent_end = found_key.offset +
1018                                 btrfs_file_extent_inline_len(leaf, fi);
1019                         extent_end = ALIGN(extent_end, root->sectorsize);
1020                 } else {
1021                         BUG_ON(1);
1022                 }
1023 out_check:
1024                 if (extent_end <= start) {
1025                         path->slots[0]++;
1026                         goto next_slot;
1027                 }
1028                 if (!nocow) {
1029                         if (cow_start == (u64)-1)
1030                                 cow_start = cur_offset;
1031                         cur_offset = extent_end;
1032                         if (cur_offset > end)
1033                                 break;
1034                         path->slots[0]++;
1035                         goto next_slot;
1036                 }
1037
1038                 btrfs_release_path(root, path);
1039                 if (cow_start != (u64)-1) {
1040                         ret = cow_file_range(inode, locked_page, cow_start,
1041                                         found_key.offset - 1, page_started,
1042                                         nr_written, 1);
1043                         BUG_ON(ret);
1044                         cow_start = (u64)-1;
1045                 }
1046
1047                 disk_bytenr += cur_offset - found_key.offset;
1048                 num_bytes = min(end + 1, extent_end) - cur_offset;
1049                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1050                         struct extent_map *em;
1051                         struct extent_map_tree *em_tree;
1052                         em_tree = &BTRFS_I(inode)->extent_tree;
1053                         em = alloc_extent_map(GFP_NOFS);
1054                         em->start = cur_offset;
1055                         em->orig_start = em->start;
1056                         em->len = num_bytes;
1057                         em->block_len = num_bytes;
1058                         em->block_start = disk_bytenr;
1059                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1060                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1061                         while (1) {
1062                                 spin_lock(&em_tree->lock);
1063                                 ret = add_extent_mapping(em_tree, em);
1064                                 spin_unlock(&em_tree->lock);
1065                                 if (ret != -EEXIST) {
1066                                         free_extent_map(em);
1067                                         break;
1068                                 }
1069                                 btrfs_drop_extent_cache(inode, em->start,
1070                                                 em->start + em->len - 1, 0);
1071                         }
1072                         type = BTRFS_ORDERED_PREALLOC;
1073                 } else {
1074                         type = BTRFS_ORDERED_NOCOW;
1075                 }
1076
1077                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1078                                                num_bytes, num_bytes, type);
1079                 BUG_ON(ret);
1080
1081                 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1082                                         cur_offset, cur_offset + num_bytes - 1,
1083                                         locked_page, 1, 1, 1, 0, 0, 0);
1084                 cur_offset = extent_end;
1085                 if (cur_offset > end)
1086                         break;
1087         }
1088         btrfs_release_path(root, path);
1089
1090         if (cur_offset <= end && cow_start == (u64)-1)
1091                 cow_start = cur_offset;
1092         if (cow_start != (u64)-1) {
1093                 ret = cow_file_range(inode, locked_page, cow_start, end,
1094                                      page_started, nr_written, 1);
1095                 BUG_ON(ret);
1096         }
1097
1098         ret = btrfs_end_transaction(trans, root);
1099         BUG_ON(ret);
1100         btrfs_free_path(path);
1101         return 0;
1102 }
1103
1104 /*
1105  * extent_io.c call back to do delayed allocation processing
1106  */
1107 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1108                               u64 start, u64 end, int *page_started,
1109                               unsigned long *nr_written)
1110 {
1111         struct btrfs_root *root = BTRFS_I(inode)->root;
1112         int ret;
1113
1114         if (btrfs_test_opt(root, NODATACOW) ||
1115             btrfs_test_flag(inode, NODATACOW))
1116                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1117                                         page_started, 1, nr_written);
1118         else if (btrfs_test_flag(inode, PREALLOC))
1119                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1120                                         page_started, 0, nr_written);
1121         else
1122                 ret = cow_file_range_async(inode, locked_page, start, end,
1123                                      page_started, nr_written);
1124
1125         return ret;
1126 }
1127
1128 /*
1129  * extent_io.c set_bit_hook, used to track delayed allocation
1130  * bytes in this file, and to maintain the list of inodes that
1131  * have pending delalloc work to be done.
1132  */
1133 int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1134                        unsigned long old, unsigned long bits)
1135 {
1136         unsigned long flags;
1137         if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1138                 struct btrfs_root *root = BTRFS_I(inode)->root;
1139                 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
1140                 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1141                 root->fs_info->delalloc_bytes += end - start + 1;
1142                 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1143                         list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1144                                       &root->fs_info->delalloc_inodes);
1145                 }
1146                 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
1147         }
1148         return 0;
1149 }
1150
1151 /*
1152  * extent_io.c clear_bit_hook, see set_bit_hook for why
1153  */
1154 int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1155                          unsigned long old, unsigned long bits)
1156 {
1157         if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1158                 struct btrfs_root *root = BTRFS_I(inode)->root;
1159                 unsigned long flags;
1160
1161                 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
1162                 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1163                         printk("warning: delalloc account %Lu %Lu\n",
1164                                end - start + 1, root->fs_info->delalloc_bytes);
1165                         root->fs_info->delalloc_bytes = 0;
1166                         BTRFS_I(inode)->delalloc_bytes = 0;
1167                 } else {
1168                         root->fs_info->delalloc_bytes -= end - start + 1;
1169                         BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1170                 }
1171                 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1172                     !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1173                         list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1174                 }
1175                 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
1176         }
1177         return 0;
1178 }
1179
1180 /*
1181  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1182  * we don't create bios that span stripes or chunks
1183  */
1184 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1185                          size_t size, struct bio *bio,
1186                          unsigned long bio_flags)
1187 {
1188         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1189         struct btrfs_mapping_tree *map_tree;
1190         u64 logical = (u64)bio->bi_sector << 9;
1191         u64 length = 0;
1192         u64 map_length;
1193         int ret;
1194
1195         if (bio_flags & EXTENT_BIO_COMPRESSED)
1196                 return 0;
1197
1198         length = bio->bi_size;
1199         map_tree = &root->fs_info->mapping_tree;
1200         map_length = length;
1201         ret = btrfs_map_block(map_tree, READ, logical,
1202                               &map_length, NULL, 0);
1203
1204         if (map_length < length + size) {
1205                 return 1;
1206         }
1207         return 0;
1208 }
1209
1210 /*
1211  * in order to insert checksums into the metadata in large chunks,
1212  * we wait until bio submission time.   All the pages in the bio are
1213  * checksummed and sums are attached onto the ordered extent record.
1214  *
1215  * At IO completion time the cums attached on the ordered extent record
1216  * are inserted into the btree
1217  */
1218 int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
1219                           int mirror_num, unsigned long bio_flags)
1220 {
1221         struct btrfs_root *root = BTRFS_I(inode)->root;
1222         int ret = 0;
1223
1224         ret = btrfs_csum_one_bio(root, inode, bio);
1225         BUG_ON(ret);
1226         return 0;
1227 }
1228
1229 /*
1230  * in order to insert checksums into the metadata in large chunks,
1231  * we wait until bio submission time.   All the pages in the bio are
1232  * checksummed and sums are attached onto the ordered extent record.
1233  *
1234  * At IO completion time the cums attached on the ordered extent record
1235  * are inserted into the btree
1236  */
1237 int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1238                           int mirror_num, unsigned long bio_flags)
1239 {
1240         struct btrfs_root *root = BTRFS_I(inode)->root;
1241         return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1242 }
1243
1244 /*
1245  * extent_io.c submission hook. This does the right thing for csum calculation on write,
1246  * or reading the csums from the tree before a read
1247  */
1248 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1249                           int mirror_num, unsigned long bio_flags)
1250 {
1251         struct btrfs_root *root = BTRFS_I(inode)->root;
1252         int ret = 0;
1253         int skip_sum;
1254
1255         ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1256         BUG_ON(ret);
1257
1258         skip_sum = btrfs_test_opt(root, NODATASUM) ||
1259                 btrfs_test_flag(inode, NODATASUM);
1260
1261         if (!(rw & (1 << BIO_RW))) {
1262
1263                 if (bio_flags & EXTENT_BIO_COMPRESSED)
1264                         return btrfs_submit_compressed_read(inode, bio,
1265                                                     mirror_num, bio_flags);
1266                 else if (!skip_sum)
1267                         btrfs_lookup_bio_sums(root, inode, bio);
1268                 goto mapit;
1269         } else if (!skip_sum) {
1270                 /* we're doing a write, do the async checksumming */
1271                 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1272                                    inode, rw, bio, mirror_num,
1273                                    bio_flags, __btrfs_submit_bio_start,
1274                                    __btrfs_submit_bio_done);
1275         }
1276
1277 mapit:
1278         return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1279 }
1280
1281 /*
1282  * given a list of ordered sums record them in the inode.  This happens
1283  * at IO completion time based on sums calculated at bio submission time.
1284  */
1285 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1286                              struct inode *inode, u64 file_offset,
1287                              struct list_head *list)
1288 {
1289         struct list_head *cur;
1290         struct btrfs_ordered_sum *sum;
1291
1292         btrfs_set_trans_block_group(trans, inode);
1293         list_for_each(cur, list) {
1294                 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1295                 btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
1296                                        inode, sum);
1297         }
1298         return 0;
1299 }
1300
1301 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1302 {
1303         if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
1304                 WARN_ON(1);
1305         }
1306         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1307                                    GFP_NOFS);
1308 }
1309
1310 /* see btrfs_writepage_start_hook for details on why this is required */
1311 struct btrfs_writepage_fixup {
1312         struct page *page;
1313         struct btrfs_work work;
1314 };
1315
1316 void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1317 {
1318         struct btrfs_writepage_fixup *fixup;
1319         struct btrfs_ordered_extent *ordered;
1320         struct page *page;
1321         struct inode *inode;
1322         u64 page_start;
1323         u64 page_end;
1324
1325         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1326         page = fixup->page;
1327 again:
1328         lock_page(page);
1329         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1330                 ClearPageChecked(page);
1331                 goto out_page;
1332         }
1333
1334         inode = page->mapping->host;
1335         page_start = page_offset(page);
1336         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1337
1338         lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1339
1340         /* already ordered? We're done */
1341         if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1342                              EXTENT_ORDERED, 0)) {
1343                 goto out;
1344         }
1345
1346         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1347         if (ordered) {
1348                 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1349                               page_end, GFP_NOFS);
1350                 unlock_page(page);
1351                 btrfs_start_ordered_extent(inode, ordered, 1);
1352                 goto again;
1353         }
1354
1355         btrfs_set_extent_delalloc(inode, page_start, page_end);
1356         ClearPageChecked(page);
1357 out:
1358         unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1359 out_page:
1360         unlock_page(page);
1361         page_cache_release(page);
1362 }
1363
1364 /*
1365  * There are a few paths in the higher layers of the kernel that directly
1366  * set the page dirty bit without asking the filesystem if it is a
1367  * good idea.  This causes problems because we want to make sure COW
1368  * properly happens and the data=ordered rules are followed.
1369  *
1370  * In our case any range that doesn't have the ORDERED bit set
1371  * hasn't been properly setup for IO.  We kick off an async process
1372  * to fix it up.  The async helper will wait for ordered extents, set
1373  * the delalloc bit and make it safe to write the page.
1374  */
1375 int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1376 {
1377         struct inode *inode = page->mapping->host;
1378         struct btrfs_writepage_fixup *fixup;
1379         struct btrfs_root *root = BTRFS_I(inode)->root;
1380         int ret;
1381
1382         ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1383                              EXTENT_ORDERED, 0);
1384         if (ret)
1385                 return 0;
1386
1387         if (PageChecked(page))
1388                 return -EAGAIN;
1389
1390         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1391         if (!fixup)
1392                 return -EAGAIN;
1393
1394         SetPageChecked(page);
1395         page_cache_get(page);
1396         fixup->work.func = btrfs_writepage_fixup_worker;
1397         fixup->page = page;
1398         btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1399         return -EAGAIN;
1400 }
1401
1402 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1403                                        struct inode *inode, u64 file_pos,
1404                                        u64 disk_bytenr, u64 disk_num_bytes,
1405                                        u64 num_bytes, u64 ram_bytes,
1406                                        u8 compression, u8 encryption,
1407                                        u16 other_encoding, int extent_type)
1408 {
1409         struct btrfs_root *root = BTRFS_I(inode)->root;
1410         struct btrfs_file_extent_item *fi;
1411         struct btrfs_path *path;
1412         struct extent_buffer *leaf;
1413         struct btrfs_key ins;
1414         u64 hint;
1415         int ret;
1416
1417         path = btrfs_alloc_path();
1418         BUG_ON(!path);
1419
1420         ret = btrfs_drop_extents(trans, root, inode, file_pos,
1421                                  file_pos + num_bytes, file_pos, &hint);
1422         BUG_ON(ret);
1423
1424         ins.objectid = inode->i_ino;
1425         ins.offset = file_pos;
1426         ins.type = BTRFS_EXTENT_DATA_KEY;
1427         ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1428         BUG_ON(ret);
1429         leaf = path->nodes[0];
1430         fi = btrfs_item_ptr(leaf, path->slots[0],
1431                             struct btrfs_file_extent_item);
1432         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1433         btrfs_set_file_extent_type(leaf, fi, extent_type);
1434         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1435         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1436         btrfs_set_file_extent_offset(leaf, fi, 0);
1437         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1438         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1439         btrfs_set_file_extent_compression(leaf, fi, compression);
1440         btrfs_set_file_extent_encryption(leaf, fi, encryption);
1441         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1442         btrfs_mark_buffer_dirty(leaf);
1443
1444         inode_add_bytes(inode, num_bytes);
1445         btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1446
1447         ins.objectid = disk_bytenr;
1448         ins.offset = disk_num_bytes;
1449         ins.type = BTRFS_EXTENT_ITEM_KEY;
1450         ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1451                                           root->root_key.objectid,
1452                                           trans->transid, inode->i_ino, &ins);
1453         BUG_ON(ret);
1454
1455         btrfs_free_path(path);
1456         return 0;
1457 }
1458
1459 /* as ordered data IO finishes, this gets called so we can finish
1460  * an ordered extent if the range of bytes in the file it covers are
1461  * fully written.
1462  */
1463 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1464 {
1465         struct btrfs_root *root = BTRFS_I(inode)->root;
1466         struct btrfs_trans_handle *trans;
1467         struct btrfs_ordered_extent *ordered_extent;
1468         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1469         int compressed = 0;
1470         int ret;
1471
1472         ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1473         if (!ret)
1474                 return 0;
1475
1476         trans = btrfs_join_transaction(root, 1);
1477
1478         ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1479         BUG_ON(!ordered_extent);
1480         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1481                 goto nocow;
1482
1483         lock_extent(io_tree, ordered_extent->file_offset,
1484                     ordered_extent->file_offset + ordered_extent->len - 1,
1485                     GFP_NOFS);
1486
1487         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1488                 compressed = 1;
1489         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1490                 BUG_ON(compressed);
1491                 ret = btrfs_mark_extent_written(trans, root, inode,
1492                                                 ordered_extent->file_offset,
1493                                                 ordered_extent->file_offset +
1494                                                 ordered_extent->len);
1495                 BUG_ON(ret);
1496         } else {
1497                 ret = insert_reserved_file_extent(trans, inode,
1498                                                 ordered_extent->file_offset,
1499                                                 ordered_extent->start,
1500                                                 ordered_extent->disk_len,
1501                                                 ordered_extent->len,
1502                                                 ordered_extent->len,
1503                                                 compressed, 0, 0,
1504                                                 BTRFS_FILE_EXTENT_REG);
1505                 BUG_ON(ret);
1506         }
1507         unlock_extent(io_tree, ordered_extent->file_offset,
1508                     ordered_extent->file_offset + ordered_extent->len - 1,
1509                     GFP_NOFS);
1510 nocow:
1511         add_pending_csums(trans, inode, ordered_extent->file_offset,
1512                           &ordered_extent->list);
1513
1514         mutex_lock(&BTRFS_I(inode)->extent_mutex);
1515         btrfs_ordered_update_i_size(inode, ordered_extent);
1516         btrfs_update_inode(trans, root, inode);
1517         btrfs_remove_ordered_extent(inode, ordered_extent);
1518         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1519
1520         /* once for us */
1521         btrfs_put_ordered_extent(ordered_extent);
1522         /* once for the tree */
1523         btrfs_put_ordered_extent(ordered_extent);
1524
1525         btrfs_end_transaction(trans, root);
1526         return 0;
1527 }
1528
1529 int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1530                                 struct extent_state *state, int uptodate)
1531 {
1532         return btrfs_finish_ordered_io(page->mapping->host, start, end);
1533 }
1534
1535 /*
1536  * When IO fails, either with EIO or csum verification fails, we
1537  * try other mirrors that might have a good copy of the data.  This
1538  * io_failure_record is used to record state as we go through all the
1539  * mirrors.  If another mirror has good data, the page is set up to date
1540  * and things continue.  If a good mirror can't be found, the original
1541  * bio end_io callback is called to indicate things have failed.
1542  */
1543 struct io_failure_record {
1544         struct page *page;
1545         u64 start;
1546         u64 len;
1547         u64 logical;
1548         int last_mirror;
1549 };
1550
1551 int btrfs_io_failed_hook(struct bio *failed_bio,
1552                          struct page *page, u64 start, u64 end,
1553                          struct extent_state *state)
1554 {
1555         struct io_failure_record *failrec = NULL;
1556         u64 private;
1557         struct extent_map *em;
1558         struct inode *inode = page->mapping->host;
1559         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1560         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1561         struct bio *bio;
1562         int num_copies;
1563         int ret;
1564         int rw;
1565         u64 logical;
1566         unsigned long bio_flags = 0;
1567
1568         ret = get_state_private(failure_tree, start, &private);
1569         if (ret) {
1570                 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1571                 if (!failrec)
1572                         return -ENOMEM;
1573                 failrec->start = start;
1574                 failrec->len = end - start + 1;
1575                 failrec->last_mirror = 0;
1576
1577                 spin_lock(&em_tree->lock);
1578                 em = lookup_extent_mapping(em_tree, start, failrec->len);
1579                 if (em->start > start || em->start + em->len < start) {
1580                         free_extent_map(em);
1581                         em = NULL;
1582                 }
1583                 spin_unlock(&em_tree->lock);
1584
1585                 if (!em || IS_ERR(em)) {
1586                         kfree(failrec);
1587                         return -EIO;
1588                 }
1589                 logical = start - em->start;
1590                 logical = em->block_start + logical;
1591                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1592                         bio_flags = EXTENT_BIO_COMPRESSED;
1593                 failrec->logical = logical;
1594                 free_extent_map(em);
1595                 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1596                                 EXTENT_DIRTY, GFP_NOFS);
1597                 set_state_private(failure_tree, start,
1598                                  (u64)(unsigned long)failrec);
1599         } else {
1600                 failrec = (struct io_failure_record *)(unsigned long)private;
1601         }
1602         num_copies = btrfs_num_copies(
1603                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
1604                               failrec->logical, failrec->len);
1605         failrec->last_mirror++;
1606         if (!state) {
1607                 spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
1608                 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1609                                                     failrec->start,
1610                                                     EXTENT_LOCKED);
1611                 if (state && state->start != failrec->start)
1612                         state = NULL;
1613                 spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
1614         }
1615         if (!state || failrec->last_mirror > num_copies) {
1616                 set_state_private(failure_tree, failrec->start, 0);
1617                 clear_extent_bits(failure_tree, failrec->start,
1618                                   failrec->start + failrec->len - 1,
1619                                   EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1620                 kfree(failrec);
1621                 return -EIO;
1622         }
1623         bio = bio_alloc(GFP_NOFS, 1);
1624         bio->bi_private = state;
1625         bio->bi_end_io = failed_bio->bi_end_io;
1626         bio->bi_sector = failrec->logical >> 9;
1627         bio->bi_bdev = failed_bio->bi_bdev;
1628         bio->bi_size = 0;
1629         bio_add_page(bio, page, failrec->len, start - page_offset(page));
1630         if (failed_bio->bi_rw & (1 << BIO_RW))
1631                 rw = WRITE;
1632         else
1633                 rw = READ;
1634
1635         BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1636                                                       failrec->last_mirror,
1637                                                       bio_flags);
1638         return 0;
1639 }
1640
1641 /*
1642  * each time an IO finishes, we do a fast check in the IO failure tree
1643  * to see if we need to process or clean up an io_failure_record
1644  */
1645 int btrfs_clean_io_failures(struct inode *inode, u64 start)
1646 {
1647         u64 private;
1648         u64 private_failure;
1649         struct io_failure_record *failure;
1650         int ret;
1651
1652         private = 0;
1653         if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1654                              (u64)-1, 1, EXTENT_DIRTY)) {
1655                 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1656                                         start, &private_failure);
1657                 if (ret == 0) {
1658                         failure = (struct io_failure_record *)(unsigned long)
1659                                    private_failure;
1660                         set_state_private(&BTRFS_I(inode)->io_failure_tree,
1661                                           failure->start, 0);
1662                         clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1663                                           failure->start,
1664                                           failure->start + failure->len - 1,
1665                                           EXTENT_DIRTY | EXTENT_LOCKED,
1666                                           GFP_NOFS);
1667                         kfree(failure);
1668                 }
1669         }
1670         return 0;
1671 }
1672
1673 /*
1674  * when reads are done, we need to check csums to verify the data is correct
1675  * if there's a match, we allow the bio to finish.  If not, we go through
1676  * the io_failure_record routines to find good copies
1677  */
1678 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1679                                struct extent_state *state)
1680 {
1681         size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1682         struct inode *inode = page->mapping->host;
1683         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1684         char *kaddr;
1685         u64 private = ~(u32)0;
1686         int ret;
1687         struct btrfs_root *root = BTRFS_I(inode)->root;
1688         u32 csum = ~(u32)0;
1689         unsigned long flags;
1690
1691         if (btrfs_test_opt(root, NODATASUM) ||
1692             btrfs_test_flag(inode, NODATASUM))
1693                 return 0;
1694         if (state && state->start == start) {
1695                 private = state->private;
1696                 ret = 0;
1697         } else {
1698                 ret = get_state_private(io_tree, start, &private);
1699         }
1700         local_irq_save(flags);
1701         kaddr = kmap_atomic(page, KM_IRQ0);
1702         if (ret) {
1703                 goto zeroit;
1704         }
1705         csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
1706         btrfs_csum_final(csum, (char *)&csum);
1707         if (csum != private) {
1708                 goto zeroit;
1709         }
1710         kunmap_atomic(kaddr, KM_IRQ0);
1711         local_irq_restore(flags);
1712
1713         /* if the io failure tree for this inode is non-empty,
1714          * check to see if we've recovered from a failed IO
1715          */
1716         btrfs_clean_io_failures(inode, start);
1717         return 0;
1718
1719 zeroit:
1720         printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
1721                page->mapping->host->i_ino, (unsigned long long)start, csum,
1722                private);
1723         memset(kaddr + offset, 1, end - start + 1);
1724         flush_dcache_page(page);
1725         kunmap_atomic(kaddr, KM_IRQ0);
1726         local_irq_restore(flags);
1727         if (private == 0)
1728                 return 0;
1729         return -EIO;
1730 }
1731
1732 /*
1733  * This creates an orphan entry for the given inode in case something goes
1734  * wrong in the middle of an unlink/truncate.
1735  */
1736 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1737 {
1738         struct btrfs_root *root = BTRFS_I(inode)->root;
1739         int ret = 0;
1740
1741         spin_lock(&root->list_lock);
1742
1743         /* already on the orphan list, we're good */
1744         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1745                 spin_unlock(&root->list_lock);
1746                 return 0;
1747         }
1748
1749         list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1750
1751         spin_unlock(&root->list_lock);
1752
1753         /*
1754          * insert an orphan item to track this unlinked/truncated file
1755          */
1756         ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1757
1758         return ret;
1759 }
1760
1761 /*
1762  * We have done the truncate/delete so we can go ahead and remove the orphan
1763  * item for this particular inode.
1764  */
1765 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1766 {
1767         struct btrfs_root *root = BTRFS_I(inode)->root;
1768         int ret = 0;
1769
1770         spin_lock(&root->list_lock);
1771
1772         if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1773                 spin_unlock(&root->list_lock);
1774                 return 0;
1775         }
1776
1777         list_del_init(&BTRFS_I(inode)->i_orphan);
1778         if (!trans) {
1779                 spin_unlock(&root->list_lock);
1780                 return 0;
1781         }
1782
1783         spin_unlock(&root->list_lock);
1784
1785         ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1786
1787         return ret;
1788 }
1789
1790 /*
1791  * this cleans up any orphans that may be left on the list from the last use
1792  * of this root.
1793  */
1794 void btrfs_orphan_cleanup(struct btrfs_root *root)
1795 {
1796         struct btrfs_path *path;
1797         struct extent_buffer *leaf;
1798         struct btrfs_item *item;
1799         struct btrfs_key key, found_key;
1800         struct btrfs_trans_handle *trans;
1801         struct inode *inode;
1802         int ret = 0, nr_unlink = 0, nr_truncate = 0;
1803
1804         path = btrfs_alloc_path();
1805         if (!path)
1806                 return;
1807         path->reada = -1;
1808
1809         key.objectid = BTRFS_ORPHAN_OBJECTID;
1810         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1811         key.offset = (u64)-1;
1812
1813
1814         while (1) {
1815                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1816                 if (ret < 0) {
1817                         printk(KERN_ERR "Error searching slot for orphan: %d"
1818                                "\n", ret);
1819                         break;
1820                 }
1821
1822                 /*
1823                  * if ret == 0 means we found what we were searching for, which
1824                  * is weird, but possible, so only screw with path if we didnt
1825                  * find the key and see if we have stuff that matches
1826                  */
1827                 if (ret > 0) {
1828                         if (path->slots[0] == 0)
1829                                 break;
1830                         path->slots[0]--;
1831                 }
1832
1833                 /* pull out the item */
1834                 leaf = path->nodes[0];
1835                 item = btrfs_item_nr(leaf, path->slots[0]);
1836                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1837
1838                 /* make sure the item matches what we want */
1839                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1840                         break;
1841                 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1842                         break;
1843
1844                 /* release the path since we're done with it */
1845                 btrfs_release_path(root, path);
1846
1847                 /*
1848                  * this is where we are basically btrfs_lookup, without the
1849                  * crossing root thing.  we store the inode number in the
1850                  * offset of the orphan item.
1851                  */
1852                 inode = btrfs_iget_locked(root->fs_info->sb,
1853                                           found_key.offset, root);
1854                 if (!inode)
1855                         break;
1856
1857                 if (inode->i_state & I_NEW) {
1858                         BTRFS_I(inode)->root = root;
1859
1860                         /* have to set the location manually */
1861                         BTRFS_I(inode)->location.objectid = inode->i_ino;
1862                         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1863                         BTRFS_I(inode)->location.offset = 0;
1864
1865                         btrfs_read_locked_inode(inode);
1866                         unlock_new_inode(inode);
1867                 }
1868
1869                 /*
1870                  * add this inode to the orphan list so btrfs_orphan_del does
1871                  * the proper thing when we hit it
1872                  */
1873                 spin_lock(&root->list_lock);
1874                 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1875                 spin_unlock(&root->list_lock);
1876
1877                 /*
1878                  * if this is a bad inode, means we actually succeeded in
1879                  * removing the inode, but not the orphan record, which means
1880                  * we need to manually delete the orphan since iput will just
1881                  * do a destroy_inode
1882                  */
1883                 if (is_bad_inode(inode)) {
1884                         trans = btrfs_start_transaction(root, 1);
1885                         btrfs_orphan_del(trans, inode);
1886                         btrfs_end_transaction(trans, root);
1887                         iput(inode);
1888                         continue;
1889                 }
1890
1891                 /* if we have links, this was a truncate, lets do that */
1892                 if (inode->i_nlink) {
1893                         nr_truncate++;
1894                         btrfs_truncate(inode);
1895                 } else {
1896                         nr_unlink++;
1897                 }
1898
1899                 /* this will do delete_inode and everything for us */
1900                 iput(inode);
1901         }
1902
1903         if (nr_unlink)
1904                 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1905         if (nr_truncate)
1906                 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1907
1908         btrfs_free_path(path);
1909 }
1910
1911 /*
1912  * read an inode from the btree into the in-memory inode
1913  */
1914 void btrfs_read_locked_inode(struct inode *inode)
1915 {
1916         struct btrfs_path *path;
1917         struct extent_buffer *leaf;
1918         struct btrfs_inode_item *inode_item;
1919         struct btrfs_timespec *tspec;
1920         struct btrfs_root *root = BTRFS_I(inode)->root;
1921         struct btrfs_key location;
1922         u64 alloc_group_block;
1923         u32 rdev;
1924         int ret;
1925
1926         path = btrfs_alloc_path();
1927         BUG_ON(!path);
1928         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1929
1930         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1931         if (ret)
1932                 goto make_bad;
1933
1934         leaf = path->nodes[0];
1935         inode_item = btrfs_item_ptr(leaf, path->slots[0],
1936                                     struct btrfs_inode_item);
1937
1938         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1939         inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1940         inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1941         inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1942         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1943
1944         tspec = btrfs_inode_atime(inode_item);
1945         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1946         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1947
1948         tspec = btrfs_inode_mtime(inode_item);
1949         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1950         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1951
1952         tspec = btrfs_inode_ctime(inode_item);
1953         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1954         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1955
1956         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
1957         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
1958         inode->i_generation = BTRFS_I(inode)->generation;
1959         inode->i_rdev = 0;
1960         rdev = btrfs_inode_rdev(leaf, inode_item);
1961
1962         BTRFS_I(inode)->index_cnt = (u64)-1;
1963
1964         alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
1965         BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
1966                                                        alloc_group_block);
1967         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
1968         if (!BTRFS_I(inode)->block_group) {
1969                 BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
1970                                                  NULL, 0,
1971                                                  BTRFS_BLOCK_GROUP_METADATA, 0);
1972         }
1973         btrfs_free_path(path);
1974         inode_item = NULL;
1975
1976         switch (inode->i_mode & S_IFMT) {
1977         case S_IFREG:
1978                 inode->i_mapping->a_ops = &btrfs_aops;
1979                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1980                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
1981                 inode->i_fop = &btrfs_file_operations;
1982                 inode->i_op = &btrfs_file_inode_operations;
1983                 break;
1984         case S_IFDIR:
1985                 inode->i_fop = &btrfs_dir_file_operations;
1986                 if (root == root->fs_info->tree_root)
1987                         inode->i_op = &btrfs_dir_ro_inode_operations;
1988                 else
1989                         inode->i_op = &btrfs_dir_inode_operations;
1990                 break;
1991         case S_IFLNK:
1992                 inode->i_op = &btrfs_symlink_inode_operations;
1993                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
1994                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1995                 break;
1996         default:
1997                 init_special_inode(inode, inode->i_mode, rdev);
1998                 break;
1999         }
2000         return;
2001
2002 make_bad:
2003         btrfs_free_path(path);
2004         make_bad_inode(inode);
2005 }
2006
2007 /*
2008  * given a leaf and an inode, copy the inode fields into the leaf
2009  */
2010 static void fill_inode_item(struct btrfs_trans_handle *trans,
2011                             struct extent_buffer *leaf,
2012                             struct btrfs_inode_item *item,
2013                             struct inode *inode)
2014 {
2015         btrfs_set_inode_uid(leaf, item, inode->i_uid);
2016         btrfs_set_inode_gid(leaf, item, inode->i_gid);
2017         btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2018         btrfs_set_inode_mode(leaf, item, inode->i_mode);
2019         btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2020
2021         btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2022                                inode->i_atime.tv_sec);
2023         btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2024                                 inode->i_atime.tv_nsec);
2025
2026         btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2027                                inode->i_mtime.tv_sec);
2028         btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2029                                 inode->i_mtime.tv_nsec);
2030
2031         btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2032                                inode->i_ctime.tv_sec);
2033         btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2034                                 inode->i_ctime.tv_nsec);
2035
2036         btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2037         btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2038         btrfs_set_inode_transid(leaf, item, trans->transid);
2039         btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2040         btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2041         btrfs_set_inode_block_group(leaf, item,
2042                                     BTRFS_I(inode)->block_group->key.objectid);
2043 }
2044
2045 /*
2046  * copy everything in the in-memory inode into the btree.
2047  */
2048 int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
2049                               struct btrfs_root *root,
2050                               struct inode *inode)
2051 {
2052         struct btrfs_inode_item *inode_item;
2053         struct btrfs_path *path;
2054         struct extent_buffer *leaf;
2055         int ret;
2056
2057         path = btrfs_alloc_path();
2058         BUG_ON(!path);
2059         ret = btrfs_lookup_inode(trans, root, path,
2060                                  &BTRFS_I(inode)->location, 1);
2061         if (ret) {
2062                 if (ret > 0)
2063                         ret = -ENOENT;
2064                 goto failed;
2065         }
2066
2067         leaf = path->nodes[0];
2068         inode_item = btrfs_item_ptr(leaf, path->slots[0],
2069                                   struct btrfs_inode_item);
2070
2071         fill_inode_item(trans, leaf, inode_item, inode);
2072         btrfs_mark_buffer_dirty(leaf);
2073         btrfs_set_inode_last_trans(trans, inode);
2074         ret = 0;
2075 failed:
2076         btrfs_free_path(path);
2077         return ret;
2078 }
2079
2080
2081 /*
2082  * unlink helper that gets used here in inode.c and in the tree logging
2083  * recovery code.  It remove a link in a directory with a given name, and
2084  * also drops the back refs in the inode to the directory
2085  */
2086 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2087                        struct btrfs_root *root,
2088                        struct inode *dir, struct inode *inode,
2089                        const char *name, int name_len)
2090 {
2091         struct btrfs_path *path;
2092         int ret = 0;
2093         struct extent_buffer *leaf;
2094         struct btrfs_dir_item *di;
2095         struct btrfs_key key;
2096         u64 index;
2097
2098         path = btrfs_alloc_path();
2099         if (!path) {
2100                 ret = -ENOMEM;
2101                 goto err;
2102         }
2103
2104         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2105                                     name, name_len, -1);
2106         if (IS_ERR(di)) {
2107                 ret = PTR_ERR(di);
2108                 goto err;
2109         }
2110         if (!di) {
2111                 ret = -ENOENT;
2112                 goto err;
2113         }
2114         leaf = path->nodes[0];
2115         btrfs_dir_item_key_to_cpu(leaf, di, &key);
2116         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2117         if (ret)
2118                 goto err;
2119         btrfs_release_path(root, path);
2120
2121         ret = btrfs_del_inode_ref(trans, root, name, name_len,
2122                                   inode->i_ino,
2123                                   dir->i_ino, &index);
2124         if (ret) {
2125                 printk("failed to delete reference to %.*s, "
2126                        "inode %lu parent %lu\n", name_len, name,
2127                        inode->i_ino, dir->i_ino);
2128                 goto err;
2129         }
2130
2131         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2132                                          index, name, name_len, -1);
2133         if (IS_ERR(di)) {
2134                 ret = PTR_ERR(di);
2135                 goto err;
2136         }
2137         if (!di) {
2138                 ret = -ENOENT;
2139                 goto err;
2140         }
2141         ret = btrfs_delete_one_dir_name(trans, root, path, di);
2142         btrfs_release_path(root, path);
2143
2144         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2145                                          inode, dir->i_ino);
2146         BUG_ON(ret != 0 && ret != -ENOENT);
2147         if (ret != -ENOENT)
2148                 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2149
2150         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2151                                            dir, index);
2152         BUG_ON(ret);
2153 err:
2154         btrfs_free_path(path);
2155         if (ret)
2156                 goto out;
2157
2158         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2159         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2160         btrfs_update_inode(trans, root, dir);
2161         btrfs_drop_nlink(inode);
2162         ret = btrfs_update_inode(trans, root, inode);
2163         dir->i_sb->s_dirt = 1;
2164 out:
2165         return ret;
2166 }
2167
2168 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2169 {
2170         struct btrfs_root *root;
2171         struct btrfs_trans_handle *trans;
2172         struct inode *inode = dentry->d_inode;
2173         int ret;
2174         unsigned long nr = 0;
2175
2176         root = BTRFS_I(dir)->root;
2177
2178         ret = btrfs_check_free_space(root, 1, 1);
2179         if (ret)
2180                 goto fail;
2181
2182         trans = btrfs_start_transaction(root, 1);
2183
2184         btrfs_set_trans_block_group(trans, dir);
2185         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2186                                  dentry->d_name.name, dentry->d_name.len);
2187
2188         if (inode->i_nlink == 0)
2189                 ret = btrfs_orphan_add(trans, inode);
2190
2191         nr = trans->blocks_used;
2192
2193         btrfs_end_transaction_throttle(trans, root);
2194 fail:
2195         btrfs_btree_balance_dirty(root, nr);
2196         return ret;
2197 }
2198
2199 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2200 {
2201         struct inode *inode = dentry->d_inode;
2202         int err = 0;
2203         int ret;
2204         struct btrfs_root *root = BTRFS_I(dir)->root;
2205         struct btrfs_trans_handle *trans;
2206         unsigned long nr = 0;
2207
2208         /*
2209          * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2210          * the root of a subvolume or snapshot
2211          */
2212         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2213             inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2214                 return -ENOTEMPTY;
2215         }
2216
2217         ret = btrfs_check_free_space(root, 1, 1);
2218         if (ret)
2219                 goto fail;
2220
2221         trans = btrfs_start_transaction(root, 1);
2222         btrfs_set_trans_block_group(trans, dir);
2223
2224         err = btrfs_orphan_add(trans, inode);
2225         if (err)
2226                 goto fail_trans;
2227
2228         /* now the directory is empty */
2229         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2230                                  dentry->d_name.name, dentry->d_name.len);
2231         if (!err) {
2232                 btrfs_i_size_write(inode, 0);
2233         }
2234
2235 fail_trans:
2236         nr = trans->blocks_used;
2237         ret = btrfs_end_transaction_throttle(trans, root);
2238 fail:
2239         btrfs_btree_balance_dirty(root, nr);
2240
2241         if (ret && !err)
2242                 err = ret;
2243         return err;
2244 }
2245
2246 /*
2247  * when truncating bytes in a file, it is possible to avoid reading
2248  * the leaves that contain only checksum items.  This can be the
2249  * majority of the IO required to delete a large file, but it must
2250  * be done carefully.
2251  *
2252  * The keys in the level just above the leaves are checked to make sure
2253  * the lowest key in a given leaf is a csum key, and starts at an offset
2254  * after the new  size.
2255  *
2256  * Then the key for the next leaf is checked to make sure it also has
2257  * a checksum item for the same file.  If it does, we know our target leaf
2258  * contains only checksum items, and it can be safely freed without reading
2259  * it.
2260  *
2261  * This is just an optimization targeted at large files.  It may do
2262  * nothing.  It will return 0 unless things went badly.
2263  */
2264 static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2265                                      struct btrfs_root *root,
2266                                      struct btrfs_path *path,
2267                                      struct inode *inode, u64 new_size)
2268 {
2269         struct btrfs_key key;
2270         int ret;
2271         int nritems;
2272         struct btrfs_key found_key;
2273         struct btrfs_key other_key;
2274         struct btrfs_leaf_ref *ref;
2275         u64 leaf_gen;
2276         u64 leaf_start;
2277
2278         path->lowest_level = 1;
2279         key.objectid = inode->i_ino;
2280         key.type = BTRFS_CSUM_ITEM_KEY;
2281         key.offset = new_size;
2282 again:
2283         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2284         if (ret < 0)
2285                 goto out;
2286
2287         if (path->nodes[1] == NULL) {
2288                 ret = 0;
2289                 goto out;
2290         }
2291         ret = 0;
2292         btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2293         nritems = btrfs_header_nritems(path->nodes[1]);
2294
2295         if (!nritems)
2296                 goto out;
2297
2298         if (path->slots[1] >= nritems)
2299                 goto next_node;
2300
2301         /* did we find a key greater than anything we want to delete? */
2302         if (found_key.objectid > inode->i_ino ||
2303            (found_key.objectid == inode->i_ino && found_key.type > key.type))
2304                 goto out;
2305
2306         /* we check the next key in the node to make sure the leave contains
2307          * only checksum items.  This comparison doesn't work if our
2308          * leaf is the last one in the node
2309          */
2310         if (path->slots[1] + 1 >= nritems) {
2311 next_node:
2312                 /* search forward from the last key in the node, this
2313                  * will bring us into the next node in the tree
2314                  */
2315                 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2316
2317                 /* unlikely, but we inc below, so check to be safe */
2318                 if (found_key.offset == (u64)-1)
2319                         goto out;
2320
2321                 /* search_forward needs a path with locks held, do the
2322                  * search again for the original key.  It is possible
2323                  * this will race with a balance and return a path that
2324                  * we could modify, but this drop is just an optimization
2325                  * and is allowed to miss some leaves.
2326                  */
2327                 btrfs_release_path(root, path);
2328                 found_key.offset++;
2329
2330                 /* setup a max key for search_forward */
2331                 other_key.offset = (u64)-1;
2332                 other_key.type = key.type;
2333                 other_key.objectid = key.objectid;
2334
2335                 path->keep_locks = 1;
2336                 ret = btrfs_search_forward(root, &found_key, &other_key,
2337                                            path, 0, 0);
2338                 path->keep_locks = 0;
2339                 if (ret || found_key.objectid != key.objectid ||
2340                     found_key.type != key.type) {
2341                         ret = 0;
2342                         goto out;
2343                 }
2344
2345                 key.offset = found_key.offset;
2346                 btrfs_release_path(root, path);
2347                 cond_resched();
2348                 goto again;
2349         }
2350
2351         /* we know there's one more slot after us in the tree,
2352          * read that key so we can verify it is also a checksum item
2353          */
2354         btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2355
2356         if (found_key.objectid < inode->i_ino)
2357                 goto next_key;
2358
2359         if (found_key.type != key.type || found_key.offset < new_size)
2360                 goto next_key;
2361
2362         /*
2363          * if the key for the next leaf isn't a csum key from this objectid,
2364          * we can't be sure there aren't good items inside this leaf.
2365          * Bail out
2366          */
2367         if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2368                 goto out;
2369
2370         leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2371         leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2372         /*
2373          * it is safe to delete this leaf, it contains only
2374          * csum items from this inode at an offset >= new_size
2375          */
2376         ret = btrfs_del_leaf(trans, root, path, leaf_start);
2377         BUG_ON(ret);
2378
2379         if (root->ref_cows && leaf_gen < trans->transid) {
2380                 ref = btrfs_alloc_leaf_ref(root, 0);
2381                 if (ref) {
2382                         ref->root_gen = root->root_key.offset;
2383                         ref->bytenr = leaf_start;
2384                         ref->owner = 0;
2385                         ref->generation = leaf_gen;
2386                         ref->nritems = 0;
2387
2388                         ret = btrfs_add_leaf_ref(root, ref, 0);
2389                         WARN_ON(ret);
2390                         btrfs_free_leaf_ref(root, ref);
2391                 } else {
2392                         WARN_ON(1);
2393                 }
2394         }
2395 next_key:
2396         btrfs_release_path(root, path);
2397
2398         if (other_key.objectid == inode->i_ino &&
2399             other_key.type == key.type && other_key.offset > key.offset) {
2400                 key.offset = other_key.offset;
2401                 cond_resched();
2402                 goto again;
2403         }
2404         ret = 0;
2405 out:
2406         /* fixup any changes we've made to the path */
2407         path->lowest_level = 0;
2408         path->keep_locks = 0;
2409         btrfs_release_path(root, path);
2410         return ret;
2411 }
2412
2413 /*
2414  * this can truncate away extent items, csum items and directory items.
2415  * It starts at a high offset and removes keys until it can't find
2416  * any higher than new_size
2417  *
2418  * csum items that cross the new i_size are truncated to the new size
2419  * as well.
2420  *
2421  * min_type is the minimum key type to truncate down to.  If set to 0, this
2422  * will kill all the items on this inode, including the INODE_ITEM_KEY.
2423  */
2424 noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2425                                         struct btrfs_root *root,
2426                                         struct inode *inode,
2427                                         u64 new_size, u32 min_type)
2428 {
2429         int ret;
2430         struct btrfs_path *path;
2431         struct btrfs_key key;
2432         struct btrfs_key found_key;
2433         u32 found_type;
2434         struct extent_buffer *leaf;
2435         struct btrfs_file_extent_item *fi;
2436         u64 extent_start = 0;
2437         u64 extent_num_bytes = 0;
2438         u64 item_end = 0;
2439         u64 root_gen = 0;
2440         u64 root_owner = 0;
2441         int found_extent;
2442         int del_item;
2443         int pending_del_nr = 0;
2444         int pending_del_slot = 0;
2445         int extent_type = -1;
2446         int encoding;
2447         u64 mask = root->sectorsize - 1;
2448
2449         if (root->ref_cows)
2450                 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2451         path = btrfs_alloc_path();
2452         path->reada = -1;
2453         BUG_ON(!path);
2454
2455         /* FIXME, add redo link to tree so we don't leak on crash */
2456         key.objectid = inode->i_ino;
2457         key.offset = (u64)-1;
2458         key.type = (u8)-1;
2459
2460         btrfs_init_path(path);
2461
2462         ret = drop_csum_leaves(trans, root, path, inode, new_size);
2463         BUG_ON(ret);
2464
2465 search_again:
2466         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2467         if (ret < 0) {
2468                 goto error;
2469         }
2470         if (ret > 0) {
2471                 /* there are no items in the tree for us to truncate, we're
2472                  * done
2473                  */
2474                 if (path->slots[0] == 0) {
2475                         ret = 0;
2476                         goto error;
2477                 }
2478                 path->slots[0]--;
2479         }
2480
2481         while(1) {
2482                 fi = NULL;
2483                 leaf = path->nodes[0];
2484                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2485                 found_type = btrfs_key_type(&found_key);
2486                 encoding = 0;
2487
2488                 if (found_key.objectid != inode->i_ino)
2489                         break;
2490
2491                 if (found_type < min_type)
2492                         break;
2493
2494                 item_end = found_key.offset;
2495                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2496                         fi = btrfs_item_ptr(leaf, path->slots[0],
2497                                             struct btrfs_file_extent_item);
2498                         extent_type = btrfs_file_extent_type(leaf, fi);
2499                         encoding = btrfs_file_extent_compression(leaf, fi);
2500                         encoding |= btrfs_file_extent_encryption(leaf, fi);
2501                         encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2502
2503                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2504                                 item_end +=
2505                                     btrfs_file_extent_num_bytes(leaf, fi);
2506                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2507                                 item_end += btrfs_file_extent_inline_len(leaf,
2508                                                                          fi);
2509                         }
2510                         item_end--;
2511                 }
2512                 if (found_type == BTRFS_CSUM_ITEM_KEY) {
2513                         ret = btrfs_csum_truncate(trans, root, path,
2514                                                   new_size);
2515                         BUG_ON(ret);
2516                 }
2517                 if (item_end < new_size) {
2518                         if (found_type == BTRFS_DIR_ITEM_KEY) {
2519                                 found_type = BTRFS_INODE_ITEM_KEY;
2520                         } else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
2521                                 found_type = BTRFS_CSUM_ITEM_KEY;
2522                         } else if (found_type == BTRFS_EXTENT_DATA_KEY) {
2523                                 found_type = BTRFS_XATTR_ITEM_KEY;
2524                         } else if (found_type == BTRFS_XATTR_ITEM_KEY) {
2525                                 found_type = BTRFS_INODE_REF_KEY;
2526                         } else if (found_type) {
2527                                 found_type--;
2528                         } else {
2529                                 break;
2530                         }
2531                         btrfs_set_key_type(&key, found_type);
2532                         goto next;
2533                 }
2534                 if (found_key.offset >= new_size)
2535                         del_item = 1;
2536                 else
2537                         del_item = 0;
2538                 found_extent = 0;
2539
2540                 /* FIXME, shrink the extent if the ref count is only 1 */
2541                 if (found_type != BTRFS_EXTENT_DATA_KEY)
2542                         goto delete;
2543
2544                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2545                         u64 num_dec;
2546                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2547                         if (!del_item && !encoding) {
2548                                 u64 orig_num_bytes =
2549                                         btrfs_file_extent_num_bytes(leaf, fi);
2550                                 extent_num_bytes = new_size -
2551                                         found_key.offset + root->sectorsize - 1;
2552                                 extent_num_bytes = extent_num_bytes &
2553                                         ~((u64)root->sectorsize - 1);
2554                                 btrfs_set_file_extent_num_bytes(leaf, fi,
2555                                                          extent_num_bytes);
2556                                 num_dec = (orig_num_bytes -
2557                                            extent_num_bytes);
2558                                 if (root->ref_cows && extent_start != 0)
2559                                         inode_sub_bytes(inode, num_dec);
2560                                 btrfs_mark_buffer_dirty(leaf);
2561                         } else {
2562                                 extent_num_bytes =
2563                                         btrfs_file_extent_disk_num_bytes(leaf,
2564                                                                          fi);
2565                                 /* FIXME blocksize != 4096 */
2566                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2567                                 if (extent_start != 0) {
2568                                         found_extent = 1;
2569                                         if (root->ref_cows)
2570                                                 inode_sub_bytes(inode, num_dec);
2571                                 }
2572                                 root_gen = btrfs_header_generation(leaf);
2573                                 root_owner = btrfs_header_owner(leaf);
2574                         }
2575                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2576                         /*
2577                          * we can't truncate inline items that have had
2578                          * special encodings
2579                          */
2580                         if (!del_item &&
2581                             btrfs_file_extent_compression(leaf, fi) == 0 &&
2582                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
2583                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2584                                 u32 size = new_size - found_key.offset;
2585
2586                                 if (root->ref_cows) {
2587                                         inode_sub_bytes(inode, item_end + 1 -
2588                                                         new_size);
2589                                 }
2590                                 size =
2591                                     btrfs_file_extent_calc_inline_size(size);
2592                                 ret = btrfs_truncate_item(trans, root, path,
2593                                                           size, 1);
2594                                 BUG_ON(ret);
2595                         } else if (root->ref_cows) {
2596                                 inode_sub_bytes(inode, item_end + 1 -
2597                                                 found_key.offset);
2598                         }
2599                 }
2600 delete:
2601                 if (del_item) {
2602                         if (!pending_del_nr) {
2603                                 /* no pending yet, add ourselves */
2604                                 pending_del_slot = path->slots[0];
2605                                 pending_del_nr = 1;
2606                         } else if (pending_del_nr &&
2607                                    path->slots[0] + 1 == pending_del_slot) {
2608                                 /* hop on the pending chunk */
2609                                 pending_del_nr++;
2610                                 pending_del_slot = path->slots[0];
2611                         } else {
2612                                 printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
2613                         }
2614                 } else {
2615                         break;
2616                 }
2617                 if (found_extent) {
2618                         ret = btrfs_free_extent(trans, root, extent_start,
2619                                                 extent_num_bytes,
2620                                                 leaf->start, root_owner,
2621                                                 root_gen, inode->i_ino, 0);
2622                         BUG_ON(ret);
2623                 }
2624 next:
2625                 if (path->slots[0] == 0) {
2626                         if (pending_del_nr)
2627                                 goto del_pending;
2628                         btrfs_release_path(root, path);
2629                         goto search_again;
2630                 }
2631
2632                 path->slots[0]--;
2633                 if (pending_del_nr &&
2634                     path->slots[0] + 1 != pending_del_slot) {
2635                         struct btrfs_key debug;
2636 del_pending:
2637                         btrfs_item_key_to_cpu(path->nodes[0], &debug,
2638                                               pending_del_slot);
2639                         ret = btrfs_del_items(trans, root, path,
2640                                               pending_del_slot,
2641                                               pending_del_nr);
2642                         BUG_ON(ret);
2643                         pending_del_nr = 0;
2644                         btrfs_release_path(root, path);
2645                         goto search_again;
2646                 }
2647         }
2648         ret = 0;
2649 error:
2650         if (pending_del_nr) {
2651                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2652                                       pending_del_nr);
2653         }
2654         btrfs_free_path(path);
2655         inode->i_sb->s_dirt = 1;
2656         return ret;
2657 }
2658
2659 /*
2660  * taken from block_truncate_page, but does cow as it zeros out
2661  * any bytes left in the last page in the file.
2662  */
2663 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2664 {
2665         struct inode *inode = mapping->host;
2666         struct btrfs_root *root = BTRFS_I(inode)->root;
2667         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2668         struct btrfs_ordered_extent *ordered;
2669         char *kaddr;
2670         u32 blocksize = root->sectorsize;
2671         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2672         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2673         struct page *page;
2674         int ret = 0;
2675         u64 page_start;
2676         u64 page_end;
2677
2678         if ((offset & (blocksize - 1)) == 0)
2679                 goto out;
2680
2681         ret = -ENOMEM;
2682 again:
2683         page = grab_cache_page(mapping, index);
2684         if (!page)
2685                 goto out;
2686
2687         page_start = page_offset(page);
2688         page_end = page_start + PAGE_CACHE_SIZE - 1;
2689
2690         if (!PageUptodate(page)) {
2691                 ret = btrfs_readpage(NULL, page);
2692                 lock_page(page);
2693                 if (page->mapping != mapping) {
2694                         unlock_page(page);
2695                         page_cache_release(page);
2696                         goto again;
2697                 }
2698                 if (!PageUptodate(page)) {
2699                         ret = -EIO;
2700                         goto out_unlock;
2701                 }
2702         }
2703         wait_on_page_writeback(page);
2704
2705         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2706         set_page_extent_mapped(page);
2707
2708         ordered = btrfs_lookup_ordered_extent(inode, page_start);
2709         if (ordered) {
2710                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2711                 unlock_page(page);
2712                 page_cache_release(page);
2713                 btrfs_start_ordered_extent(inode, ordered, 1);
2714                 btrfs_put_ordered_extent(ordered);
2715                 goto again;
2716         }
2717
2718         btrfs_set_extent_delalloc(inode, page_start, page_end);
2719         ret = 0;
2720         if (offset != PAGE_CACHE_SIZE) {
2721                 kaddr = kmap(page);
2722                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2723                 flush_dcache_page(page);
2724                 kunmap(page);
2725         }
2726         ClearPageChecked(page);
2727         set_page_dirty(page);
2728         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2729
2730 out_unlock:
2731         unlock_page(page);
2732         page_cache_release(page);
2733 out:
2734         return ret;
2735 }
2736
2737 int btrfs_cont_expand(struct inode *inode, loff_t size)
2738 {
2739         struct btrfs_trans_handle *trans;
2740         struct btrfs_root *root = BTRFS_I(inode)->root;
2741         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2742         struct extent_map *em;
2743         u64 mask = root->sectorsize - 1;
2744         u64 hole_start = (inode->i_size + mask) & ~mask;
2745         u64 block_end = (size + mask) & ~mask;
2746         u64 last_byte;
2747         u64 cur_offset;
2748         u64 hole_size;
2749         int err;
2750
2751         if (size <= hole_start)
2752                 return 0;
2753
2754         err = btrfs_check_free_space(root, 1, 0);
2755         if (err)
2756                 return err;
2757
2758         btrfs_truncate_page(inode->i_mapping, inode->i_size);
2759
2760         while (1) {
2761                 struct btrfs_ordered_extent *ordered;
2762                 btrfs_wait_ordered_range(inode, hole_start,
2763                                          block_end - hole_start);
2764                 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2765                 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2766                 if (!ordered)
2767                         break;
2768                 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2769                 btrfs_put_ordered_extent(ordered);
2770         }
2771
2772         trans = btrfs_start_transaction(root, 1);
2773         btrfs_set_trans_block_group(trans, inode);
2774
2775         cur_offset = hole_start;
2776         while (1) {
2777                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2778                                 block_end - cur_offset, 0);
2779                 BUG_ON(IS_ERR(em) || !em);
2780                 last_byte = min(extent_map_end(em), block_end);
2781                 last_byte = (last_byte + mask) & ~mask;
2782                 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2783                         u64 hint_byte = 0;
2784                         hole_size = last_byte - cur_offset;
2785                         err = btrfs_drop_extents(trans, root, inode,
2786                                                  cur_offset,
2787                                                  cur_offset + hole_size,
2788                                                  cur_offset, &hint_byte);
2789                         if (err)
2790                                 break;
2791                         err = btrfs_insert_file_extent(trans, root,
2792                                         inode->i_ino, cur_offset, 0,
2793                                         0, hole_size, 0, hole_size,
2794                                         0, 0, 0);
2795                         btrfs_drop_extent_cache(inode, hole_start,
2796                                         last_byte - 1, 0);
2797                 }
2798                 free_extent_map(em);
2799                 cur_offset = last_byte;
2800                 if (err || cur_offset >= block_end)
2801                         break;
2802         }
2803
2804         btrfs_end_transaction(trans, root);
2805         unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2806         return err;
2807 }
2808
2809 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2810 {
2811         struct inode *inode = dentry->d_inode;
2812         int err;
2813
2814         err = inode_change_ok(inode, attr);
2815         if (err)
2816                 return err;
2817
2818         if (S_ISREG(inode->i_mode) &&
2819             attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2820                 err = btrfs_cont_expand(inode, attr->ia_size);
2821                 if (err)
2822                         return err;
2823         }
2824
2825         err = inode_setattr(inode, attr);
2826
2827         if (!err && ((attr->ia_valid & ATTR_MODE)))
2828                 err = btrfs_acl_chmod(inode);
2829         return err;
2830 }
2831
2832 void btrfs_delete_inode(struct inode *inode)
2833 {
2834         struct btrfs_trans_handle *trans;
2835         struct btrfs_root *root = BTRFS_I(inode)->root;
2836         unsigned long nr;
2837         int ret;
2838
2839         truncate_inode_pages(&inode->i_data, 0);
2840         if (is_bad_inode(inode)) {
2841                 btrfs_orphan_del(NULL, inode);
2842                 goto no_delete;
2843         }
2844         btrfs_wait_ordered_range(inode, 0, (u64)-1);
2845
2846         btrfs_i_size_write(inode, 0);
2847         trans = btrfs_start_transaction(root, 1);
2848
2849         btrfs_set_trans_block_group(trans, inode);
2850         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2851         if (ret) {
2852                 btrfs_orphan_del(NULL, inode);
2853                 goto no_delete_lock;
2854         }
2855
2856         btrfs_orphan_del(trans, inode);
2857
2858         nr = trans->blocks_used;
2859         clear_inode(inode);
2860
2861         btrfs_end_transaction(trans, root);
2862         btrfs_btree_balance_dirty(root, nr);
2863         return;
2864
2865 no_delete_lock:
2866         nr = trans->blocks_used;
2867         btrfs_end_transaction(trans, root);
2868         btrfs_btree_balance_dirty(root, nr);
2869 no_delete:
2870         clear_inode(inode);
2871 }
2872
2873 /*
2874  * this returns the key found in the dir entry in the location pointer.
2875  * If no dir entries were found, location->objectid is 0.
2876  */
2877 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2878                                struct btrfs_key *location)
2879 {
2880         const char *name = dentry->d_name.name;
2881         int namelen = dentry->d_name.len;
2882         struct btrfs_dir_item *di;
2883         struct btrfs_path *path;
2884         struct btrfs_root *root = BTRFS_I(dir)->root;
2885         int ret = 0;
2886
2887         path = btrfs_alloc_path();
2888         BUG_ON(!path);
2889
2890         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2891                                     namelen, 0);
2892         if (IS_ERR(di))
2893                 ret = PTR_ERR(di);
2894         if (!di || IS_ERR(di)) {
2895                 goto out_err;
2896         }
2897         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2898 out:
2899         btrfs_free_path(path);
2900         return ret;
2901 out_err:
2902         location->objectid = 0;
2903         goto out;
2904 }
2905
2906 /*
2907  * when we hit a tree root in a directory, the btrfs part of the inode
2908  * needs to be changed to reflect the root directory of the tree root.  This
2909  * is kind of like crossing a mount point.
2910  */
2911 static int fixup_tree_root_location(struct btrfs_root *root,
2912                              struct btrfs_key *location,
2913                              struct btrfs_root **sub_root,
2914                              struct dentry *dentry)
2915 {
2916         struct btrfs_root_item *ri;
2917
2918         if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2919                 return 0;
2920         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2921                 return 0;
2922
2923         *sub_root = btrfs_read_fs_root(root->fs_info, location,
2924                                         dentry->d_name.name,
2925                                         dentry->d_name.len);
2926         if (IS_ERR(*sub_root))
2927                 return PTR_ERR(*sub_root);
2928
2929         ri = &(*sub_root)->root_item;
2930         location->objectid = btrfs_root_dirid(ri);
2931         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2932         location->offset = 0;
2933
2934         return 0;
2935 }
2936
2937 static noinline void init_btrfs_i(struct inode *inode)
2938 {
2939         struct btrfs_inode *bi = BTRFS_I(inode);
2940
2941         bi->i_acl = NULL;
2942         bi->i_default_acl = NULL;
2943
2944         bi->generation = 0;
2945         bi->last_trans = 0;
2946         bi->logged_trans = 0;
2947         bi->delalloc_bytes = 0;
2948         bi->disk_i_size = 0;
2949         bi->flags = 0;
2950         bi->index_cnt = (u64)-1;
2951         bi->log_dirty_trans = 0;
2952         extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2953         extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2954                              inode->i_mapping, GFP_NOFS);
2955         extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2956                              inode->i_mapping, GFP_NOFS);
2957         INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2958         btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2959         mutex_init(&BTRFS_I(inode)->csum_mutex);
2960         mutex_init(&BTRFS_I(inode)->extent_mutex);
2961         mutex_init(&BTRFS_I(inode)->log_mutex);
2962 }
2963
2964 static int btrfs_init_locked_inode(struct inode *inode, void *p)
2965 {
2966         struct btrfs_iget_args *args = p;
2967         inode->i_ino = args->ino;
2968         init_btrfs_i(inode);
2969         BTRFS_I(inode)->root = args->root;
2970         return 0;
2971 }
2972
2973 static int btrfs_find_actor(struct inode *inode, void *opaque)
2974 {
2975         struct btrfs_iget_args *args = opaque;
2976         return (args->ino == inode->i_ino &&
2977                 args->root == BTRFS_I(inode)->root);
2978 }
2979
2980 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2981                             struct btrfs_root *root, int wait)
2982 {
2983         struct inode *inode;
2984         struct btrfs_iget_args args;
2985         args.ino = objectid;
2986         args.root = root;
2987
2988         if (wait) {
2989                 inode = ilookup5(s, objectid, btrfs_find_actor,
2990                                  (void *)&args);
2991         } else {
2992                 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
2993                                         (void *)&args);
2994         }
2995         return inode;
2996 }
2997
2998 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2999                                 struct btrfs_root *root)
3000 {
3001         struct inode *inode;
3002         struct btrfs_iget_args args;
3003         args.ino = objectid;
3004         args.root = root;
3005
3006         inode = iget5_locked(s, objectid, btrfs_find_actor,
3007                              btrfs_init_locked_inode,
3008                              (void *)&args);
3009         return inode;
3010 }
3011
3012 /* Get an inode object given its location and corresponding root.
3013  * Returns in *is_new if the inode was read from disk
3014  */
3015 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3016                          struct btrfs_root *root, int *is_new)
3017 {
3018         struct inode *inode;
3019
3020         inode = btrfs_iget_locked(s, location->objectid, root);
3021         if (!inode)
3022                 return ERR_PTR(-EACCES);
3023
3024         if (inode->i_state & I_NEW) {
3025                 BTRFS_I(inode)->root = root;
3026                 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3027                 btrfs_read_locked_inode(inode);
3028                 unlock_new_inode(inode);
3029                 if (is_new)
3030                         *is_new = 1;
3031         } else {
3032                 if (is_new)
3033                         *is_new = 0;
3034         }
3035
3036         return inode;
3037 }
3038
3039 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3040 {
3041         struct inode * inode;
3042         struct btrfs_inode *bi = BTRFS_I(dir);
3043         struct btrfs_root *root = bi->root;
3044         struct btrfs_root *sub_root = root;
3045         struct btrfs_key location;
3046         int ret, new;
3047
3048         if (dentry->d_name.len > BTRFS_NAME_LEN)
3049                 return ERR_PTR(-ENAMETOOLONG);
3050
3051         ret = btrfs_inode_by_name(dir, dentry, &location);
3052
3053         if (ret < 0)
3054                 return ERR_PTR(ret);
3055
3056         inode = NULL;
3057         if (location.objectid) {
3058                 ret = fixup_tree_root_location(root, &location, &sub_root,
3059                                                 dentry);
3060                 if (ret < 0)
3061                         return ERR_PTR(ret);
3062                 if (ret > 0)
3063                         return ERR_PTR(-ENOENT);
3064                 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3065                 if (IS_ERR(inode))
3066                         return ERR_CAST(inode);
3067         }
3068         return inode;
3069 }
3070
3071 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3072                                    struct nameidata *nd)
3073 {
3074         struct inode *inode;
3075
3076         if (dentry->d_name.len > BTRFS_NAME_LEN)
3077                 return ERR_PTR(-ENAMETOOLONG);
3078
3079         inode = btrfs_lookup_dentry(dir, dentry);
3080         if (IS_ERR(inode))
3081                 return ERR_CAST(inode);
3082
3083         return d_splice_alias(inode, dentry);
3084 }
3085
3086 static unsigned char btrfs_filetype_table[] = {
3087         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3088 };
3089
3090 static int btrfs_real_readdir(struct file *filp, void *dirent,
3091                               filldir_t filldir)
3092 {
3093         struct inode *inode = filp->f_dentry->d_inode;
3094         struct btrfs_root *root = BTRFS_I(inode)->root;
3095         struct btrfs_item *item;
3096         struct btrfs_dir_item *di;
3097         struct btrfs_key key;
3098         struct btrfs_key found_key;
3099         struct btrfs_path *path;
3100         int ret;
3101         u32 nritems;
3102         struct extent_buffer *leaf;
3103         int slot;
3104         int advance;
3105         unsigned char d_type;
3106         int over = 0;
3107         u32 di_cur;
3108         u32 di_total;
3109         u32 di_len;
3110         int key_type = BTRFS_DIR_INDEX_KEY;
3111         char tmp_name[32];
3112         char *name_ptr;
3113         int name_len;
3114
3115         /* FIXME, use a real flag for deciding about the key type */
3116         if (root->fs_info->tree_root == root)
3117                 key_type = BTRFS_DIR_ITEM_KEY;
3118
3119         /* special case for "." */
3120         if (filp->f_pos == 0) {
3121                 over = filldir(dirent, ".", 1,
3122                                1, inode->i_ino,
3123                                DT_DIR);
3124                 if (over)
3125                         return 0;
3126                 filp->f_pos = 1;
3127         }
3128         /* special case for .., just use the back ref */
3129         if (filp->f_pos == 1) {
3130                 u64 pino = parent_ino(filp->f_path.dentry);
3131                 over = filldir(dirent, "..", 2,
3132                                2, pino, DT_DIR);
3133                 if (over)
3134                         return 0;
3135                 filp->f_pos = 2;
3136         }
3137         path = btrfs_alloc_path();
3138         path->reada = 2;
3139
3140         btrfs_set_key_type(&key, key_type);
3141         key.offset = filp->f_pos;
3142         key.objectid = inode->i_ino;
3143
3144         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3145         if (ret < 0)
3146                 goto err;
3147         advance = 0;
3148
3149         while (1) {
3150                 leaf = path->nodes[0];
3151                 nritems = btrfs_header_nritems(leaf);
3152                 slot = path->slots[0];
3153                 if (advance || slot >= nritems) {
3154                         if (slot >= nritems - 1) {
3155                                 ret = btrfs_next_leaf(root, path);
3156                                 if (ret)
3157                                         break;
3158                                 leaf = path->nodes[0];
3159                                 nritems = btrfs_header_nritems(leaf);
3160                                 slot = path->slots[0];
3161                         } else {
3162                                 slot++;
3163                                 path->slots[0]++;
3164                         }
3165                 }
3166
3167                 advance = 1;
3168                 item = btrfs_item_nr(leaf, slot);
3169                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3170
3171                 if (found_key.objectid != key.objectid)
3172                         break;
3173                 if (btrfs_key_type(&found_key) != key_type)
3174                         break;
3175                 if (found_key.offset < filp->f_pos)
3176                         continue;
3177
3178                 filp->f_pos = found_key.offset;
3179
3180                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3181                 di_cur = 0;
3182                 di_total = btrfs_item_size(leaf, item);
3183
3184                 while (di_cur < di_total) {
3185                         struct btrfs_key location;
3186
3187                         name_len = btrfs_dir_name_len(leaf, di);
3188                         if (name_len <= sizeof(tmp_name)) {
3189                                 name_ptr = tmp_name;
3190                         } else {
3191                                 name_ptr = kmalloc(name_len, GFP_NOFS);
3192                                 if (!name_ptr) {
3193                                         ret = -ENOMEM;
3194                                         goto err;
3195                                 }
3196                         }
3197                         read_extent_buffer(leaf, name_ptr,
3198                                            (unsigned long)(di + 1), name_len);
3199
3200                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3201                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
3202
3203                         /* is this a reference to our own snapshot? If so
3204                          * skip it
3205                          */
3206                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
3207                             location.objectid == root->root_key.objectid) {
3208                                 over = 0;
3209                                 goto skip;
3210                         }
3211                         over = filldir(dirent, name_ptr, name_len,
3212                                        found_key.offset, location.objectid,
3213                                        d_type);
3214
3215 skip:
3216                         if (name_ptr != tmp_name)
3217                                 kfree(name_ptr);
3218
3219                         if (over)
3220                                 goto nopos;
3221                         di_len = btrfs_dir_name_len(leaf, di) +
3222                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
3223                         di_cur += di_len;
3224                         di = (struct btrfs_dir_item *)((char *)di + di_len);
3225                 }
3226         }
3227
3228         /* Reached end of directory/root. Bump pos past the last item. */
3229         if (key_type == BTRFS_DIR_INDEX_KEY)
3230                 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3231         else
3232                 filp->f_pos++;
3233 nopos:
3234         ret = 0;
3235 err:
3236         btrfs_free_path(path);
3237         return ret;
3238 }
3239
3240 int btrfs_write_inode(struct inode *inode, int wait)
3241 {
3242         struct btrfs_root *root = BTRFS_I(inode)->root;
3243         struct btrfs_trans_handle *trans;
3244         int ret = 0;
3245
3246         if (root->fs_info->btree_inode == inode)
3247                 return 0;
3248
3249         if (wait) {
3250                 trans = btrfs_join_transaction(root, 1);
3251                 btrfs_set_trans_block_group(trans, inode);
3252                 ret = btrfs_commit_transaction(trans, root);
3253         }
3254         return ret;
3255 }
3256
3257 /*
3258  * This is somewhat expensive, updating the tree every time the
3259  * inode changes.  But, it is most likely to find the inode in cache.
3260  * FIXME, needs more benchmarking...there are no reasons other than performance
3261  * to keep or drop this code.
3262  */
3263 void btrfs_dirty_inode(struct inode *inode)
3264 {
3265         struct btrfs_root *root = BTRFS_I(inode)->root;
3266         struct btrfs_trans_handle *trans;
3267
3268         trans = btrfs_join_transaction(root, 1);
3269         btrfs_set_trans_block_group(trans, inode);
3270         btrfs_update_inode(trans, root, inode);
3271         btrfs_end_transaction(trans, root);
3272 }
3273
3274 /*
3275  * find the highest existing sequence number in a directory
3276  * and then set the in-memory index_cnt variable to reflect
3277  * free sequence numbers
3278  */
3279 static int btrfs_set_inode_index_count(struct inode *inode)
3280 {
3281         struct btrfs_root *root = BTRFS_I(inode)->root;
3282         struct btrfs_key key, found_key;
3283         struct btrfs_path *path;
3284         struct extent_buffer *leaf;
3285         int ret;
3286
3287         key.objectid = inode->i_ino;
3288         btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3289         key.offset = (u64)-1;
3290
3291         path = btrfs_alloc_path();
3292         if (!path)
3293                 return -ENOMEM;
3294
3295         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3296         if (ret < 0)
3297                 goto out;
3298         /* FIXME: we should be able to handle this */
3299         if (ret == 0)
3300                 goto out;
3301         ret = 0;
3302
3303         /*
3304          * MAGIC NUMBER EXPLANATION:
3305          * since we search a directory based on f_pos we have to start at 2
3306          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3307          * else has to start at 2
3308          */
3309         if (path->slots[0] == 0) {
3310                 BTRFS_I(inode)->index_cnt = 2;
3311                 goto out;
3312         }
3313
3314         path->slots[0]--;
3315
3316         leaf = path->nodes[0];
3317         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3318
3319         if (found_key.objectid != inode->i_ino ||
3320             btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3321                 BTRFS_I(inode)->index_cnt = 2;
3322                 goto out;
3323         }
3324
3325         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3326 out:
3327         btrfs_free_path(path);
3328         return ret;
3329 }
3330
3331 /*
3332  * helper to find a free sequence number in a given directory.  This current
3333  * code is very simple, later versions will do smarter things in the btree
3334  */
3335 int btrfs_set_inode_index(struct inode *dir, u64 *index)
3336 {
3337         int ret = 0;
3338
3339         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3340                 ret = btrfs_set_inode_index_count(dir);
3341                 if (ret) {
3342                         return ret;
3343                 }
3344         }
3345
3346         *index = BTRFS_I(dir)->index_cnt;
3347         BTRFS_I(dir)->index_cnt++;
3348
3349         return ret;
3350 }
3351
3352 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3353                                      struct btrfs_root *root,
3354                                      struct inode *dir,
3355                                      const char *name, int name_len,
3356                                      u64 ref_objectid,
3357                                      u64 objectid,
3358                                      struct btrfs_block_group_cache *group,
3359                                      int mode, u64 *index)
3360 {
3361         struct inode *inode;
3362         struct btrfs_inode_item *inode_item;
3363         struct btrfs_block_group_cache *new_inode_group;
3364         struct btrfs_key *location;
3365         struct btrfs_path *path;
3366         struct btrfs_inode_ref *ref;
3367         struct btrfs_key key[2];
3368         u32 sizes[2];
3369         unsigned long ptr;
3370         int ret;
3371         int owner;
3372
3373         path = btrfs_alloc_path();
3374         BUG_ON(!path);
3375
3376         inode = new_inode(root->fs_info->sb);
3377         if (!inode)
3378                 return ERR_PTR(-ENOMEM);
3379
3380         if (dir) {
3381                 ret = btrfs_set_inode_index(dir, index);
3382                 if (ret)
3383                         return ERR_PTR(ret);
3384         }
3385         /*
3386          * index_cnt is ignored for everything but a dir,
3387          * btrfs_get_inode_index_count has an explanation for the magic
3388          * number
3389          */
3390         init_btrfs_i(inode);
3391         BTRFS_I(inode)->index_cnt = 2;
3392         BTRFS_I(inode)->root = root;
3393         BTRFS_I(inode)->generation = trans->transid;
3394
3395         if (mode & S_IFDIR)
3396                 owner = 0;
3397         else
3398                 owner = 1;
3399         new_inode_group = btrfs_find_block_group(root, group, 0,
3400                                        BTRFS_BLOCK_GROUP_METADATA, owner);
3401         if (!new_inode_group) {
3402                 printk("find_block group failed\n");
3403                 new_inode_group = group;
3404         }
3405         BTRFS_I(inode)->block_group = new_inode_group;
3406
3407         key[0].objectid = objectid;
3408         btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3409         key[0].offset = 0;
3410
3411         key[1].objectid = objectid;
3412         btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3413         key[1].offset = ref_objectid;
3414
3415         sizes[0] = sizeof(struct btrfs_inode_item);
3416         sizes[1] = name_len + sizeof(*ref);
3417
3418         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3419         if (ret != 0)
3420                 goto fail;
3421
3422         if (objectid > root->highest_inode)
3423                 root->highest_inode = objectid;
3424
3425         inode->i_uid = current_fsuid();
3426         inode->i_gid = current_fsgid();
3427         inode->i_mode = mode;
3428         inode->i_ino = objectid;
3429         inode_set_bytes(inode, 0);
3430         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3431         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3432                                   struct btrfs_inode_item);
3433         fill_inode_item(trans, path->nodes[0], inode_item, inode);
3434
3435         ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3436                              struct btrfs_inode_ref);
3437         btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3438         btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3439         ptr = (unsigned long)(ref + 1);
3440         write_extent_buffer(path->nodes[0], name, ptr, name_len);
3441
3442         btrfs_mark_buffer_dirty(path->nodes[0]);
3443         btrfs_free_path(path);
3444
3445         location = &BTRFS_I(inode)->location;
3446         location->objectid = objectid;
3447         location->offset = 0;
3448         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3449
3450         insert_inode_hash(inode);
3451         return inode;
3452 fail:
3453         if (dir)
3454                 BTRFS_I(dir)->index_cnt--;
3455         btrfs_free_path(path);
3456         return ERR_PTR(ret);
3457 }
3458
3459 static inline u8 btrfs_inode_type(struct inode *inode)
3460 {
3461         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3462 }
3463
3464 /*
3465  * utility function to add 'inode' into 'parent_inode' with
3466  * a give name and a given sequence number.
3467  * if 'add_backref' is true, also insert a backref from the
3468  * inode to the parent directory.
3469  */
3470 int btrfs_add_link(struct btrfs_trans_handle *trans,
3471                    struct inode *parent_inode, struct inode *inode,
3472                    const char *name, int name_len, int add_backref, u64 index)
3473 {
3474         int ret;
3475         struct btrfs_key key;
3476         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3477
3478         key.objectid = inode->i_ino;
3479         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3480         key.offset = 0;
3481
3482         ret = btrfs_insert_dir_item(trans, root, name, name_len,
3483                                     parent_inode->i_ino,
3484                                     &key, btrfs_inode_type(inode),
3485                                     index);
3486         if (ret == 0) {
3487                 if (add_backref) {
3488                         ret = btrfs_insert_inode_ref(trans, root,
3489                                                      name, name_len,
3490                                                      inode->i_ino,
3491                                                      parent_inode->i_ino,
3492                                                      index);
3493                 }
3494                 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3495                                    name_len * 2);
3496                 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3497                 ret = btrfs_update_inode(trans, root, parent_inode);
3498         }
3499         return ret;
3500 }
3501
3502 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3503                             struct dentry *dentry, struct inode *inode,
3504                             int backref, u64 index)
3505 {
3506         int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3507                                  inode, dentry->d_name.name,
3508                                  dentry->d_name.len, backref, index);
3509         if (!err) {
3510                 d_instantiate(dentry, inode);
3511                 return 0;
3512         }
3513         if (err > 0)
3514                 err = -EEXIST;
3515         return err;
3516 }
3517
3518 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3519                         int mode, dev_t rdev)
3520 {
3521         struct btrfs_trans_handle *trans;
3522         struct btrfs_root *root = BTRFS_I(dir)->root;
3523         struct inode *inode = NULL;
3524         int err;
3525         int drop_inode = 0;
3526         u64 objectid;
3527         unsigned long nr = 0;
3528         u64 index = 0;
3529
3530         if (!new_valid_dev(rdev))
3531                 return -EINVAL;
3532
3533         err = btrfs_check_free_space(root, 1, 0);
3534         if (err)
3535                 goto fail;
3536
3537         trans = btrfs_start_transaction(root, 1);
3538         btrfs_set_trans_block_group(trans, dir);
3539
3540         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3541         if (err) {
3542                 err = -ENOSPC;
3543                 goto out_unlock;
3544         }
3545
3546         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3547                                 dentry->d_name.len,
3548                                 dentry->d_parent->d_inode->i_ino, objectid,
3549                                 BTRFS_I(dir)->block_group, mode, &index);
3550         err = PTR_ERR(inode);
3551         if (IS_ERR(inode))
3552                 goto out_unlock;
3553
3554         err = btrfs_init_acl(inode, dir);
3555         if (err) {
3556                 drop_inode = 1;
3557                 goto out_unlock;
3558         }
3559
3560         btrfs_set_trans_block_group(trans, inode);
3561         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3562         if (err)
3563                 drop_inode = 1;
3564         else {
3565                 inode->i_op = &btrfs_special_inode_operations;
3566                 init_special_inode(inode, inode->i_mode, rdev);
3567                 btrfs_update_inode(trans, root, inode);
3568         }
3569         dir->i_sb->s_dirt = 1;
3570         btrfs_update_inode_block_group(trans, inode);
3571         btrfs_update_inode_block_group(trans, dir);
3572 out_unlock:
3573         nr = trans->blocks_used;
3574         btrfs_end_transaction_throttle(trans, root);
3575 fail:
3576         if (drop_inode) {
3577                 inode_dec_link_count(inode);
3578                 iput(inode);
3579         }
3580         btrfs_btree_balance_dirty(root, nr);
3581         return err;
3582 }
3583
3584 static int btrfs_create(struct inode *dir, struct dentry *dentry,
3585                         int mode, struct nameidata *nd)
3586 {
3587         struct btrfs_trans_handle *trans;
3588         struct btrfs_root *root = BTRFS_I(dir)->root;
3589         struct inode *inode = NULL;
3590         int err;
3591         int drop_inode = 0;
3592         unsigned long nr = 0;
3593         u64 objectid;
3594         u64 index = 0;
3595
3596         err = btrfs_check_free_space(root, 1, 0);
3597         if (err)
3598                 goto fail;
3599         trans = btrfs_start_transaction(root, 1);
3600         btrfs_set_trans_block_group(trans, dir);
3601
3602         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3603         if (err) {
3604                 err = -ENOSPC;
3605                 goto out_unlock;
3606         }
3607
3608         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3609                                 dentry->d_name.len,
3610                                 dentry->d_parent->d_inode->i_ino,
3611                                 objectid, BTRFS_I(dir)->block_group, mode,
3612                                 &index);
3613         err = PTR_ERR(inode);
3614         if (IS_ERR(inode))
3615                 goto out_unlock;
3616
3617         err = btrfs_init_acl(inode, dir);
3618         if (err) {
3619                 drop_inode = 1;
3620                 goto out_unlock;
3621         }
3622
3623         btrfs_set_trans_block_group(trans, inode);
3624         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3625         if (err)
3626                 drop_inode = 1;
3627         else {
3628                 inode->i_mapping->a_ops = &btrfs_aops;
3629                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3630                 inode->i_fop = &btrfs_file_operations;
3631                 inode->i_op = &btrfs_file_inode_operations;
3632                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3633         }
3634         dir->i_sb->s_dirt = 1;
3635         btrfs_update_inode_block_group(trans, inode);
3636         btrfs_update_inode_block_group(trans, dir);
3637 out_unlock:
3638         nr = trans->blocks_used;
3639         btrfs_end_transaction_throttle(trans, root);
3640 fail:
3641         if (drop_inode) {
3642                 inode_dec_link_count(inode);
3643                 iput(inode);
3644         }
3645         btrfs_btree_balance_dirty(root, nr);
3646         return err;
3647 }
3648
3649 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3650                       struct dentry *dentry)
3651 {
3652         struct btrfs_trans_handle *trans;
3653         struct btrfs_root *root = BTRFS_I(dir)->root;
3654         struct inode *inode = old_dentry->d_inode;
3655         u64 index;
3656         unsigned long nr = 0;
3657         int err;
3658         int drop_inode = 0;
3659
3660         if (inode->i_nlink == 0)
3661                 return -ENOENT;
3662
3663         btrfs_inc_nlink(inode);
3664         err = btrfs_check_free_space(root, 1, 0);
3665         if (err)
3666                 goto fail;
3667         err = btrfs_set_inode_index(dir, &index);
3668         if (err)
3669                 goto fail;
3670
3671         trans = btrfs_start_transaction(root, 1);
3672
3673         btrfs_set_trans_block_group(trans, dir);
3674         atomic_inc(&inode->i_count);
3675
3676         err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3677
3678         if (err)
3679                 drop_inode = 1;
3680
3681         dir->i_sb->s_dirt = 1;
3682         btrfs_update_inode_block_group(trans, dir);
3683         err = btrfs_update_inode(trans, root, inode);
3684
3685         if (err)
3686                 drop_inode = 1;
3687
3688         nr = trans->blocks_used;
3689         btrfs_end_transaction_throttle(trans, root);
3690 fail:
3691         if (drop_inode) {
3692                 inode_dec_link_count(inode);
3693                 iput(inode);
3694         }
3695         btrfs_btree_balance_dirty(root, nr);
3696         return err;
3697 }
3698
3699 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3700 {
3701         struct inode *inode = NULL;
3702         struct btrfs_trans_handle *trans;
3703         struct btrfs_root *root = BTRFS_I(dir)->root;
3704         int err = 0;
3705         int drop_on_err = 0;
3706         u64 objectid = 0;
3707         u64 index = 0;
3708         unsigned long nr = 1;
3709
3710         err = btrfs_check_free_space(root, 1, 0);
3711         if (err)
3712                 goto out_unlock;
3713
3714         trans = btrfs_start_transaction(root, 1);
3715         btrfs_set_trans_block_group(trans, dir);
3716
3717         if (IS_ERR(trans)) {
3718                 err = PTR_ERR(trans);
3719                 goto out_unlock;
3720         }
3721
3722         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3723         if (err) {
3724                 err = -ENOSPC;
3725                 goto out_unlock;
3726         }
3727
3728         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3729                                 dentry->d_name.len,
3730                                 dentry->d_parent->d_inode->i_ino, objectid,
3731                                 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3732                                 &index);
3733         if (IS_ERR(inode)) {
3734                 err = PTR_ERR(inode);
3735                 goto out_fail;
3736         }
3737
3738         drop_on_err = 1;
3739
3740         err = btrfs_init_acl(inode, dir);
3741         if (err)
3742                 goto out_fail;
3743
3744         inode->i_op = &btrfs_dir_inode_operations;
3745         inode->i_fop = &btrfs_dir_file_operations;
3746         btrfs_set_trans_block_group(trans, inode);
3747
3748         btrfs_i_size_write(inode, 0);
3749         err = btrfs_update_inode(trans, root, inode);
3750         if (err)
3751                 goto out_fail;
3752
3753         err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3754                                  inode, dentry->d_name.name,
3755                                  dentry->d_name.len, 0, index);
3756         if (err)
3757                 goto out_fail;
3758
3759         d_instantiate(dentry, inode);
3760         drop_on_err = 0;
3761         dir->i_sb->s_dirt = 1;
3762         btrfs_update_inode_block_group(trans, inode);
3763         btrfs_update_inode_block_group(trans, dir);
3764
3765 out_fail:
3766         nr = trans->blocks_used;
3767         btrfs_end_transaction_throttle(trans, root);
3768
3769 out_unlock:
3770         if (drop_on_err)
3771                 iput(inode);
3772         btrfs_btree_balance_dirty(root, nr);
3773         return err;
3774 }
3775
3776 /* helper for btfs_get_extent.  Given an existing extent in the tree,
3777  * and an extent that you want to insert, deal with overlap and insert
3778  * the new extent into the tree.
3779  */
3780 static int merge_extent_mapping(struct extent_map_tree *em_tree,
3781                                 struct extent_map *existing,
3782                                 struct extent_map *em,
3783                                 u64 map_start, u64 map_len)
3784 {
3785         u64 start_diff;
3786
3787         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3788         start_diff = map_start - em->start;
3789         em->start = map_start;
3790         em->len = map_len;
3791         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3792             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3793                 em->block_start += start_diff;
3794                 em->block_len -= start_diff;
3795         }
3796         return add_extent_mapping(em_tree, em);
3797 }
3798
3799 static noinline int uncompress_inline(struct btrfs_path *path,
3800                                       struct inode *inode, struct page *page,
3801                                       size_t pg_offset, u64 extent_offset,
3802                                       struct btrfs_file_extent_item *item)
3803 {
3804         int ret;
3805         struct extent_buffer *leaf = path->nodes[0];
3806         char *tmp;
3807         size_t max_size;
3808         unsigned long inline_size;
3809         unsigned long ptr;
3810
3811         WARN_ON(pg_offset != 0);
3812         max_size = btrfs_file_extent_ram_bytes(leaf, item);
3813         inline_size = btrfs_file_extent_inline_item_len(leaf,
3814                                         btrfs_item_nr(leaf, path->slots[0]));
3815         tmp = kmalloc(inline_size, GFP_NOFS);
3816         ptr = btrfs_file_extent_inline_start(item);
3817
3818         read_extent_buffer(leaf, tmp, ptr, inline_size);
3819
3820         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3821         ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3822                                     inline_size, max_size);
3823         if (ret) {
3824                 char *kaddr = kmap_atomic(page, KM_USER0);
3825                 unsigned long copy_size = min_t(u64,
3826                                   PAGE_CACHE_SIZE - pg_offset,
3827                                   max_size - extent_offset);
3828                 memset(kaddr + pg_offset, 0, copy_size);
3829                 kunmap_atomic(kaddr, KM_USER0);
3830         }
3831         kfree(tmp);
3832         return 0;
3833 }
3834
3835 /*
3836  * a bit scary, this does extent mapping from logical file offset to the disk.
3837  * the ugly parts come from merging extents from the disk with the
3838  * in-ram representation.  This gets more complex because of the data=ordered code,
3839  * where the in-ram extents might be locked pending data=ordered completion.
3840  *
3841  * This also copies inline extents directly into the page.
3842  */
3843 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3844                                     size_t pg_offset, u64 start, u64 len,
3845                                     int create)
3846 {
3847         int ret;
3848         int err = 0;
3849         u64 bytenr;
3850         u64 extent_start = 0;
3851         u64 extent_end = 0;
3852         u64 objectid = inode->i_ino;
3853         u32 found_type;
3854         struct btrfs_path *path = NULL;
3855         struct btrfs_root *root = BTRFS_I(inode)->root;
3856         struct btrfs_file_extent_item *item;
3857         struct extent_buffer *leaf;
3858         struct btrfs_key found_key;
3859         struct extent_map *em = NULL;
3860         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3861         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3862         struct btrfs_trans_handle *trans = NULL;
3863         int compressed;
3864
3865 again:
3866         spin_lock(&em_tree->lock);
3867         em = lookup_extent_mapping(em_tree, start, len);
3868         if (em)
3869                 em->bdev = root->fs_info->fs_devices->latest_bdev;
3870         spin_unlock(&em_tree->lock);
3871
3872         if (em) {
3873                 if (em->start > start || em->start + em->len <= start)
3874                         free_extent_map(em);
3875                 else if (em->block_start == EXTENT_MAP_INLINE && page)
3876                         free_extent_map(em);
3877                 else
3878                         goto out;
3879         }
3880         em = alloc_extent_map(GFP_NOFS);
3881         if (!em) {
3882                 err = -ENOMEM;
3883                 goto out;
3884         }
3885         em->bdev = root->fs_info->fs_devices->latest_bdev;
3886         em->start = EXTENT_MAP_HOLE;
3887         em->orig_start = EXTENT_MAP_HOLE;
3888         em->len = (u64)-1;
3889         em->block_len = (u64)-1;
3890
3891         if (!path) {
3892                 path = btrfs_alloc_path();
3893                 BUG_ON(!path);
3894         }
3895
3896         ret = btrfs_lookup_file_extent(trans, root, path,
3897                                        objectid, start, trans != NULL);
3898         if (ret < 0) {
3899                 err = ret;
3900                 goto out;
3901         }
3902
3903         if (ret != 0) {
3904                 if (path->slots[0] == 0)
3905                         goto not_found;
3906                 path->slots[0]--;
3907         }
3908
3909         leaf = path->nodes[0];
3910         item = btrfs_item_ptr(leaf, path->slots[0],
3911                               struct btrfs_file_extent_item);
3912         /* are we inside the extent that was found? */
3913         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3914         found_type = btrfs_key_type(&found_key);
3915         if (found_key.objectid != objectid ||
3916             found_type != BTRFS_EXTENT_DATA_KEY) {
3917                 goto not_found;
3918         }
3919
3920         found_type = btrfs_file_extent_type(leaf, item);
3921         extent_start = found_key.offset;
3922         compressed = btrfs_file_extent_compression(leaf, item);
3923         if (found_type == BTRFS_FILE_EXTENT_REG ||
3924             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3925                 extent_end = extent_start +
3926                        btrfs_file_extent_num_bytes(leaf, item);
3927         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3928                 size_t size;
3929                 size = btrfs_file_extent_inline_len(leaf, item);
3930                 extent_end = (extent_start + size + root->sectorsize - 1) &
3931                         ~((u64)root->sectorsize - 1);
3932         }
3933
3934         if (start >= extent_end) {
3935                 path->slots[0]++;
3936                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3937                         ret = btrfs_next_leaf(root, path);
3938                         if (ret < 0) {
3939                                 err = ret;
3940                                 goto out;
3941                         }
3942                         if (ret > 0)
3943                                 goto not_found;
3944                         leaf = path->nodes[0];
3945                 }
3946                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3947                 if (found_key.objectid != objectid ||
3948                     found_key.type != BTRFS_EXTENT_DATA_KEY)
3949                         goto not_found;
3950                 if (start + len <= found_key.offset)
3951                         goto not_found;
3952                 em->start = start;
3953                 em->len = found_key.offset - start;
3954                 goto not_found_em;
3955         }
3956
3957         if (found_type == BTRFS_FILE_EXTENT_REG ||
3958             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3959                 em->start = extent_start;
3960                 em->len = extent_end - extent_start;
3961                 em->orig_start = extent_start -
3962                                  btrfs_file_extent_offset(leaf, item);
3963                 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
3964                 if (bytenr == 0) {
3965                         em->block_start = EXTENT_MAP_HOLE;
3966                         goto insert;
3967                 }
3968                 if (compressed) {
3969                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3970                         em->block_start = bytenr;
3971                         em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
3972                                                                          item);
3973                 } else {
3974                         bytenr += btrfs_file_extent_offset(leaf, item);
3975                         em->block_start = bytenr;
3976                         em->block_len = em->len;
3977                         if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
3978                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
3979                 }
3980                 goto insert;
3981         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3982                 unsigned long ptr;
3983                 char *map;
3984                 size_t size;
3985                 size_t extent_offset;
3986                 size_t copy_size;
3987
3988                 em->block_start = EXTENT_MAP_INLINE;
3989                 if (!page || create) {
3990                         em->start = extent_start;
3991                         em->len = extent_end - extent_start;
3992                         goto out;
3993                 }
3994
3995                 size = btrfs_file_extent_inline_len(leaf, item);
3996                 extent_offset = page_offset(page) + pg_offset - extent_start;
3997                 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
3998                                 size - extent_offset);
3999                 em->start = extent_start + extent_offset;
4000                 em->len = (copy_size + root->sectorsize - 1) &
4001                         ~((u64)root->sectorsize - 1);
4002                 em->orig_start = EXTENT_MAP_INLINE;
4003                 if (compressed)
4004                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4005                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4006                 if (create == 0 && !PageUptodate(page)) {
4007                         if (btrfs_file_extent_compression(leaf, item) ==
4008                             BTRFS_COMPRESS_ZLIB) {
4009                                 ret = uncompress_inline(path, inode, page,
4010                                                         pg_offset,
4011                                                         extent_offset, item);
4012                                 BUG_ON(ret);
4013                         } else {
4014                                 map = kmap(page);
4015                                 read_extent_buffer(leaf, map + pg_offset, ptr,
4016                                                    copy_size);
4017                                 kunmap(page);
4018                         }
4019                         flush_dcache_page(page);
4020                 } else if (create && PageUptodate(page)) {
4021                         if (!trans) {
4022                                 kunmap(page);
4023                                 free_extent_map(em);
4024                                 em = NULL;
4025                                 btrfs_release_path(root, path);
4026                                 trans = btrfs_join_transaction(root, 1);
4027                                 goto again;
4028                         }
4029                         map = kmap(page);
4030                         write_extent_buffer(leaf, map + pg_offset, ptr,
4031                                             copy_size);
4032                         kunmap(page);
4033                         btrfs_mark_buffer_dirty(leaf);
4034                 }
4035                 set_extent_uptodate(io_tree, em->start,
4036                                     extent_map_end(em) - 1, GFP_NOFS);
4037                 goto insert;
4038         } else {
4039                 printk("unkknown found_type %d\n", found_type);
4040                 WARN_ON(1);
4041         }
4042 not_found:
4043         em->start = start;
4044         em->len = len;
4045 not_found_em:
4046         em->block_start = EXTENT_MAP_HOLE;
4047         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4048 insert:
4049         btrfs_release_path(root, path);
4050         if (em->start > start || extent_map_end(em) <= start) {
4051                 printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
4052                 err = -EIO;
4053                 goto out;
4054         }
4055
4056         err = 0;
4057         spin_lock(&em_tree->lock);
4058         ret = add_extent_mapping(em_tree, em);
4059         /* it is possible that someone inserted the extent into the tree
4060          * while we had the lock dropped.  It is also possible that
4061          * an overlapping map exists in the tree
4062          */
4063         if (ret == -EEXIST) {
4064                 struct extent_map *existing;
4065
4066                 ret = 0;
4067
4068                 existing = lookup_extent_mapping(em_tree, start, len);
4069                 if (existing && (existing->start > start ||
4070                     existing->start + existing->len <= start)) {
4071                         free_extent_map(existing);
4072                         existing = NULL;
4073                 }
4074                 if (!existing) {
4075                         existing = lookup_extent_mapping(em_tree, em->start,
4076                                                          em->len);
4077                         if (existing) {
4078                                 err = merge_extent_mapping(em_tree, existing,
4079                                                            em, start,
4080                                                            root->sectorsize);
4081                                 free_extent_map(existing);
4082                                 if (err) {
4083                                         free_extent_map(em);
4084                                         em = NULL;
4085                                 }
4086                         } else {
4087                                 err = -EIO;
4088                                 printk("failing to insert %Lu %Lu\n",
4089                                        start, len);
4090                                 free_extent_map(em);
4091                                 em = NULL;
4092                         }
4093                 } else {
4094                         free_extent_map(em);
4095                         em = existing;
4096                         err = 0;
4097                 }
4098         }
4099         spin_unlock(&em_tree->lock);
4100 out:
4101         if (path)
4102                 btrfs_free_path(path);
4103         if (trans) {
4104                 ret = btrfs_end_transaction(trans, root);
4105                 if (!err) {
4106                         err = ret;
4107                 }
4108         }
4109         if (err) {
4110                 free_extent_map(em);
4111                 WARN_ON(1);
4112                 return ERR_PTR(err);
4113         }
4114         return em;
4115 }
4116
4117 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4118                         const struct iovec *iov, loff_t offset,
4119                         unsigned long nr_segs)
4120 {
4121         return -EINVAL;
4122 }
4123
4124 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4125 {
4126         return extent_bmap(mapping, iblock, btrfs_get_extent);
4127 }
4128
4129 int btrfs_readpage(struct file *file, struct page *page)
4130 {
4131         struct extent_io_tree *tree;
4132         tree = &BTRFS_I(page->mapping->host)->io_tree;
4133         return extent_read_full_page(tree, page, btrfs_get_extent);
4134 }
4135
4136 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4137 {
4138         struct extent_io_tree *tree;
4139
4140
4141         if (current->flags & PF_MEMALLOC) {
4142                 redirty_page_for_writepage(wbc, page);
4143                 unlock_page(page);
4144                 return 0;
4145         }
4146         tree = &BTRFS_I(page->mapping->host)->io_tree;
4147         return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4148 }
4149
4150 int btrfs_writepages(struct address_space *mapping,
4151                      struct writeback_control *wbc)
4152 {
4153         struct extent_io_tree *tree;
4154
4155         tree = &BTRFS_I(mapping->host)->io_tree;
4156         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4157 }
4158
4159 static int
4160 btrfs_readpages(struct file *file, struct address_space *mapping,
4161                 struct list_head *pages, unsigned nr_pages)
4162 {
4163         struct extent_io_tree *tree;
4164         tree = &BTRFS_I(mapping->host)->io_tree;
4165         return extent_readpages(tree, mapping, pages, nr_pages,
4166                                 btrfs_get_extent);
4167 }
4168 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4169 {
4170         struct extent_io_tree *tree;
4171         struct extent_map_tree *map;
4172         int ret;
4173
4174         tree = &BTRFS_I(page->mapping->host)->io_tree;
4175         map = &BTRFS_I(page->mapping->host)->extent_tree;
4176         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4177         if (ret == 1) {
4178                 ClearPagePrivate(page);
4179                 set_page_private(page, 0);
4180                 page_cache_release(page);
4181         }
4182         return ret;
4183 }
4184
4185 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4186 {
4187         if (PageWriteback(page) || PageDirty(page))
4188                 return 0;
4189         return __btrfs_releasepage(page, gfp_flags);
4190 }
4191
4192 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4193 {
4194         struct extent_io_tree *tree;
4195         struct btrfs_ordered_extent *ordered;
4196         u64 page_start = page_offset(page);
4197         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4198
4199         wait_on_page_writeback(page);
4200         tree = &BTRFS_I(page->mapping->host)->io_tree;
4201         if (offset) {
4202                 btrfs_releasepage(page, GFP_NOFS);
4203                 return;
4204         }
4205
4206         lock_extent(tree, page_start, page_end, GFP_NOFS);
4207         ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4208                                            page_offset(page));
4209         if (ordered) {
4210                 /*
4211                  * IO on this page will never be started, so we need
4212                  * to account for any ordered extents now
4213                  */
4214                 clear_extent_bit(tree, page_start, page_end,
4215                                  EXTENT_DIRTY | EXTENT_DELALLOC |
4216                                  EXTENT_LOCKED, 1, 0, GFP_NOFS);
4217                 btrfs_finish_ordered_io(page->mapping->host,
4218                                         page_start, page_end);
4219                 btrfs_put_ordered_extent(ordered);
4220                 lock_extent(tree, page_start, page_end, GFP_NOFS);
4221         }
4222         clear_extent_bit(tree, page_start, page_end,
4223                  EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4224                  EXTENT_ORDERED,
4225                  1, 1, GFP_NOFS);
4226         __btrfs_releasepage(page, GFP_NOFS);
4227
4228         ClearPageChecked(page);
4229         if (PagePrivate(page)) {
4230                 ClearPagePrivate(page);
4231                 set_page_private(page, 0);
4232                 page_cache_release(page);
4233         }
4234 }
4235
4236 /*
4237  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4238  * called from a page fault handler when a page is first dirtied. Hence we must
4239  * be careful to check for EOF conditions here. We set the page up correctly
4240  * for a written page which means we get ENOSPC checking when writing into
4241  * holes and correct delalloc and unwritten extent mapping on filesystems that
4242  * support these features.
4243  *
4244  * We are not allowed to take the i_mutex here so we have to play games to
4245  * protect against truncate races as the page could now be beyond EOF.  Because
4246  * vmtruncate() writes the inode size before removing pages, once we have the
4247  * page lock we can determine safely if the page is beyond EOF. If it is not
4248  * beyond EOF, then the page is guaranteed safe against truncation until we
4249  * unlock the page.
4250  */
4251 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4252 {
4253         struct inode *inode = fdentry(vma->vm_file)->d_inode;
4254         struct btrfs_root *root = BTRFS_I(inode)->root;
4255         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4256         struct btrfs_ordered_extent *ordered;
4257         char *kaddr;
4258         unsigned long zero_start;
4259         loff_t size;
4260         int ret;
4261         u64 page_start;
4262         u64 page_end;
4263
4264         ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4265         if (ret)
4266                 goto out;
4267
4268         ret = -EINVAL;
4269 again:
4270         lock_page(page);
4271         size = i_size_read(inode);
4272         page_start = page_offset(page);
4273         page_end = page_start + PAGE_CACHE_SIZE - 1;
4274
4275         if ((page->mapping != inode->i_mapping) ||
4276             (page_start >= size)) {
4277                 /* page got truncated out from underneath us */
4278                 goto out_unlock;
4279         }
4280         wait_on_page_writeback(page);
4281
4282         lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4283         set_page_extent_mapped(page);
4284
4285         /*
4286          * we can't set the delalloc bits if there are pending ordered
4287          * extents.  Drop our locks and wait for them to finish
4288          */
4289         ordered = btrfs_lookup_ordered_extent(inode, page_start);
4290         if (ordered) {
4291                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4292                 unlock_page(page);
4293                 btrfs_start_ordered_extent(inode, ordered, 1);
4294                 btrfs_put_ordered_extent(ordered);
4295                 goto again;
4296         }
4297
4298         btrfs_set_extent_delalloc(inode, page_start, page_end);
4299         ret = 0;
4300
4301         /* page is wholly or partially inside EOF */
4302         if (page_start + PAGE_CACHE_SIZE > size)
4303                 zero_start = size & ~PAGE_CACHE_MASK;
4304         else
4305                 zero_start = PAGE_CACHE_SIZE;
4306
4307         if (zero_start != PAGE_CACHE_SIZE) {
4308                 kaddr = kmap(page);
4309                 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4310                 flush_dcache_page(page);
4311                 kunmap(page);
4312         }
4313         ClearPageChecked(page);
4314         set_page_dirty(page);
4315         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4316
4317 out_unlock:
4318         unlock_page(page);
4319 out:
4320         return ret;
4321 }
4322
4323 static void btrfs_truncate(struct inode *inode)
4324 {
4325         struct btrfs_root *root = BTRFS_I(inode)->root;
4326         int ret;
4327         struct btrfs_trans_handle *trans;
4328         unsigned long nr;
4329         u64 mask = root->sectorsize - 1;
4330
4331         if (!S_ISREG(inode->i_mode))
4332                 return;
4333         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4334                 return;
4335
4336         btrfs_truncate_page(inode->i_mapping, inode->i_size);
4337         btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4338
4339         trans = btrfs_start_transaction(root, 1);
4340         btrfs_set_trans_block_group(trans, inode);
4341         btrfs_i_size_write(inode, inode->i_size);
4342
4343         ret = btrfs_orphan_add(trans, inode);
4344         if (ret)
4345                 goto out;
4346         /* FIXME, add redo link to tree so we don't leak on crash */
4347         ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4348                                       BTRFS_EXTENT_DATA_KEY);
4349         btrfs_update_inode(trans, root, inode);
4350
4351         ret = btrfs_orphan_del(trans, inode);
4352         BUG_ON(ret);
4353
4354 out:
4355         nr = trans->blocks_used;
4356         ret = btrfs_end_transaction_throttle(trans, root);
4357         BUG_ON(ret);
4358         btrfs_btree_balance_dirty(root, nr);
4359 }
4360
4361 /*
4362  * Invalidate a single dcache entry at the root of the filesystem.
4363  * Needed after creation of snapshot or subvolume.
4364  */
4365 void btrfs_invalidate_dcache_root(struct inode *dir, char *name,
4366                                   int namelen)
4367 {
4368         struct dentry *alias, *entry;
4369         struct qstr qstr;
4370
4371         alias = d_find_alias(dir);
4372         if (alias) {
4373                 qstr.name = name;
4374                 qstr.len = namelen;
4375                 /* change me if btrfs ever gets a d_hash operation */
4376                 qstr.hash = full_name_hash(qstr.name, qstr.len);
4377                 entry = d_lookup(alias, &qstr);
4378                 dput(alias);
4379                 if (entry) {
4380                         d_invalidate(entry);
4381                         dput(entry);
4382                 }
4383         }
4384 }
4385
4386 /*
4387  * create a new subvolume directory/inode (helper for the ioctl).
4388  */
4389 int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
4390                 struct btrfs_trans_handle *trans, u64 new_dirid,
4391                 struct btrfs_block_group_cache *block_group)
4392 {
4393         struct inode *inode;
4394         int error;
4395         u64 index = 0;
4396
4397         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4398                                 new_dirid, block_group, S_IFDIR | 0700, &index);
4399         if (IS_ERR(inode))
4400                 return PTR_ERR(inode);
4401         inode->i_op = &btrfs_dir_inode_operations;
4402         inode->i_fop = &btrfs_dir_file_operations;
4403
4404         inode->i_nlink = 1;
4405         btrfs_i_size_write(inode, 0);
4406
4407         error = btrfs_update_inode(trans, new_root, inode);
4408         if (error)
4409                 return error;
4410
4411         d_instantiate(dentry, inode);
4412         return 0;
4413 }
4414
4415 /* helper function for file defrag and space balancing.  This
4416  * forces readahead on a given range of bytes in an inode
4417  */
4418 unsigned long btrfs_force_ra(struct address_space *mapping,
4419                               struct file_ra_state *ra, struct file *file,
4420                               pgoff_t offset, pgoff_t last_index)
4421 {
4422         pgoff_t req_size = last_index - offset + 1;
4423
4424         page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4425         return offset + req_size;
4426 }
4427
4428 struct inode *btrfs_alloc_inode(struct super_block *sb)
4429 {
4430         struct btrfs_inode *ei;
4431
4432         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4433         if (!ei)
4434                 return NULL;
4435         ei->last_trans = 0;
4436         ei->logged_trans = 0;
4437         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4438         ei->i_acl = BTRFS_ACL_NOT_CACHED;
4439         ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4440         INIT_LIST_HEAD(&ei->i_orphan);
4441         return &ei->vfs_inode;
4442 }
4443
4444 void btrfs_destroy_inode(struct inode *inode)
4445 {
4446         struct btrfs_ordered_extent *ordered;
4447         WARN_ON(!list_empty(&inode->i_dentry));
4448         WARN_ON(inode->i_data.nrpages);
4449
4450         if (BTRFS_I(inode)->i_acl &&
4451             BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4452                 posix_acl_release(BTRFS_I(inode)->i_acl);
4453         if (BTRFS_I(inode)->i_default_acl &&
4454             BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4455                 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4456
4457         spin_lock(&BTRFS_I(inode)->root->list_lock);
4458         if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4459                 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4460                        " list\n", inode->i_ino);
4461                 dump_stack();
4462         }
4463         spin_unlock(&BTRFS_I(inode)->root->list_lock);
4464
4465         while(1) {
4466                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4467                 if (!ordered)
4468                         break;
4469                 else {
4470                         printk("found ordered extent %Lu %Lu\n",
4471                                ordered->file_offset, ordered->len);
4472                         btrfs_remove_ordered_extent(inode, ordered);
4473                         btrfs_put_ordered_extent(ordered);
4474                         btrfs_put_ordered_extent(ordered);
4475                 }
4476         }
4477         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4478         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4479 }
4480
4481 static void init_once(void *foo)
4482 {
4483         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4484
4485         inode_init_once(&ei->vfs_inode);
4486 }
4487
4488 void btrfs_destroy_cachep(void)
4489 {
4490         if (btrfs_inode_cachep)
4491                 kmem_cache_destroy(btrfs_inode_cachep);
4492         if (btrfs_trans_handle_cachep)
4493                 kmem_cache_destroy(btrfs_trans_handle_cachep);
4494         if (btrfs_transaction_cachep)
4495                 kmem_cache_destroy(btrfs_transaction_cachep);
4496         if (btrfs_bit_radix_cachep)
4497                 kmem_cache_destroy(btrfs_bit_radix_cachep);
4498         if (btrfs_path_cachep)
4499                 kmem_cache_destroy(btrfs_path_cachep);
4500 }
4501
4502 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4503                                        unsigned long extra_flags,
4504                                        void (*ctor)(void *))
4505 {
4506         return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4507                                  SLAB_MEM_SPREAD | extra_flags), ctor);
4508 }
4509
4510 int btrfs_init_cachep(void)
4511 {
4512         btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4513                                           sizeof(struct btrfs_inode),
4514                                           0, init_once);
4515         if (!btrfs_inode_cachep)
4516                 goto fail;
4517         btrfs_trans_handle_cachep =
4518                         btrfs_cache_create("btrfs_trans_handle_cache",
4519                                            sizeof(struct btrfs_trans_handle),
4520                                            0, NULL);
4521         if (!btrfs_trans_handle_cachep)
4522                 goto fail;
4523         btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4524                                              sizeof(struct btrfs_transaction),
4525                                              0, NULL);
4526         if (!btrfs_transaction_cachep)
4527                 goto fail;
4528         btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4529                                          sizeof(struct btrfs_path),
4530                                          0, NULL);
4531         if (!btrfs_path_cachep)
4532                 goto fail;
4533         btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4534                                               SLAB_DESTROY_BY_RCU, NULL);
4535         if (!btrfs_bit_radix_cachep)
4536                 goto fail;
4537         return 0;
4538 fail:
4539         btrfs_destroy_cachep();
4540         return -ENOMEM;
4541 }
4542
4543 static int btrfs_getattr(struct vfsmount *mnt,
4544                          struct dentry *dentry, struct kstat *stat)
4545 {
4546         struct inode *inode = dentry->d_inode;
4547         generic_fillattr(inode, stat);
4548         stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4549         stat->blksize = PAGE_CACHE_SIZE;
4550         stat->blocks = (inode_get_bytes(inode) +
4551                         BTRFS_I(inode)->delalloc_bytes) >> 9;
4552         return 0;
4553 }
4554
4555 static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
4556                            struct inode * new_dir,struct dentry *new_dentry)
4557 {
4558         struct btrfs_trans_handle *trans;
4559         struct btrfs_root *root = BTRFS_I(old_dir)->root;
4560         struct inode *new_inode = new_dentry->d_inode;
4561         struct inode *old_inode = old_dentry->d_inode;
4562         struct timespec ctime = CURRENT_TIME;
4563         u64 index = 0;
4564         int ret;
4565
4566         /* we're not allowed to rename between subvolumes */
4567         if (BTRFS_I(old_inode)->root->root_key.objectid !=
4568             BTRFS_I(new_dir)->root->root_key.objectid)
4569                 return -EXDEV;
4570
4571         if (S_ISDIR(old_inode->i_mode) && new_inode &&
4572             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4573                 return -ENOTEMPTY;
4574         }
4575
4576         /* to rename a snapshot or subvolume, we need to juggle the
4577          * backrefs.  This isn't coded yet
4578          */
4579         if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4580                 return -EXDEV;
4581
4582         ret = btrfs_check_free_space(root, 1, 0);
4583         if (ret)
4584                 goto out_unlock;
4585
4586         trans = btrfs_start_transaction(root, 1);
4587
4588         btrfs_set_trans_block_group(trans, new_dir);
4589
4590         btrfs_inc_nlink(old_dentry->d_inode);
4591         old_dir->i_ctime = old_dir->i_mtime = ctime;
4592         new_dir->i_ctime = new_dir->i_mtime = ctime;
4593         old_inode->i_ctime = ctime;
4594
4595         ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4596                                  old_dentry->d_name.name,
4597                                  old_dentry->d_name.len);
4598         if (ret)
4599                 goto out_fail;
4600
4601         if (new_inode) {
4602                 new_inode->i_ctime = CURRENT_TIME;
4603                 ret = btrfs_unlink_inode(trans, root, new_dir,
4604                                          new_dentry->d_inode,
4605                                          new_dentry->d_name.name,
4606                                          new_dentry->d_name.len);
4607                 if (ret)
4608                         goto out_fail;
4609                 if (new_inode->i_nlink == 0) {
4610                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4611                         if (ret)
4612                                 goto out_fail;
4613                 }
4614
4615         }
4616         ret = btrfs_set_inode_index(new_dir, &index);
4617         if (ret)
4618                 goto out_fail;
4619
4620         ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4621                              old_inode, new_dentry->d_name.name,
4622                              new_dentry->d_name.len, 1, index);
4623         if (ret)
4624                 goto out_fail;
4625
4626 out_fail:
4627         btrfs_end_transaction_throttle(trans, root);
4628 out_unlock:
4629         return ret;
4630 }
4631
4632 /*
4633  * some fairly slow code that needs optimization. This walks the list
4634  * of all the inodes with pending delalloc and forces them to disk.
4635  */
4636 int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4637 {
4638         struct list_head *head = &root->fs_info->delalloc_inodes;
4639         struct btrfs_inode *binode;
4640         struct inode *inode;
4641         unsigned long flags;
4642
4643         if (root->fs_info->sb->s_flags & MS_RDONLY)
4644                 return -EROFS;
4645
4646         spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
4647         while(!list_empty(head)) {
4648                 binode = list_entry(head->next, struct btrfs_inode,
4649                                     delalloc_inodes);
4650                 inode = igrab(&binode->vfs_inode);
4651                 if (!inode)
4652                         list_del_init(&binode->delalloc_inodes);
4653                 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
4654                 if (inode) {
4655                         filemap_flush(inode->i_mapping);
4656                         iput(inode);
4657                 }
4658                 cond_resched();
4659                 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
4660         }
4661         spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
4662
4663         /* the filemap_flush will queue IO into the worker threads, but
4664          * we have to make sure the IO is actually started and that
4665          * ordered extents get created before we return
4666          */
4667         atomic_inc(&root->fs_info->async_submit_draining);
4668         while(atomic_read(&root->fs_info->nr_async_submits) ||
4669               atomic_read(&root->fs_info->async_delalloc_pages)) {
4670                 wait_event(root->fs_info->async_submit_wait,
4671                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4672                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4673         }
4674         atomic_dec(&root->fs_info->async_submit_draining);
4675         return 0;
4676 }
4677
4678 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4679                          const char *symname)
4680 {
4681         struct btrfs_trans_handle *trans;
4682         struct btrfs_root *root = BTRFS_I(dir)->root;
4683         struct btrfs_path *path;
4684         struct btrfs_key key;
4685         struct inode *inode = NULL;
4686         int err;
4687         int drop_inode = 0;
4688         u64 objectid;
4689         u64 index = 0 ;
4690         int name_len;
4691         int datasize;
4692         unsigned long ptr;
4693         struct btrfs_file_extent_item *ei;
4694         struct extent_buffer *leaf;
4695         unsigned long nr = 0;
4696
4697         name_len = strlen(symname) + 1;
4698         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4699                 return -ENAMETOOLONG;
4700
4701         err = btrfs_check_free_space(root, 1, 0);
4702         if (err)
4703                 goto out_fail;
4704
4705         trans = btrfs_start_transaction(root, 1);
4706         btrfs_set_trans_block_group(trans, dir);
4707
4708         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4709         if (err) {
4710                 err = -ENOSPC;
4711                 goto out_unlock;
4712         }
4713
4714         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4715                                 dentry->d_name.len,
4716                                 dentry->d_parent->d_inode->i_ino, objectid,
4717                                 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4718                                 &index);
4719         err = PTR_ERR(inode);
4720         if (IS_ERR(inode))
4721                 goto out_unlock;
4722
4723         err = btrfs_init_acl(inode, dir);
4724         if (err) {
4725                 drop_inode = 1;
4726                 goto out_unlock;
4727         }
4728
4729         btrfs_set_trans_block_group(trans, inode);
4730         err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4731         if (err)
4732                 drop_inode = 1;
4733         else {
4734                 inode->i_mapping->a_ops = &btrfs_aops;
4735                 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4736                 inode->i_fop = &btrfs_file_operations;
4737                 inode->i_op = &btrfs_file_inode_operations;
4738                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4739         }
4740         dir->i_sb->s_dirt = 1;
4741         btrfs_update_inode_block_group(trans, inode);
4742         btrfs_update_inode_block_group(trans, dir);
4743         if (drop_inode)
4744                 goto out_unlock;
4745
4746         path = btrfs_alloc_path();
4747         BUG_ON(!path);
4748         key.objectid = inode->i_ino;
4749         key.offset = 0;
4750         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4751         datasize = btrfs_file_extent_calc_inline_size(name_len);
4752         err = btrfs_insert_empty_item(trans, root, path, &key,
4753                                       datasize);
4754         if (err) {
4755                 drop_inode = 1;
4756                 goto out_unlock;
4757         }
4758         leaf = path->nodes[0];
4759         ei = btrfs_item_ptr(leaf, path->slots[0],
4760                             struct btrfs_file_extent_item);
4761         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4762         btrfs_set_file_extent_type(leaf, ei,
4763                                    BTRFS_FILE_EXTENT_INLINE);
4764         btrfs_set_file_extent_encryption(leaf, ei, 0);
4765         btrfs_set_file_extent_compression(leaf, ei, 0);
4766         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4767         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4768
4769         ptr = btrfs_file_extent_inline_start(ei);
4770         write_extent_buffer(leaf, symname, ptr, name_len);
4771         btrfs_mark_buffer_dirty(leaf);
4772         btrfs_free_path(path);
4773
4774         inode->i_op = &btrfs_symlink_inode_operations;
4775         inode->i_mapping->a_ops = &btrfs_symlink_aops;
4776         inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4777         inode_set_bytes(inode, name_len);
4778         btrfs_i_size_write(inode, name_len - 1);
4779         err = btrfs_update_inode(trans, root, inode);
4780         if (err)
4781                 drop_inode = 1;
4782
4783 out_unlock:
4784         nr = trans->blocks_used;
4785         btrfs_end_transaction_throttle(trans, root);
4786 out_fail:
4787         if (drop_inode) {
4788                 inode_dec_link_count(inode);
4789                 iput(inode);
4790         }
4791         btrfs_btree_balance_dirty(root, nr);
4792         return err;
4793 }
4794
4795 static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4796                                u64 alloc_hint, int mode)
4797 {
4798         struct btrfs_trans_handle *trans;
4799         struct btrfs_root *root = BTRFS_I(inode)->root;
4800         struct btrfs_key ins;
4801         u64 alloc_size;
4802         u64 cur_offset = start;
4803         u64 num_bytes = end - start;
4804         int ret = 0;
4805
4806         trans = btrfs_join_transaction(root, 1);
4807         BUG_ON(!trans);
4808         btrfs_set_trans_block_group(trans, inode);
4809
4810         while (num_bytes > 0) {
4811                 alloc_size = min(num_bytes, root->fs_info->max_extent);
4812                 ret = btrfs_reserve_extent(trans, root, alloc_size,
4813                                            root->sectorsize, 0, alloc_hint,
4814                                            (u64)-1, &ins, 1);
4815                 if (ret) {
4816                         WARN_ON(1);
4817                         goto out;
4818                 }
4819                 ret = insert_reserved_file_extent(trans, inode,
4820                                                   cur_offset, ins.objectid,
4821                                                   ins.offset, ins.offset,
4822                                                   ins.offset, 0, 0, 0,
4823                                                   BTRFS_FILE_EXTENT_PREALLOC);
4824                 BUG_ON(ret);
4825                 num_bytes -= ins.offset;
4826                 cur_offset += ins.offset;
4827                 alloc_hint = ins.objectid + ins.offset;
4828         }
4829 out:
4830         if (cur_offset > start) {
4831                 inode->i_ctime = CURRENT_TIME;
4832                 btrfs_set_flag(inode, PREALLOC);
4833                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4834                     cur_offset > i_size_read(inode))
4835                         btrfs_i_size_write(inode, cur_offset);
4836                 ret = btrfs_update_inode(trans, root, inode);
4837                 BUG_ON(ret);
4838         }
4839
4840         btrfs_end_transaction(trans, root);
4841         return ret;
4842 }
4843
4844 static long btrfs_fallocate(struct inode *inode, int mode,
4845                             loff_t offset, loff_t len)
4846 {
4847         u64 cur_offset;
4848         u64 last_byte;
4849         u64 alloc_start;
4850         u64 alloc_end;
4851         u64 alloc_hint = 0;
4852         u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4853         struct extent_map *em;
4854         int ret;
4855
4856         alloc_start = offset & ~mask;
4857         alloc_end =  (offset + len + mask) & ~mask;
4858
4859         mutex_lock(&inode->i_mutex);
4860         if (alloc_start > inode->i_size) {
4861                 ret = btrfs_cont_expand(inode, alloc_start);
4862                 if (ret)
4863                         goto out;
4864         }
4865
4866         while (1) {
4867                 struct btrfs_ordered_extent *ordered;
4868                 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4869                             alloc_end - 1, GFP_NOFS);
4870                 ordered = btrfs_lookup_first_ordered_extent(inode,
4871                                                             alloc_end - 1);
4872                 if (ordered &&
4873                     ordered->file_offset + ordered->len > alloc_start &&
4874                     ordered->file_offset < alloc_end) {
4875                         btrfs_put_ordered_extent(ordered);
4876                         unlock_extent(&BTRFS_I(inode)->io_tree,
4877                                       alloc_start, alloc_end - 1, GFP_NOFS);
4878                         btrfs_wait_ordered_range(inode, alloc_start,
4879                                                  alloc_end - alloc_start);
4880                 } else {
4881                         if (ordered)
4882                                 btrfs_put_ordered_extent(ordered);
4883                         break;
4884                 }
4885         }
4886
4887         cur_offset = alloc_start;
4888         while (1) {
4889                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4890                                       alloc_end - cur_offset, 0);
4891                 BUG_ON(IS_ERR(em) || !em);
4892                 last_byte = min(extent_map_end(em), alloc_end);
4893                 last_byte = (last_byte + mask) & ~mask;
4894                 if (em->block_start == EXTENT_MAP_HOLE) {
4895                         ret = prealloc_file_range(inode, cur_offset,
4896                                         last_byte, alloc_hint, mode);
4897                         if (ret < 0) {
4898                                 free_extent_map(em);
4899                                 break;
4900                         }
4901                 }
4902                 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4903                         alloc_hint = em->block_start;
4904                 free_extent_map(em);
4905
4906                 cur_offset = last_byte;
4907                 if (cur_offset >= alloc_end) {
4908                         ret = 0;
4909                         break;
4910                 }
4911         }
4912         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4913                       GFP_NOFS);
4914 out:
4915         mutex_unlock(&inode->i_mutex);
4916         return ret;
4917 }
4918
4919 static int btrfs_set_page_dirty(struct page *page)
4920 {
4921         return __set_page_dirty_nobuffers(page);
4922 }
4923
4924 static int btrfs_permission(struct inode *inode, int mask)
4925 {
4926         if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4927                 return -EACCES;
4928         return generic_permission(inode, mask, btrfs_check_acl);
4929 }
4930
4931 static struct inode_operations btrfs_dir_inode_operations = {
4932         .getattr        = btrfs_getattr,
4933         .lookup         = btrfs_lookup,
4934         .create         = btrfs_create,
4935         .unlink         = btrfs_unlink,
4936         .link           = btrfs_link,
4937         .mkdir          = btrfs_mkdir,
4938         .rmdir          = btrfs_rmdir,
4939         .rename         = btrfs_rename,
4940         .symlink        = btrfs_symlink,
4941         .setattr        = btrfs_setattr,
4942         .mknod          = btrfs_mknod,
4943         .setxattr       = btrfs_setxattr,
4944         .getxattr       = btrfs_getxattr,
4945         .listxattr      = btrfs_listxattr,
4946         .removexattr    = btrfs_removexattr,
4947         .permission     = btrfs_permission,
4948 };
4949 static struct inode_operations btrfs_dir_ro_inode_operations = {
4950         .lookup         = btrfs_lookup,
4951         .permission     = btrfs_permission,
4952 };
4953 static struct file_operations btrfs_dir_file_operations = {
4954         .llseek         = generic_file_llseek,
4955         .read           = generic_read_dir,
4956         .readdir        = btrfs_real_readdir,
4957         .unlocked_ioctl = btrfs_ioctl,
4958 #ifdef CONFIG_COMPAT
4959         .compat_ioctl   = btrfs_ioctl,
4960 #endif
4961         .release        = btrfs_release_file,
4962         .fsync          = btrfs_sync_file,
4963 };
4964
4965 static struct extent_io_ops btrfs_extent_io_ops = {
4966         .fill_delalloc = run_delalloc_range,
4967         .submit_bio_hook = btrfs_submit_bio_hook,
4968         .merge_bio_hook = btrfs_merge_bio_hook,
4969         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4970         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4971         .writepage_start_hook = btrfs_writepage_start_hook,
4972         .readpage_io_failed_hook = btrfs_io_failed_hook,
4973         .set_bit_hook = btrfs_set_bit_hook,
4974         .clear_bit_hook = btrfs_clear_bit_hook,
4975 };
4976
4977 static struct address_space_operations btrfs_aops = {
4978         .readpage       = btrfs_readpage,
4979         .writepage      = btrfs_writepage,
4980         .writepages     = btrfs_writepages,
4981         .readpages      = btrfs_readpages,
4982         .sync_page      = block_sync_page,
4983         .bmap           = btrfs_bmap,
4984         .direct_IO      = btrfs_direct_IO,
4985         .invalidatepage = btrfs_invalidatepage,
4986         .releasepage    = btrfs_releasepage,
4987         .set_page_dirty = btrfs_set_page_dirty,
4988 };
4989
4990 static struct address_space_operations btrfs_symlink_aops = {
4991         .readpage       = btrfs_readpage,
4992         .writepage      = btrfs_writepage,
4993         .invalidatepage = btrfs_invalidatepage,
4994         .releasepage    = btrfs_releasepage,
4995 };
4996
4997 static struct inode_operations btrfs_file_inode_operations = {
4998         .truncate       = btrfs_truncate,
4999         .getattr        = btrfs_getattr,
5000         .setattr        = btrfs_setattr,
5001         .setxattr       = btrfs_setxattr,
5002         .getxattr       = btrfs_getxattr,
5003         .listxattr      = btrfs_listxattr,
5004         .removexattr    = btrfs_removexattr,
5005         .permission     = btrfs_permission,
5006         .fallocate      = btrfs_fallocate,
5007 };
5008 static struct inode_operations btrfs_special_inode_operations = {
5009         .getattr        = btrfs_getattr,
5010         .setattr        = btrfs_setattr,
5011         .permission     = btrfs_permission,
5012         .setxattr       = btrfs_setxattr,
5013         .getxattr       = btrfs_getxattr,
5014         .listxattr      = btrfs_listxattr,
5015         .removexattr    = btrfs_removexattr,
5016 };
5017 static struct inode_operations btrfs_symlink_inode_operations = {
5018         .readlink       = generic_readlink,
5019         .follow_link    = page_follow_link_light,
5020         .put_link       = page_put_link,
5021         .permission     = btrfs_permission,
5022 };