]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Apr 2009 17:20:44 +0000 (10:20 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Apr 2009 17:20:44 +0000 (10:20 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
  Btrfs: try to free metadata pages when we free btree blocks
  Btrfs: add extra flushing for renames and truncates
  Btrfs: make sure btrfs_update_delayed_ref doesn't increase ref_mod
  Btrfs: optimize fsyncs on old files
  Btrfs: tree logging unlink/rename fixes
  Btrfs: Make sure i_nlink doesn't hit zero too soon during log replay
  Btrfs: limit balancing work while flushing delayed refs
  Btrfs: readahead checksums during btrfs_finish_ordered_io
  Btrfs: leave btree locks spinning more often
  Btrfs: Only let very young transactions grow during commit
  Btrfs: Check for a blocking lock before taking the spin
  Btrfs: reduce stack in cow_file_range
  Btrfs: reduce stalls during transaction commit
  Btrfs: process the delayed reference queue in clusters
  Btrfs: try to cleanup delayed refs while freeing extents
  Btrfs: reduce stack usage in some crucial tree balancing functions
  Btrfs: do extent allocation and reference count updates in the background
  Btrfs: don't preallocate metadata blocks during btrfs_search_slot

24 files changed:
fs/btrfs/Makefile
fs/btrfs/btrfs_inode.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c [new file with mode: 0644]
fs/btrfs/delayed-ref.h [new file with mode: 0644]
fs/btrfs/dir-item.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/inode-item.c
fs/btrfs/inode.c
fs/btrfs/locking.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-defrag.c
fs/btrfs/tree-log.c
fs/btrfs/tree-log.h

index d2cf5a54a4b816d308ed7c452892fae27e009db1..9adf5e4f7e967b09045e8444c80dd819d72f3031 100644 (file)
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-          compression.o
+          compression.o delayed-ref.o
 else
 
 # Normal Makefile
index 72677ce2b74fc87b1ff6a277906492389085f687..b30986f00b9d7eebb9bfd50c93e86a9a34f329be 100644 (file)
@@ -66,6 +66,12 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
 
+       /*
+        * list for tracking inodes that must be sent to disk before a
+        * rename or truncate commit
+        */
+       struct list_head ordered_operations;
+
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
 
@@ -86,12 +92,6 @@ struct btrfs_inode {
         */
        u64 logged_trans;
 
-       /*
-        * trans that last made a change that should be fully fsync'd.  This
-        * gets reset to zero each time the inode is logged
-        */
-       u64 log_dirty_trans;
-
        /* total number of bytes pending delalloc, used by stat to calc the
         * real block usage of the file
         */
@@ -121,6 +121,25 @@ struct btrfs_inode {
        /* the start of block group preferred for allocations. */
        u64 block_group;
 
+       /* the fsync log has some corner cases that mean we have to check
+        * directories to see if any unlinks have been done before
+        * the directory was logged.  See tree-log.c for all the
+        * details
+        */
+       u64 last_unlink_trans;
+
+       /*
+        * ordered_data_close is set by truncate when a file that used
+        * to have good data has been truncated to zero.  When it is set
+        * the btrfs file release call will add this inode to the
+        * ordered operations list so that we make sure to flush out any
+        * new data the application may have written before commit.
+        *
+        * yes, its silly to have a single bitflag, but we might grow more
+        * of these.
+        */
+       unsigned ordered_data_close:1;
+
        struct inode vfs_inode;
 };
 
index 37f31b5529aa0abe62a4e30ec7170c69c443eb0a..dbb7241246335373dde1fa8597744faec6748942 100644 (file)
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * empty_size -- a hint that you plan on doing more cow.  This is the size in
  * bytes the allocator should try to find free next to the block it returns.
  * This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct extent_buffer *buf,
                             struct extent_buffer *parent, int parent_slot,
                             struct extent_buffer **cow_ret,
-                            u64 search_start, u64 empty_size,
-                            u64 prealloc_dest)
+                            u64 search_start, u64 empty_size)
 {
        u64 parent_start;
        struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
 
-       if (prealloc_dest) {
-               struct btrfs_key ins;
-
-               ins.objectid = prealloc_dest;
-               ins.offset = buf->len;
-               ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-               ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
-                                                 root->root_key.objectid,
-                                                 trans->transid, level, &ins);
-               BUG_ON(ret);
-               cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-                                           buf->len, level);
-       } else {
-               cow = btrfs_alloc_free_block(trans, root, buf->len,
-                                            parent_start,
-                                            root->root_key.objectid,
-                                            trans->transid, level,
-                                            search_start, empty_size);
-       }
+       cow = btrfs_alloc_free_block(trans, root, buf->len,
+                                    parent_start, root->root_key.objectid,
+                                    trans->transid, level,
+                                    search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
-                   struct extent_buffer **cow_ret, u64 prealloc_dest)
+                   struct extent_buffer **cow_ret)
 {
        u64 search_start;
        int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
            btrfs_header_owner(buf) == root->root_key.objectid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
-               WARN_ON(prealloc_dest);
                return 0;
        }
 
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        btrfs_set_lock_blocking(buf);
 
        ret = __btrfs_cow_block(trans, root, buf, parent,
-                                parent_slot, cow_ret, search_start, 0,
-                                prealloc_dest);
+                                parent_slot, cow_ret, search_start, 0);
        return ret;
 }
 
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                err = __btrfs_cow_block(trans, root, cur, parent, i,
                                        &cur, search_start,
                                        min(16 * blocksize,
-                                           (end_slot - i) * blocksize), 0);
+                                           (end_slot - i) * blocksize));
                if (err) {
                        btrfs_tree_unlock(cur);
                        free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(!child);
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
-               ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+               ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
                BUG_ON(ret);
 
                spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                spin_unlock(&root->node_lock);
 
                ret = btrfs_update_extent_ref(trans, root, child->start,
+                                             child->len,
                                              mid->start, child->start,
                                              root->root_key.objectid,
                                              trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
 
+       if (trans->transaction->delayed_refs.flushing &&
+           btrfs_header_nritems(mid) > 2)
+               return 0;
+
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
 
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(left);
                btrfs_set_lock_blocking(left);
                wret = btrfs_cow_block(trans, root, left,
-                                      parent, pslot - 1, &left, 0);
+                                      parent, pslot - 1, &left);
                if (wret) {
                        ret = wret;
                        goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(right);
                btrfs_set_lock_blocking(right);
                wret = btrfs_cow_block(trans, root, right,
-                                      parent, pslot + 1, &right, 0);
+                                      parent, pslot + 1, &right);
                if (wret) {
                        ret = wret;
                        goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        wret = 1;
                } else {
                        ret = btrfs_cow_block(trans, root, left, parent,
-                                             pslot - 1, &left, 0);
+                                             pslot - 1, &left);
                        if (ret)
                                wret = 1;
                        else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                } else {
                        ret = btrfs_cow_block(trans, root, right,
                                              parent, pslot + 1,
-                                             &right, 0);
+                                             &right);
                        if (ret)
                                wret = 1;
                        else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        u8 lowest_level = 0;
        u64 blocknr;
        u64 gen;
-       struct btrfs_key prealloc_block;
 
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        if (ins_len < 0)
                lowest_unlock = 2;
 
-       prealloc_block.objectid = 0;
-
 again:
        if (p->skip_locking)
                b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                goto cow_done;
                        }
-
-                       /* ok, we have to cow, is our old prealloc the right
-                        * size?
-                        */
-                       if (prealloc_block.objectid &&
-                           prealloc_block.offset != b->len) {
-                               btrfs_release_path(root, p);
-                               btrfs_free_reserved_extent(root,
-                                          prealloc_block.objectid,
-                                          prealloc_block.offset);
-                               prealloc_block.objectid = 0;
-                               goto again;
-                       }
-
-                       /*
-                        * for higher level blocks, try not to allocate blocks
-                        * with the block and the parent locks held.
-                        */
-                       if (level > 0 && !prealloc_block.objectid) {
-                               u32 size = b->len;
-                               u64 hint = b->start;
-
-                               btrfs_release_path(root, p);
-                               ret = btrfs_reserve_extent(trans, root,
-                                                          size, size, 0,
-                                                          hint, (u64)-1,
-                                                          &prealloc_block, 0);
-                               BUG_ON(ret);
-                               goto again;
-                       }
-
                        btrfs_set_path_blocking(p);
 
                        wret = btrfs_cow_block(trans, root, b,
                                               p->nodes[level + 1],
-                                              p->slots[level + 1],
-                                              &b, prealloc_block.objectid);
-                       prealloc_block.objectid = 0;
+                                              p->slots[level + 1], &b);
                        if (wret) {
                                free_extent_buffer(b);
                                ret = wret;
@@ -1742,12 +1688,8 @@ done:
         * we don't really know what they plan on doing with the path
         * from here on, so for now just mark it as blocking
         */
-       btrfs_set_path_blocking(p);
-       if (prealloc_block.objectid) {
-               btrfs_free_reserved_extent(root,
-                          prealloc_block.objectid,
-                          prealloc_block.offset);
-       }
+       if (!p->leave_spinning)
+               btrfs_set_path_blocking(p);
        return ret;
 }
 
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
        int ret;
 
        eb = btrfs_lock_root_node(root);
-       ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+       ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
        BUG_ON(ret);
 
        btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
                        }
 
                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
-                                             &eb, 0);
+                                             &eb);
                        BUG_ON(ret);
 
                        if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        spin_unlock(&root->node_lock);
 
        ret = btrfs_update_extent_ref(trans, root, lower->start,
-                                     lower->start, c->start,
+                                     lower->len, lower->start, c->start,
                                      root->root_key.objectid,
                                      trans->transid, level - 1);
        BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
-       } else {
+       } else if (!trans->transaction->delayed_refs.flushing) {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        return ret;
 }
 
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-                          *root, struct btrfs_path *path, int data_size,
-                          int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct btrfs_path *path,
+                                     int data_size, int empty,
+                                     struct extent_buffer *right,
+                                     int free_space, u32 left_nritems)
 {
        struct extent_buffer *left = path->nodes[0];
-       struct extent_buffer *right;
-       struct extent_buffer *upper;
+       struct extent_buffer *upper = path->nodes[1];
        struct btrfs_disk_key disk_key;
        int slot;
        u32 i;
-       int free_space;
        int push_space = 0;
        int push_items = 0;
        struct btrfs_item *item;
-       u32 left_nritems;
        u32 nr;
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;
        int ret;
 
-       slot = path->slots[1];
-       if (!path->nodes[1])
-               return 1;
-
-       upper = path->nodes[1];
-       if (slot >= btrfs_header_nritems(upper) - 1)
-               return 1;
-
-       btrfs_assert_tree_locked(path->nodes[1]);
-
-       right = read_node_slot(root, upper, slot + 1);
-       btrfs_tree_lock(right);
-       btrfs_set_lock_blocking(right);
-
-       free_space = btrfs_leaf_free_space(root, right);
-       if (free_space < data_size)
-               goto out_unlock;
-
-       /* cow and double check */
-       ret = btrfs_cow_block(trans, root, right, upper,
-                             slot + 1, &right, 0);
-       if (ret)
-               goto out_unlock;
-
-       free_space = btrfs_leaf_free_space(root, right);
-       if (free_space < data_size)
-               goto out_unlock;
-
-       left_nritems = btrfs_header_nritems(left);
-       if (left_nritems == 0)
-               goto out_unlock;
-
        if (empty)
                nr = 0;
        else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (path->slots[0] >= left_nritems)
                push_space += data_size;
 
+       slot = path->slots[1];
        i = left_nritems - 1;
        while (i >= nr) {
                item = btrfs_item_nr(left, i);
@@ -2527,25 +2431,83 @@ out_unlock:
        return 1;
 }
 
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct btrfs_path *path, int data_size,
+                          int empty)
+{
+       struct extent_buffer *left = path->nodes[0];
+       struct extent_buffer *right;
+       struct extent_buffer *upper;
+       int slot;
+       int free_space;
+       u32 left_nritems;
+       int ret;
+
+       if (!path->nodes[1])
+               return 1;
+
+       slot = path->slots[1];
+       upper = path->nodes[1];
+       if (slot >= btrfs_header_nritems(upper) - 1)
+               return 1;
+
+       btrfs_assert_tree_locked(path->nodes[1]);
+
+       right = read_node_slot(root, upper, slot + 1);
+       btrfs_tree_lock(right);
+       btrfs_set_lock_blocking(right);
+
+       free_space = btrfs_leaf_free_space(root, right);
+       if (free_space < data_size)
+               goto out_unlock;
+
+       /* cow and double check */
+       ret = btrfs_cow_block(trans, root, right, upper,
+                             slot + 1, &right);
+       if (ret)
+               goto out_unlock;
+
+       free_space = btrfs_leaf_free_space(root, right);
+       if (free_space < data_size)
+               goto out_unlock;
+
+       left_nritems = btrfs_header_nritems(left);
+       if (left_nritems == 0)
+               goto out_unlock;
+
+       return __push_leaf_right(trans, root, path, data_size, empty,
+                               right, free_space, left_nritems);
+out_unlock:
+       btrfs_tree_unlock(right);
+       free_extent_buffer(right);
+       return 1;
+}
+
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-                         *root, struct btrfs_path *path, int data_size,
-                         int empty)
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path, int data_size,
+                                    int empty, struct extent_buffer *left,
+                                    int free_space, int right_nritems)
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
-       struct extent_buffer *left;
        int slot;
        int i;
-       int free_space;
        int push_space = 0;
        int push_items = 0;
        struct btrfs_item *item;
        u32 old_left_nritems;
-       u32 right_nritems;
        u32 nr;
        int ret = 0;
        int wret;
@@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        u32 old_left_item_size;
 
        slot = path->slots[1];
-       if (slot == 0)
-               return 1;
-       if (!path->nodes[1])
-               return 1;
-
-       right_nritems = btrfs_header_nritems(right);
-       if (right_nritems == 0)
-               return 1;
-
-       btrfs_assert_tree_locked(path->nodes[1]);
-
-       left = read_node_slot(root, path->nodes[1], slot - 1);
-       btrfs_tree_lock(left);
-       btrfs_set_lock_blocking(left);
-
-       free_space = btrfs_leaf_free_space(root, left);
-       if (free_space < data_size) {
-               ret = 1;
-               goto out;
-       }
-
-       /* cow and double check */
-       ret = btrfs_cow_block(trans, root, left,
-                             path->nodes[1], slot - 1, &left, 0);
-       if (ret) {
-               /* we hit -ENOSPC, but it isn't fatal here */
-               ret = 1;
-               goto out;
-       }
-
-       free_space = btrfs_leaf_free_space(root, left);
-       if (free_space < data_size) {
-               ret = 1;
-               goto out;
-       }
 
        if (empty)
                nr = right_nritems;
@@ -2754,6 +2681,154 @@ out:
        return ret;
 }
 
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+                         *root, struct btrfs_path *path, int data_size,
+                         int empty)
+{
+       struct extent_buffer *right = path->nodes[0];
+       struct extent_buffer *left;
+       int slot;
+       int free_space;
+       u32 right_nritems;
+       int ret = 0;
+
+       slot = path->slots[1];
+       if (slot == 0)
+               return 1;
+       if (!path->nodes[1])
+               return 1;
+
+       right_nritems = btrfs_header_nritems(right);
+       if (right_nritems == 0)
+               return 1;
+
+       btrfs_assert_tree_locked(path->nodes[1]);
+
+       left = read_node_slot(root, path->nodes[1], slot - 1);
+       btrfs_tree_lock(left);
+       btrfs_set_lock_blocking(left);
+
+       free_space = btrfs_leaf_free_space(root, left);
+       if (free_space < data_size) {
+               ret = 1;
+               goto out;
+       }
+
+       /* cow and double check */
+       ret = btrfs_cow_block(trans, root, left,
+                             path->nodes[1], slot - 1, &left);
+       if (ret) {
+               /* we hit -ENOSPC, but it isn't fatal here */
+               ret = 1;
+               goto out;
+       }
+
+       free_space = btrfs_leaf_free_space(root, left);
+       if (free_space < data_size) {
+               ret = 1;
+               goto out;
+       }
+
+       return __push_leaf_left(trans, root, path, data_size,
+                              empty, left, free_space, right_nritems);
+out:
+       btrfs_tree_unlock(left);
+       free_extent_buffer(left);
+       return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              struct extent_buffer *l,
+                              struct extent_buffer *right,
+                              int slot, int mid, int nritems)
+{
+       int data_copy_size;
+       int rt_data_off;
+       int i;
+       int ret = 0;
+       int wret;
+       struct btrfs_disk_key disk_key;
+
+       nritems = nritems - mid;
+       btrfs_set_header_nritems(right, nritems);
+       data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+       copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+                          btrfs_item_nr_offset(mid),
+                          nritems * sizeof(struct btrfs_item));
+
+       copy_extent_buffer(right, l,
+                    btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+                    data_copy_size, btrfs_leaf_data(l) +
+                    leaf_data_end(root, l), data_copy_size);
+
+       rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+                     btrfs_item_end_nr(l, mid);
+
+       for (i = 0; i < nritems; i++) {
+               struct btrfs_item *item = btrfs_item_nr(right, i);
+               u32 ioff;
+
+               if (!right->map_token) {
+                       map_extent_buffer(right, (unsigned long)item,
+                                       sizeof(struct btrfs_item),
+                                       &right->map_token, &right->kaddr,
+                                       &right->map_start, &right->map_len,
+                                       KM_USER1);
+               }
+
+               ioff = btrfs_item_offset(right, item);
+               btrfs_set_item_offset(right, item, ioff + rt_data_off);
+       }
+
+       if (right->map_token) {
+               unmap_extent_buffer(right, right->map_token, KM_USER1);
+               right->map_token = NULL;
+       }
+
+       btrfs_set_header_nritems(l, mid);
+       ret = 0;
+       btrfs_item_key(right, &disk_key, 0);
+       wret = insert_ptr(trans, root, path, &disk_key, right->start,
+                         path->slots[1] + 1, 1);
+       if (wret)
+               ret = wret;
+
+       btrfs_mark_buffer_dirty(right);
+       btrfs_mark_buffer_dirty(l);
+       BUG_ON(path->slots[0] != slot);
+
+       ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+       BUG_ON(ret);
+
+       if (mid <= slot) {
+               btrfs_tree_unlock(path->nodes[0]);
+               free_extent_buffer(path->nodes[0]);
+               path->nodes[0] = right;
+               path->slots[0] -= mid;
+               path->slots[1] += 1;
+       } else {
+               btrfs_tree_unlock(right);
+               free_extent_buffer(right);
+       }
+
+       BUG_ON(path->slots[0] < 0);
+
+       return ret;
+}
+
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
@@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int mid;
        int slot;
        struct extent_buffer *right;
-       int data_copy_size;
-       int rt_data_off;
-       int i;
        int ret = 0;
        int wret;
        int double_split;
        int num_doubles = 0;
-       struct btrfs_disk_key disk_key;
 
        /* first try to make some room by pushing left and right */
-       if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+       if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+           !trans->transaction->delayed_refs.flushing) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
                if (wret < 0)
                        return wret;
@@ -2830,11 +2902,14 @@ again:
        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
+
        if (mid <= slot) {
                if (nritems == 1 ||
                    leaf_space_used(l, mid, nritems - mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(root)) {
                        if (slot >= nritems) {
+                               struct btrfs_disk_key disk_key;
+
                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
                                btrfs_set_header_nritems(right, 0);
                                wret = insert_ptr(trans, root, path,
@@ -2862,6 +2937,8 @@ again:
                if (leaf_space_used(l, 0, mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(root)) {
                        if (!extend && data_size && slot == 0) {
+                               struct btrfs_disk_key disk_key;
+
                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
                                btrfs_set_header_nritems(right, 0);
                                wret = insert_ptr(trans, root, path,
@@ -2894,76 +2971,16 @@ again:
                        }
                }
        }
-       nritems = nritems - mid;
-       btrfs_set_header_nritems(right, nritems);
-       data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
-       copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
-                          btrfs_item_nr_offset(mid),
-                          nritems * sizeof(struct btrfs_item));
-
-       copy_extent_buffer(right, l,
-                    btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-                    data_copy_size, btrfs_leaf_data(l) +
-                    leaf_data_end(root, l), data_copy_size);
-
-       rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-                     btrfs_item_end_nr(l, mid);
-
-       for (i = 0; i < nritems; i++) {
-               struct btrfs_item *item = btrfs_item_nr(right, i);
-               u32 ioff;
-
-               if (!right->map_token) {
-                       map_extent_buffer(right, (unsigned long)item,
-                                       sizeof(struct btrfs_item),
-                                       &right->map_token, &right->kaddr,
-                                       &right->map_start, &right->map_len,
-                                       KM_USER1);
-               }
-
-               ioff = btrfs_item_offset(right, item);
-               btrfs_set_item_offset(right, item, ioff + rt_data_off);
-       }
-
-       if (right->map_token) {
-               unmap_extent_buffer(right, right->map_token, KM_USER1);
-               right->map_token = NULL;
-       }
-
-       btrfs_set_header_nritems(l, mid);
-       ret = 0;
-       btrfs_item_key(right, &disk_key, 0);
-       wret = insert_ptr(trans, root, path, &disk_key, right->start,
-                         path->slots[1] + 1, 1);
-       if (wret)
-               ret = wret;
-
-       btrfs_mark_buffer_dirty(right);
-       btrfs_mark_buffer_dirty(l);
-       BUG_ON(path->slots[0] != slot);
 
-       ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+       ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
        BUG_ON(ret);
 
-       if (mid <= slot) {
-               btrfs_tree_unlock(path->nodes[0]);
-               free_extent_buffer(path->nodes[0]);
-               path->nodes[0] = right;
-               path->slots[0] -= mid;
-               path->slots[1] += 1;
-       } else {
-               btrfs_tree_unlock(right);
-               free_extent_buffer(right);
-       }
-
-       BUG_ON(path->slots[0] < 0);
-
        if (double_split) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
        }
+
        return ret;
 }
 
@@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
                return -EAGAIN;
        }
 
+       btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &orig_key, path,
                         sizeof(struct btrfs_item), 1);
        path->keep_locks = 0;
        BUG_ON(ret);
 
+       btrfs_unlock_up_safe(path, 1);
+       leaf = path->nodes[0];
+       BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
        /*
         * make sure any changes to the path from split_leaf leave it
         * in a blocking state
         */
        btrfs_set_path_blocking(path);
 
-       leaf = path->nodes[0];
-       BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
        item = btrfs_item_nr(leaf, path->slots[0]);
        orig_offset = btrfs_item_offset(leaf, item);
        item_size = btrfs_item_size(leaf, item);
 
-
        buf = kmalloc(item_size, GFP_NOFS);
        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
                            path->slots[0]), item_size);
@@ -3445,39 +3463,27 @@ out:
 }
 
 /*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
  */
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root,
-                           struct btrfs_path *path,
-                           struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct btrfs_path *path,
+                     struct btrfs_key *cpu_key, u32 *data_size,
+                     u32 total_data, u32 total_size, int nr)
 {
-       struct extent_buffer *leaf;
        struct btrfs_item *item;
-       int ret = 0;
-       int slot;
-       int slot_orig;
        int i;
        u32 nritems;
-       u32 total_size = 0;
-       u32 total_data = 0;
        unsigned int data_end;
        struct btrfs_disk_key disk_key;
+       int ret;
+       struct extent_buffer *leaf;
+       int slot;
 
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
-
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
-       ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-       if (ret == 0)
-               return -EEXIST;
-       if (ret < 0)
-               goto out;
-
-       slot_orig = path->slots[0];
        leaf = path->nodes[0];
+       slot = path->slots[0];
 
        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                BUG();
        }
 
-       slot = path->slots[0];
-       BUG_ON(slot < 0);
-
        if (slot != nritems) {
                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
@@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                data_end -= data_size[i];
                btrfs_set_item_size(leaf, item, data_size[i]);
        }
+
        btrfs_set_header_nritems(leaf, nritems + nr);
-       btrfs_mark_buffer_dirty(leaf);
 
        ret = 0;
        if (slot == 0) {
+               struct btrfs_disk_key disk_key;
                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
        }
+       btrfs_unlock_up_safe(path, 1);
+       btrfs_mark_buffer_dirty(leaf);
 
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
+       return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path,
+                           struct btrfs_key *cpu_key, u32 *data_size,
+                           int nr)
+{
+       struct extent_buffer *leaf;
+       int ret = 0;
+       int slot;
+       int i;
+       u32 total_size = 0;
+       u32 total_data = 0;
+
+       for (i = 0; i < nr; i++)
+               total_data += data_size[i];
+
+       total_size = total_data + (nr * sizeof(struct btrfs_item));
+       ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+       if (ret == 0)
+               return -EEXIST;
+       if (ret < 0)
+               goto out;
+
+       leaf = path->nodes[0];
+       slot = path->slots[0];
+       BUG_ON(slot < 0);
+
+       ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+                              total_data, total_size, nr);
+
 out:
-       btrfs_unlock_up_safe(path, 1);
        return ret;
 }
 
@@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
 
                /* delete the leaf if it is mostly empty */
-               if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+               if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+                   !trans->transaction->delayed_refs.flushing) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        slot = path->slots[1];
                        extent_buffer_get(leaf);
 
+                       btrfs_set_path_blocking(path);
                        wret = push_leaf_left(trans, root, path, 1, 1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;
index 7dd1b6d0bf323cf7e006fd309e76fc61bccfa58f..9417713542a2a5867490d0308241f610832400ee 100644 (file)
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 
@@ -401,15 +408,16 @@ struct btrfs_path {
        int locks[BTRFS_MAX_LEVEL];
        int reada;
        /* keep some upper locks as we walk down */
-       int keep_locks;
-       int skip_locking;
        int lowest_level;
 
        /*
         * set by btrfs_split_item, tells search_slot to keep all locks
         * and to force calls to keep space in the nodes
         */
-       int search_for_split;
+       unsigned int search_for_split:1;
+       unsigned int keep_locks:1;
+       unsigned int skip_locking:1;
+       unsigned int leave_spinning:1;
 };
 
 /*
@@ -688,15 +696,18 @@ struct btrfs_fs_info {
        struct rb_root block_group_cache_tree;
 
        struct extent_io_tree pinned_extents;
-       struct extent_io_tree pending_del;
-       struct extent_io_tree extent_ins;
 
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
 
        u64 generation;
        u64 last_trans_committed;
-       u64 last_trans_new_blockgroup;
+
+       /*
+        * this is updated to the current trans every time a full commit
+        * is required instead of the faster short fsync log commits
+        */
+       u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
        unsigned long mount_opt;
        u64 max_extent;
@@ -717,12 +728,21 @@ struct btrfs_fs_info {
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
-       struct mutex extent_ins_mutex;
        struct mutex pinned_mutex;
        struct mutex chunk_mutex;
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+
+       /*
+        * this protects the ordered operations list only while we are
+        * processing all of the entries on it.  This way we make
+        * sure the commit code doesn't find the list temporarily empty
+        * because another function happens to be doing non-waiting preflush
+        * before jumping into the main commit.
+        */
+       struct mutex ordered_operations_mutex;
+
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -737,9 +757,28 @@ struct btrfs_fs_info {
         * ordered extents
         */
        spinlock_t ordered_extent_lock;
+
+       /*
+        * all of the data=ordered extents pending writeback
+        * these can span multiple transactions and basically include
+        * every dirty data page that isn't from nodatacow
+        */
        struct list_head ordered_extents;
+
+       /*
+        * all of the inodes that have delalloc bytes.  It is possible for
+        * this list to be empty even when there is still dirty data=ordered
+        * extents waiting to finish IO.
+        */
        struct list_head delalloc_inodes;
 
+       /*
+        * special rename and truncate targets that must be on disk before
+        * we're allowed to commit.  This is basically the ext3 style
+        * data=ordered list.
+        */
+       struct list_head ordered_operations;
+
        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
@@ -781,6 +820,11 @@ struct btrfs_fs_info {
        atomic_t throttle_gen;
 
        u64 total_pinned;
+
+       /* protected by the delalloc lock, used to keep from writing
+        * metadata until there is a nice batch
+        */
+       u64 dirty_metadata_bytes;
        struct list_head dirty_cowonly_roots;
 
        struct btrfs_fs_devices *fs_devices;
@@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file)
 }
 
 /* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, u64 bytenr,
-                           u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
                                u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
@@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         u64 root_objectid, u64 ref_generation,
                         u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, u64 bytenr,
+                           struct btrfs_root *root, u64 bytenr, u64 num_bytes,
                            u64 orig_parent, u64 parent,
                            u64 root_objectid, u64 ref_generation,
                            u64 owner_objectid);
@@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
-                   struct extent_buffer **cow_ret, u64 prealloc_dest);
+                   struct extent_buffer **cow_ret);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644 (file)
index 0000000..cbf7dc8
--- /dev/null
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include <linux/ftrace.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+                     u64 bytenr, u64 parent)
+{
+       if (bytenr < ref->bytenr)
+               return -1;
+       if (bytenr > ref->bytenr)
+               return 1;
+       if (parent < ref->parent)
+               return -1;
+       if (parent > ref->parent)
+               return 1;
+       return 0;
+}
+
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+                                                 u64 bytenr, u64 parent,
+                                                 struct rb_node *node)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent_node = NULL;
+       struct btrfs_delayed_ref_node *entry;
+       int cmp;
+
+       while (*p) {
+               parent_node = *p;
+               entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+                                rb_node);
+
+               cmp = comp_entry(entry, bytenr, parent);
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else if (cmp > 0)
+                       p = &(*p)->rb_right;
+               else
+                       return entry;
+       }
+
+       entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+       rb_link_node(node, parent_node, p);
+       rb_insert_color(node, root);
+       return NULL;
+}
+
+/*
+ * find an entry based on (bytenr,parent).  This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+                                 u64 bytenr, u64 parent,
+                                 struct btrfs_delayed_ref_node **last)
+{
+       struct rb_node *n = root->rb_node;
+       struct btrfs_delayed_ref_node *entry;
+       int cmp;
+
+       while (n) {
+               entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+               WARN_ON(!entry->in_tree);
+               if (last)
+                       *last = entry;
+
+               cmp = comp_entry(entry, bytenr, parent);
+               if (cmp < 0)
+                       n = n->rb_left;
+               else if (cmp > 0)
+                       n = n->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_head *head)
+{
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       assert_spin_locked(&delayed_refs->lock);
+       if (mutex_trylock(&head->mutex))
+               return 0;
+
+       atomic_inc(&head->node.refs);
+       spin_unlock(&delayed_refs->lock);
+
+       mutex_lock(&head->mutex);
+       spin_lock(&delayed_refs->lock);
+       if (!head->node.in_tree) {
+               mutex_unlock(&head->mutex);
+               btrfs_put_delayed_ref(&head->node);
+               return -EAGAIN;
+       }
+       btrfs_put_delayed_ref(&head->node);
+       return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                          struct list_head *cluster, u64 start)
+{
+       int count = 0;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct rb_node *node;
+       struct btrfs_delayed_ref_node *ref;
+       struct btrfs_delayed_ref_head *head;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       if (start == 0) {
+               node = rb_first(&delayed_refs->root);
+       } else {
+               ref = NULL;
+               tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+               if (ref) {
+                       struct btrfs_delayed_ref_node *tmp;
+
+                       node = rb_prev(&ref->rb_node);
+                       while (node) {
+                               tmp = rb_entry(node,
+                                              struct btrfs_delayed_ref_node,
+                                              rb_node);
+                               if (tmp->bytenr < start)
+                                       break;
+                               ref = tmp;
+                               node = rb_prev(&ref->rb_node);
+                       }
+                       node = &ref->rb_node;
+               } else
+                       node = rb_first(&delayed_refs->root);
+       }
+again:
+       while (node && count < 32) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       head = btrfs_delayed_node_to_head(ref);
+                       if (list_empty(&head->cluster)) {
+                               list_add_tail(&head->cluster, cluster);
+                               delayed_refs->run_delayed_start =
+                                       head->node.bytenr;
+                               count++;
+
+                               WARN_ON(delayed_refs->num_heads_ready == 0);
+                               delayed_refs->num_heads_ready--;
+                       } else if (count) {
+                               /* the goal of the clustering is to find extents
+                                * that are likely to end up in the same extent
+                                * leaf on disk.  So, we don't want them spread
+                                * all over the tree.  Stop now if we've hit
+                                * a head that was already in use
+                                */
+                               break;
+                       }
+               }
+               node = rb_next(node);
+       }
+       if (count) {
+               return 0;
+       } else if (start) {
+               /*
+                * we've gone to the end of the rbtree without finding any
+                * clusters.  start from the beginning and try again
+                */
+               start = 0;
+               node = rb_first(&delayed_refs->root);
+               goto again;
+       }
+       return 1;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr.  It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree.  There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+       struct btrfs_delayed_ref_node *ref;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct rb_node *prev_node;
+       int ret = 0;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+       if (ref) {
+               prev_node = rb_prev(&ref->rb_node);
+               if (!prev_node)
+                       goto out;
+               ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+                              rb_node);
+               if (ref->bytenr == bytenr)
+                       ret = 1;
+       }
+out:
+       spin_unlock(&delayed_refs->lock);
+       return ret;
+}
+
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree.  This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytenr,
+                           u64 num_bytes, u32 *refs)
+{
+       struct btrfs_delayed_ref_node *ref;
+       struct btrfs_delayed_ref_head *head;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_extent_item *ei;
+       struct btrfs_key key;
+       u32 num_refs;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = bytenr;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = num_bytes;
+       delayed_refs = &trans->transaction->delayed_refs;
+again:
+       ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                               &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
+       if (ret == 0) {
+               leaf = path->nodes[0];
+               ei = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_extent_item);
+               num_refs = btrfs_extent_refs(leaf, ei);
+       } else {
+               num_refs = 0;
+               ret = 0;
+       }
+
+       spin_lock(&delayed_refs->lock);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+       if (ref) {
+               head = btrfs_delayed_node_to_head(ref);
+               if (mutex_trylock(&head->mutex)) {
+                       num_refs += ref->ref_mod;
+                       mutex_unlock(&head->mutex);
+                       *refs = num_refs;
+                       goto out;
+               }
+
+               atomic_inc(&ref->refs);
+               spin_unlock(&delayed_refs->lock);
+
+               btrfs_release_path(root->fs_info->extent_root, path);
+
+               mutex_lock(&head->mutex);
+               mutex_unlock(&head->mutex);
+               btrfs_put_delayed_ref(ref);
+               goto again;
+       } else {
+               *refs = num_refs;
+       }
+out:
+       spin_unlock(&delayed_refs->lock);
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+                   struct btrfs_delayed_ref_root *delayed_refs,
+                   struct btrfs_delayed_ref_node *existing,
+                   struct btrfs_delayed_ref_node *update)
+{
+       struct btrfs_delayed_ref *existing_ref;
+       struct btrfs_delayed_ref *ref;
+
+       existing_ref = btrfs_delayed_node_to_ref(existing);
+       ref = btrfs_delayed_node_to_ref(update);
+
+       if (ref->pin)
+               existing_ref->pin = 1;
+
+       if (ref->action != existing_ref->action) {
+               /*
+                * this is effectively undoing either an add or a
+                * drop.  We decrement the ref_mod, and if it goes
+                * down to zero we just delete the entry without
+                * every changing the extent allocation tree.
+                */
+               existing->ref_mod--;
+               if (existing->ref_mod == 0) {
+                       rb_erase(&existing->rb_node,
+                                &delayed_refs->root);
+                       existing->in_tree = 0;
+                       btrfs_put_delayed_ref(existing);
+                       delayed_refs->num_entries--;
+                       if (trans->delayed_ref_updates)
+                               trans->delayed_ref_updates--;
+               }
+       } else {
+               if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+                       /* if we're adding refs, make sure all the
+                        * details match up.  The extent could
+                        * have been totally freed and reallocated
+                        * by a different owner before the delayed
+                        * ref entries were removed.
+                        */
+                       existing_ref->owner_objectid = ref->owner_objectid;
+                       existing_ref->generation = ref->generation;
+                       existing_ref->root = ref->root;
+                       existing->num_bytes = update->num_bytes;
+               }
+               /*
+                * the action on the existing ref matches
+                * the action on the ref we're trying to add.
+                * Bump the ref_mod by one so the backref that
+                * is eventually added/removed has the correct
+                * reference count
+                */
+               existing->ref_mod += update->ref_mod;
+       }
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+                        struct btrfs_delayed_ref_node *update)
+{
+       struct btrfs_delayed_ref_head *existing_ref;
+       struct btrfs_delayed_ref_head *ref;
+
+       existing_ref = btrfs_delayed_node_to_head(existing);
+       ref = btrfs_delayed_node_to_head(update);
+
+       if (ref->must_insert_reserved) {
+               /* if the extent was freed and then
+                * reallocated before the delayed ref
+                * entries were processed, we can end up
+                * with an existing head ref without
+                * the must_insert_reserved flag set.
+                * Set it again here
+                */
+               existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+               /*
+                * update the num_bytes so we make sure the accounting
+                * is done correctly
+                */
+               existing->num_bytes = update->num_bytes;
+
+       }
+
+       /*
+        * update the reference mod on the head to reflect this new operation
+        */
+       existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_delayed_ref_node *ref,
+                         u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                         u64 ref_generation, u64 owner_objectid, int action,
+                         int pin)
+{
+       struct btrfs_delayed_ref_node *existing;
+       struct btrfs_delayed_ref *full_ref;
+       struct btrfs_delayed_ref_head *head_ref = NULL;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       int count_mod = 1;
+       int must_insert_reserved = 0;
+
+       /*
+        * the head node stores the sum of all the mods, so dropping a ref
+        * should drop the sum in the head node by one.
+        */
+       if (parent == (u64)-1) {
+               if (action == BTRFS_DROP_DELAYED_REF)
+                       count_mod = -1;
+               else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+                       count_mod = 0;
+       }
+
+       /*
+        * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+        * the reserved accounting when the extent is finally added, or
+        * if a later modification deletes the delayed ref without ever
+        * inserting the extent into the extent allocation tree.
+        * ref->must_insert_reserved is the flag used to record
+        * that accounting mods are required.
+        *
+        * Once we record must_insert_reserved, switch the action to
+        * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+        */
+       if (action == BTRFS_ADD_DELAYED_EXTENT) {
+               must_insert_reserved = 1;
+               action = BTRFS_ADD_DELAYED_REF;
+       } else {
+               must_insert_reserved = 0;
+       }
+
+
+       delayed_refs = &trans->transaction->delayed_refs;
+
+       /* first set the basic ref node struct up */
+       atomic_set(&ref->refs, 1);
+       ref->bytenr = bytenr;
+       ref->parent = parent;
+       ref->ref_mod = count_mod;
+       ref->in_tree = 1;
+       ref->num_bytes = num_bytes;
+
+       if (btrfs_delayed_ref_is_head(ref)) {
+               head_ref = btrfs_delayed_node_to_head(ref);
+               head_ref->must_insert_reserved = must_insert_reserved;
+               INIT_LIST_HEAD(&head_ref->cluster);
+               mutex_init(&head_ref->mutex);
+       } else {
+               full_ref = btrfs_delayed_node_to_ref(ref);
+               full_ref->root = ref_root;
+               full_ref->generation = ref_generation;
+               full_ref->owner_objectid = owner_objectid;
+               full_ref->pin = pin;
+               full_ref->action = action;
+       }
+
+       existing = tree_insert(&delayed_refs->root, bytenr,
+                              parent, &ref->rb_node);
+
+       if (existing) {
+               if (btrfs_delayed_ref_is_head(ref))
+                       update_existing_head_ref(existing, ref);
+               else
+                       update_existing_ref(trans, delayed_refs, existing, ref);
+
+               /*
+                * we've updated the existing ref, free the newly
+                * allocated ref
+                */
+               kfree(ref);
+       } else {
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       delayed_refs->num_heads++;
+                       delayed_refs->num_heads_ready++;
+               }
+               delayed_refs->num_entries++;
+               trans->delayed_ref_updates++;
+       }
+       return 0;
+}
+
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                         u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                         u64 ref_generation, u64 owner_objectid, int action,
+                         int pin)
+{
+       struct btrfs_delayed_ref *ref;
+       struct btrfs_delayed_ref_head *head_ref;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       int ret;
+
+       ref = kmalloc(sizeof(*ref), GFP_NOFS);
+       if (!ref)
+               return -ENOMEM;
+
+       /*
+        * the parent = 0 case comes from cases where we don't actually
+        * know the parent yet.  It will get updated later via a add/drop
+        * pair.
+        */
+       if (parent == 0)
+               parent = bytenr;
+
+       head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+       if (!head_ref) {
+               kfree(ref);
+               return -ENOMEM;
+       }
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+
+       /*
+        * insert both the head node and the new ref without dropping
+        * the spin lock
+        */
+       ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+                                     (u64)-1, 0, 0, 0, action, pin);
+       BUG_ON(ret);
+
+       ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+                                     parent, ref_root, ref_generation,
+                                     owner_objectid, action, pin);
+       BUG_ON(ret);
+       spin_unlock(&delayed_refs->lock);
+       return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+       struct btrfs_delayed_ref_node *ref;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+       if (ref)
+               return btrfs_delayed_node_to_head(ref);
+       return NULL;
+}
+
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+                         u64 bytenr, u64 num_bytes, u64 orig_parent,
+                         u64 parent, u64 orig_ref_root, u64 ref_root,
+                         u64 orig_ref_generation, u64 ref_generation,
+                         u64 owner_objectid, int pin)
+{
+       struct btrfs_delayed_ref *ref;
+       struct btrfs_delayed_ref *old_ref;
+       struct btrfs_delayed_ref_head *head_ref;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       int ret;
+
+       ref = kmalloc(sizeof(*ref), GFP_NOFS);
+       if (!ref)
+               return -ENOMEM;
+
+       old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+       if (!old_ref) {
+               kfree(ref);
+               return -ENOMEM;
+       }
+
+       /*
+        * the parent = 0 case comes from cases where we don't actually
+        * know the parent yet.  It will get updated later via a add/drop
+        * pair.
+        */
+       if (parent == 0)
+               parent = bytenr;
+       if (orig_parent == 0)
+               orig_parent = bytenr;
+
+       head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+       if (!head_ref) {
+               kfree(ref);
+               kfree(old_ref);
+               return -ENOMEM;
+       }
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+
+       /*
+        * insert both the head node and the new ref without dropping
+        * the spin lock
+        */
+       ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+                                     (u64)-1, 0, 0, 0,
+                                     BTRFS_UPDATE_DELAYED_HEAD, 0);
+       BUG_ON(ret);
+
+       ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+                                     parent, ref_root, ref_generation,
+                                     owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+       BUG_ON(ret);
+
+       ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+                                     orig_parent, orig_ref_root,
+                                     orig_ref_generation, owner_objectid,
+                                     BTRFS_DROP_DELAYED_REF, pin);
+       BUG_ON(ret);
+       spin_unlock(&delayed_refs->lock);
+       return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644 (file)
index 0000000..3bec2ff
--- /dev/null
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+       struct rb_node rb_node;
+
+       /* the starting bytenr of the extent */
+       u64 bytenr;
+
+       /* the parent our backref will point to */
+       u64 parent;
+
+       /* the size of the extent */
+       u64 num_bytes;
+
+       /* ref count on this data structure */
+       atomic_t refs;
+
+       /*
+        * how many refs is this entry adding or deleting.  For
+        * head refs, this may be a negative number because it is keeping
+        * track of the total mods done to the reference count.
+        * For individual refs, this will always be a positive number
+        *
+        * It may be more than one, since it is possible for a single
+        * parent to have more than one ref on an extent
+        */
+       int ref_mod;
+
+       /* is this node still in the rbtree? */
+       unsigned int in_tree:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+       struct btrfs_delayed_ref_node node;
+
+       /*
+        * the mutex is held while running the refs, and it is also
+        * held when checking the sum of reference modifications.
+        */
+       struct mutex mutex;
+
+       struct list_head cluster;
+
+       /*
+        * when a new extent is allocated, it is just reserved in memory
+        * The actual extent isn't inserted into the extent allocation tree
+        * until the delayed ref is processed.  must_insert_reserved is
+        * used to flag a delayed ref so the accounting can be updated
+        * when a full insert is done.
+        *
+        * It is possible the extent will be freed before it is ever
+        * inserted into the extent allocation tree.  In this case
+        * we need to update the in ram accounting to properly reflect
+        * the free has happened.
+        */
+       unsigned int must_insert_reserved:1;
+};
+
+struct btrfs_delayed_ref {
+       struct btrfs_delayed_ref_node node;
+
+       /* the root objectid our ref will point to */
+       u64 root;
+
+       /* the generation for the backref */
+       u64 generation;
+
+       /* owner_objectid of the backref  */
+       u64 owner_objectid;
+
+       /* operation done by this entry in the rbtree */
+       u8 action;
+
+       /* if pin == 1, when the extent is freed it will be pinned until
+        * transaction commit
+        */
+       unsigned int pin:1;
+};
+
+struct btrfs_delayed_ref_root {
+       struct rb_root root;
+
+       /* this spin lock protects the rbtree and the entries inside */
+       spinlock_t lock;
+
+       /* how many delayed ref updates we've queued, used by the
+        * throttling code
+        */
+       unsigned long num_entries;
+
+       /* total number of head nodes in tree */
+       unsigned long num_heads;
+
+       /* total number of head nodes ready for processing */
+       unsigned long num_heads_ready;
+
+       /*
+        * set when the tree is flushing before a transaction commit,
+        * used by the throttling code to decide if new updates need
+        * to be run right away
+        */
+       int flushing;
+
+       u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+       WARN_ON(atomic_read(&ref->refs) == 0);
+       if (atomic_dec_and_test(&ref->refs)) {
+               WARN_ON(ref->in_tree);
+               kfree(ref);
+       }
+}
+
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                         u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                         u64 ref_generation, u64 owner_objectid, int action,
+                         int pin);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytenr,
+                           u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+                         u64 bytenr, u64 num_bytes, u64 orig_parent,
+                         u64 parent, u64 orig_ref_root, u64 ref_root,
+                         u64 orig_ref_generation, u64 ref_generation,
+                         u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                          struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+       return node->parent == (u64)-1;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+       WARN_ON(btrfs_delayed_ref_is_head(node));
+       return container_of(node, struct btrfs_delayed_ref, node);
+
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+       WARN_ON(!btrfs_delayed_ref_is_head(node));
+       return container_of(node, struct btrfs_delayed_ref_head, node);
+
+}
+#endif
index 926a0b287a7d752994fc5a11894f21e91d5561f8..1d70236ba00c7bf4abf5181d2efd3b20d5867e2e 100644 (file)
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        key.objectid = dir;
        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
        key.offset = btrfs_name_hash(name, name_len);
+
        path = btrfs_alloc_path();
+       path->leave_spinning = 1;
+
        data_size = sizeof(*dir_item) + name_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
index 6ec80c0fc8697a0a9171e72a8dd4e63905c7993e..92d73929d3814b50024196eb5e8688d2ee330222 100644 (file)
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+       struct extent_buffer *eb;
+       int was_dirty;
+
        tree = &BTRFS_I(page->mapping->host)->io_tree;
+       if (!(current->flags & PF_MEMALLOC)) {
+               return extent_write_full_page(tree, page,
+                                             btree_get_extent, wbc);
+       }
 
-       if (current->flags & PF_MEMALLOC) {
-               redirty_page_for_writepage(wbc, page);
-               unlock_page(page);
-               return 0;
+       redirty_page_for_writepage(wbc, page);
+       eb = btrfs_find_tree_block(root, page_offset(page),
+                                     PAGE_CACHE_SIZE);
+       WARN_ON(!eb);
+
+       was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+       if (!was_dirty) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+               spin_unlock(&root->fs_info->delalloc_lock);
        }
-       return extent_write_full_page(tree, page, btree_get_extent, wbc);
+       free_extent_buffer(eb);
+
+       unlock_page(page);
+       return 0;
 }
 
 static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
        struct extent_io_tree *tree;
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
+               struct btrfs_root *root = BTRFS_I(mapping->host)->root;
                u64 num_dirty;
-               u64 start = 0;
                unsigned long thresh = 32 * 1024 * 1024;
 
                if (wbc->for_kupdate)
                        return 0;
 
-               num_dirty = count_range_bits(tree, &start, (u64)-1,
-                                            thresh, EXTENT_DIRTY);
+               /* this is a bit racy, but that's ok */
+               num_dirty = root->fs_info->dirty_metadata_bytes;
                if (num_dirty < thresh)
                        return 0;
        }
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
            root->fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
 
-               /* ugh, clear_extent_buffer_dirty can be expensive */
-               btrfs_set_lock_blocking(buf);
+               if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+                       spin_lock(&root->fs_info->delalloc_lock);
+                       if (root->fs_info->dirty_metadata_bytes >= buf->len)
+                               root->fs_info->dirty_metadata_bytes -= buf->len;
+                       else
+                               WARN_ON(1);
+                       spin_unlock(&root->fs_info->delalloc_lock);
+               }
 
+               /* ugh, clear_extent_buffer_dirty needs to lock the page */
+               btrfs_set_lock_blocking(buf);
                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
                                          buf);
        }
@@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-               if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-                       printk(KERN_INFO "btrfs: total reference cache "
-                              "size %llu\n",
-                              root->fs_info->total_ref_cache_size);
-               }
-
                mutex_lock(&root->fs_info->trans_mutex);
                cur = root->fs_info->running_transaction;
                if (!cur) {
@@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg)
                mutex_unlock(&root->fs_info->trans_mutex);
                trans = btrfs_start_transaction(root, 1);
                ret = btrfs_commit_transaction(trans, root);
+
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->ordered_operations);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
        extent_io_tree_init(&fs_info->pinned_extents,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-       extent_io_tree_init(&fs_info->pending_del,
-                            fs_info->btree_inode->i_mapping, GFP_NOFS);
-       extent_io_tree_init(&fs_info->extent_ins,
-                            fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
 
        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        insert_inode_hash(fs_info->btree_inode);
 
        mutex_init(&fs_info->trans_mutex);
+       mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
-       mutex_init(&fs_info->extent_ins_mutex);
        mutex_init(&fs_info->pinned_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
        u64 transid = btrfs_header_generation(buf);
        struct inode *btree_inode = root->fs_info->btree_inode;
-
-       btrfs_set_lock_blocking(buf);
+       int was_dirty;
 
        btrfs_assert_tree_locked(buf);
        if (transid != root->fs_info->generation) {
@@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)root->fs_info->generation);
                WARN_ON(1);
        }
-       set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+       was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+                                           buf);
+       if (!was_dirty) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               root->fs_info->dirty_metadata_bytes += buf->len;
+               spin_unlock(&root->fs_info->delalloc_lock);
+       }
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_buffer *eb;
        unsigned long len;
@@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page)
 
        btrfs_tree_lock(eb);
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               if (root->fs_info->dirty_metadata_bytes >= eb->len)
+                       root->fs_info->dirty_metadata_bytes -= eb->len;
+               else
+                       WARN_ON(1);
+               spin_unlock(&root->fs_info->delalloc_lock);
+       }
+
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 out:
index 95029db227be5767d17eeab2a11cb36b63709849..c958ecbc19168ef0e7ef8e7bdd123f5f527b55f7 100644 (file)
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
index fefe83ad20595eba078f6480781eefb0fe81c659..f5e7cae63d80f8828f711833b58600a8bce71df9 100644 (file)
@@ -49,17 +49,23 @@ struct pending_extent_op {
        int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root, int all);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, int is_data);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root, u64 parent,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner, struct btrfs_key *ins,
+                                        int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+                                  u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       u64 bytenr, u64 num_bytes, u64 parent,
+                                       u64 root_objectid, u64 ref_generation,
+                                       u64 owner_objectid, int pin,
+                                       int ref_to_drop);
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +560,13 @@ out:
        return ret;
 }
 
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *extent_root,
-                                   struct btrfs_path *path,
-                                   struct list_head *update_list)
-{
-       struct btrfs_key key;
-       struct btrfs_extent_ref *ref;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct pending_extent_op *op;
-       struct extent_buffer *leaf;
-       int ret = 0;
-       struct list_head *cur = update_list->next;
-       u64 ref_objectid;
-       u64 ref_root = extent_root->root_key.objectid;
-
-       op = list_entry(cur, struct pending_extent_op, list);
-
-search:
-       key.objectid = op->bytenr;
-       key.type = BTRFS_EXTENT_REF_KEY;
-       key.offset = op->orig_parent;
-
-       ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
-       BUG_ON(ret);
-
-       leaf = path->nodes[0];
-
-loop:
-       ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
-       ref_objectid = btrfs_ref_objectid(leaf, ref);
-
-       if (btrfs_ref_root(leaf, ref) != ref_root ||
-           btrfs_ref_generation(leaf, ref) != op->orig_generation ||
-           (ref_objectid != op->level &&
-            ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-               printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
-                      "root %llu, owner %u\n",
-                      (unsigned long long)op->bytenr,
-                      (unsigned long long)op->orig_parent,
-                      (unsigned long long)ref_root, op->level);
-               btrfs_print_leaf(extent_root, leaf);
-               BUG();
-       }
-
-       key.objectid = op->bytenr;
-       key.offset = op->parent;
-       key.type = BTRFS_EXTENT_REF_KEY;
-       ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
-       BUG_ON(ret);
-       ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-       btrfs_set_ref_generation(leaf, ref, op->generation);
-
-       cur = cur->next;
-
-       list_del_init(&op->list);
-       unlock_extent(&info->extent_ins, op->bytenr,
-                     op->bytenr + op->num_bytes - 1, GFP_NOFS);
-       kfree(op);
-
-       if (cur == update_list) {
-               btrfs_mark_buffer_dirty(path->nodes[0]);
-               btrfs_release_path(extent_root, path);
-               goto out;
-       }
-
-       op = list_entry(cur, struct pending_extent_op, list);
-
-       path->slots[0]++;
-       while (path->slots[0] < btrfs_header_nritems(leaf)) {
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               if (key.objectid == op->bytenr &&
-                   key.type == BTRFS_EXTENT_REF_KEY)
-                       goto loop;
-               path->slots[0]++;
-       }
-
-       btrfs_mark_buffer_dirty(path->nodes[0]);
-       btrfs_release_path(extent_root, path);
-       goto search;
-
-out:
-       return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *extent_root,
-                                  struct btrfs_path *path,
-                                  struct list_head *insert_list, int nr)
-{
-       struct btrfs_key *keys;
-       u32 *data_size;
-       struct pending_extent_op *op;
-       struct extent_buffer *leaf;
-       struct list_head *cur = insert_list->next;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       u64 ref_root = extent_root->root_key.objectid;
-       int i = 0, last = 0, ret;
-       int total = nr * 2;
-
-       if (!nr)
-               return 0;
-
-       keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
-       if (!keys)
-               return -ENOMEM;
-
-       data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
-       if (!data_size) {
-               kfree(keys);
-               return -ENOMEM;
-       }
-
-       list_for_each_entry(op, insert_list, list) {
-               keys[i].objectid = op->bytenr;
-               keys[i].offset = op->num_bytes;
-               keys[i].type = BTRFS_EXTENT_ITEM_KEY;
-               data_size[i] = sizeof(struct btrfs_extent_item);
-               i++;
-
-               keys[i].objectid = op->bytenr;
-               keys[i].offset = op->parent;
-               keys[i].type = BTRFS_EXTENT_REF_KEY;
-               data_size[i] = sizeof(struct btrfs_extent_ref);
-               i++;
-       }
-
-       op = list_entry(cur, struct pending_extent_op, list);
-       i = 0;
-       while (i < total) {
-               int c;
-               ret = btrfs_insert_some_items(trans, extent_root, path,
-                                             keys+i, data_size+i, total-i);
-               BUG_ON(ret < 0);
-
-               if (last && ret > 1)
-                       BUG();
-
-               leaf = path->nodes[0];
-               for (c = 0; c < ret; c++) {
-                       int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
-                       /*
-                        * if the first item we inserted was a backref, then
-                        * the EXTENT_ITEM will be the odd c's, else it will
-                        * be the even c's
-                        */
-                       if ((ref_first && (c % 2)) ||
-                           (!ref_first && !(c % 2))) {
-                               struct btrfs_extent_item *itm;
-
-                               itm = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                    struct btrfs_extent_item);
-                               btrfs_set_extent_refs(path->nodes[0], itm, 1);
-                               op->del++;
-                       } else {
-                               struct btrfs_extent_ref *ref;
-
-                               ref = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                    struct btrfs_extent_ref);
-                               btrfs_set_ref_root(leaf, ref, ref_root);
-                               btrfs_set_ref_generation(leaf, ref,
-                                                        op->generation);
-                               btrfs_set_ref_objectid(leaf, ref, op->level);
-                               btrfs_set_ref_num_refs(leaf, ref, 1);
-                               op->del++;
-                       }
-
-                       /*
-                        * using del to see when its ok to free up the
-                        * pending_extent_op.  In the case where we insert the
-                        * last item on the list in order to help do batching
-                        * we need to not free the extent op until we actually
-                        * insert the extent_item
-                        */
-                       if (op->del == 2) {
-                               unlock_extent(&info->extent_ins, op->bytenr,
-                                             op->bytenr + op->num_bytes - 1,
-                                             GFP_NOFS);
-                               cur = cur->next;
-                               list_del_init(&op->list);
-                               kfree(op);
-                               if (cur != insert_list)
-                                       op = list_entry(cur,
-                                               struct pending_extent_op,
-                                               list);
-                       }
-               }
-               btrfs_mark_buffer_dirty(leaf);
-               btrfs_release_path(extent_root, path);
-
-               /*
-                * Ok backref's and items usually go right next to eachother,
-                * but if we could only insert 1 item that means that we
-                * inserted on the end of a leaf, and we have no idea what may
-                * be on the next leaf so we just play it safe.  In order to
-                * try and help this case we insert the last thing on our
-                * insert list so hopefully it will end up being the last
-                * thing on the leaf and everything else will be before it,
-                * which will let us insert a whole bunch of items at the same
-                * time.
-                */
-               if (ret == 1 && !last && (i + ret < total)) {
-                       /*
-                        * last: where we will pick up the next time around
-                        * i: our current key to insert, will be total - 1
-                        * cur: the current op we are screwing with
-                        * op: duh
-                        */
-                       last = i + ret;
-                       i = total - 1;
-                       cur = insert_list->prev;
-                       op = list_entry(cur, struct pending_extent_op, list);
-               } else if (last) {
-                       /*
-                        * ok we successfully inserted the last item on the
-                        * list, lets reset everything
-                        *
-                        * i: our current key to insert, so where we left off
-                        *    last time
-                        * last: done with this
-                        * cur: the op we are messing with
-                        * op: duh
-                        * total: since we inserted the last key, we need to
-                        *        decrement total so we dont overflow
-                        */
-                       i = last;
-                       last = 0;
-                       total--;
-                       if (i < total) {
-                               cur = insert_list->next;
-                               op = list_entry(cur, struct pending_extent_op,
-                                               list);
-                       }
-               } else {
-                       i += ret;
-               }
-
-               cond_resched();
-       }
-       ret = 0;
-       kfree(keys);
-       kfree(data_size);
-       return ret;
-}
-
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                          u64 bytenr, u64 parent,
                                          u64 ref_root, u64 ref_generation,
-                                         u64 owner_objectid)
+                                         u64 owner_objectid,
+                                         int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                btrfs_set_ref_root(leaf, ref, ref_root);
                btrfs_set_ref_generation(leaf, ref, ref_generation);
                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-               btrfs_set_ref_num_refs(leaf, ref, 1);
+               btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
        } else if (ret == -EEXIST) {
                u64 existing_owner;
+
                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 
                num_refs = btrfs_ref_num_refs(leaf, ref);
                BUG_ON(num_refs == 0);
-               btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+               btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
 
                existing_owner = btrfs_ref_objectid(leaf, ref);
                if (existing_owner != owner_objectid &&
@@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
        } else {
                goto out;
        }
+       btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
        btrfs_release_path(root, path);
@@ -865,7 +624,8 @@ out:
 
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
-                                         struct btrfs_path *path)
+                                         struct btrfs_path *path,
+                                         int refs_to_drop)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_ref *ref;
@@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
        num_refs = btrfs_ref_num_refs(leaf, ref);
-       BUG_ON(num_refs == 0);
-       num_refs -= 1;
+       BUG_ON(num_refs < refs_to_drop);
+       num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
@@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static noinline int free_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root,
-                                struct list_head *del_list)
-{
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_path *path;
-       struct btrfs_key key, found_key;
-       struct extent_buffer *leaf;
-       struct list_head *cur;
-       struct pending_extent_op *op;
-       struct btrfs_extent_item *ei;
-       int ret, num_to_del, extent_slot = 0, found_extent = 0;
-       u32 refs;
-       u64 bytes_freed = 0;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       path->reada = 1;
-
-search:
-       /* search for the backref for the current ref we want to delete */
-       cur = del_list->next;
-       op = list_entry(cur, struct pending_extent_op, list);
-       ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
-                                   op->orig_parent,
-                                   extent_root->root_key.objectid,
-                                   op->orig_generation, op->level, 1);
-       if (ret) {
-               printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
-                      "root %llu gen %llu owner %u\n",
-                      (unsigned long long)op->bytenr,
-                      (unsigned long long)extent_root->root_key.objectid,
-                      (unsigned long long)op->orig_generation, op->level);
-               btrfs_print_leaf(extent_root, path->nodes[0]);
-               WARN_ON(1);
-               goto out;
-       }
-
-       extent_slot = path->slots[0];
-       num_to_del = 1;
-       found_extent = 0;
-
-       /*
-        * if we aren't the first item on the leaf we can move back one and see
-        * if our ref is right next to our extent item
-        */
-       if (likely(extent_slot)) {
-               extent_slot--;
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     extent_slot);
-               if (found_key.objectid == op->bytenr &&
-                   found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-                   found_key.offset == op->num_bytes) {
-                       num_to_del++;
-                       found_extent = 1;
-               }
-       }
-
-       /*
-        * if we didn't find the extent we need to delete the backref and then
-        * search for the extent item key so we can update its ref count
-        */
-       if (!found_extent) {
-               key.objectid = op->bytenr;
-               key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = op->num_bytes;
-
-               ret = remove_extent_backref(trans, extent_root, path);
-               BUG_ON(ret);
-               btrfs_release_path(extent_root, path);
-               ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-               BUG_ON(ret);
-               extent_slot = path->slots[0];
-       }
-
-       /* this is where we update the ref count for the extent */
-       leaf = path->nodes[0];
-       ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
-       refs = btrfs_extent_refs(leaf, ei);
-       BUG_ON(refs == 0);
-       refs--;
-       btrfs_set_extent_refs(leaf, ei, refs);
-
-       btrfs_mark_buffer_dirty(leaf);
-
-       /*
-        * This extent needs deleting.  The reason cur_slot is extent_slot +
-        * num_to_del is because extent_slot points to the slot where the extent
-        * is, and if the backref was not right next to the extent we will be
-        * deleting at least 1 item, and will want to start searching at the
-        * slot directly next to extent_slot.  However if we did find the
-        * backref next to the extent item them we will be deleting at least 2
-        * items and will want to start searching directly after the ref slot
-        */
-       if (!refs) {
-               struct list_head *pos, *n, *end;
-               int cur_slot = extent_slot+num_to_del;
-               u64 super_used;
-               u64 root_used;
-
-               path->slots[0] = extent_slot;
-               bytes_freed = op->num_bytes;
-
-               mutex_lock(&info->pinned_mutex);
-               ret = pin_down_bytes(trans, extent_root, op->bytenr,
-                                    op->num_bytes, op->level >=
-                                    BTRFS_FIRST_FREE_OBJECTID);
-               mutex_unlock(&info->pinned_mutex);
-               BUG_ON(ret < 0);
-               op->del = ret;
-
-               /*
-                * we need to see if we can delete multiple things at once, so
-                * start looping through the list of extents we are wanting to
-                * delete and see if their extent/backref's are right next to
-                * eachother and the extents only have 1 ref
-                */
-               for (pos = cur->next; pos != del_list; pos = pos->next) {
-                       struct pending_extent_op *tmp;
-
-                       tmp = list_entry(pos, struct pending_extent_op, list);
-
-                       /* we only want to delete extent+ref at this stage */
-                       if (cur_slot >= btrfs_header_nritems(leaf) - 1)
-                               break;
-
-                       btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
-                       if (found_key.objectid != tmp->bytenr ||
-                           found_key.type != BTRFS_EXTENT_ITEM_KEY ||
-                           found_key.offset != tmp->num_bytes)
-                               break;
-
-                       /* check to make sure this extent only has one ref */
-                       ei = btrfs_item_ptr(leaf, cur_slot,
-                                           struct btrfs_extent_item);
-                       if (btrfs_extent_refs(leaf, ei) != 1)
-                               break;
-
-                       btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
-                       if (found_key.objectid != tmp->bytenr ||
-                           found_key.type != BTRFS_EXTENT_REF_KEY ||
-                           found_key.offset != tmp->orig_parent)
-                               break;
-
-                       /*
-                        * the ref is right next to the extent, we can set the
-                        * ref count to 0 since we will delete them both now
-                        */
-                       btrfs_set_extent_refs(leaf, ei, 0);
-
-                       /* pin down the bytes for this extent */
-                       mutex_lock(&info->pinned_mutex);
-                       ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
-                                            tmp->num_bytes, tmp->level >=
-                                            BTRFS_FIRST_FREE_OBJECTID);
-                       mutex_unlock(&info->pinned_mutex);
-                       BUG_ON(ret < 0);
-
-                       /*
-                        * use the del field to tell if we need to go ahead and
-                        * free up the extent when we delete the item or not.
-                        */
-                       tmp->del = ret;
-                       bytes_freed += tmp->num_bytes;
-
-                       num_to_del += 2;
-                       cur_slot += 2;
-               }
-               end = pos;
-
-               /* update the free space counters */
-               spin_lock(&info->delalloc_lock);
-               super_used = btrfs_super_bytes_used(&info->super_copy);
-               btrfs_set_super_bytes_used(&info->super_copy,
-                                          super_used - bytes_freed);
-
-               root_used = btrfs_root_used(&extent_root->root_item);
-               btrfs_set_root_used(&extent_root->root_item,
-                                   root_used - bytes_freed);
-               spin_unlock(&info->delalloc_lock);
-
-               /* delete the items */
-               ret = btrfs_del_items(trans, extent_root, path,
-                                     path->slots[0], num_to_del);
-               BUG_ON(ret);
-
-               /*
-                * loop through the extents we deleted and do the cleanup work
-                * on them
-                */
-               for (pos = cur, n = pos->next; pos != end;
-                    pos = n, n = pos->next) {
-                       struct pending_extent_op *tmp;
-                       tmp = list_entry(pos, struct pending_extent_op, list);
-
-                       /*
-                        * remember tmp->del tells us wether or not we pinned
-                        * down the extent
-                        */
-                       ret = update_block_group(trans, extent_root,
-                                                tmp->bytenr, tmp->num_bytes, 0,
-                                                tmp->del);
-                       BUG_ON(ret);
-
-                       list_del_init(&tmp->list);
-                       unlock_extent(&info->extent_ins, tmp->bytenr,
-                                     tmp->bytenr + tmp->num_bytes - 1,
-                                     GFP_NOFS);
-                       kfree(tmp);
-               }
-       } else if (refs && found_extent) {
-               /*
-                * the ref and extent were right next to eachother, but the
-                * extent still has a ref, so just free the backref and keep
-                * going
-                */
-               ret = remove_extent_backref(trans, extent_root, path);
-               BUG_ON(ret);
-
-               list_del_init(&op->list);
-               unlock_extent(&info->extent_ins, op->bytenr,
-                             op->bytenr + op->num_bytes - 1, GFP_NOFS);
-               kfree(op);
-       } else {
-               /*
-                * the extent has multiple refs and the backref we were looking
-                * for was not right next to it, so just unlock and go next,
-                * we're good to go
-                */
-               list_del_init(&op->list);
-               unlock_extent(&info->extent_ins, op->bytenr,
-                             op->bytenr + op->num_bytes - 1, GFP_NOFS);
-               kfree(op);
-       }
-
-       btrfs_release_path(extent_root, path);
-       if (!list_empty(del_list))
-               goto search;
-
-out:
-       btrfs_free_path(path);
-       return ret;
-}
-
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 bytenr,
+                                    u64 num_bytes,
                                     u64 orig_parent, u64 parent,
                                     u64 orig_root, u64 ref_root,
                                     u64 orig_generation, u64 ref_generation,
                                     u64 owner_objectid)
 {
        int ret;
-       struct btrfs_root *extent_root = root->fs_info->extent_root;
-       struct btrfs_path *path;
-
-       if (root == root->fs_info->extent_root) {
-               struct pending_extent_op *extent_op;
-               u64 num_bytes;
-
-               BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
-               num_bytes = btrfs_level_size(root, (int)owner_objectid);
-               mutex_lock(&root->fs_info->extent_ins_mutex);
-               if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                               bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                       u64 priv;
-                       ret = get_state_private(&root->fs_info->extent_ins,
-                                               bytenr, &priv);
-                       BUG_ON(ret);
-                       extent_op = (struct pending_extent_op *)
-                                                       (unsigned long)priv;
-                       BUG_ON(extent_op->parent != orig_parent);
-                       BUG_ON(extent_op->generation != orig_generation);
+       int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-                       extent_op->parent = parent;
-                       extent_op->generation = ref_generation;
-               } else {
-                       extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                       BUG_ON(!extent_op);
-
-                       extent_op->type = PENDING_BACKREF_UPDATE;
-                       extent_op->bytenr = bytenr;
-                       extent_op->num_bytes = num_bytes;
-                       extent_op->parent = parent;
-                       extent_op->orig_parent = orig_parent;
-                       extent_op->generation = ref_generation;
-                       extent_op->orig_generation = orig_generation;
-                       extent_op->level = (int)owner_objectid;
-                       INIT_LIST_HEAD(&extent_op->list);
-                       extent_op->del = 0;
-
-                       set_extent_bits(&root->fs_info->extent_ins,
-                                       bytenr, bytenr + num_bytes - 1,
-                                       EXTENT_WRITEBACK, GFP_NOFS);
-                       set_state_private(&root->fs_info->extent_ins,
-                                         bytenr, (unsigned long)extent_op);
-               }
-               mutex_unlock(&root->fs_info->extent_ins_mutex);
-               return 0;
-       }
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       ret = lookup_extent_backref(trans, extent_root, path,
-                                   bytenr, orig_parent, orig_root,
-                                   orig_generation, owner_objectid, 1);
-       if (ret)
-               goto out;
-       ret = remove_extent_backref(trans, extent_root, path);
-       if (ret)
-               goto out;
-       ret = insert_extent_backref(trans, extent_root, path, bytenr,
-                                   parent, ref_root, ref_generation,
-                                   owner_objectid);
+       ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+                                      orig_parent, parent, orig_root,
+                                      ref_root, orig_generation,
+                                      ref_generation, owner_objectid, pin);
        BUG_ON(ret);
-       finish_current_insert(trans, extent_root, 0);
-       del_pending_extents(trans, extent_root, 0);
-out:
-       btrfs_free_path(path);
        return ret;
 }
 
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr,
-                           u64 orig_parent, u64 parent,
+                           u64 num_bytes, u64 orig_parent, u64 parent,
                            u64 ref_root, u64 ref_generation,
                            u64 owner_objectid)
 {
@@ -1260,19 +716,35 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-       ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
-                                       parent, ref_root, ref_root,
-                                       ref_generation, ref_generation,
-                                       owner_objectid);
+
+       ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+                                       orig_parent, parent, ref_root,
+                                       ref_root, ref_generation,
+                                       ref_generation, owner_objectid);
        return ret;
 }
-
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, u64 bytenr,
+                                 u64 num_bytes,
                                  u64 orig_parent, u64 parent,
                                  u64 orig_root, u64 ref_root,
                                  u64 orig_generation, u64 ref_generation,
                                  u64 owner_objectid)
+{
+       int ret;
+
+       ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+                                   ref_generation, owner_objectid,
+                                   BTRFS_ADD_DELAYED_REF, 0);
+       BUG_ON(ret);
+       return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root, u64 bytenr,
+                         u64 num_bytes, u64 parent, u64 ref_root,
+                         u64 ref_generation, u64 owner_objectid,
+                         int refs_to_add)
 {
        struct btrfs_path *path;
        int ret;
@@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->reada = 1;
+       path->leave_spinning = 1;
        key.objectid = bytenr;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-       key.offset = (u64)-1;
+       key.offset = num_bytes;
 
-       ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-                               0, 1);
-       if (ret < 0)
+       /* first find the extent item and update its reference count */
+       ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+                               path, 0, 1);
+       if (ret < 0) {
+               btrfs_set_path_blocking(path);
                return ret;
-       BUG_ON(ret == 0 || path->slots[0] == 0);
+       }
 
-       path->slots[0]--;
+       if (ret > 0) {
+               WARN_ON(1);
+               btrfs_free_path(path);
+               return -EIO;
+       }
        l = path->nodes[0];
 
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
        refs = btrfs_extent_refs(l, item);
-       btrfs_set_extent_refs(l, item, refs + 1);
+       btrfs_set_extent_refs(l, item, refs + refs_to_add);
+       btrfs_unlock_up_safe(path, 1);
+
        btrfs_mark_buffer_dirty(path->nodes[0]);
 
        btrfs_release_path(root->fs_info->extent_root, path);
 
        path->reada = 1;
+       path->leave_spinning = 1;
+
+       /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent,
                                    ref_root, ref_generation,
-                                   owner_objectid);
+                                   owner_objectid, refs_to_add);
        BUG_ON(ret);
-       finish_current_insert(trans, root->fs_info->extent_root, 0);
-       del_pending_extents(trans, root->fs_info->extent_root, 0);
-
        btrfs_free_path(path);
        return 0;
 }
@@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-       ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+
+       ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
                                     0, ref_root, 0, ref_generation,
                                     owner_objectid);
        return ret;
 }
 
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root)
-{
-       u64 start;
-       u64 end;
-       int ret;
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_delayed_ref_node *node)
+{
+       int ret = 0;
+       struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+       BUG_ON(node->ref_mod == 0);
+       ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+                                 node->parent, ref->root, ref->generation,
+                                 ref->owner_objectid, ref->pin, node->ref_mod);
+
+       return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_delayed_ref_node *node,
+                                       int insert_reserved)
+{
+       int ret;
+       struct btrfs_delayed_ref *ref;
+
+       if (node->parent == (u64)-1) {
+               struct btrfs_delayed_ref_head *head;
+               /*
+                * we've hit the end of the chain and we were supposed
+                * to insert this extent into the tree.  But, it got
+                * deleted before we ever needed to insert it, so all
+                * we have to do is clean up the accounting
+                */
+               if (insert_reserved) {
+                       update_reserved_extents(root, node->bytenr,
+                                               node->num_bytes, 0);
+               }
+               head = btrfs_delayed_node_to_head(node);
+               mutex_unlock(&head->mutex);
+               return 0;
+       }
+
+       ref = btrfs_delayed_node_to_ref(node);
+       if (ref->action == BTRFS_ADD_DELAYED_REF) {
+               if (insert_reserved) {
+                       struct btrfs_key ins;
+
+                       ins.objectid = node->bytenr;
+                       ins.offset = node->num_bytes;
+                       ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+                       /* record the full extent allocation */
+                       ret = __btrfs_alloc_reserved_extent(trans, root,
+                                       node->parent, ref->root,
+                                       ref->generation, ref->owner_objectid,
+                                       &ins, node->ref_mod);
+                       update_reserved_extents(root, node->bytenr,
+                                               node->num_bytes, 0);
+               } else {
+                       /* just add one backref */
+                       ret = add_extent_ref(trans, root, node->bytenr,
+                                    node->num_bytes,
+                                    node->parent, ref->root, ref->generation,
+                                    ref->owner_objectid, node->ref_mod);
+               }
+               BUG_ON(ret);
+       } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+               WARN_ON(insert_reserved);
+               ret = drop_delayed_ref(trans, root, node);
+       }
+       return 0;
+}
+
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_node *ref;
+       int action = BTRFS_ADD_DELAYED_REF;
+again:
+       /*
+        * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+        * this prevents ref count from going down to zero when
+        * there still are pending delayed ref.
+        */
+       node = rb_prev(&head->node.rb_node);
+       while (1) {
+               if (!node)
+                       break;
+               ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                               rb_node);
+               if (ref->bytenr != head->node.bytenr)
+                       break;
+               if (btrfs_delayed_node_to_ref(ref)->action == action)
+                       return ref;
+               node = rb_prev(node);
+       }
+       if (action == BTRFS_ADD_DELAYED_REF) {
+               action = BTRFS_DROP_DELAYED_REF;
+               goto again;
+       }
+       return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct list_head *cluster)
+{
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct btrfs_delayed_ref_head *locked_ref = NULL;
+       int ret;
+       int count = 0;
+       int must_insert_reserved = 0;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       while (1) {
+               if (!locked_ref) {
+                       /* pick a new head ref from the cluster list */
+                       if (list_empty(cluster))
+                               break;
+
+                       locked_ref = list_entry(cluster->next,
+                                    struct btrfs_delayed_ref_head, cluster);
+
+                       /* grab the lock that says we are going to process
+                        * all the refs for this head */
+                       ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+                       /*
+                        * we may have dropped the spin lock to get the head
+                        * mutex lock, and that might have given someone else
+                        * time to free the head.  If that's true, it has been
+                        * removed from our list and we can move on.
+                        */
+                       if (ret == -EAGAIN) {
+                               locked_ref = NULL;
+                               count++;
+                               continue;
+                       }
+               }
+
+               /*
+                * record the must insert reserved flag before we
+                * drop the spin lock.
+                */
+               must_insert_reserved = locked_ref->must_insert_reserved;
+               locked_ref->must_insert_reserved = 0;
+
+               /*
+                * locked_ref is the head node, so we have to go one
+                * node back for any delayed ref updates
+                */
+               ref = select_delayed_ref(locked_ref);
+               if (!ref) {
+                       /* All delayed refs have been processed, Go ahead
+                        * and send the head node to run_one_delayed_ref,
+                        * so that any accounting fixes can happen
+                        */
+                       ref = &locked_ref->node;
+                       list_del_init(&locked_ref->cluster);
+                       locked_ref = NULL;
+               }
+
+               ref->in_tree = 0;
+               rb_erase(&ref->rb_node, &delayed_refs->root);
+               delayed_refs->num_entries--;
+               spin_unlock(&delayed_refs->lock);
+
+               ret = run_one_delayed_ref(trans, root, ref,
+                                         must_insert_reserved);
+               BUG_ON(ret);
+               btrfs_put_delayed_ref(ref);
+
+               count++;
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+       return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long count)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct list_head cluster;
+       int ret;
+       int run_all = count == (unsigned long)-1;
+       int run_most = 0;
+
+       if (root == root->fs_info->extent_root)
+               root = root->fs_info->tree_root;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       INIT_LIST_HEAD(&cluster);
+again:
+       spin_lock(&delayed_refs->lock);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+       while (1) {
+               if (!(run_all || run_most) &&
+                   delayed_refs->num_heads_ready < 64)
+                       break;
+
+               /*
+                * go find something we can process in the rbtree.  We start at
+                * the beginning of the tree, and then build a cluster
+                * of refs to process starting at the first one we are able to
+                * lock
+                */
+               ret = btrfs_find_ref_cluster(trans, &cluster,
+                                            delayed_refs->run_delayed_start);
+               if (ret)
+                       break;
+
+               ret = run_clustered_refs(trans, root, &cluster);
+               BUG_ON(ret < 0);
 
-       while(1) {
-               finish_current_insert(trans, root->fs_info->extent_root, 1);
-               del_pending_extents(trans, root->fs_info->extent_root, 1);
+               count -= min_t(unsigned long, ret, count);
 
-               /* is there more work to do? */
-               ret = find_first_extent_bit(&root->fs_info->pending_del,
-                                           0, &start, &end, EXTENT_WRITEBACK);
-               if (!ret)
-                       continue;
-               ret = find_first_extent_bit(&root->fs_info->extent_ins,
-                                           0, &start, &end, EXTENT_WRITEBACK);
-               if (!ret)
-                       continue;
-               break;
+               if (count == 0)
+                       break;
        }
-       return 0;
-}
 
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, u64 bytenr,
-                           u64 num_bytes, u32 *refs)
-{
-       struct btrfs_path *path;
-       int ret;
-       struct btrfs_key key;
-       struct extent_buffer *l;
-       struct btrfs_extent_item *item;
+       if (run_all) {
+               node = rb_first(&delayed_refs->root);
+               if (!node)
+                       goto out;
+               count = (unsigned long)-1;
 
-       WARN_ON(num_bytes < root->sectorsize);
-       path = btrfs_alloc_path();
-       path->reada = 1;
-       key.objectid = bytenr;
-       key.offset = num_bytes;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-       ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-                               0, 0);
-       if (ret < 0)
-               goto out;
-       if (ret != 0) {
-               btrfs_print_leaf(root, path->nodes[0]);
-               printk(KERN_INFO "btrfs failed to find block number %llu\n",
-                      (unsigned long long)bytenr);
-               BUG();
+               while (node) {
+                       ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                      rb_node);
+                       if (btrfs_delayed_ref_is_head(ref)) {
+                               struct btrfs_delayed_ref_head *head;
+
+                               head = btrfs_delayed_node_to_head(ref);
+                               atomic_inc(&ref->refs);
+
+                               spin_unlock(&delayed_refs->lock);
+                               mutex_lock(&head->mutex);
+                               mutex_unlock(&head->mutex);
+
+                               btrfs_put_delayed_ref(ref);
+                               cond_resched();
+                               goto again;
+                       }
+                       node = rb_next(node);
+               }
+               spin_unlock(&delayed_refs->lock);
+               schedule_timeout(1);
+               goto again;
        }
-       l = path->nodes[0];
-       item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-       *refs = btrfs_extent_refs(l, item);
 out:
-       btrfs_free_path(path);
+       spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
@@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
        int refi = 0;
        int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, u64, u64, u64);
 
        ref_root = btrfs_header_owner(buf);
        ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
 
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, slot);
+                       fi = btrfs_item_ptr(buf, slot,
+                                           struct btrfs_file_extent_item);
+
+                       bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                       if (bytenr == 0)
+                               continue;
 
                        ret = process_func(trans, root, bytenr,
-                                          orig_buf->start, buf->start,
-                                          orig_root, ref_root,
-                                          orig_generation, ref_generation,
-                                          key.objectid);
+                                  btrfs_file_extent_disk_num_bytes(buf, fi),
+                                  orig_buf->start, buf->start,
+                                  orig_root, ref_root,
+                                  orig_generation, ref_generation,
+                                  key.objectid);
 
                        if (ret) {
                                faili = slot;
@@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                                goto fail;
                        }
                } else {
-                       ret = process_func(trans, root, bytenr,
+                       ret = process_func(trans, root, bytenr, buf->len,
                                           orig_buf->start, buf->start,
                                           orig_root, ref_root,
                                           orig_generation, ref_generation,
@@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                           orig_buf->start, buf->start,
-                                           orig_root, ref_root,
-                                           orig_generation, ref_generation,
-                                           key.objectid);
+                                   btrfs_file_extent_disk_num_bytes(buf, fi),
+                                   orig_buf->start, buf->start,
+                                   orig_root, ref_root, orig_generation,
+                                   ref_generation, key.objectid);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, slot);
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                           orig_buf->start, buf->start,
-                                           orig_root, ref_root,
+                                           buf->len, orig_buf->start,
+                                           buf->start, orig_root, ref_root,
                                            orig_generation, ref_generation,
                                            level - 1);
                        if (ret)
@@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group_cache *cache)
 {
        int ret;
-       int pending_ret;
        struct btrfs_root *extent_root = root->fs_info->extent_root;
        unsigned long bi;
        struct extent_buffer *leaf;
@@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(extent_root, path);
 fail:
-       finish_current_insert(trans, extent_root, 0);
-       pending_ret = del_pending_extents(trans, extent_root, 0);
        if (ret)
                return ret;
-       if (pending_ret)
-               return pending_ret;
        return 0;
 
 }
@@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
+       mutex_unlock(&root->fs_info->pinned_mutex);
+
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
                BUG_ON(!cache);
@@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        u64 end;
        int ret;
 
-       mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
+               mutex_lock(&root->fs_info->pinned_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
                if (ret)
@@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                ret = btrfs_discard_extent(root, start, end + 1 - start);
 
+               /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
-               if (need_resched()) {
-                       mutex_unlock(&root->fs_info->pinned_mutex);
-                       cond_resched();
-                       mutex_lock(&root->fs_info->pinned_mutex);
-               }
+               cond_resched();
        }
        mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
 }
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root, int all)
-{
-       u64 start;
-       u64 end;
-       u64 priv;
-       u64 search = 0;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_path *path;
-       struct pending_extent_op *extent_op, *tmp;
-       struct list_head insert_list, update_list;
-       int ret;
-       int num_inserts = 0, max_inserts, restart = 0;
-
-       path = btrfs_alloc_path();
-       INIT_LIST_HEAD(&insert_list);
-       INIT_LIST_HEAD(&update_list);
-
-       max_inserts = extent_root->leafsize /
-               (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
-                sizeof(struct btrfs_extent_ref) +
-                sizeof(struct btrfs_extent_item));
-again:
-       mutex_lock(&info->extent_ins_mutex);
-       while (1) {
-               ret = find_first_extent_bit(&info->extent_ins, search, &start,
-                                           &end, EXTENT_WRITEBACK);
-               if (ret) {
-                       if (restart && !num_inserts &&
-                           list_empty(&update_list)) {
-                               restart = 0;
-                               search = 0;
-                               continue;
-                       }
-                       break;
-               }
-
-               ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
-               if (!ret) {
-                       if (all)
-                               restart = 1;
-                       search = end + 1;
-                       if (need_resched()) {
-                               mutex_unlock(&info->extent_ins_mutex);
-                               cond_resched();
-                               mutex_lock(&info->extent_ins_mutex);
-                       }
-                       continue;
-               }
-
-               ret = get_state_private(&info->extent_ins, start, &priv);
-               BUG_ON(ret);
-               extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
-               if (extent_op->type == PENDING_EXTENT_INSERT) {
-                       num_inserts++;
-                       list_add_tail(&extent_op->list, &insert_list);
-                       search = end + 1;
-                       if (num_inserts == max_inserts) {
-                               restart = 1;
-                               break;
-                       }
-               } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                       list_add_tail(&extent_op->list, &update_list);
-                       search = end + 1;
-               } else {
-                       BUG();
-               }
-       }
-
-       /*
-        * process the update list, clear the writeback bit for it, and if
-        * somebody marked this thing for deletion then just unlock it and be
-        * done, the free_extents will handle it
-        */
-       list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
-               clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                 extent_op->bytenr + extent_op->num_bytes - 1,
-                                 EXTENT_WRITEBACK, GFP_NOFS);
-               if (extent_op->del) {
-                       list_del_init(&extent_op->list);
-                       unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                     extent_op->bytenr + extent_op->num_bytes
-                                     - 1, GFP_NOFS);
-                       kfree(extent_op);
-               }
-       }
-       mutex_unlock(&info->extent_ins_mutex);
-
-       /*
-        * still have things left on the update list, go ahead an update
-        * everything
-        */
-       if (!list_empty(&update_list)) {
-               ret = update_backrefs(trans, extent_root, path, &update_list);
-               BUG_ON(ret);
-
-               /* we may have COW'ed new blocks, so lets start over */
-               if (all)
-                       restart = 1;
-       }
-
-       /*
-        * if no inserts need to be done, but we skipped some extents and we
-        * need to make sure everything is cleaned then reset everything and
-        * go back to the beginning
-        */
-       if (!num_inserts && restart) {
-               search = 0;
-               restart = 0;
-               INIT_LIST_HEAD(&update_list);
-               INIT_LIST_HEAD(&insert_list);
-               goto again;
-       } else if (!num_inserts) {
-               goto out;
-       }
-
-       /*
-        * process the insert extents list.  Again if we are deleting this
-        * extent, then just unlock it, pin down the bytes if need be, and be
-        * done with it.  Saves us from having to actually insert the extent
-        * into the tree and then subsequently come along and delete it
-        */
-       mutex_lock(&info->extent_ins_mutex);
-       list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
-               clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                 extent_op->bytenr + extent_op->num_bytes - 1,
-                                 EXTENT_WRITEBACK, GFP_NOFS);
-               if (extent_op->del) {
-                       u64 used;
-                       list_del_init(&extent_op->list);
-                       unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                     extent_op->bytenr + extent_op->num_bytes
-                                     - 1, GFP_NOFS);
-
-                       mutex_lock(&extent_root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, extent_root,
-                                            extent_op->bytenr,
-                                            extent_op->num_bytes, 0);
-                       mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-                       spin_lock(&info->delalloc_lock);
-                       used = btrfs_super_bytes_used(&info->super_copy);
-                       btrfs_set_super_bytes_used(&info->super_copy,
-                                       used - extent_op->num_bytes);
-                       used = btrfs_root_used(&extent_root->root_item);
-                       btrfs_set_root_used(&extent_root->root_item,
-                                       used - extent_op->num_bytes);
-                       spin_unlock(&info->delalloc_lock);
-
-                       ret = update_block_group(trans, extent_root,
-                                                extent_op->bytenr,
-                                                extent_op->num_bytes,
-                                                0, ret > 0);
-                       BUG_ON(ret);
-                       kfree(extent_op);
-                       num_inserts--;
-               }
-       }
-       mutex_unlock(&info->extent_ins_mutex);
-
-       ret = insert_extents(trans, extent_root, path, &insert_list,
-                            num_inserts);
-       BUG_ON(ret);
-
-       /*
-        * if restart is set for whatever reason we need to go back and start
-        * searching through the pending list again.
-        *
-        * We just inserted some extents, which could have resulted in new
-        * blocks being allocated, which would result in new blocks needing
-        * updates, so if all is set we _must_ restart to get the updated
-        * blocks.
-        */
-       if (restart || all) {
-               INIT_LIST_HEAD(&insert_list);
-               INIT_LIST_HEAD(&update_list);
-               search = 0;
-               restart = 0;
-               num_inserts = 0;
-               goto again;
-       }
-out:
-       btrfs_free_path(path);
-       return 0;
-}
-
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, int is_data)
+                         struct btrfs_path *path,
+                         u64 bytenr, u64 num_bytes, int is_data,
+                         struct extent_buffer **must_clean)
 {
        int err = 0;
        struct extent_buffer *buf;
@@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+                   header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                       clean_tree_block(NULL, root, buf);
-                       btrfs_tree_unlock(buf);
-                       free_extent_buffer(buf);
+                       *must_clean = buf;
                        return 1;
                }
                btrfs_tree_unlock(buf);
        }
        free_extent_buffer(buf);
 pinit:
+       btrfs_set_path_blocking(path);
+       mutex_lock(&root->fs_info->pinned_mutex);
+       /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
        BUG_ON(err < 0);
@@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 ref_generation,
-                        u64 owner_objectid, int pin, int mark_free)
+                        u64 owner_objectid, int pin, int mark_free,
+                        int refs_to_drop)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
@@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->reada = 1;
+       path->leave_spinning = 1;
        ret = lookup_extent_backref(trans, extent_root, path,
                                    bytenr, parent, root_objectid,
                                    ref_generation, owner_objectid, 1);
@@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                                break;
                }
                if (!found_extent) {
-                       ret = remove_extent_backref(trans, extent_root, path);
+                       ret = remove_extent_backref(trans, extent_root, path,
+                                                   refs_to_drop);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                      "root %llu gen %llu owner %llu\n",
+                      "parent %llu root %llu gen %llu owner %llu\n",
                       (unsigned long long)bytenr,
+                      (unsigned long long)parent,
                       (unsigned long long)root_objectid,
                       (unsigned long long)ref_generation,
                       (unsigned long long)owner_objectid);
@@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
-       BUG_ON(refs == 0);
-       refs -= 1;
-       btrfs_set_extent_refs(leaf, ei, refs);
 
+       /*
+        * we're not allowed to delete the extent item if there
+        * are other delayed ref updates pending
+        */
+
+       BUG_ON(refs < refs_to_drop);
+       refs -= refs_to_drop;
+       btrfs_set_extent_refs(leaf, ei, refs);
        btrfs_mark_buffer_dirty(leaf);
 
-       if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+       if (refs == 0 && found_extent &&
+           path->slots[0] == extent_slot + 1) {
                struct btrfs_extent_ref *ref;
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_ref);
-               BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+               BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
                /* if the back ref and the extent are next to each other
                 * they get deleted below in one shot
                 */
@@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                num_to_del = 2;
        } else if (found_extent) {
                /* otherwise delete the extent back ref */
-               ret = remove_extent_backref(trans, extent_root, path);
+               ret = remove_extent_backref(trans, extent_root, path,
+                                           refs_to_drop);
                BUG_ON(ret);
                /* if refs are 0, we need to setup the path for deletion */
                if (refs == 0) {
                        btrfs_release_path(extent_root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root, &key, path,
                                                -1, 1);
                        BUG_ON(ret);
@@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        if (refs == 0) {
                u64 super_used;
                u64 root_used;
+               struct extent_buffer *must_clean = NULL;
 
                if (pin) {
-                       mutex_lock(&root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, root, bytenr, num_bytes,
-                               owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
-                       mutex_unlock(&root->fs_info->pinned_mutex);
+                       ret = pin_down_bytes(trans, root, path,
+                               bytenr, num_bytes,
+                               owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+                               &must_clean);
                        if (ret > 0)
                                mark_free = 1;
                        BUG_ON(ret < 0);
                }
+
                /* block accounting for super block */
                spin_lock(&info->delalloc_lock);
                super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_set_root_used(&root->root_item,
                                           root_used - num_bytes);
                spin_unlock(&info->delalloc_lock);
+
+               /*
+                * it is going to be very rare for someone to be waiting
+                * on the block we're freeing.  del_items might need to
+                * schedule, so rather than get fancy, just force it
+                * to blocking here
+                */
+               if (must_clean)
+                       btrfs_set_lock_blocking(must_clean);
+
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
 
+               if (must_clean) {
+                       clean_tree_block(NULL, root, must_clean);
+                       btrfs_tree_unlock(must_clean);
+                       free_extent_buffer(must_clean);
+               }
+
                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
+               } else {
+                       invalidate_mapping_pages(info->btree_inode->i_mapping,
+                            bytenr >> PAGE_CACHE_SHIFT,
+                            (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(ret);
        }
        btrfs_free_path(path);
-       finish_current_insert(trans, extent_root, 0);
        return ret;
 }
 
 /*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
+ * remove an extent from the root, returns 0 on success
  */
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root, int all)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       u64 bytenr, u64 num_bytes, u64 parent,
+                                       u64 root_objectid, u64 ref_generation,
+                                       u64 owner_objectid, int pin,
+                                       int refs_to_drop)
 {
-       int ret;
-       int err = 0;
-       u64 start;
-       u64 end;
-       u64 priv;
-       u64 search = 0;
-       int nr = 0, skipped = 0;
-       struct extent_io_tree *pending_del;
-       struct extent_io_tree *extent_ins;
-       struct pending_extent_op *extent_op;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct list_head delete_list;
-
-       INIT_LIST_HEAD(&delete_list);
-       extent_ins = &extent_root->fs_info->extent_ins;
-       pending_del = &extent_root->fs_info->pending_del;
-
-again:
-       mutex_lock(&info->extent_ins_mutex);
-       while (1) {
-               ret = find_first_extent_bit(pending_del, search, &start, &end,
-                                           EXTENT_WRITEBACK);
-               if (ret) {
-                       if (all && skipped && !nr) {
-                               search = 0;
-                               skipped = 0;
-                               continue;
-                       }
-                       mutex_unlock(&info->extent_ins_mutex);
-                       break;
-               }
-
-               ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
-               if (!ret) {
-                       search = end+1;
-                       skipped = 1;
-
-                       if (need_resched()) {
-                               mutex_unlock(&info->extent_ins_mutex);
-                               cond_resched();
-                               mutex_lock(&info->extent_ins_mutex);
-                       }
-
-                       continue;
-               }
-               BUG_ON(ret < 0);
-
-               ret = get_state_private(pending_del, start, &priv);
-               BUG_ON(ret);
-               extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-               clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
-                                 GFP_NOFS);
-               if (!test_range_bit(extent_ins, start, end,
-                                   EXTENT_WRITEBACK, 0)) {
-                       list_add_tail(&extent_op->list, &delete_list);
-                       nr++;
-               } else {
-                       kfree(extent_op);
-
-                       ret = get_state_private(&info->extent_ins, start,
-                                               &priv);
-                       BUG_ON(ret);
-                       extent_op = (struct pending_extent_op *)
-                                               (unsigned long)priv;
-
-                       clear_extent_bits(&info->extent_ins, start, end,
-                                         EXTENT_WRITEBACK, GFP_NOFS);
-
-                       if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                               list_add_tail(&extent_op->list, &delete_list);
-                               search = end + 1;
-                               nr++;
-                               continue;
-                       }
-
-                       mutex_lock(&extent_root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, extent_root, start,
-                                            end + 1 - start, 0);
-                       mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-                       ret = update_block_group(trans, extent_root, start,
-                                               end + 1 - start, 0, ret > 0);
-
-                       unlock_extent(extent_ins, start, end, GFP_NOFS);
-                       BUG_ON(ret);
-                       kfree(extent_op);
-               }
-               if (ret)
-                       err = ret;
-
-               search = end + 1;
-
-               if (need_resched()) {
-                       mutex_unlock(&info->extent_ins_mutex);
-                       cond_resched();
-                       mutex_lock(&info->extent_ins_mutex);
-               }
-       }
+       WARN_ON(num_bytes < root->sectorsize);
 
-       if (nr) {
-               ret = free_extents(trans, extent_root, &delete_list);
-               BUG_ON(ret);
-       }
+       /*
+        * if metadata always pin
+        * if data pin when any transaction has committed this
+        */
+       if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+           ref_generation != trans->transid)
+               pin = 1;
 
-       if (all && skipped) {
-               INIT_LIST_HEAD(&delete_list);
-               search = 0;
-               nr = 0;
-               goto again;
-       }
+       if (ref_generation != trans->transid)
+               pin = 1;
 
-       if (!err)
-               finish_current_insert(trans, extent_root, 0);
-       return err;
+       return __free_extent(trans, root, bytenr, num_bytes, parent,
+                           root_objectid, ref_generation,
+                           owner_objectid, pin, pin == 0, refs_to_drop);
 }
 
 /*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
  */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, u64 parent,
-                              u64 root_objectid, u64 ref_generation,
-                              u64 owner_objectid, int pin)
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 bytenr)
 {
-       struct btrfs_root *extent_root = root->fs_info->extent_root;
-       int pending_ret;
+       struct btrfs_delayed_ref_head *head;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct rb_node *node;
        int ret;
 
-       WARN_ON(num_bytes < root->sectorsize);
-       if (root == extent_root) {
-               struct pending_extent_op *extent_op = NULL;
-
-               mutex_lock(&root->fs_info->extent_ins_mutex);
-               if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                               bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                       u64 priv;
-                       ret = get_state_private(&root->fs_info->extent_ins,
-                                               bytenr, &priv);
-                       BUG_ON(ret);
-                       extent_op = (struct pending_extent_op *)
-                                               (unsigned long)priv;
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (!head)
+               goto out;
 
-                       extent_op->del = 1;
-                       if (extent_op->type == PENDING_EXTENT_INSERT) {
-                               mutex_unlock(&root->fs_info->extent_ins_mutex);
-                               return 0;
-                       }
-               }
+       node = rb_prev(&head->node.rb_node);
+       if (!node)
+               goto out;
 
-               if (extent_op) {
-                       ref_generation = extent_op->orig_generation;
-                       parent = extent_op->orig_parent;
-               }
+       ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-               BUG_ON(!extent_op);
-
-               extent_op->type = PENDING_EXTENT_DELETE;
-               extent_op->bytenr = bytenr;
-               extent_op->num_bytes = num_bytes;
-               extent_op->parent = parent;
-               extent_op->orig_parent = parent;
-               extent_op->generation = ref_generation;
-               extent_op->orig_generation = ref_generation;
-               extent_op->level = (int)owner_objectid;
-               INIT_LIST_HEAD(&extent_op->list);
-               extent_op->del = 0;
-
-               set_extent_bits(&root->fs_info->pending_del,
-                               bytenr, bytenr + num_bytes - 1,
-                               EXTENT_WRITEBACK, GFP_NOFS);
-               set_state_private(&root->fs_info->pending_del,
-                                 bytenr, (unsigned long)extent_op);
-               mutex_unlock(&root->fs_info->extent_ins_mutex);
-               return 0;
-       }
-       /* if metadata always pin */
-       if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-               if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-                       mutex_lock(&root->fs_info->pinned_mutex);
-                       btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-                       mutex_unlock(&root->fs_info->pinned_mutex);
-                       update_reserved_extents(root, bytenr, num_bytes, 0);
-                       return 0;
-               }
-               pin = 1;
-       }
+       /* there are still entries for this ref, we can't drop it */
+       if (ref->bytenr == bytenr)
+               goto out;
 
-       /* if data pin when any transaction has committed this */
-       if (ref_generation != trans->transid)
-               pin = 1;
+       /*
+        * waiting for the lock here would deadlock.  If someone else has it
+        * locked they are already in the process of dropping it anyway
+        */
+       if (!mutex_trylock(&head->mutex))
+               goto out;
 
-       ret = __free_extent(trans, root, bytenr, num_bytes, parent,
-                           root_objectid, ref_generation,
-                           owner_objectid, pin, pin == 0);
+       /*
+        * at this point we have a head with no other entries.  Go
+        * ahead and process it.
+        */
+       head->node.in_tree = 0;
+       rb_erase(&head->node.rb_node, &delayed_refs->root);
+
+       delayed_refs->num_entries--;
 
-       finish_current_insert(trans, root->fs_info->extent_root, 0);
-       pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
-       return ret ? ret : pending_ret;
+       /*
+        * we don't take a ref on the node because we're removing it from the
+        * tree, so we just steal the ref the tree was holding.
+        */
+       delayed_refs->num_heads--;
+       if (list_empty(&head->cluster))
+               delayed_refs->num_heads_ready--;
+
+       list_del_init(&head->cluster);
+       spin_unlock(&delayed_refs->lock);
+
+       ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+                                 &head->node, head->must_insert_reserved);
+       BUG_ON(ret);
+       btrfs_put_delayed_ref(&head->node);
+       return 0;
+out:
+       spin_unlock(&delayed_refs->lock);
+       return 0;
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
 
-       ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
-                                 root_objectid, ref_generation,
-                                 owner_objectid, pin);
+       /*
+        * tree log blocks never actually go into the extent allocation
+        * tree, just update pinning info and exit early.
+        *
+        * data extents referenced by the tree log do need to have
+        * their reference counts bumped.
+        */
+       if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+           owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+               mutex_lock(&root->fs_info->pinned_mutex);
+
+               /* unlocks the pinned mutex */
+               btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+               update_reserved_extents(root, bytenr, num_bytes, 0);
+               ret = 0;
+       } else {
+               ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                                      root_objectid, ref_generation,
+                                      owner_objectid,
+                                      BTRFS_DROP_DELAYED_REF, 1);
+               BUG_ON(ret);
+               ret = check_ref_cleanup(trans, root, bytenr);
+               BUG_ON(ret);
+       }
        return ret;
 }
 
@@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u64 parent,
                                         u64 root_objectid, u64 ref_generation,
-                                        u64 owner, struct btrfs_key *ins)
+                                        u64 owner, struct btrfs_key *ins,
+                                        int ref_mod)
 {
        int ret;
-       int pending_ret;
        u64 super_used;
        u64 root_used;
        u64 num_bytes = ins->offset;
@@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
        spin_unlock(&info->delalloc_lock);
 
-       if (root == extent_root) {
-               struct pending_extent_op *extent_op;
-
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-               BUG_ON(!extent_op);
-
-               extent_op->type = PENDING_EXTENT_INSERT;
-               extent_op->bytenr = ins->objectid;
-               extent_op->num_bytes = ins->offset;
-               extent_op->parent = parent;
-               extent_op->orig_parent = 0;
-               extent_op->generation = ref_generation;
-               extent_op->orig_generation = 0;
-               extent_op->level = (int)owner;
-               INIT_LIST_HEAD(&extent_op->list);
-               extent_op->del = 0;
-
-               mutex_lock(&root->fs_info->extent_ins_mutex);
-               set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-                               ins->objectid + ins->offset - 1,
-                               EXTENT_WRITEBACK, GFP_NOFS);
-               set_state_private(&root->fs_info->extent_ins,
-                                 ins->objectid, (unsigned long)extent_op);
-               mutex_unlock(&root->fs_info->extent_ins_mutex);
-               goto update_block;
-       }
-
        memcpy(&keys[0], ins, sizeof(*ins));
        keys[1].objectid = ins->objectid;
        keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
                                       sizes, 2);
        BUG_ON(ret);
 
        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_extent_item);
-       btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+       btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
                             struct btrfs_extent_ref);
 
        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-       btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+       btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
 
        btrfs_mark_buffer_dirty(path->nodes[0]);
 
        trans->alloc_exclude_start = 0;
        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-       finish_current_insert(trans, extent_root, 0);
-       pending_ret = del_pending_extents(trans, extent_root, 0);
 
        if (ret)
                goto out;
-       if (pending_ret) {
-               ret = pending_ret;
-               goto out;
-       }
 
-update_block:
        ret = update_block_group(trans, root, ins->objectid,
                                 ins->offset, 1, 0);
        if (ret) {
@@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
                return 0;
-       ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                           ref_generation, owner, ins);
-       update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+       ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                                   ins->offset, parent, root_objectid,
+                                   ref_generation, owner,
+                                   BTRFS_ADD_DELAYED_EXTENT, 0);
+       BUG_ON(ret);
        return ret;
 }
 
@@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        put_block_group(block_group);
        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                           ref_generation, owner, ins);
+                                           ref_generation, owner, ins, 1);
        return ret;
 }
 
@@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       u64 search_end, struct btrfs_key *ins, u64 data)
 {
        int ret;
-
        ret = __btrfs_reserve_extent(trans, root, num_bytes,
                                     min_alloc_size, empty_size, hint_byte,
                                     search_end, ins, data);
        BUG_ON(ret);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-               ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-                                       root_objectid, ref_generation,
-                                       owner_objectid, ins);
+               ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                                           ins->offset, parent, root_objectid,
+                                           ref_generation, owner_objectid,
+                                           BTRFS_ADD_DELAYED_EXTENT, 0);
                BUG_ON(ret);
-
-       } else {
-               update_reserved_extents(root, ins->objectid, ins->offset, 1);
        }
+       update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
 
@@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 
                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
-               ret = __btrfs_free_extent(trans, root, disk_bytenr,
+               ret = btrfs_free_extent(trans, root, disk_bytenr,
                                btrfs_file_extent_disk_num_bytes(leaf, fi),
                                leaf->start, leaf_owner, leaf_generation,
                                key.objectid, 0);
@@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
         */
        for (i = 0; i < ref->nritems; i++) {
                info = ref->extents + sorted[i].slot;
-               ret = __btrfs_free_extent(trans, root, info->bytenr,
+               ret = btrfs_free_extent(trans, root, info->bytenr,
                                          info->num_bytes, ref->bytenr,
                                          ref->owner, ref->generation,
                                          info->objectid, 0);
@@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
 
-       ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+       ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
        BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
                 * we just decrement it below and don't update any
                 * of the refs the leaf points to.
                 */
-               ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+               ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                               blocksize, &refs);
                BUG_ON(ret);
                if (refs != 1)
                        continue;
@@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
         */
        for (i = 0; i < refi; i++) {
                bytenr = sorted[i].bytenr;
-               ret = __btrfs_free_extent(trans, root, bytenr,
+               ret = btrfs_free_extent(trans, root, bytenr,
                                        blocksize, eb->start,
                                        root_owner, root_gen, 0, 1);
                BUG_ON(ret);
@@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-       ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+       ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
                                path->nodes[*level]->len, &refs);
        BUG_ON(ret);
        if (refs > 1)
@@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
                blocksize = btrfs_level_size(root, *level - 1);
 
-               ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+               ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                               blocksize, &refs);
                BUG_ON(ret);
 
                /*
@@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                        root_gen = btrfs_header_generation(parent);
                        path->slots[*level]++;
 
-                       ret = __btrfs_free_extent(trans, root, bytenr,
+                       ret = btrfs_free_extent(trans, root, bytenr,
                                                blocksize, parent->start,
                                                root_owner, root_gen,
                                                *level - 1, 1);
@@ -4165,7 +3587,7 @@ out:
         * cleanup and free the reference on the last node
         * we processed
         */
-       ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+       ret = btrfs_free_extent(trans, root, bytenr, blocksize,
                                  parent->start, root_owner, root_gen,
                                  *level, 1);
        free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_path *path;
        int i;
        int orig_level;
+       int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
 
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
        }
        while (1) {
+               unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
@@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        break;
                if (wret < 0)
                        ret = wret;
-               if (trans->transaction->in_commit) {
+               if (trans->transaction->in_commit ||
+                   trans->transaction->delayed_refs.flushing) {
                        ret = -EAGAIN;
                        break;
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+               for (update_count = 0; update_count < 16; update_count++) {
+                       update = trans->delayed_ref_updates;
+                       trans->delayed_ref_updates = 0;
+                       if (update)
+                               btrfs_run_delayed_refs(trans, root, update);
+                       else
+                               break;
+               }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
@@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
                                        root->root_key.objectid,
                                        trans->transid, key.objectid);
                BUG_ON(ret);
+
                ret = btrfs_free_extent(trans, root,
                                        bytenr, num_bytes, leaf->start,
                                        btrfs_header_owner(leaf),
@@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
                                ref_path, NULL, NULL);
        BUG_ON(ret);
 
-       if (root == root->fs_info->extent_root)
-               btrfs_extent_post_op(trans, root);
-
        return 0;
 }
 
@@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                goto out;
@@ -6208,6 +5640,9 @@ again:
        btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
        mutex_unlock(&root->fs_info->cleaner_mutex);
 
+       trans = btrfs_start_transaction(info->tree_root, 1);
+       btrfs_commit_transaction(trans, info->tree_root);
+
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
        extent_root = root->fs_info->extent_root;
 
-       root->fs_info->last_trans_new_blockgroup = trans->transid;
+       root->fs_info->last_trans_log_full_commit = trans->transid;
 
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)
@@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                sizeof(cache->item));
        BUG_ON(ret);
 
-       finish_current_insert(trans, extent_root, 0);
-       ret = del_pending_extents(trans, extent_root, 0);
-       BUG_ON(ret);
        set_avail_alloc_bits(extent_root->fs_info, type);
 
        return 0;
index ebe6b29e60698156250708766c70bb74b903bfb6..08085af089e27827eba49898146ebe63c6623b07 100644 (file)
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb)
 {
-       int set;
        unsigned long i;
        unsigned long num_pages;
        struct page *page;
 
-       u64 start = eb->start;
-       u64 end = start + eb->len - 1;
-
-       set = clear_extent_dirty(tree, start, end, GFP_NOFS);
        num_pages = num_extent_pages(eb->start, eb->len);
 
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if (!set && !PageDirty(page))
+               if (!PageDirty(page))
                        continue;
 
                lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                else
                        set_page_private(page, EXTENT_PAGE_PRIVATE);
 
-               /*
-                * if we're on the last page or the first page and the
-                * block isn't aligned on a page boundary, do extra checks
-                * to make sure we don't clean page that is partially dirty
-                */
-               if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-                   ((i == num_pages - 1) &&
-                    ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-                       start = (u64)page->index << PAGE_CACHE_SHIFT;
-                       end  = start + PAGE_CACHE_SIZE - 1;
-                       if (test_range_bit(tree, start, end,
-                                          EXTENT_DIRTY, 0)) {
-                               unlock_page(page);
-                               continue;
-                       }
-               }
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
                if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 {
        unsigned long i;
        unsigned long num_pages;
+       int was_dirty = 0;
 
+       was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
-       for (i = 0; i < num_pages; i++) {
-               struct page *page = extent_buffer_page(eb, i);
-               /* writepage may need to do something special for the
-                * first page, we have to make sure page->private is
-                * properly set.  releasepage may drop page->private
-                * on us if the page isn't already dirty.
-                */
-               lock_page(page);
-               if (i == 0) {
-                       set_page_extent_head(page, eb->len);
-               } else if (PagePrivate(page) &&
-                          page->private != EXTENT_PAGE_PRIVATE) {
-                       set_page_extent_mapped(page);
-               }
+       for (i = 0; i < num_pages; i++)
                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-               set_extent_dirty(tree, page_offset(page),
-                                page_offset(page) + PAGE_CACHE_SIZE - 1,
-                                GFP_NOFS);
-               unlock_page(page);
-       }
-       return 0;
+       return was_dirty;
 }
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
                ret = 0;
                goto out;
        }
+       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+               ret = 0;
+               goto out;
+       }
        /* at this point we can safely release the extent buffer */
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++)
index 1f9df88afbf6922e9f4af87972f05521ed15e862..5bc20abf3f3d340b22248802e4c8d7d72f4dc4bb 100644 (file)
@@ -25,6 +25,7 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+                            struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
index 964652435fd1a5f40e061cfe1c3a7dd1d1198248..9b99886562d0b2c8ea9e883f3f2623b0ec6d8674 100644 (file)
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        file_key.offset = pos;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      sizeof(*item));
        if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                key.offset = end_byte - 1;
                key.type = BTRFS_EXTENT_CSUM_KEY;
 
+               path->leave_spinning = 1;
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
        } else {
                ins_size = csum_size;
        }
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      ins_size);
+       path->leave_spinning = 0;
        if (ret < 0)
                goto fail_unlock;
        if (ret != 0) {
@@ -776,7 +780,6 @@ found:
        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
                                      btrfs_item_size_nr(leaf, path->slots[0]));
        eb_token = NULL;
-       cond_resched();
 next_sector:
 
        if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
                eb_token = NULL;
        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
-       cond_resched();
        if (total_bytes < sums->len) {
                btrfs_release_path(root, path);
+               cond_resched();
                goto again;
        }
 out:
index dc78954861b333d75e831e27c6bbe7760f6e1de9..9c9fb46ccd08f9fe7eaf9d22991738f9231810c2 100644 (file)
@@ -606,6 +606,7 @@ next_slot:
                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 
                        btrfs_release_path(root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
                                                      sizeof(*extent));
                        BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
                                                        ram_bytes);
                        btrfs_set_file_extent_type(leaf, extent, found_type);
 
+                       btrfs_unlock_up_safe(path, 1);
                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                       btrfs_set_lock_blocking(path->nodes[0]);
 
                        if (disk_bytenr != 0) {
                                ret = btrfs_update_extent_ref(trans, root,
-                                               disk_bytenr, orig_parent,
+                                               disk_bytenr,
+                                               le64_to_cpu(old.disk_num_bytes),
+                                               orig_parent,
                                                leaf->start,
                                                root->root_key.objectid,
                                                trans->transid, ins.objectid);
 
                                BUG_ON(ret);
                        }
+                       path->leave_spinning = 0;
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0)
                                inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 
        if (orig_parent != leaf->start) {
-               ret = btrfs_update_extent_ref(trans, root, bytenr,
+               ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
                                              orig_parent, leaf->start,
                                              root->root_key.objectid,
                                              trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
                page_cache_release(pinned[1]);
        *ppos = pos;
 
+       /*
+        * we want to make sure fsync finds this change
+        * but we haven't joined a transaction running right now.
+        *
+        * Later on, someone is sure to update the inode and get the
+        * real transid recorded.
+        *
+        * We set last_trans now to the fs_info generation + 1,
+        * this will either be one more than the running transaction
+        * or the generation used for the next transaction if there isn't
+        * one running right now.
+        */
+       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
        if (num_written > 0 && will_write) {
                struct btrfs_trans_handle *trans;
 
@@ -1167,8 +1187,11 @@ out_nolock:
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
-                               btrfs_sync_log(trans, root);
-                               btrfs_end_transaction(trans, root);
+                               ret = btrfs_sync_log(trans, root);
+                               if (ret == 0)
+                                       btrfs_end_transaction(trans, root);
+                               else
+                                       btrfs_commit_transaction(trans, root);
                        } else {
                                btrfs_commit_transaction(trans, root);
                        }
@@ -1185,6 +1208,18 @@ out_nolock:
 
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+       /*
+        * ordered_data_close is set by settattr when we are about to truncate
+        * a file from a non-zero size to a zero size.  This tries to
+        * flush down new bytes that may have been written if the
+        * application were using truncate to replace a file in place.
+        */
+       if (BTRFS_I(inode)->ordered_data_close) {
+               BTRFS_I(inode)->ordered_data_close = 0;
+               btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+               if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                       filemap_flush(inode->i_mapping);
+       }
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
        return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (ret > 0) {
                ret = btrfs_commit_transaction(trans, root);
        } else {
-               btrfs_sync_log(trans, root);
-               ret = btrfs_end_transaction(trans, root);
+               ret = btrfs_sync_log(trans, root);
+               if (ret == 0)
+                       ret = btrfs_end_transaction(trans, root);
+               else
+                       ret = btrfs_commit_transaction(trans, root);
        }
        mutex_lock(&dentry->d_inode->i_mutex);
 out:
index 3d46fa1f29a4d75be94f116e148a0f50321973f6..6b627c6118081ccea87d1acaf92000f995dda946 100644 (file)
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+       path->leave_spinning = 1;
+
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
index 17e608c4dc702466900a99c1ec49dc993da2592b..06d8db5afb08ba8800dc776a64f421c1e9322780 100644 (file)
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+       path->leave_spinning = 1;
        btrfs_set_trans_block_group(trans, inode);
 
        key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        cur_size = min_t(unsigned long, compressed_size,
                                       PAGE_CACHE_SIZE);
 
-                       kaddr = kmap(cpage);
+                       kaddr = kmap_atomic(cpage, KM_USER0);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                       kunmap(cpage);
+                       kunmap_atomic(kaddr, KM_USER0);
 
                        i++;
                        ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode, u64 start, u64 end,
                                 size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
        u64 cur_end;
        int limit = 10 * 1024 * 1042;
 
-       if (!btrfs_test_opt(root, COMPRESS)) {
-               return cow_file_range(inode, locked_page, start, end,
-                                     page_started, nr_written, 1);
-       }
-
        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
                         EXTENT_DELALLOC, 1, 0, GFP_NOFS);
        while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+                                      struct page *locked_page,
                              u64 start, u64 end, int *page_started, int force,
                              unsigned long *nr_written)
 {
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                              unsigned long *nr_written)
 {
        int ret;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
 
        if (btrfs_test_flag(inode, NODATACOW))
                ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        else if (btrfs_test_flag(inode, PREALLOC))
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
+       else if (!btrfs_test_opt(root, COMPRESS))
+               ret = cow_file_range(inode, locked_page, start, end,
+                                     page_started, nr_written, 1);
        else
                ret = cow_file_range_async(inode, locked_page, start, end,
                                           page_started, nr_written);
-
        return ret;
 }
 
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       path->leave_spinning = 1;
        ret = btrfs_drop_extents(trans, root, inode, file_pos,
                                 file_pos + num_bytes, file_pos, &hint);
        BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_compression(leaf, fi, compression);
        btrfs_set_file_extent_encryption(leaf, fi, encryption);
        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+       btrfs_unlock_up_safe(path, 1);
+       btrfs_set_lock_blocking(leaf);
+
        btrfs_mark_buffer_dirty(leaf);
 
        inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                          root->root_key.objectid,
                                          trans->transid, inode->i_ino, &ins);
        BUG_ON(ret);
-
        btrfs_free_path(path);
+
        return 0;
 }
 
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction.  It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+                               struct btrfs_path *path,
+                               struct btrfs_ordered_extent *ordered_extent)
+{
+       struct btrfs_ordered_sum *sum;
+       u64 bytenr;
+
+       sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+                        list);
+       bytenr = sum->sums[0].bytenr;
+
+       /*
+        * we don't care about the results, the point of this search is
+        * just to get the btree leaves into ram
+        */
+       btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-       struct btrfs_ordered_extent *ordered_extent;
+       struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct btrfs_path *path;
        int compressed = 0;
        int ret;
 
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        if (!ret)
                return 0;
 
+       /*
+        * before we join the transaction, try to do some of our IO.
+        * This will limit the amount of IO that we have to do with
+        * the transaction running.  We're unlikely to need to do any
+        * IO if the file extents are new, the disk_i_size checks
+        * covers the most common case.
+        */
+       if (start < BTRFS_I(inode)->disk_i_size) {
+               path = btrfs_alloc_path();
+               if (path) {
+                       ret = btrfs_lookup_file_extent(NULL, root, path,
+                                                      inode->i_ino,
+                                                      start, 0);
+                       ordered_extent = btrfs_lookup_ordered_extent(inode,
+                                                                    start);
+                       if (!list_empty(&ordered_extent->list)) {
+                               btrfs_release_path(root, path);
+                               reada_csum(root, path, ordered_extent);
+                       }
+                       btrfs_free_path(path);
+               }
+       }
+
        trans = btrfs_join_transaction(root, 1);
 
-       ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+       if (!ordered_extent)
+               ordered_extent = btrfs_lookup_ordered_extent(inode, start);
        BUG_ON(!ordered_extent);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
                goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
        path = btrfs_alloc_path();
        BUG_ON(!path);
+       path->leave_spinning = 1;
        ret = btrfs_lookup_inode(trans, root, path,
                                 &BTRFS_I(inode)->location, 1);
        if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                goto err;
        }
 
+       path->leave_spinning = 1;
        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
                                    name, name_len, -1);
        if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
                                         inode, dir->i_ino);
        BUG_ON(ret != 0 && ret != -ENOENT);
-       if (ret != -ENOENT)
-               BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        trans = btrfs_start_transaction(root, 1);
 
        btrfs_set_trans_block_group(trans, dir);
+
+       btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
 
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.type = (u8)-1;
 
 search_again:
+       path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                goto error;
@@ -2644,6 +2702,7 @@ delete:
                        break;
                }
                if (found_extent) {
+                       btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes,
                                                leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (err)
                return err;
 
-       if (S_ISREG(inode->i_mode) &&
-           attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
-               err = btrfs_cont_expand(inode, attr->ia_size);
-               if (err)
-                       return err;
+       if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+               if (attr->ia_size > inode->i_size) {
+                       err = btrfs_cont_expand(inode, attr->ia_size);
+                       if (err)
+                               return err;
+               } else if (inode->i_size > 0 &&
+                          attr->ia_size == 0) {
+
+                       /* we're truncating a file that used to have good
+                        * data down to zero.  Make sure it gets into
+                        * the ordered flush list so that any new writes
+                        * get down to disk quickly.
+                        */
+                       BTRFS_I(inode)->ordered_data_close = 1;
+               }
        }
 
        err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
        bi->disk_i_size = 0;
        bi->flags = 0;
        bi->index_cnt = (u64)-1;
-       bi->log_dirty_trans = 0;
+       bi->last_unlink_trans = 0;
        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
                             inode->i_mapping, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+       INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        sizes[0] = sizeof(struct btrfs_inode_item);
        sizes[1] = name_len + sizeof(*ref);
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
        if (ret != 0)
                goto fail;
@@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                drop_inode = 1;
 
        nr = trans->blocks_used;
+
+       btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
        btrfs_end_transaction_throttle(trans, root);
 fail:
        if (drop_inode) {
@@ -4363,6 +4436,8 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
+
+       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
@@ -4388,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 
        trans = btrfs_start_transaction(root, 1);
+
+       /*
+        * setattr is responsible for setting the ordered_data_close flag,
+        * but that is only tested during the last file release.  That
+        * could happen well after the next commit, leaving a great big
+        * window where new writes may get lost if someone chooses to write
+        * to this file after truncating to zero
+        *
+        * The inode doesn't have any dirty data here, and so if we commit
+        * this is a noop.  If someone immediately starts writing to the inode
+        * it is very likely we'll catch some of their writes in this
+        * transaction, and the commit will find this file on the ordered
+        * data list with good things to send down.
+        *
+        * This is a best effort solution, there is still a window where
+        * using truncate to replace the contents of the file will
+        * end up with a zero length file after a crash.
+        */
+       if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+               btrfs_add_ordered_operation(trans, root, inode);
+
        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, inode->i_size);
 
@@ -4464,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->i_acl = BTRFS_ACL_NOT_CACHED;
        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
+       INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
 
@@ -4480,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode)
            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
                posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-       spin_lock(&BTRFS_I(inode)->root->list_lock);
+       /*
+        * Make sure we're properly removed from the ordered operation
+        * lists.
+        */
+       smp_mb();
+       if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+               spin_lock(&root->fs_info->ordered_extent_lock);
+               list_del_init(&BTRFS_I(inode)->ordered_operations);
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+       }
+
+       spin_lock(&root->list_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
                       " list\n", inode->i_ino);
                dump_stack();
        }
-       spin_unlock(&BTRFS_I(inode)->root->list_lock);
+       spin_unlock(&root->list_lock);
 
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4611,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_unlock;
 
+       /*
+        * we're using rename to replace one file with another.
+        * and the replacement file is large.  Start IO on it now so
+        * we don't add too much work to the end of the transaction
+        */
+       if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+           new_inode->i_size &&
+           old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+               filemap_flush(old_inode->i_mapping);
+
        trans = btrfs_start_transaction(root, 1);
 
+       /*
+        * make sure the inode gets flushed if it is replacing
+        * something.
+        */
+       if (new_inode && new_inode->i_size &&
+           old_inode && S_ISREG(old_inode->i_mode)) {
+               btrfs_add_ordered_operation(trans, root, old_inode);
+       }
+
+       /*
+        * this is an ugly little race, but the rename is required to make
+        * sure that if we crash, the inode is either at the old name
+        * or the new one.  pinning the log transaction lets us make sure
+        * we don't allow a log commit to come in after we unlink the
+        * name but before we add the new name back in.
+        */
+       btrfs_pin_log_trans(root);
+
        btrfs_set_trans_block_group(trans, new_dir);
 
        btrfs_inc_nlink(old_dentry->d_inode);
@@ -4620,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        new_dir->i_ctime = new_dir->i_mtime = ctime;
        old_inode->i_ctime = ctime;
 
+       if (old_dentry->d_parent != new_dentry->d_parent)
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
        ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
                                 old_dentry->d_name.name,
                                 old_dentry->d_name.len);
@@ -4651,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_fail;
 
+       btrfs_log_new_name(trans, old_inode, old_dir,
+                                      new_dentry->d_parent);
 out_fail:
+
+       /* this btrfs_end_log_trans just allows the current
+        * log-sub transaction to complete
+        */
+       btrfs_end_log_trans(root);
        btrfs_end_transaction_throttle(trans, root);
 out_unlock:
        return ret;
index 47b0a88c12a23a6d983eff4c3ff0e4bdf1347d2e..a5310c0f41e24f089bfd78e3a2f0939d3957f717 100644 (file)
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 static int btrfs_spin_on_block(struct extent_buffer *eb)
 {
        int i;
+
        for (i = 0; i < 512; i++) {
-               cpu_relax();
                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
                        return 1;
                if (need_resched())
                        break;
+               cpu_relax();
        }
        return 0;
 }
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
        int i;
 
-       spin_nested(eb);
-       if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-               return 1;
-       spin_unlock(&eb->lock);
-
+       if (btrfs_spin_on_block(eb)) {
+               spin_nested(eb);
+               if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                       return 1;
+               spin_unlock(&eb->lock);
+       }
        /* spin for a bit on the BLOCKING flag */
        for (i = 0; i < 2; i++) {
+               cpu_relax();
                if (!btrfs_spin_on_block(eb))
                        break;
 
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
        DEFINE_WAIT(wait);
        wait.func = btrfs_wake_function;
 
+       if (!btrfs_spin_on_block(eb))
+               goto sleep;
+
        while(1) {
                spin_nested(eb);
 
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
                 * spin for a bit, and if the blocking flag goes away,
                 * loop around
                 */
+               cpu_relax();
                if (btrfs_spin_on_block(eb))
                        continue;
-
+sleep:
                prepare_to_wait_exclusive(&eb->lock_wq, &wait,
                                          TASK_UNINTERRUPTIBLE);
 
index 77c2411a5f0f0c59b6ab97e78b2828d94a1c7401..53c87b197d701671afde64fd7f8f00b59fdb8cd2 100644 (file)
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+
+       /*
+        * we have no more ordered extents for this inode and
+        * no dirty pages.  We can safely remove it from the
+        * list of ordered extents
+        */
+       if (RB_EMPTY_ROOT(&tree->tree) &&
+           !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+               list_del_init(&BTRFS_I(inode)->ordered_operations);
+       }
        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
        mutex_unlock(&tree->mutex);
@@ -369,6 +379,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
        return 0;
 }
 
+/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+       struct btrfs_inode *btrfs_inode;
+       struct inode *inode;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       mutex_lock(&root->fs_info->ordered_operations_mutex);
+       spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+       list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                  ordered_operations);
+
+               inode = &btrfs_inode->vfs_inode;
+
+               list_del_init(&btrfs_inode->ordered_operations);
+
+               /*
+                * the inode may be getting freed (in sys_unlink path).
+                */
+               inode = igrab(inode);
+
+               if (!wait && inode) {
+                       list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                             &root->fs_info->ordered_operations);
+               }
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+
+               if (inode) {
+                       if (wait)
+                               btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                       else
+                               filemap_flush(inode->i_mapping);
+                       iput(inode);
+               }
+
+               cond_resched();
+               spin_lock(&root->fs_info->ordered_extent_lock);
+       }
+       if (wait && !list_empty(&root->fs_info->ordered_operations))
+               goto again;
+
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+       return 0;
+}
+
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 
        return ret;
 }
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode)
+{
+       u64 last_mod;
+
+       last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+       /*
+        * if this file hasn't been changed since the last transaction
+        * commit, we can safely return without doing anything
+        */
+       if (last_mod < root->fs_info->last_trans_committed)
+               return 0;
+
+       /*
+        * the transaction is already committing.  Just start the IO and
+        * don't bother with all of this list nonsense
+        */
+       if (trans && root->fs_info->running_transaction->blocked) {
+               btrfs_wait_ordered_range(inode, 0, (u64)-1);
+               return 0;
+       }
+
+       spin_lock(&root->fs_info->ordered_extent_lock);
+       if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+               list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                             &root->fs_info->ordered_operations);
+       }
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+
+       return 0;
+}
index ab66d5e8d6d697eac5110401d5e37b0f2afa8f9e..3d31c8827b013407d6f4b14796896a6aac8ba53b 100644 (file)
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                           loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode);
 #endif
index 4112d53d4f4dad195636c6ffa93deaded831af6e..664782c6a2dfbf6e65628f09fe2ff8552335f016 100644 (file)
@@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root)
                cur_trans->use_count = 1;
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
+
+               cur_trans->delayed_refs.root.rb_node = NULL;
+               cur_trans->delayed_refs.num_entries = 0;
+               cur_trans->delayed_refs.num_heads_ready = 0;
+               cur_trans->delayed_refs.num_heads = 0;
+               cur_trans->delayed_refs.flushing = 0;
+               cur_trans->delayed_refs.run_delayed_start = 0;
+               spin_lock_init(&cur_trans->delayed_refs.lock);
+
                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
                extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        h->block_group = 0;
        h->alloc_exclude_nr = 0;
        h->alloc_exclude_start = 0;
+       h->delayed_ref_updates = 0;
+
        root->fs_info->running_transaction->use_count++;
        mutex_unlock(&root->fs_info->trans_mutex);
        return h;
@@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
        mutex_unlock(&root->fs_info->trans_mutex);
-
        throttle_on_drops(root);
 }
 
@@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *info = root->fs_info;
+       int count = 0;
+
+       while (count < 4) {
+               unsigned long cur = trans->delayed_ref_updates;
+               trans->delayed_ref_updates = 0;
+               if (cur &&
+                   trans->transaction->delayed_refs.num_heads_ready > 64) {
+                       trans->delayed_ref_updates = 0;
+
+                       /*
+                        * do a full flush if the transaction is trying
+                        * to close
+                        */
+                       if (trans->transaction->delayed_refs.flushing)
+                               cur = 0;
+                       btrfs_run_delayed_refs(trans, root, cur);
+               } else {
+                       break;
+               }
+               count++;
+       }
 
        mutex_lock(&info->trans_mutex);
        cur_trans = info->running_transaction;
@@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
 
-       btrfs_extent_post_op(trans, root);
        btrfs_write_dirty_block_groups(trans, root);
-       btrfs_extent_post_op(trans, root);
+
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       BUG_ON(ret);
 
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                     btrfs_header_level(root->node));
                btrfs_set_root_generation(&root->root_item, trans->transid);
 
-               btrfs_extent_post_op(trans, root);
-
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
                btrfs_write_dirty_block_groups(trans, root);
-               btrfs_extent_post_op(trans, root);
+
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               BUG_ON(ret);
        }
        return 0;
 }
@@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *next;
        struct extent_buffer *eb;
+       int ret;
 
-       btrfs_extent_post_op(trans, fs_info->tree_root);
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       BUG_ON(ret);
 
        eb = btrfs_lock_root_node(fs_info->tree_root);
-       btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+       btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 
-       btrfs_extent_post_op(trans, fs_info->tree_root);
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       BUG_ON(ret);
 
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
 
                update_cowonly_root(trans, root);
+
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               BUG_ON(ret);
        }
        return 0;
 }
@@ -634,6 +672,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
        return 0;
 }
 
+/*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+       DEFINE_WAIT(wait);
+
+       mutex_lock(&info->trans_mutex);
+       while (info->running_transaction &&
+              info->running_transaction->delayed_refs.flushing) {
+               prepare_to_wait(&info->transaction_wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+               mutex_unlock(&info->trans_mutex);
+               schedule();
+               mutex_lock(&info->trans_mutex);
+               finish_wait(&info->transaction_wait, &wait);
+       }
+       mutex_unlock(&info->trans_mutex);
+       return 0;
+}
+
 /*
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
@@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                atomic_inc(&root->fs_info->throttles);
 
                while (1) {
+                       /*
+                        * we don't want to jump in and create a bunch of
+                        * delayed refs if the transaction is starting to close
+                        */
+                       wait_transaction_pre_flush(tree_root->fs_info);
                        trans = btrfs_start_transaction(tree_root, 1);
+
+                       /*
+                        * we've joined a transaction, make sure it isn't
+                        * closing right now
+                        */
+                       if (trans->transaction->delayed_refs.flushing) {
+                               btrfs_end_transaction(trans, tree_root);
+                               continue;
+                       }
+
                        mutex_lock(&root->fs_info->drop_mutex);
                        ret = btrfs_drop_snapshot(trans, dirty->root);
                        if (ret != -EAGAIN)
@@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
        old = btrfs_lock_root_node(root);
-       btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+       btrfs_cow_block(trans, root, old, NULL, 0, &old);
 
        btrfs_copy_root(trans, root, old, &tmp, objectid);
        btrfs_tree_unlock(old);
@@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
+       int should_grow = 0;
+       unsigned long now = get_seconds();
+
+       btrfs_run_ordered_operations(root, 0);
+
+       /* make a pass through all the delayed refs we have so far
+        * any runnings procs may add more while we are here
+        */
+       ret = btrfs_run_delayed_refs(trans, root, 0);
+       BUG_ON(ret);
+
+       cur_trans = trans->transaction;
+       /*
+        * set the flushing flag so procs in this transaction have to
+        * start sending their work down.
+        */
+       cur_trans->delayed_refs.flushing = 1;
+
+       ret = btrfs_run_delayed_refs(trans, root, 0);
+       BUG_ON(ret);
 
-       INIT_LIST_HEAD(&dirty_fs_roots);
        mutex_lock(&root->fs_info->trans_mutex);
-       if (trans->transaction->in_commit) {
-               cur_trans = trans->transaction;
-               trans->transaction->use_count++;
+       INIT_LIST_HEAD(&dirty_fs_roots);
+       if (cur_trans->in_commit) {
+               cur_trans->use_count++;
                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
 
@@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
-       cur_trans = trans->transaction;
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
@@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                }
        }
 
+       if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+               should_grow = 1;
+
        do {
                int snap_pending = 0;
                joined = cur_trans->num_joined;
@@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
-               else
+               else if (should_grow)
                        timeout = 1;
 
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
 
-               schedule_timeout(timeout);
+               /*
+                * rename don't use btrfs_join_transaction, so, once we
+                * set the transaction to blocked above, we aren't going
+                * to get any new ordered operations.  We can safely run
+                * it here and no for sure that nothing new will be added
+                * to the list
+                */
+               btrfs_run_ordered_operations(root, 1);
+
+               smp_mb();
+               if (cur_trans->num_writers > 1 || should_grow)
+                       schedule_timeout(timeout);
 
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
        } while (cur_trans->num_writers > 1 ||
-                (cur_trans->num_joined != joined));
+                (should_grow && cur_trans->num_joined != joined));
 
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
 
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       BUG_ON(ret);
+
        WARN_ON(cur_trans != trans->transaction);
 
        /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_copy_pinned(root, pinned_copy);
 
        trans->transaction->blocked = 0;
+
        wake_up(&root->fs_info->transaction_throttle);
        wake_up(&root->fs_info->transaction_wait);
 
@@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->trans_mutex);
 
        cur_trans->commit_done = 1;
+
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
 
index ea292117f88252f2f6c14ccde2cce4eac50d9509..94f5bde2b58d40144a5d5ddf3a11607cef4b0e82 100644 (file)
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "delayed-ref.h"
 
 struct btrfs_transaction {
        u64 transid;
+       /*
+        * total writers in this transaction, it must be zero before the
+        * transaction can end
+        */
        unsigned long num_writers;
+
        unsigned long num_joined;
        int in_commit;
        int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
+       struct btrfs_delayed_ref_root delayed_refs;
 };
 
 struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
        u64 block_group;
        u64 alloc_exclude_start;
        u64 alloc_exclude_nr;
+       unsigned long delayed_ref_updates;
 };
 
 struct btrfs_pending_snapshot {
index 98d25fa4570ea902abc4c8345ecf29f62af9013c..b10eacdb16200e686b3519a46747b706b927761f 100644 (file)
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        }
 
        btrfs_release_path(root, path);
-       if (is_extent)
-               btrfs_extent_post_op(trans, root);
 out:
        if (path)
                btrfs_free_path(path);
index 9c462fbd60fac14ad17f192afb08c37794295d36..fc9b87a7975bd38f76d7e22c84cfe010e3aafa42 100644 (file)
 #define LOG_INODE_ALL 0
 #define LOG_INODE_EXISTS 1
 
+/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
 /*
  * stages for the tree walking.  The first
  * stage (0) is to only pin down the blocks we find
 #define LOG_WALK_REPLAY_INODES 1
 #define LOG_WALK_REPLAY_ALL 2
 
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, struct inode *inode,
                             int inode_only);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_root *log,
+                                      struct btrfs_path *path,
+                                      u64 dirid, int del_all);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -132,11 +180,26 @@ static int join_running_log_trans(struct btrfs_root *root)
        return ret;
 }
 
+/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+       int ret = -ENOENT;
+
+       mutex_lock(&root->log_mutex);
+       atomic_inc(&root->log_writers);
+       mutex_unlock(&root->log_mutex);
+       return ret;
+}
+
 /*
  * indicate we're done making changes to the log tree
  * and wake up anyone waiting to do a sync
  */
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
 {
        if (atomic_dec_and_test(&root->log_writers)) {
                smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
                mutex_lock(&log->fs_info->pinned_mutex);
                btrfs_update_pinned_extents(log->fs_info->extent_root,
                                            eb->start, eb->len, 1);
-               mutex_unlock(&log->fs_info->pinned_mutex);
        }
 
        if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 
        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        BUG_ON(ret);
+
        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
        BUG_ON(ret);
        kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
                                btrfs_release_path(root, path);
+
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                key.offset--;
                btrfs_release_path(root, path);
        }
-       btrfs_free_path(path);
+       btrfs_release_path(root, path);
        if (nlink != inode->i_nlink) {
                inode->i_nlink = nlink;
                btrfs_update_inode(trans, root, inode);
        }
        BTRFS_I(inode)->index_cnt = (u64)-1;
 
+       if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+               ret = replay_dir_deletes(trans, root, NULL, path,
+                                        inode->i_ino, 1);
+               BUG_ON(ret);
+       }
+       btrfs_free_path(path);
+
        return 0;
 }
 
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 
                iput(inode);
 
-               if (key.offset == 0)
-                       break;
-               key.offset--;
+               /*
+                * fixup on a directory may create new entries,
+                * make sure we always look for the highset possible
+                * offset
+                */
+               key.offset = (u64)-1;
        }
        btrfs_release_path(root, path);
        return 0;
@@ -1313,11 +1387,11 @@ again:
                read_extent_buffer(eb, name, (unsigned long)(di + 1),
                                  name_len);
                log_di = NULL;
-               if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+               if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
                                                       dir_key->objectid,
                                                       name, name_len, 0);
-               } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+               } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
                        log_di = btrfs_lookup_dir_index_item(trans, log,
                                                     log_path,
                                                     dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
-                                      u64 dirid)
+                                      u64 dirid, int del_all)
 {
        u64 range_start;
        u64 range_end;
@@ -1408,10 +1482,14 @@ again:
        range_start = 0;
        range_end = 0;
        while (1) {
-               ret = find_dir_range(log, path, dirid, key_type,
-                                    &range_start, &range_end);
-               if (ret != 0)
-                       break;
+               if (del_all)
+                       range_end = (u64)-1;
+               else {
+                       ret = find_dir_range(log, path, dirid, key_type,
+                                            &range_start, &range_end);
+                       if (ret != 0)
+                               break;
+               }
 
                dir_key.offset = range_start;
                while (1) {
@@ -1437,7 +1515,8 @@ again:
                                break;
 
                        ret = check_item_in_log(trans, root, log, path,
-                                               log_path, dir, &found_key);
+                                               log_path, dir,
+                                               &found_key);
                        BUG_ON(ret);
                        if (found_key.offset == (u64)-1)
                                break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        mode = btrfs_inode_mode(eb, inode_item);
                        if (S_ISDIR(mode)) {
                                ret = replay_dir_deletes(wc->trans,
-                                        root, log, path, key.objectid);
+                                        root, log, path, key.objectid, 0);
                                BUG_ON(ret);
                        }
                        ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                        root, inode, inode->i_size,
                                        BTRFS_EXTENT_DATA_KEY);
                                BUG_ON(ret);
+
+                               /* if the nlink count is zero here, the iput
+                                * will free the inode.  We bump it to make
+                                * sure it doesn't get freed until the link
+                                * count fixup is done
+                                */
+                               if (inode->i_nlink == 0) {
+                                       btrfs_inc_nlink(inode);
+                                       btrfs_update_inode(wc->trans,
+                                                          root, inode);
+                               }
                                iput(inode);
                        }
                        ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long transid)
 {
        DEFINE_WAIT(wait);
        int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
                prepare_to_wait(&root->log_commit_wait[index],
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-               if (root->log_transid < transid + 2 &&
+
+               if (root->fs_info->last_trans_log_full_commit !=
+                   trans->transid && root->log_transid < transid + 2 &&
                    atomic_read(&root->log_commit[index]))
                        schedule();
+
                finish_wait(&root->log_commit_wait[index], &wait);
                mutex_lock(&root->log_mutex);
        } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
        return 0;
 }
 
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
 {
        DEFINE_WAIT(wait);
        while (atomic_read(&root->log_writers)) {
                prepare_to_wait(&root->log_writer_wait,
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-               if (atomic_read(&root->log_writers))
+               if (root->fs_info->last_trans_log_full_commit !=
+                   trans->transid && atomic_read(&root->log_writers))
                        schedule();
                mutex_lock(&root->log_mutex);
                finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
 /*
  * btrfs_sync_log does sends a given tree log down to the disk and
  * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_lock(&root->log_mutex);
        index1 = root->log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
-               wait_log_commit(root, root->log_transid);
+               wait_log_commit(trans, root, root->log_transid);
                mutex_unlock(&root->log_mutex);
                return 0;
        }
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-               wait_log_commit(root, root->log_transid - 1);
+               wait_log_commit(trans, root, root->log_transid - 1);
 
        while (1) {
                unsigned long batch = root->log_batch;
                mutex_unlock(&root->log_mutex);
                schedule_timeout_uninterruptible(1);
                mutex_lock(&root->log_mutex);
-               wait_for_writer(root);
+
+               wait_for_writer(trans, root);
                if (batch == root->log_batch)
                        break;
        }
 
+       /* bail out if we need to do a full commit */
+       if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+               ret = -EAGAIN;
+               mutex_unlock(&root->log_mutex);
+               goto out;
+       }
+
        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
        BUG_ON(ret);
 
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
-               wait_log_commit(log_root_tree, log_root_tree->log_transid);
+               wait_log_commit(trans, log_root_tree,
+                               log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out;
        }
        atomic_set(&log_root_tree->log_commit[index2], 1);
 
-       if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
-               wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+       if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+               wait_log_commit(trans, log_root_tree,
+                               log_root_tree->log_transid - 1);
+       }
+
+       wait_for_writer(trans, log_root_tree);
 
-       wait_for_writer(log_root_tree);
+       /*
+        * now that we've moved on to the tree of log tree roots,
+        * check the full commit flag again
+        */
+       if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+               mutex_unlock(&log_root_tree->log_mutex);
+               ret = -EAGAIN;
+               goto out_wake_log_root;
+       }
 
        ret = btrfs_write_and_wait_marked_extents(log_root_tree,
                                &log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * in and cause problems either.
         */
        write_ctree_super(trans, root->fs_info->tree_root, 2);
+       ret = 0;
 
+out_wake_log_root:
        atomic_set(&log_root_tree->log_commit[index2], 0);
        smp_mb();
        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
        return 0;
 }
 
-/* * free all the extents used by the tree log.  This should be called
+/*
+ * free all the extents used by the tree log.  This should be called
  * at commit time of the full transaction
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
-       end_log_trans(root);
+       btrfs_end_log_trans(root);
 
        return 0;
 }
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
-       end_log_trans(root);
+       btrfs_end_log_trans(root);
 
        return ret;
 }
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
  *
  * This handles both files and directories.
  */
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, struct inode *inode,
                             int inode_only)
 {
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
        min_key.offset = 0;
 
        max_key.objectid = inode->i_ino;
+
+       /* today the code can only do partial logging of directories */
+       if (!S_ISDIR(inode->i_mode))
+           inode_only = LOG_INODE_ALL;
+
        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
 
-       /*
-        * if this inode has already been logged and we're in inode_only
-        * mode, we don't want to delete the things that have already
-        * been written to the log.
-        *
-        * But, if the inode has been through an inode_only log,
-        * the logged_trans field is not set.  This allows us to catch
-        * any new names for this inode in the backrefs by logging it
-        * again
-        */
-       if (inode_only == LOG_INODE_EXISTS &&
-           BTRFS_I(inode)->logged_trans == trans->transid) {
-               btrfs_free_path(path);
-               btrfs_free_path(dst_path);
-               goto out;
-       }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
 
        /*
@@ -2693,7 +2809,6 @@ next_slot:
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
-               BTRFS_I(inode)->log_dirty_trans = 0;
                ret = log_directory_changes(trans, root, inode, path, dst_path);
                BUG_ON(ret);
        }
@@ -2702,19 +2817,69 @@ next_slot:
 
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-out:
        return 0;
 }
 
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                   struct btrfs_root *root, struct inode *inode,
-                   int inode_only)
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+                                              struct inode *inode,
+                                              struct dentry *parent,
+                                              struct super_block *sb,
+                                              u64 last_committed)
 {
-       int ret;
+       int ret = 0;
+       struct btrfs_root *root;
 
-       start_log_trans(trans, root);
-       ret = __btrfs_log_inode(trans, root, inode, inode_only);
-       end_log_trans(root);
+       /*
+        * for regular files, if its inode is already on disk, we don't
+        * have to worry about the parents at all.  This is because
+        * we can use the last_unlink_trans field to record renames
+        * and other fun in this file.
+        */
+       if (S_ISREG(inode->i_mode) &&
+           BTRFS_I(inode)->generation <= last_committed &&
+           BTRFS_I(inode)->last_unlink_trans <= last_committed)
+                       goto out;
+
+       if (!S_ISDIR(inode->i_mode)) {
+               if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+                       goto out;
+               inode = parent->d_inode;
+       }
+
+       while (1) {
+               BTRFS_I(inode)->logged_trans = trans->transid;
+               smp_mb();
+
+               if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+                       root = BTRFS_I(inode)->root;
+
+                       /*
+                        * make sure any commits to the log are forced
+                        * to be full commits
+                        */
+                       root->fs_info->last_trans_log_full_commit =
+                               trans->transid;
+                       ret = 1;
+                       break;
+               }
+
+               if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+                       break;
+
+               if (parent == sb->s_root)
+                       break;
+
+               parent = parent->d_parent;
+               inode = parent->d_inode;
+
+       }
+out:
        return ret;
 }
 
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-                   struct btrfs_root *root, struct dentry *dentry)
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+                   struct btrfs_root *root, struct inode *inode,
+                   struct dentry *parent, int exists_only)
 {
-       int inode_only = LOG_INODE_ALL;
+       int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
-       int ret;
+       int ret = 0;
+       u64 last_committed = root->fs_info->last_trans_committed;
+
+       sb = inode->i_sb;
+
+       if (root->fs_info->last_trans_log_full_commit >
+           root->fs_info->last_trans_committed) {
+               ret = 1;
+               goto end_no_trans;
+       }
+
+       ret = check_parent_dirs_for_sync(trans, inode, parent,
+                                        sb, last_committed);
+       if (ret)
+               goto end_no_trans;
 
        start_log_trans(trans, root);
-       sb = dentry->d_inode->i_sb;
-       while (1) {
-               ret = __btrfs_log_inode(trans, root, dentry->d_inode,
-                                       inode_only);
-               BUG_ON(ret);
-               inode_only = LOG_INODE_EXISTS;
 
-               dentry = dentry->d_parent;
-               if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+       ret = btrfs_log_inode(trans, root, inode, inode_only);
+       BUG_ON(ret);
+
+       /*
+        * for regular files, if its inode is already on disk, we don't
+        * have to worry about the parents at all.  This is because
+        * we can use the last_unlink_trans field to record renames
+        * and other fun in this file.
+        */
+       if (S_ISREG(inode->i_mode) &&
+           BTRFS_I(inode)->generation <= last_committed &&
+           BTRFS_I(inode)->last_unlink_trans <= last_committed)
+                       goto no_parent;
+
+       inode_only = LOG_INODE_EXISTS;
+       while (1) {
+               if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
 
-               if (BTRFS_I(dentry->d_inode)->generation <=
-                   root->fs_info->last_trans_committed)
+               inode = parent->d_inode;
+               if (BTRFS_I(inode)->generation >
+                   root->fs_info->last_trans_committed) {
+                       ret = btrfs_log_inode(trans, root, inode, inode_only);
+                       BUG_ON(ret);
+               }
+               if (parent == sb->s_root)
                        break;
+
+               parent = parent->d_parent;
        }
-       end_log_trans(root);
-       return 0;
+no_parent:
+       ret = 0;
+       btrfs_end_log_trans(root);
+end_no_trans:
+       return ret;
 }
 
 /*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-       u64 gen;
-       gen = root->fs_info->last_trans_new_blockgroup;
-       if (gen > root->fs_info->last_trans_committed)
-               return 1;
-       else
-               return btrfs_log_dentry(trans, root, dentry);
+       return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+                                     dentry->d_parent, 0);
 }
 
 /*
@@ -2884,3 +3079,94 @@ again:
        kfree(log_root_tree);
        return 0;
 }
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+                            struct inode *dir, struct inode *inode,
+                            int for_rename)
+{
+       /*
+        * when we're logging a file, if it hasn't been renamed
+        * or unlinked, and its inode is fully committed on disk,
+        * we don't have to worry about walking up the directory chain
+        * to log its parents.
+        *
+        * So, we use the last_unlink_trans field to put this transid
+        * into the file.  When the file is logged we check it and
+        * don't log the parents if the file is fully on disk.
+        */
+       if (S_ISREG(inode->i_mode))
+               BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+       /*
+        * if this directory was already logged any new
+        * names for this file/dir will get recorded
+        */
+       smp_mb();
+       if (BTRFS_I(dir)->logged_trans == trans->transid)
+               return;
+
+       /*
+        * if the inode we're about to unlink was logged,
+        * the log will be properly updated for any new names
+        */
+       if (BTRFS_I(inode)->logged_trans == trans->transid)
+               return;
+
+       /*
+        * when renaming files across directories, if the directory
+        * there we're unlinking from gets fsync'd later on, there's
+        * no way to find the destination directory later and fsync it
+        * properly.  So, we have to be conservative and force commits
+        * so the new name gets discovered.
+        */
+       if (for_rename)
+               goto record;
+
+       /* we can safely do the unlink without any special recording */
+       return;
+
+record:
+       BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+                       struct inode *inode, struct inode *old_dir,
+                       struct dentry *parent)
+{
+       struct btrfs_root * root = BTRFS_I(inode)->root;
+
+       /*
+        * this will force the logging code to walk the dentry chain
+        * up for the file
+        */
+       if (S_ISREG(inode->i_mode))
+               BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+       /*
+        * if this inode hasn't been logged and directory we're renaming it
+        * from hasn't been logged, we don't need to log it
+        */
+       if (BTRFS_I(inode)->logged_trans <=
+           root->fs_info->last_trans_committed &&
+           (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+                   root->fs_info->last_trans_committed))
+               return 0;
+
+       return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
index b9409b32ed0299a43f0b04b4598ec8b3d9b266b8..d09c7609e16b3281a0b7658083888c277fb1ccf8 100644 (file)
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-                   struct btrfs_root *root, struct dentry *dentry);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                   struct btrfs_root *root, struct inode *inode,
-                   int inode_only);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               const char *name, int name_len,
                               struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+                   struct btrfs_root *root, struct inode *inode,
+                   struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+                            struct inode *dir, struct inode *inode,
+                            int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+                       struct inode *inode, struct inode *old_dir,
+                       struct dentry *parent);
 #endif