struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
 
  * ref if it was able to find one, or NULL if nothing was in that spot
  */
 static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
-                                                 u64 bytenr, u64 parent)
+                                 u64 bytenr, u64 parent,
+                                 struct btrfs_delayed_ref_node **last)
 {
        struct rb_node *n = root->rb_node;
        struct btrfs_delayed_ref_node *entry;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);
+               if (last)
+                       *last = entry;
 
                cmp = comp_entry(entry, bytenr, parent);
                if (cmp < 0)
        return NULL;
 }
 
-/*
- * Locking on delayed refs is done by taking a lock on the head node,
- * which has the (impossible) parent id of (u64)-1.  Once a lock is held
- * on the head node, you're allowed (and required) to process all the
- * delayed refs for a given byte number in the tree.
- *
- * This will walk forward in the rbtree until it finds a head node it
- * is able to lock.  It might not lock the delayed ref you asked for,
- * and so it will return the one it did lock in next_ret and return 0.
- *
- * If no locks are taken, next_ret is set to null and 1 is returned.  This
- * means there are no more unlocked head nodes in the rbtree.
- */
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
-                          struct btrfs_delayed_ref_head **next_ret)
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_head *head)
 {
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       assert_spin_locked(&delayed_refs->lock);
+       if (mutex_trylock(&head->mutex))
+               return 0;
+
+       atomic_inc(&head->node.refs);
+       spin_unlock(&delayed_refs->lock);
+
+       mutex_lock(&head->mutex);
+       spin_lock(&delayed_refs->lock);
+       if (!head->node.in_tree) {
+               mutex_unlock(&head->mutex);
+               btrfs_put_delayed_ref(&head->node);
+               return -EAGAIN;
+       }
+       btrfs_put_delayed_ref(&head->node);
+       return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                          struct list_head *cluster, u64 start)
+{
+       int count = 0;
+       struct btrfs_delayed_ref_root *delayed_refs;
        struct rb_node *node;
+       struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *head;
-       int ret = 0;
 
-       while (1) {
+       delayed_refs = &trans->transaction->delayed_refs;
+       if (start == 0) {
+               node = rb_first(&delayed_refs->root);
+       } else {
+               ref = NULL;
+               tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+               if (ref) {
+                       struct btrfs_delayed_ref_node *tmp;
+
+                       node = rb_prev(&ref->rb_node);
+                       while (node) {
+                               tmp = rb_entry(node,
+                                              struct btrfs_delayed_ref_node,
+                                              rb_node);
+                               if (tmp->bytenr < start)
+                                       break;
+                               ref = tmp;
+                               node = rb_prev(&ref->rb_node);
+                       }
+                       node = &ref->rb_node;
+               } else
+                       node = rb_first(&delayed_refs->root);
+       }
+again:
+       while (node && count < 32) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                if (btrfs_delayed_ref_is_head(ref)) {
                        head = btrfs_delayed_node_to_head(ref);
-                       if (mutex_trylock(&head->mutex)) {
-                               *next_ret = head;
-                               ret = 0;
+                       if (list_empty(&head->cluster)) {
+                               list_add_tail(&head->cluster, cluster);
+                               delayed_refs->run_delayed_start =
+                                       head->node.bytenr;
+                               count++;
+
+                               WARN_ON(delayed_refs->num_heads_ready == 0);
+                               delayed_refs->num_heads_ready--;
+                       } else if (count) {
+                               /* the goal of the clustering is to find extents
+                                * that are likely to end up in the same extent
+                                * leaf on disk.  So, we don't want them spread
+                                * all over the tree.  Stop now if we've hit
+                                * a head that was already in use
+                                */
                                break;
                        }
                }
-               node = rb_next(&ref->rb_node);
-               if (!node) {
-                       ret = 1;
-                       *next_ret = NULL;
-                       break;
-               }
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               node = rb_next(node);
        }
-       return ret;
+       if (count) {
+               return 0;
+       } else if (start) {
+               /*
+                * we've gone to the end of the rbtree without finding any
+                * clusters.  start from the beginning and try again
+                */
+               start = 0;
+               node = rb_first(&delayed_refs->root);
+               goto again;
+       }
+       return 1;
 }
 
 /*
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
-       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
        if (ref) {
                prev_node = rb_prev(&ref->rb_node);
                if (!prev_node)
        }
 
        spin_lock(&delayed_refs->lock);
-       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
        if (ref) {
                head = btrfs_delayed_node_to_head(ref);
                if (mutex_trylock(&head->mutex)) {
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_ref *full_ref;
-       struct btrfs_delayed_ref_head *head_ref;
+       struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
        int must_insert_reserved = 0;
        if (btrfs_delayed_ref_is_head(ref)) {
                head_ref = btrfs_delayed_node_to_head(ref);
                head_ref->must_insert_reserved = must_insert_reserved;
+               INIT_LIST_HEAD(&head_ref->cluster);
                mutex_init(&head_ref->mutex);
        } else {
                full_ref = btrfs_delayed_node_to_ref(ref);
                 */
                kfree(ref);
        } else {
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       delayed_refs->num_heads++;
+                       delayed_refs->num_heads_ready++;
+               }
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
        }
        struct btrfs_delayed_ref_root *delayed_refs;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1);
+       ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
 
         */
        struct mutex mutex;
 
+       struct list_head cluster;
+
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
         */
        unsigned long num_entries;
 
+       /* total number of head nodes in tree */
+       unsigned long num_heads;
+
+       /* total number of head nodes ready for processing */
+       unsigned long num_heads_ready;
+
        /*
         * set when the tree is flushing before a transaction commit,
         * used by the throttling code to decide if new updates need
         * to be run right away
         */
        int flushing;
+
+       u64 run_delayed_start;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lock_delayed_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
-                          struct btrfs_delayed_ref_head **next_ret);
 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr,
                            u64 num_bytes, u32 *refs);
                          u64 parent, u64 orig_ref_root, u64 ref_root,
                          u64 orig_ref_generation, u64 ref_generation,
                          u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                          struct list_head *cluster, u64 search_start);
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
 
        struct btrfs_root *root = arg;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
-       struct btrfs_fs_info *info = root->fs_info;
        unsigned long now;
        unsigned long delay;
        int ret;
 
                now = get_seconds();
                if (now < cur->start_time || now - cur->start_time < 30) {
-                       unsigned long num_delayed;
-                       num_delayed = cur->delayed_refs.num_entries;
                        mutex_unlock(&root->fs_info->trans_mutex);
                        delay = HZ * 5;
-
-                       /*
-                        * we may have been woken up early to start
-                        * processing the delayed extent ref updates
-                        * If so, run some of them and then loop around again
-                        * to see if we need to force a commit
-                        */
-                       if (num_delayed > 64) {
-                               mutex_unlock(&info->transaction_kthread_mutex);
-                               trans = btrfs_start_transaction(root, 1);
-                               btrfs_run_delayed_refs(trans, root, 256);
-                               btrfs_end_transaction(trans, root);
-                               continue;
-                       }
                        goto sleep;
                }
                mutex_unlock(&root->fs_info->trans_mutex);
 
        return NULL;
 }
 
-/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
- */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, unsigned long count)
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct list_head *cluster)
 {
-       struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
+       int count = 0;
        int must_insert_reserved = 0;
-       int run_all = count == (unsigned long)-1;
-
-       if (root == root->fs_info->extent_root)
-               root = root->fs_info->tree_root;
 
        delayed_refs = &trans->transaction->delayed_refs;
-again:
-       spin_lock(&delayed_refs->lock);
-       if (count == 0)
-               count = delayed_refs->num_entries;
        while (1) {
                if (!locked_ref) {
-                       /*
-                        * no locked ref, go find something we can
-                        * process in the rbtree.  We start at
-                        * the beginning of the tree, there may be less
-                        * lock contention if we do something smarter here.
-                        */
-                       node = rb_first(&delayed_refs->root);
-                       if (!node) {
-                               spin_unlock(&delayed_refs->lock);
+                       /* pick a new head ref from the cluster list */
+                       if (list_empty(cluster))
                                break;
-                       }
 
-                       ref = rb_entry(node, struct btrfs_delayed_ref_node,
-                                      rb_node);
-                       ret = btrfs_lock_delayed_ref(trans, ref, &locked_ref);
-                       if (ret) {
-                               spin_unlock(&delayed_refs->lock);
-                               break;
+                       locked_ref = list_entry(cluster->next,
+                                    struct btrfs_delayed_ref_head, cluster);
+
+                       /* grab the lock that says we are going to process
+                        * all the refs for this head */
+                       ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+                       /*
+                        * we may have dropped the spin lock to get the head
+                        * mutex lock, and that might have given someone else
+                        * time to free the head.  If that's true, it has been
+                        * removed from our list and we can move on.
+                        */
+                       if (ret == -EAGAIN) {
+                               locked_ref = NULL;
+                               count++;
+                               continue;
                        }
                }
 
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
                 */
-
                ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * so that any accounting fixes can happen
                         */
                        ref = &locked_ref->node;
+                       list_del_init(&locked_ref->cluster);
                        locked_ref = NULL;
                }
 
                BUG_ON(ret);
                btrfs_put_delayed_ref(ref);
 
-               /* once we lock the head ref, we have to process all the
-                * entries for it.  So, we might end up doing more entries
-                * that count was asking us to do.
-                */
-               if (count > 0)
-                       count--;
+               count++;
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+       return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long count)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct list_head cluster;
+       int ret;
+       int run_all = count == (unsigned long)-1;
+       int run_most = 0;
+
+       if (root == root->fs_info->extent_root)
+               root = root->fs_info->tree_root;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       INIT_LIST_HEAD(&cluster);
+again:
+       spin_lock(&delayed_refs->lock);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+       while (1) {
+               if (!(run_all || run_most) &&
+                   delayed_refs->num_heads_ready < 64)
+                       break;
 
                /*
-                * we set locked_ref to null above if we're all done
-                * with this bytenr
+                * go find something we can process in the rbtree.  We start at
+                * the beginning of the tree, and then build a cluster
+                * of refs to process starting at the first one we are able to
+                * lock
                 */
-               if (!locked_ref && count == 0)
+               ret = btrfs_find_ref_cluster(trans, &cluster,
+                                            delayed_refs->run_delayed_start);
+               if (ret)
                        break;
 
-               cond_resched();
-               spin_lock(&delayed_refs->lock);
+               ret = run_clustered_refs(trans, root, &cluster);
+               BUG_ON(ret < 0);
+
+               count -= min_t(unsigned long, ret, count);
+
+               if (count == 0)
+                       break;
        }
+
        if (run_all) {
-               spin_lock(&delayed_refs->lock);
                node = rb_first(&delayed_refs->root);
-               if (!node) {
-                       spin_unlock(&delayed_refs->lock);
+               if (!node)
                        goto out;
-               }
+               count = (unsigned long)-1;
 
                while (node) {
                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
                        node = rb_next(node);
                }
                spin_unlock(&delayed_refs->lock);
-               count = (unsigned long)-1;
                schedule_timeout(1);
                goto again;
        }
 out:
+       spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
         */
        head->node.in_tree = 0;
        rb_erase(&head->node.rb_node, &delayed_refs->root);
+
        delayed_refs->num_entries--;
 
        /*
         * we don't take a ref on the node because we're removing it from the
         * tree, so we just steal the ref the tree was holding.
         */
+       delayed_refs->num_heads--;
+       if (list_empty(&head->cluster))
+               delayed_refs->num_heads_ready--;
+
+       list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
 
        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
        struct btrfs_path *path;
        int i;
        int orig_level;
+       int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
 
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
                }
        }
        while (1) {
+               unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+               for (update_count = 0; update_count < 16; update_count++) {
+                       update = trans->delayed_ref_updates;
+                       trans->delayed_ref_updates = 0;
+                       if (update)
+                               btrfs_run_delayed_refs(trans, root, update);
+                       else
+                               break;
+               }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
 
 
                cur_trans->delayed_refs.root.rb_node = NULL;
                cur_trans->delayed_refs.num_entries = 0;
+               cur_trans->delayed_refs.num_heads_ready = 0;
+               cur_trans->delayed_refs.num_heads = 0;
                cur_trans->delayed_refs.flushing = 0;
+               cur_trans->delayed_refs.run_delayed_start = 0;
                spin_lock_init(&cur_trans->delayed_refs.lock);
 
                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *info = root->fs_info;
-
-       if (trans->delayed_ref_updates &&
-           (trans->transaction->delayed_refs.flushing ||
-           trans->transaction->delayed_refs.num_entries > 16384)) {
-               btrfs_run_delayed_refs(trans, root, trans->delayed_ref_updates);
-       } else if (trans->transaction->delayed_refs.num_entries > 64) {
-               wake_up_process(root->fs_info->transaction_kthread);
+       int count = 0;
+
+       while (count < 4) {
+               unsigned long cur = trans->delayed_ref_updates;
+               trans->delayed_ref_updates = 0;
+               if (cur &&
+                   trans->transaction->delayed_refs.num_heads_ready > 64) {
+                       trans->delayed_ref_updates = 0;
+                       btrfs_run_delayed_refs(trans, root, cur);
+               } else {
+                       break;
+               }
+               count++;
        }
 
        mutex_lock(&info->trans_mutex);
         */
        trans->transaction->delayed_refs.flushing = 1;
 
-       ret = btrfs_run_delayed_refs(trans, root, (u64)-1);
+       ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
 
        INIT_LIST_HEAD(&dirty_fs_roots);