]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_update_interior.c
Update bcachefs sources to 09a5465430 bcachefs: Don't need to walk inodes on clean...
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
index 922a48635a6a872aa9ba2e2a1086d10281677721..33b5cf40a5f48377f2a5fe3a6357fc77cba02eba 100644 (file)
@@ -1,6 +1,6 @@
 
 #include "bcachefs.h"
-#include "alloc.h"
+#include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
@@ -12,7 +12,9 @@
 #include "buckets.h"
 #include "extents.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
+#include "replicas.h"
 #include "super-io.h"
 
 #include <linux/random.h>
@@ -21,7 +23,7 @@
 static void btree_node_will_make_reachable(struct btree_update *,
                                           struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
 
 /* Debug code: */
 
@@ -32,7 +34,7 @@ static void btree_node_interior_verify(struct btree *b)
 
        BUG_ON(!b->level);
 
-       bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+       bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
 #if 1
        BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
               bkey_cmp_left_packed(b, k, &b->key.k.p));
@@ -129,13 +131,15 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 /* Btree node freeing/allocation: */
 
 static bool btree_key_matches(struct bch_fs *c,
-                             struct bkey_s_c_extent l,
-                             struct bkey_s_c_extent r)
+                             struct bkey_s_c l,
+                             struct bkey_s_c r)
 {
+       struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
+       struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
        const struct bch_extent_ptr *ptr1, *ptr2;
 
-       extent_for_each_ptr(l, ptr1)
-               extent_for_each_ptr(r, ptr2)
+       bkey_for_each_ptr(ptrs1, ptr1)
+               bkey_for_each_ptr(ptrs2, ptr2)
                        if (ptr1->dev == ptr2->dev &&
                            ptr1->gen == ptr2->gen &&
                            ptr1->offset == ptr2->offset)
@@ -157,32 +161,17 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
        struct bch_fs *c = as->c;
        struct pending_btree_node_free *d;
-       unsigned replicas;
-
-       /*
-        * btree_update lock is only needed here to avoid racing with
-        * gc:
-        */
-       mutex_lock(&c->btree_interior_update_lock);
+       struct gc_pos pos = { 0 };
 
        for (d = as->pending; d < as->pending + as->nr_pending; d++)
                if (!bkey_cmp(k.k->p, d->key.k.p) &&
-                   btree_key_matches(c, bkey_s_c_to_extent(k),
-                                     bkey_i_to_s_c_extent(&d->key)))
+                   btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
                        goto found;
        BUG();
 found:
        BUG_ON(d->index_update_done);
        d->index_update_done = true;
 
-       /*
-        * Btree nodes are accounted as freed in bch_alloc_stats when they're
-        * freed from the index:
-        */
-       replicas = bch2_extent_nr_dirty_ptrs(k);
-       if (replicas)
-               stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
-
        /*
         * We're dropping @k from the btree, but it's still live until the
         * index update is persistent so we need to keep a reference around for
@@ -204,100 +193,87 @@ found:
         * bch2_mark_key() compares the current gc pos to the pos we're
         * moving this reference from, hence one comparison here:
         */
-       if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-               struct bch_fs_usage tmp = { 0 };
-
-               bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-                            -c->opts.btree_node_size, true, b
-                            ? gc_pos_btree_node(b)
-                            : gc_pos_btree_root(as->btree_id),
-                            &tmp, 0);
-               /*
-                * Don't apply tmp - pending deletes aren't tracked in
-                * bch_alloc_stats:
-                */
-       }
-
-       mutex_unlock(&c->btree_interior_update_lock);
+       if (gc_pos_cmp(c->gc_pos, b
+                      ? gc_pos_btree_node(b)
+                      : gc_pos_btree_root(as->btree_id)) >= 0 &&
+           gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
+               bch2_mark_key_locked(c,
+                             bkey_i_to_s_c(&d->key),
+                             false, 0, pos,
+                             NULL, 0, BCH_BUCKET_MARK_GC);
 }
 
-static void __btree_node_free(struct bch_fs *c, struct btree *b,
-                             struct btree_iter *iter)
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
        trace_btree_node_free(c, b);
 
        BUG_ON(btree_node_dirty(b));
        BUG_ON(btree_node_need_write(b));
        BUG_ON(b == btree_node_root(c, b));
-       BUG_ON(b->ob);
+       BUG_ON(b->ob.nr);
        BUG_ON(!list_empty(&b->write_blocked));
        BUG_ON(b->will_make_reachable);
 
        clear_btree_node_noevict(b);
 
-       six_lock_write(&b->lock);
-
-       bch2_btree_node_hash_remove(c, b);
-
-       mutex_lock(&c->btree_cache_lock);
-       list_move(&b->list, &c->btree_cache_freeable);
-       mutex_unlock(&c->btree_cache_lock);
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
 
-       /*
-        * By using six_unlock_write() directly instead of
-        * bch2_btree_node_unlock_write(), we don't update the iterator's
-        * sequence numbers and cause future bch2_btree_node_relock() calls to
-        * fail:
-        */
-       six_unlock_write(&b->lock);
+       mutex_lock(&c->btree_cache.lock);
+       list_move(&b->list, &c->btree_cache.freeable);
+       mutex_unlock(&c->btree_cache.lock);
 }
 
 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
-       struct open_bucket *ob = b->ob;
+       struct open_bucketob = b->ob;
 
        btree_update_drop_new_node(c, b);
 
-       b->ob = NULL;
+       b->ob.nr = 0;
 
        clear_btree_node_dirty(b);
 
-       __btree_node_free(c, b, NULL);
+       btree_node_lock_type(c, b, SIX_LOCK_write);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->lock);
 
-       bch2_open_bucket_put(c, ob);
+       bch2_open_buckets_put(c, &ob);
 }
 
 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
                                struct btree_iter *iter)
 {
-       bch2_btree_iter_node_drop_linked(iter, b);
+       struct btree_iter *linked;
 
-       __btree_node_free(c, b, iter);
+       for_each_btree_iter(iter, linked)
+               BUG_ON(linked->l[b->level].b == b);
 
-       bch2_btree_iter_node_drop(iter, b);
+       /*
+        * Is this a node that isn't reachable on disk yet?
+        *
+        * Nodes that aren't reachable yet have writes blocked until they're
+        * reachable - now that we've cancelled any pending writes and moved
+        * things waiting on that write to wait on this update, we can drop this
+        * node from the list of nodes that the other update is making
+        * reachable, prior to freeing it:
+        */
+       btree_update_drop_new_node(c, b);
+
+       six_lock_write(&b->lock);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->lock);
+       six_unlock_intent(&b->lock);
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
                                        struct pending_btree_node_free *pending)
 {
-       struct bch_fs_usage stats = { 0 };
-
        BUG_ON(!pending->index_update_done);
 
        bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                    -c->opts.btree_node_size, true,
-                    gc_phase(GC_PHASE_PENDING_DELETE),
-                    &stats, 0);
-       /*
-        * Don't apply stats - pending deletes aren't tracked in
-        * bch_alloc_stats:
-        */
-}
-
-void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
-{
-       bch2_open_bucket_put(c, b->ob);
-       b->ob = NULL;
+                     false, 0,
+                     gc_phase(GC_PHASE_PENDING_DELETE),
+                     NULL, 0, 0);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -305,9 +281,11 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
                                             struct closure *cl,
                                             unsigned flags)
 {
-       BKEY_PADDED(k) tmp;
-       struct open_bucket *ob;
+       struct write_point *wp;
        struct btree *b;
+       BKEY_PADDED(k) tmp;
+       struct open_buckets ob = { .nr = 0 };
+       struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        unsigned nr_reserve;
        enum alloc_reserve alloc_reserve;
 
@@ -335,31 +313,40 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       /* alloc_sectors is weird, I suppose */
-       bkey_extent_init(&tmp.k);
-       tmp.k.k.size = c->opts.btree_node_size,
-
-       ob = bch2_alloc_sectors(c, &c->btree_write_point,
-                              bkey_i_to_extent(&tmp.k),
-                              res->nr_replicas,
-                              c->opts.metadata_replicas_required,
-                              alloc_reserve, cl);
-       if (IS_ERR(ob))
-               return ERR_CAST(ob);
-
-       if (tmp.k.k.size < c->opts.btree_node_size) {
-               bch2_open_bucket_put(c, ob);
+       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+                                     writepoint_ptr(&c->btree_write_point),
+                                     &devs_have,
+                                     res->nr_replicas,
+                                     c->opts.metadata_replicas_required,
+                                     alloc_reserve, 0, cl);
+       if (IS_ERR(wp))
+               return ERR_CAST(wp);
+
+       if (wp->sectors_free < c->opts.btree_node_size) {
+               struct open_bucket *ob;
+               unsigned i;
+
+               open_bucket_for_each(c, &wp->ptrs, ob, i)
+                       if (ob->sectors_free < c->opts.btree_node_size)
+                               ob->sectors_free = 0;
+
+               bch2_alloc_sectors_done(c, wp);
                goto retry;
        }
+
+       bkey_btree_ptr_init(&tmp.k);
+       bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
+
+       bch2_open_bucket_get(c, wp, &ob);
+       bch2_alloc_sectors_done(c, wp);
 mem_alloc:
        b = bch2_btree_node_mem_alloc(c);
 
        /* we hold cannibalize_lock: */
        BUG_ON(IS_ERR(b));
-       BUG_ON(b->ob);
+       BUG_ON(b->ob.nr);
 
        bkey_copy(&b->key, &tmp.k);
-       b->key.k.size = 0;
        b->ob = ob;
 
        return b;
@@ -370,14 +357,16 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        struct bch_fs *c = as->c;
        struct btree *b;
 
+       BUG_ON(level >= BTREE_MAX_DEPTH);
        BUG_ON(!as->reserve->nr);
 
        b = as->reserve->b[--as->reserve->nr];
 
-       BUG_ON(bch2_btree_node_hash_insert(c, b, level, as->btree_id));
+       BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
 
        set_btree_node_accessed(b);
        set_btree_node_dirty(b);
+       set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
        memset(&b->nr, 0, sizeof(b->nr));
@@ -385,7 +374,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        b->data->flags = 0;
        SET_BTREE_NODE_ID(b->data, as->btree_id);
        SET_BTREE_NODE_LEVEL(b->data, level);
-       b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+       b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
 
        bch2_btree_build_aux_trees(b);
 
@@ -406,6 +395,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
        n->data->min_key        = b->data->min_key;
        n->data->max_key        = b->data->max_key;
        n->data->format         = format;
+       SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
        btree_node_set_format(n, format);
 
@@ -466,14 +456,15 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
                                &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
 
                        a->ob = b->ob;
-                       b->ob = NULL;
+                       b->ob.nr = 0;
                        bkey_copy(&a->k, &b->key);
                } else {
-                       bch2_open_bucket_put(c, b->ob);
-                       b->ob = NULL;
+                       bch2_open_buckets_put(c, &b->ob);
                }
 
-               __btree_node_free(c, b, NULL);
+               btree_node_lock_type(c, b, SIX_LOCK_write);
+               __btree_node_free(c, b);
+               six_unlock_write(&b->lock);
 
                six_unlock_intent(&b->lock);
        }
@@ -492,8 +483,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
        struct btree *b;
        struct disk_reservation disk_res = { 0, 0 };
        unsigned sectors = nr_nodes * c->opts.btree_node_size;
-       int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD|
-               BCH_DISK_RESERVATION_METADATA;
+       int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
 
        if (flags & BTREE_INSERT_NOFAIL)
                disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
@@ -506,7 +496,9 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
        if (ret)
                return ERR_PTR(ret);
 
-       if (bch2_disk_reservation_get(c, &disk_res, sectors, disk_res_flags))
+       if (bch2_disk_reservation_get(c, &disk_res, sectors,
+                                     c->opts.metadata_replicas,
+                                     disk_res_flags))
                return ERR_PTR(-ENOSPC);
 
        BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
@@ -515,7 +507,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
         * Protects reaping from the btree node cache and using the btree node
         * open bucket reserve:
         */
-       ret = bch2_btree_node_cannibalize_lock(c, cl);
+       ret = bch2_btree_cache_cannibalize_lock(c, cl);
        if (ret) {
                bch2_disk_reservation_put(c, &disk_res);
                return ERR_PTR(ret);
@@ -535,19 +527,18 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
                        goto err_free;
                }
 
-               ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
-                                           BCH_DATA_BTREE);
+               ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
                if (ret)
                        goto err_free;
 
                reserve->b[reserve->nr++] = b;
        }
 
-       bch2_btree_node_cannibalize_unlock(c);
+       bch2_btree_cache_cannibalize_unlock(c);
        return reserve;
 err_free:
        bch2_btree_reserve_put(c, reserve);
-       bch2_btree_node_cannibalize_unlock(c);
+       bch2_btree_cache_cannibalize_unlock(c);
        trace_btree_reserve_get_fail(c, nr_nodes, cl);
        return ERR_PTR(ret);
 }
@@ -558,6 +549,8 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
 
+       bch2_journal_pin_flush(&c->journal, &as->journal);
+
        BUG_ON(as->nr_new_nodes);
        BUG_ON(as->nr_pending);
 
@@ -566,17 +559,17 @@ static void bch2_btree_update_free(struct btree_update *as)
 
        mutex_lock(&c->btree_interior_update_lock);
        list_del(&as->list);
-       mutex_unlock(&c->btree_interior_update_lock);
 
        closure_debug_destroy(&as->cl);
        mempool_free(as, &c->btree_interior_update_pool);
-       percpu_ref_put(&c->writes);
+
+       closure_wake_up(&c->btree_interior_update_wait);
+       mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void btree_update_nodes_reachable(struct closure *cl)
 {
-       struct btree_update *as =
-               container_of(cl, struct btree_update, cl);
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
        struct bch_fs *c = as->c;
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -586,12 +579,16 @@ static void btree_update_nodes_reachable(struct closure *cl)
        while (as->nr_new_nodes) {
                struct btree *b = as->new_nodes[--as->nr_new_nodes];
 
-               BUG_ON(b->will_make_reachable != as);
-               b->will_make_reachable = NULL;
+               BUG_ON(b->will_make_reachable != (unsigned long) as);
+               b->will_make_reachable = 0;
                mutex_unlock(&c->btree_interior_update_lock);
 
-               six_lock_read(&b->lock);
-               bch2_btree_node_write_dirty(c, b, NULL, btree_node_need_write(b));
+               /*
+                * b->will_make_reachable prevented it from being written, so
+                * write it now if it needs to be written:
+                */
+               btree_node_lock_type(c, b, SIX_LOCK_read);
+               bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
                six_unlock_read(&b->lock);
                mutex_lock(&c->btree_interior_update_lock);
        }
@@ -606,10 +603,28 @@ static void btree_update_nodes_reachable(struct closure *cl)
        bch2_btree_update_free(as);
 }
 
+static void btree_update_wait_on_journal(struct closure *cl)
+{
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct bch_fs *c = as->c;
+       int ret;
+
+       ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
+       if (ret == -EAGAIN) {
+               continue_at(cl, btree_update_wait_on_journal, system_wq);
+               return;
+       }
+       if (ret < 0)
+               goto err;
+
+       bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
+err:
+       continue_at(cl, btree_update_nodes_reachable, system_wq);
+}
+
 static void btree_update_nodes_written(struct closure *cl)
 {
-       struct btree_update *as =
-               container_of(cl, struct btree_update, cl);
+       struct btree_update *as = container_of(cl, struct btree_update, cl);
        struct bch_fs *c = as->c;
        struct btree *b;
 
@@ -620,6 +635,8 @@ static void btree_update_nodes_written(struct closure *cl)
         */
 retry:
        mutex_lock(&c->btree_interior_update_lock);
+       as->nodes_written = true;
+
        switch (as->mode) {
        case BTREE_INTERIOR_NO_UPDATE:
                BUG();
@@ -629,7 +646,7 @@ retry:
 
                if (!six_trylock_read(&b->lock)) {
                        mutex_unlock(&c->btree_interior_update_lock);
-                       six_lock_read(&b->lock);
+                       btree_node_lock_type(c, b, SIX_LOCK_read);
                        six_unlock_read(&b->lock);
                        goto retry;
                }
@@ -638,10 +655,19 @@ retry:
                closure_wait(&btree_current_write(b)->wait, cl);
 
                list_del(&as->write_blocked_list);
+
+               /*
+                * for flush_held_btree_writes() waiting on updates to flush or
+                * nodes to be writeable:
+                */
+               closure_wake_up(&c->btree_interior_update_wait);
                mutex_unlock(&c->btree_interior_update_lock);
 
-               bch2_btree_node_write_dirty(c, b, NULL,
-                                           btree_node_need_write(b));
+               /*
+                * b->write_blocked prevented it from being written, so
+                * write it now if it needs to be written:
+                */
+               bch2_btree_node_write_cond(c, b, true);
                six_unlock_read(&b->lock);
                break;
 
@@ -667,7 +693,7 @@ retry:
 
                if (!six_trylock_read(&b->lock)) {
                        mutex_unlock(&c->btree_interior_update_lock);
-                       six_lock_read(&b->lock);
+                       btree_node_lock_type(c, b, SIX_LOCK_read);
                        six_unlock_read(&b->lock);
                        goto retry;
                }
@@ -675,7 +701,7 @@ retry:
                BUG_ON(c->btree_roots[b->btree_id].as != as);
                c->btree_roots[b->btree_id].as = NULL;
 
-               bch2_btree_set_root_ondisk(c, b);
+               bch2_btree_set_root_ondisk(c, b, WRITE);
 
                /*
                 * We don't have to wait anything anything here (before
@@ -701,13 +727,10 @@ retry:
                 */
                bch2_journal_pin_drop(&c->journal, &as->journal);
 
-               /*
-                * And, do a journal write to write the pointer to the new root,
-                * then wait for it to complete before freeing the nodes we
-                * replaced:
-                */
-               bch2_journal_meta_async(&c->journal, cl);
-               break;
+               as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
+
+               btree_update_wait_on_journal(cl);
+               return;
        }
 
        continue_at(cl, btree_update_nodes_reachable, system_wq);
@@ -840,16 +863,26 @@ static void btree_node_will_make_reachable(struct btree_update *as,
        BUG_ON(b->will_make_reachable);
 
        as->new_nodes[as->nr_new_nodes++] = b;
-       b->will_make_reachable = as;
+       b->will_make_reachable = 1UL|(unsigned long) as;
+
+       closure_get(&as->cl);
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void __btree_interior_update_drop_new_node(struct btree *b)
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
 {
-       struct btree_update *as = b->will_make_reachable;
+       struct btree_update *as;
+       unsigned long v;
        unsigned i;
 
-       BUG_ON(!as);
+       mutex_lock(&c->btree_interior_update_lock);
+       v = xchg(&b->will_make_reachable, 0);
+       as = (struct btree_update *) (v & ~1UL);
+
+       if (!as) {
+               mutex_unlock(&c->btree_interior_update_lock);
+               return;
+       }
 
        for (i = 0; i < as->nr_new_nodes; i++)
                if (as->new_nodes[i] == b)
@@ -857,18 +890,11 @@ static void __btree_interior_update_drop_new_node(struct btree *b)
 
        BUG();
 found:
-       as->nr_new_nodes--;
-       memmove(&as->new_nodes[i],
-               &as->new_nodes[i + 1],
-               sizeof(struct btree *) * (as->nr_new_nodes - i));
-       b->will_make_reachable = NULL;
-}
-
-static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
-{
-       mutex_lock(&c->btree_interior_update_lock);
-       __btree_interior_update_drop_new_node(b);
+       array_remove_item(as->new_nodes, as->nr_new_nodes, i);
        mutex_unlock(&c->btree_interior_update_lock);
+
+       if (v & 1)
+               closure_put(&as->cl);
 }
 
 static void btree_interior_update_add_node_reference(struct btree_update *as,
@@ -906,6 +932,11 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        struct btree_write *w;
        struct bset_tree *t;
 
+       set_btree_node_dying(b);
+
+       if (btree_node_fake(b))
+               return;
+
        btree_interior_update_add_node_reference(as, b);
 
        /*
@@ -917,7 +948,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         * in with keys that aren't in the journal anymore:
         */
        for_each_bset(b, t)
-               as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+               as->journal_seq = max(as->journal_seq,
+                                     le64_to_cpu(bset(b, t)->journal_seq));
 
        mutex_lock(&c->btree_interior_update_lock);
 
@@ -932,12 +964,24 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
                list_del(&p->write_blocked_list);
                btree_update_reparent(as, p);
+
+               /*
+                * for flush_held_btree_writes() waiting on updates to flush or
+                * nodes to be writeable:
+                */
+               closure_wake_up(&c->btree_interior_update_wait);
        }
 
        clear_btree_node_dirty(b);
        clear_btree_node_need_write(b);
        w = btree_current_write(b);
 
+       /*
+        * Does this node have any btree_update operations waiting on this node
+        * to be written?
+        *
+        * If so, wake them up when this btree_update operation is reachable:
+        */
        llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
                llist_add(&cl->list, &as->wait.list);
 
@@ -958,9 +1002,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                                      &as->journal, interior_update_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
-       if (b->will_make_reachable)
-               __btree_interior_update_drop_new_node(b);
-
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
@@ -982,14 +1023,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
        struct btree_reserve *reserve;
        struct btree_update *as;
 
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
-               return ERR_PTR(-EROFS);
-
        reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-       if (IS_ERR(reserve)) {
-               percpu_ref_put(&c->writes);
+       if (IS_ERR(reserve))
                return ERR_CAST(reserve);
-       }
 
        as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
        memset(as, 0, sizeof(*as));
@@ -1000,11 +1036,10 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
        as->reserve     = reserve;
        INIT_LIST_HEAD(&as->write_blocked_list);
 
-       bch2_keylist_init(&as->parent_keys, as->inline_keys,
-                        ARRAY_SIZE(as->inline_keys));
+       bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
        mutex_lock(&c->btree_interior_update_lock);
-       list_add(&as->list, &c->btree_interior_update_list);
+       list_add_tail(&as->list, &c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
 
        return as;
@@ -1015,11 +1050,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 {
        /* Root nodes cannot be reaped */
-       mutex_lock(&c->btree_cache_lock);
+       mutex_lock(&c->btree_cache.lock);
        list_del_init(&b->list);
-       mutex_unlock(&c->btree_cache_lock);
+       mutex_unlock(&c->btree_cache.lock);
 
        mutex_lock(&c->btree_root_lock);
+       BUG_ON(btree_node_root(c, b) &&
+              (b->level < btree_node_root(c, b)->level ||
+               !btree_node_dying(btree_node_root(c, b))));
+
        btree_node_root(c, b) = b;
        mutex_unlock(&c->btree_root_lock);
 
@@ -1030,24 +1069,31 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 {
        struct bch_fs *c = as->c;
        struct btree *old = btree_node_root(c, b);
-       struct bch_fs_usage stats = { 0 };
+       struct bch_fs_usage *fs_usage;
 
        __bch2_btree_set_root_inmem(c, b);
 
-       bch2_mark_key(c, bkey_i_to_s_c(&b->key),
-                     c->opts.btree_node_size, true,
+       mutex_lock(&c->btree_interior_update_lock);
+       percpu_down_read_preempt_disable(&c->mark_lock);
+       fs_usage = bch2_fs_usage_get_scratch(c);
+
+       bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
+                     true, 0,
                      gc_pos_btree_root(b->btree_id),
-                     &stats, 0);
+                     fs_usage, 0, 0);
 
-       if (old)
+       if (old && !btree_node_fake(old))
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&old->key),
-                                          &stats);
-       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                          fs_usage);
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                            gc_pos_btree_root(b->btree_id));
+
+       percpu_up_read_preempt_enable(&c->mark_lock);
+       mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
 {
        struct btree_root *r = &c->btree_roots[b->btree_id];
 
@@ -1057,6 +1103,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
        bkey_copy(&r->key, &b->key);
        r->level = b->level;
        r->alive = true;
+       if (rw == WRITE)
+               c->btree_roots_dirty = true;
 
        mutex_unlock(&c->btree_root_lock);
 }
@@ -1080,7 +1128,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
        struct btree *old;
 
        trace_btree_set_root(c, b);
-       BUG_ON(!b->written);
+       BUG_ON(!b->written &&
+              !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
 
        old = btree_node_root(c, b);
 
@@ -1112,19 +1161,22 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                                        struct btree_node_iter *node_iter)
 {
        struct bch_fs *c = as->c;
-       struct bch_fs_usage stats = { 0 };
+       struct bch_fs_usage *fs_usage;
        struct bkey_packed *k;
        struct bkey tmp;
 
        BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
 
-       if (bkey_extent_is_data(&insert->k))
-               bch2_mark_key(c, bkey_i_to_s_c(insert),
-                            c->opts.btree_node_size, true,
-                            gc_pos_btree_node(b), &stats, 0);
+       mutex_lock(&c->btree_interior_update_lock);
+       percpu_down_read_preempt_disable(&c->mark_lock);
+       fs_usage = bch2_fs_usage_get_scratch(c);
+
+       bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+                            true, 0,
+                            gc_pos_btree_node(b), fs_usage, 0, 0);
 
        while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
-              !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
+              bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
                bch2_btree_node_iter_advance(node_iter, b);
 
        /*
@@ -1134,11 +1186,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        if (k && !bkey_cmp_packed(b, k, &insert->k))
                bch2_btree_node_free_index(as, b,
                                           bkey_disassemble(b, k, &tmp),
-                                          &stats);
+                                          fs_usage);
 
-       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                            gc_pos_btree_node(b));
 
+       percpu_up_read_preempt_enable(&c->mark_lock);
+       mutex_unlock(&c->btree_interior_update_lock);
+
        bch2_btree_bset_insert_key(iter, b, node_iter, insert);
        set_btree_node_dirty(b);
        set_btree_node_need_write(b);
@@ -1161,6 +1216,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
+       SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
        n2->key.k.p = n1->key.k.p;
 
        btree_node_set_format(n2, n2->data->format);
@@ -1254,7 +1310,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
        BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
-       bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+       bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
        while (!bch2_keylist_empty(keys)) {
                k = bch2_keylist_front(keys);
@@ -1292,10 +1348,11 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 }
 
 static void btree_split(struct btree_update *as, struct btree *b,
-                       struct btree_iter *iter, struct keylist *keys)
+                       struct btree_iter *iter, struct keylist *keys,
+                       unsigned flags)
 {
        struct bch_fs *c = as->c;
-       struct btree *parent = iter->nodes[b->level + 1];
+       struct btree *parent = btree_node_parent(iter, b);
        struct btree *n1, *n2 = NULL, *n3 = NULL;
        u64 start_time = local_clock();
 
@@ -1319,7 +1376,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
                six_unlock_write(&n2->lock);
                six_unlock_write(&n1->lock);
 
-               bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent);
+               bch2_btree_node_write(c, n2, SIX_LOCK_intent);
 
                /*
                 * Note that on recursive parent_keys == keys, so we
@@ -1337,7 +1394,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
                        n3->sib_u64s[1] = U16_MAX;
 
                        btree_split_insert_keys(as, n3, iter, &as->parent_keys);
-                       bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent);
+
+                       bch2_btree_node_write(c, n3, SIX_LOCK_intent);
                }
        } else {
                trace_btree_compact(c, b);
@@ -1348,13 +1406,13 @@ static void btree_split(struct btree_update *as, struct btree *b,
                bch2_keylist_add(&as->parent_keys, &n1->key);
        }
 
-       bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent);
+       bch2_btree_node_write(c, n1, SIX_LOCK_intent);
 
        /* New nodes all written, now make them visible: */
 
        if (parent) {
                /* Split a non root node */
-               bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
        } else if (n3) {
                bch2_btree_set_root(as, n3, iter);
        } else {
@@ -1362,58 +1420,39 @@ static void btree_split(struct btree_update *as, struct btree *b,
                bch2_btree_set_root(as, n1, iter);
        }
 
-       bch2_btree_open_bucket_put(c, n1);
+       bch2_open_buckets_put(c, &n1->ob);
        if (n2)
-               bch2_btree_open_bucket_put(c, n2);
+               bch2_open_buckets_put(c, &n2->ob);
        if (n3)
-               bch2_btree_open_bucket_put(c, n3);
-
-       /*
-        * Note - at this point other linked iterators could still have @b read
-        * locked; we're depending on the bch2_btree_iter_node_replace() calls
-        * below removing all references to @b so we don't return with other
-        * iterators pointing to a node they have locked that's been freed.
-        *
-        * We have to free the node first because the bch2_iter_node_replace()
-        * calls will drop _our_ iterator's reference - and intent lock - to @b.
-        */
-       bch2_btree_node_free_inmem(c, b, iter);
+               bch2_open_buckets_put(c, &n3->ob);
 
        /* Successful split, update the iterator to point to the new nodes: */
 
+       bch2_btree_iter_node_drop(iter, b);
        if (n3)
                bch2_btree_iter_node_replace(iter, n3);
        if (n2)
                bch2_btree_iter_node_replace(iter, n2);
        bch2_btree_iter_node_replace(iter, n1);
 
-       bch2_time_stats_update(&c->btree_split_time, start_time);
+       bch2_btree_node_free_inmem(c, b, iter);
+
+       bch2_btree_iter_verify_locks(iter);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
 }
 
-static int
+static void
 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
                                struct btree_iter *iter, struct keylist *keys)
 {
-       struct bch_fs *c = as->c;
        struct btree_iter *linked;
        struct btree_node_iter node_iter;
        struct bkey_i *insert = bch2_keylist_front(keys);
        struct bkey_packed *k;
 
-       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
-       BUG_ON(!b->level);
-       BUG_ON(!as || as->b);
-       bch2_verify_keylist_sorted(keys);
-
-       bch2_btree_node_lock_for_insert(c, b, iter);
-
-       if (bch_keylist_u64s(keys) > bch_btree_keys_u64s_remaining(c, b)) {
-               bch2_btree_node_unlock_write(b, iter);
-               return -1;
-       }
-
        /* Don't screw up @iter's position: */
-       node_iter = iter->node_iters[b->level];
+       node_iter = iter->l[b->level].iter;
 
        /*
         * btree_split(), btree_gc_coalesce() will insert keys before
@@ -1433,20 +1472,10 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 
        btree_update_updated_node(as, b);
 
-       for_each_linked_btree_node(iter, b, linked)
-               bch2_btree_node_iter_peek(&linked->node_iters[b->level],
-                                        b);
-       bch2_btree_node_iter_peek(&iter->node_iters[b->level], b);
+       for_each_btree_iter_with_node(iter, b, linked)
+               bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
 
        bch2_btree_iter_verify(iter, b);
-
-       if (bch2_maybe_compact_whiteouts(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
-
-       bch2_btree_node_unlock_write(b, iter);
-
-       btree_node_interior_verify(b);
-       return 0;
 }
 
 /**
@@ -1462,27 +1491,83 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
-                           struct btree_iter *iter, struct keylist *keys)
+                           struct btree_iter *iter, struct keylist *keys,
+                           unsigned flags)
 {
+       struct bch_fs *c = as->c;
+       int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+       int old_live_u64s = b->nr.live_u64s;
+       int live_u64s_added, u64s_added;
+
+       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
        BUG_ON(!b->level);
+       BUG_ON(!as || as->b);
+       bch2_verify_keylist_sorted(keys);
+
+       if (as->must_rewrite)
+               goto split;
 
-       if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) ||
-           bch2_btree_insert_keys_interior(as, b, iter, keys))
-               btree_split(as, b, iter, keys);
+       bch2_btree_node_lock_for_insert(c, b, iter);
+
+       if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+               bch2_btree_node_unlock_write(b, iter);
+               goto split;
+       }
+
+       bch2_btree_insert_keys_interior(as, b, iter, keys);
+
+       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+       u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+       if (u64s_added > live_u64s_added &&
+           bch2_maybe_compact_whiteouts(c, b))
+               bch2_btree_iter_reinit_node(iter, b);
+
+       bch2_btree_node_unlock_write(b, iter);
+
+       btree_node_interior_verify(b);
+
+       /*
+        * when called from the btree_split path the new nodes aren't added to
+        * the btree iterator yet, so the merge path's unlock/wait/relock dance
+        * won't work:
+        */
+       bch2_foreground_maybe_merge(c, iter, b->level,
+                                   flags|BTREE_INSERT_NOUNLOCK);
+       return;
+split:
+       btree_split(as, b, iter, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
-                         unsigned btree_reserve_flags)
+                         unsigned flags)
 {
-       struct btree *b = iter->nodes[0];
+       struct btree *b = iter->l[0].b;
        struct btree_update *as;
        struct closure cl;
        int ret = 0;
+       struct btree_iter *linked;
+
+       /*
+        * We already have a disk reservation and open buckets pinned; this
+        * allocation must not block:
+        */
+       for_each_btree_iter(iter, linked)
+               if (linked->btree_id == BTREE_ID_EXTENTS)
+                       flags |= BTREE_INSERT_USE_RESERVE;
 
        closure_init_stack(&cl);
 
        /* Hack, because gc and splitting nodes doesn't mix yet: */
        if (!down_read_trylock(&c->gc_lock)) {
+               if (flags & BTREE_INSERT_NOUNLOCK)
+                       return -EINTR;
+
                bch2_btree_iter_unlock(iter);
                down_read(&c->gc_lock);
 
@@ -1494,37 +1579,44 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
         * XXX: figure out how far we might need to split,
         * instead of locking/reserving all the way to the root:
         */
-       if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX,
+                       !(flags & BTREE_INSERT_NOUNLOCK))) {
                ret = -EINTR;
                goto out;
        }
 
        as = bch2_btree_update_start(c, iter->btree_id,
-                                    btree_update_reserve_required(c, b),
-                                    btree_reserve_flags, &cl);
+               btree_update_reserve_required(c, b), flags,
+               !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
        if (IS_ERR(as)) {
                ret = PTR_ERR(as);
                if (ret == -EAGAIN) {
+                       BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
                        bch2_btree_iter_unlock(iter);
-                       up_read(&c->gc_lock);
-                       closure_sync(&cl);
-                       return -EINTR;
+                       ret = -EINTR;
                }
                goto out;
        }
 
-       btree_split(as, b, iter, NULL);
+       btree_split(as, b, iter, NULL, flags);
        bch2_btree_update_done(as);
 
-       bch2_btree_iter_set_locks_want(iter, 1);
+       /*
+        * We haven't successfully inserted yet, so don't downgrade all the way
+        * back to read locks;
+        */
+       __bch2_btree_iter_downgrade(iter, 1);
 out:
        up_read(&c->gc_lock);
+       closure_sync(&cl);
        return ret;
 }
 
-int bch2_foreground_maybe_merge(struct bch_fs *c,
-                               struct btree_iter *iter,
-                               enum btree_node_sibling sib)
+void __bch2_foreground_maybe_merge(struct bch_fs *c,
+                                  struct btree_iter *iter,
+                                  unsigned level,
+                                  unsigned flags,
+                                  enum btree_node_sibling sib)
 {
        struct btree_update *as;
        struct bkey_format_state new_s;
@@ -1537,29 +1629,29 @@ int bch2_foreground_maybe_merge(struct bch_fs *c,
 
        closure_init_stack(&cl);
 retry:
-       if (!bch2_btree_node_relock(iter, iter->level))
-               return 0;
+       BUG_ON(!btree_node_locked(iter, level));
 
-       b = iter->nodes[iter->level];
+       b = iter->l[level].b;
 
-       parent = iter->nodes[b->level + 1];
+       parent = btree_node_parent(iter, b);
        if (!parent)
-               return 0;
+               goto out;
 
        if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
-               return 0;
+               goto out;
 
        /* XXX: can't be holding read locks */
-       m = bch2_btree_node_get_sibling(c, iter, b, sib);
+       m = bch2_btree_node_get_sibling(c, iter, b,
+                       !(flags & BTREE_INSERT_NOUNLOCK), sib);
        if (IS_ERR(m)) {
                ret = PTR_ERR(m);
-               goto out;
+               goto err;
        }
 
        /* NULL means no sibling: */
        if (!m) {
                b->sib_u64s[sib] = U16_MAX;
-               return 0;
+               goto out;
        }
 
        if (sib == btree_prev_sib) {
@@ -1589,35 +1681,31 @@ retry:
 
        if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
                six_unlock_intent(&m->lock);
-               return 0;
+               goto out;
        }
 
        /* We're changing btree topology, doesn't mix with gc: */
-       if (!down_read_trylock(&c->gc_lock)) {
-               six_unlock_intent(&m->lock);
-               bch2_btree_iter_unlock(iter);
+       if (!down_read_trylock(&c->gc_lock))
+               goto err_cycle_gc_lock;
 
-               down_read(&c->gc_lock);
-               up_read(&c->gc_lock);
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX,
+                       !(flags & BTREE_INSERT_NOUNLOCK))) {
                ret = -EINTR;
-               goto out;
-       }
-
-       if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
-               ret = -EINTR;
-               goto out_unlock;
+               goto err_unlock;
        }
 
        as = bch2_btree_update_start(c, iter->btree_id,
-                                    btree_update_reserve_required(c, b),
-                                    BTREE_INSERT_NOFAIL|
-                                    BTREE_INSERT_USE_RESERVE,
-                                    &cl);
+                        btree_update_reserve_required(c, parent) + 1,
+                        BTREE_INSERT_NOFAIL|
+                        BTREE_INSERT_USE_RESERVE,
+                        !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
        if (IS_ERR(as)) {
                ret = PTR_ERR(as);
-               goto out_unlock;
+               goto err_unlock;
        }
 
+       trace_btree_merge(c, b);
+
        bch2_btree_interior_update_will_free_node(as, b);
        bch2_btree_interior_update_will_free_node(as, m);
 
@@ -1641,50 +1729,83 @@ retry:
        bch2_keylist_add(&as->parent_keys, &delete);
        bch2_keylist_add(&as->parent_keys, &n->key);
 
-       bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent);
 
-       bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+       bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
-       bch2_btree_open_bucket_put(c, n);
-       bch2_btree_node_free_inmem(c, b, iter);
-       bch2_btree_node_free_inmem(c, m, iter);
+       bch2_open_buckets_put(c, &n->ob);
+
+       bch2_btree_iter_node_drop(iter, b);
        bch2_btree_iter_node_replace(iter, n);
 
        bch2_btree_iter_verify(iter, n);
 
+       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_btree_node_free_inmem(c, m, iter);
+
        bch2_btree_update_done(as);
-out_unlock:
-       if (ret != -EINTR && ret != -EAGAIN)
-               bch2_btree_iter_set_locks_want(iter, 1);
-       six_unlock_intent(&m->lock);
+
        up_read(&c->gc_lock);
 out:
-       if (ret == -EAGAIN || ret == -EINTR) {
-               bch2_btree_iter_unlock(iter);
-               ret = -EINTR;
-       }
+       bch2_btree_iter_verify_locks(iter);
 
+       /*
+        * Don't downgrade locks here: we're called after successful insert,
+        * and the caller will downgrade locks after a successful insert
+        * anyways (in case e.g. a split was required first)
+        *
+        * And we're also called when inserting into interior nodes in the
+        * split path, and downgrading to read locks in there is potentially
+        * confusing:
+        */
        closure_sync(&cl);
+       return;
+
+err_cycle_gc_lock:
+       six_unlock_intent(&m->lock);
+
+       if (flags & BTREE_INSERT_NOUNLOCK)
+               goto out;
+
+       bch2_btree_iter_unlock(iter);
+
+       down_read(&c->gc_lock);
+       up_read(&c->gc_lock);
+       ret = -EINTR;
+       goto err;
 
-       if (ret == -EINTR) {
+err_unlock:
+       six_unlock_intent(&m->lock);
+       up_read(&c->gc_lock);
+err:
+       BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
+
+       if ((ret == -EAGAIN || ret == -EINTR) &&
+           !(flags & BTREE_INSERT_NOUNLOCK)) {
+               bch2_btree_iter_unlock(iter);
+               closure_sync(&cl);
                ret = bch2_btree_iter_traverse(iter);
-               if (!ret)
-                       goto retry;
+               if (ret)
+                       goto out;
+
+               goto retry;
        }
 
-       return ret;
+       goto out;
 }
 
 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
                                struct btree *b, unsigned flags,
                                struct closure *cl)
 {
-       struct btree *n, *parent = iter->nodes[b->level + 1];
+       struct btree *n, *parent = btree_node_parent(iter, b);
        struct btree_update *as;
 
        as = bch2_btree_update_start(c, iter->btree_id,
-                                    btree_update_reserve_required(c, b),
-                                    flags, cl);
+               (parent
+                ? btree_update_reserve_required(c, parent)
+                : 0) + 1,
+               flags, cl);
        if (IS_ERR(as)) {
                trace_btree_gc_rewrite_node_fail(c, b);
                return PTR_ERR(as);
@@ -1699,21 +1820,21 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        trace_btree_gc_rewrite_node(c, b);
 
-       bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent);
 
        if (parent) {
-               bch2_btree_insert_node(as, parent, iter,
-                                      &keylist_single(&n->key));
+               bch2_keylist_add(&as->parent_keys, &n->key);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
        } else {
                bch2_btree_set_root(as, n, iter);
        }
 
-       bch2_btree_open_bucket_put(c, n);
+       bch2_open_buckets_put(c, &n->ob);
 
+       bch2_btree_iter_node_drop(iter, b);
+       bch2_btree_iter_node_replace(iter, n);
        bch2_btree_node_free_inmem(c, b, iter);
 
-       BUG_ON(!bch2_btree_iter_node_replace(iter, n));
-
        bch2_btree_update_done(as);
        return 0;
 }
@@ -1727,7 +1848,6 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
                            __le64 seq, unsigned flags)
 {
-       unsigned locks_want = iter->locks_want;
        struct closure cl;
        struct btree *b;
        int ret;
@@ -1736,7 +1856,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        closure_init_stack(&cl);
 
-       bch2_btree_iter_set_locks_want(iter, U8_MAX);
+       bch2_btree_iter_upgrade(iter, U8_MAX, true);
 
        if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
                if (!down_read_trylock(&c->gc_lock)) {
@@ -1763,7 +1883,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
                closure_sync(&cl);
        }
 
-       bch2_btree_iter_set_locks_want(iter, locks_want);
+       bch2_btree_iter_downgrade(iter);
 
        if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
                up_read(&c->gc_lock);
@@ -1772,64 +1892,15 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
        return ret;
 }
 
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
-                              struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+                                        struct btree_update *as,
+                                        struct btree_iter *iter,
+                                        struct btree *b, struct btree *new_hash,
+                                        struct bkey_i_btree_ptr *new_key)
 {
-       struct btree_update *as = NULL;
-       struct btree *parent, *new_hash = NULL;
-       struct btree_iter iter;
-       struct closure cl;
-       bool must_rewrite_parent = false;
+       struct btree *parent;
        int ret;
 
-       __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-                              BTREE_MAX_DEPTH,
-                              b->level, 0);
-       closure_init_stack(&cl);
-
-       ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
-       if (ret)
-               return ret;
-
-retry:
-       down_read(&c->gc_lock);
-       ret = bch2_btree_iter_traverse(&iter);
-       if (ret)
-               goto err;
-
-       /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-       if (!new_hash &&
-           PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-               /* bch2_btree_reserve_get will unlock */
-               do {
-                       ret = bch2_btree_node_cannibalize_lock(c, &cl);
-                       closure_sync(&cl);
-               } while (ret == -EAGAIN);
-
-               BUG_ON(ret);
-
-               new_hash = bch2_btree_node_mem_alloc(c);
-       }
-
-       as = bch2_btree_update_start(c, iter.btree_id,
-                                    btree_update_reserve_required(c, b),
-                                    BTREE_INSERT_NOFAIL|
-                                    BTREE_INSERT_USE_RESERVE|
-                                    BTREE_INSERT_USE_ALLOC_RESERVE,
-                                    &cl);
-       if (IS_ERR(as)) {
-               ret = PTR_ERR(as);
-               if (ret == -EAGAIN || ret == -EINTR) {
-                       bch2_btree_iter_unlock(&iter);
-                       up_read(&c->gc_lock);
-                       closure_sync(&cl);
-                       goto retry;
-               }
-               goto err;
-       }
-
-       mutex_lock(&c->btree_interior_update_lock);
-
        /*
         * Two corner cases that need to be thought about here:
         *
@@ -1852,86 +1923,187 @@ retry:
         */
 
        if (b->will_make_reachable)
-               must_rewrite_parent = true;
-
-       /* other case: btree node being freed */
-       if (iter.nodes[b->level] != b) {
-               /* node has been freed: */
-               BUG_ON(btree_node_hashed(b));
-               mutex_unlock(&c->btree_interior_update_lock);
-               goto err;
-       }
-
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       if (must_rewrite_parent)
-               as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
+               as->must_rewrite = true;
 
        btree_interior_update_add_node_reference(as, b);
 
-       parent = iter.nodes[b->level + 1];
+       /*
+        * XXX: the rest of the update path treats this like we're actually
+        * inserting a new node and deleting the existing node, so the
+        * reservation needs to include enough space for @b
+        *
+        * that is actually sketch as fuck though and I am surprised the code
+        * seems to work like that, definitely need to go back and rework it
+        * into something saner.
+        *
+        * (I think @b is just getting double counted until the btree update
+        * finishes and "deletes" @b on disk)
+        */
+       ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+                       c->opts.btree_node_size *
+                       bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
+                       BCH_DISK_RESERVATION_NOFAIL|
+                       BCH_DISK_RESERVATION_GC_LOCK_HELD);
+       BUG_ON(ret);
+
+       parent = btree_node_parent(iter, b);
        if (parent) {
                if (new_hash) {
                        bkey_copy(&new_hash->key, &new_key->k_i);
-                       BUG_ON(bch2_btree_node_hash_insert(c, new_hash,
-                                                       b->level, b->btree_id));
+                       ret = bch2_btree_node_hash_insert(&c->btree_cache,
+                                       new_hash, b->level, b->btree_id);
+                       BUG_ON(ret);
                }
 
-               bch2_btree_insert_node(as, parent, &iter,
-                                      &keylist_single(&new_key->k_i));
+               bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
 
                if (new_hash) {
-                       mutex_lock(&c->btree_cache_lock);
-                       bch2_btree_node_hash_remove(c, new_hash);
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
 
-                       bch2_btree_node_hash_remove(c, b);
+                       bch2_btree_node_hash_remove(&c->btree_cache, b);
 
                        bkey_copy(&b->key, &new_key->k_i);
-                       ret = __bch2_btree_node_hash_insert(c, b);
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
                        BUG_ON(ret);
-                       mutex_unlock(&c->btree_cache_lock);
+                       mutex_unlock(&c->btree_cache.lock);
                } else {
                        bkey_copy(&b->key, &new_key->k_i);
                }
        } else {
-               struct bch_fs_usage stats = { 0 };
+               struct bch_fs_usage *fs_usage;
 
                BUG_ON(btree_node_root(c, b) != b);
 
-               bch2_btree_node_lock_write(b, &iter);
+               bch2_btree_node_lock_write(b, iter);
 
-               bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
-                             c->opts.btree_node_size, true,
+               mutex_lock(&c->btree_interior_update_lock);
+               percpu_down_read_preempt_disable(&c->mark_lock);
+               fs_usage = bch2_fs_usage_get_scratch(c);
+
+               bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+                             true, 0,
                              gc_pos_btree_root(b->btree_id),
-                             &stats, 0);
+                             fs_usage, 0, 0);
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&b->key),
-                                          &stats);
-               bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                          fs_usage);
+               bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                                    gc_pos_btree_root(b->btree_id));
-               bkey_copy(&b->key, &new_key->k_i);
+
+               percpu_up_read_preempt_enable(&c->mark_lock);
+               mutex_unlock(&c->btree_interior_update_lock);
+
+               if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+                       bkey_copy(&b->key, &new_key->k_i);
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+               } else {
+                       bkey_copy(&b->key, &new_key->k_i);
+               }
 
                btree_update_updated_root(as);
-               bch2_btree_node_unlock_write(b, &iter);
+               bch2_btree_node_unlock_write(b, iter);
        }
 
        bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+                              struct btree *b,
+                              struct bkey_i_btree_ptr *new_key)
+{
+       struct btree *parent = btree_node_parent(iter, b);
+       struct btree_update *as = NULL;
+       struct btree *new_hash = NULL;
+       struct closure cl;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX, true))
+               return -EINTR;
+
+       if (!down_read_trylock(&c->gc_lock)) {
+               bch2_btree_iter_unlock(iter);
+               down_read(&c->gc_lock);
+
+               if (!bch2_btree_iter_relock(iter)) {
+                       ret = -EINTR;
+                       goto err;
+               }
+       }
+
+       /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+       if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+               /* bch2_btree_reserve_get will unlock */
+               ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+               if (ret) {
+                       ret = -EINTR;
+
+                       bch2_btree_iter_unlock(iter);
+                       up_read(&c->gc_lock);
+                       closure_sync(&cl);
+                       down_read(&c->gc_lock);
+
+                       if (!bch2_btree_iter_relock(iter))
+                               goto err;
+               }
+
+               new_hash = bch2_btree_node_mem_alloc(c);
+       }
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+               parent ? btree_update_reserve_required(c, parent) : 0,
+               BTREE_INSERT_NOFAIL|
+               BTREE_INSERT_USE_RESERVE|
+               BTREE_INSERT_USE_ALLOC_RESERVE,
+               &cl);
+
+       if (IS_ERR(as)) {
+               ret = PTR_ERR(as);
+               if (ret == -EAGAIN)
+                       ret = -EINTR;
+
+               if (ret != -EINTR)
+                       goto err;
+
+               bch2_btree_iter_unlock(iter);
+               up_read(&c->gc_lock);
+               closure_sync(&cl);
+               down_read(&c->gc_lock);
+
+               if (!bch2_btree_iter_relock(iter))
+                       goto err;
+       }
+
+       ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
+       if (ret)
+               goto err_free_update;
+
+       __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+
+       bch2_btree_iter_downgrade(iter);
+err:
        if (new_hash) {
-               mutex_lock(&c->btree_cache_lock);
-               list_move(&new_hash->list, &c->btree_cache_freeable);
-               mutex_unlock(&c->btree_cache_lock);
+               mutex_lock(&c->btree_cache.lock);
+               list_move(&new_hash->list, &c->btree_cache.freeable);
+               mutex_unlock(&c->btree_cache.lock);
 
                six_unlock_write(&new_hash->lock);
                six_unlock_intent(&new_hash->lock);
        }
-       bch2_btree_iter_unlock(&iter);
        up_read(&c->gc_lock);
+       closure_sync(&cl);
        return ret;
-err:
-       if (as)
-               bch2_btree_update_free(as);
-       goto out;
+err_free_update:
+       bch2_btree_update_free(as);
+       goto err;
 }
 
 /* Init code: */
@@ -1945,48 +2117,76 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
        BUG_ON(btree_node_root(c, b));
 
        __bch2_btree_set_root_inmem(c, b);
-       bch2_btree_set_root_ondisk(c, b);
 }
 
-int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
-                         struct closure *writes)
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 {
-       struct btree_update *as;
        struct closure cl;
        struct btree *b;
+       int ret;
 
-       memset(&as, 0, sizeof(as));
        closure_init_stack(&cl);
 
-       while (1) {
-               /* XXX haven't calculated capacity yet :/ */
-               as = bch2_btree_update_start(c, id, 1,
-                                            BTREE_INSERT_USE_RESERVE|
-                                            BTREE_INSERT_USE_ALLOC_RESERVE,
-                                            &cl);
-               if (!IS_ERR(as))
-                       break;
+       do {
+               ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+               closure_sync(&cl);
+       } while (ret);
 
-               if (PTR_ERR(as) == -ENOSPC)
-                       return PTR_ERR(as);
+       b = bch2_btree_node_mem_alloc(c);
+       bch2_btree_cache_cannibalize_unlock(c);
 
-               closure_sync(&cl);
-       }
+       set_btree_node_fake(b);
+       b->level        = 0;
+       b->btree_id     = id;
 
-       b = __btree_root_alloc(as, 0);
+       bkey_btree_ptr_init(&b->key);
+       b->key.k.p = POS_MAX;
+       PTR_HASH(&b->key) = U64_MAX - id;
 
-       bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
-       btree_update_drop_new_node(c, b);
+       bch2_bset_init_first(b, &b->data->keys);
+       bch2_btree_build_aux_trees(b);
 
-       BUG_ON(btree_node_root(c, b));
+       b->data->min_key = POS_MIN;
+       b->data->max_key = POS_MAX;
+       b->data->format = bch2_btree_calc_format(b);
+       btree_node_set_format(b, b->data->format);
 
-       bch2_btree_set_root_inmem(as, b);
-       bch2_btree_set_root_ondisk(c, b);
+       ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
+       BUG_ON(ret);
+
+       __bch2_btree_set_root_inmem(c, b);
 
-       bch2_btree_open_bucket_put(c, b);
+       six_unlock_write(&b->lock);
        six_unlock_intent(&b->lock);
+}
 
-       bch2_btree_update_free(as);
+ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+{
+       struct printbuf out = _PBUF(buf, PAGE_SIZE);
+       struct btree_update *as;
 
-       return 0;
+       mutex_lock(&c->btree_interior_update_lock);
+       list_for_each_entry(as, &c->btree_interior_update_list, list)
+               pr_buf(&out, "%p m %u w %u r %u j %llu\n",
+                      as,
+                      as->mode,
+                      as->nodes_written,
+                      atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+                      as->journal.seq);
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       return out.pos - buf;
+}
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+       size_t ret = 0;
+       struct list_head *i;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       list_for_each(i, &c->btree_interior_update_list)
+               ret++;
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       return ret;
 }