]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_update_interior.c
Update bcachefs sources to 09a5465430 bcachefs: Don't need to walk inodes on clean...
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
index 0a9d6919ccd334f615f952a490ee255ee5804dad..33b5cf40a5f48377f2a5fe3a6357fc77cba02eba 100644 (file)
@@ -131,13 +131,15 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 /* Btree node freeing/allocation: */
 
 static bool btree_key_matches(struct bch_fs *c,
-                             struct bkey_s_c_extent l,
-                             struct bkey_s_c_extent r)
+                             struct bkey_s_c l,
+                             struct bkey_s_c r)
 {
+       struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
+       struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
        const struct bch_extent_ptr *ptr1, *ptr2;
 
-       extent_for_each_ptr(l, ptr1)
-               extent_for_each_ptr(r, ptr2)
+       bkey_for_each_ptr(ptrs1, ptr1)
+               bkey_for_each_ptr(ptrs2, ptr2)
                        if (ptr1->dev == ptr2->dev &&
                            ptr1->gen == ptr2->gen &&
                            ptr1->offset == ptr2->offset)
@@ -159,33 +161,17 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
        struct bch_fs *c = as->c;
        struct pending_btree_node_free *d;
-       unsigned replicas;
-
-       /*
-        * btree_update lock is only needed here to avoid racing with
-        * gc:
-        */
-       mutex_lock(&c->btree_interior_update_lock);
+       struct gc_pos pos = { 0 };
 
        for (d = as->pending; d < as->pending + as->nr_pending; d++)
                if (!bkey_cmp(k.k->p, d->key.k.p) &&
-                   btree_key_matches(c, bkey_s_c_to_extent(k),
-                                     bkey_i_to_s_c_extent(&d->key)))
+                   btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
                        goto found;
        BUG();
 found:
        BUG_ON(d->index_update_done);
        d->index_update_done = true;
 
-       /*
-        * Btree nodes are accounted as freed in bch_alloc_stats when they're
-        * freed from the index:
-        */
-       replicas = bch2_extent_nr_dirty_ptrs(k);
-       if (replicas)
-               stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-                       c->opts.btree_node_size * replicas;
-
        /*
         * We're dropping @k from the btree, but it's still live until the
         * index update is persistent so we need to keep a reference around for
@@ -207,22 +193,14 @@ found:
         * bch2_mark_key() compares the current gc pos to the pos we're
         * moving this reference from, hence one comparison here:
         */
-       if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-               struct bch_fs_usage tmp = { 0 };
-
-               bch2_mark_key(c, BKEY_TYPE_BTREE,
+       if (gc_pos_cmp(c->gc_pos, b
+                      ? gc_pos_btree_node(b)
+                      : gc_pos_btree_root(as->btree_id)) >= 0 &&
+           gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
+               bch2_mark_key_locked(c,
                              bkey_i_to_s_c(&d->key),
-                             false, 0, b
-                             ? gc_pos_btree_node(b)
-                             : gc_pos_btree_root(as->btree_id),
-                             &tmp, 0, 0);
-               /*
-                * Don't apply tmp - pending deletes aren't tracked in
-                * bch_alloc_stats:
-                */
-       }
-
-       mutex_unlock(&c->btree_interior_update_lock);
+                             false, 0, pos,
+                             NULL, 0, BCH_BUCKET_MARK_GC);
 }
 
 static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -265,6 +243,11 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
                                struct btree_iter *iter)
 {
+       struct btree_iter *linked;
+
+       for_each_btree_iter(iter, linked)
+               BUG_ON(linked->l[b->level].b == b);
+
        /*
         * Is this a node that isn't reachable on disk yet?
         *
@@ -276,29 +259,21 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
         */
        btree_update_drop_new_node(c, b);
 
-       __bch2_btree_node_lock_write(b, iter);
+       six_lock_write(&b->lock);
        __btree_node_free(c, b);
        six_unlock_write(&b->lock);
-
-       bch2_btree_iter_node_drop(iter, b);
+       six_unlock_intent(&b->lock);
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
                                        struct pending_btree_node_free *pending)
 {
-       struct bch_fs_usage stats = { 0 };
-
        BUG_ON(!pending->index_update_done);
 
-       bch2_mark_key(c, BKEY_TYPE_BTREE,
-                     bkey_i_to_s_c(&pending->key),
+       bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
                      false, 0,
                      gc_phase(GC_PHASE_PENDING_DELETE),
-                     &stats, 0, 0);
-       /*
-        * Don't apply stats - pending deletes aren't tracked in
-        * bch_alloc_stats:
-        */
+                     NULL, 0, 0);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -309,7 +284,6 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        struct write_point *wp;
        struct btree *b;
        BKEY_PADDED(k) tmp;
-       struct bkey_i_extent *e;
        struct open_buckets ob = { .nr = 0 };
        struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        unsigned nr_reserve;
@@ -339,7 +313,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
                                      writepoint_ptr(&c->btree_write_point),
                                      &devs_have,
                                      res->nr_replicas,
@@ -360,8 +334,8 @@ retry:
                goto retry;
        }
 
-       e = bkey_extent_init(&tmp.k);
-       bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+       bkey_btree_ptr_init(&tmp.k);
+       bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
 
        bch2_open_bucket_get(c, wp, &ob);
        bch2_alloc_sectors_done(c, wp);
@@ -392,6 +366,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
        set_btree_node_accessed(b);
        set_btree_node_dirty(b);
+       set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
        memset(&b->nr, 0, sizeof(b->nr));
@@ -399,7 +374,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        b->data->flags = 0;
        SET_BTREE_NODE_ID(b->data, as->btree_id);
        SET_BTREE_NODE_LEVEL(b->data, level);
-       b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
+       b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
 
        bch2_btree_build_aux_trees(b);
 
@@ -552,8 +527,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
                        goto err_free;
                }
 
-               ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-                                             bkey_i_to_s_c(&b->key));
+               ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
                if (ret)
                        goto err_free;
 
@@ -588,7 +562,6 @@ static void bch2_btree_update_free(struct btree_update *as)
 
        closure_debug_destroy(&as->cl);
        mempool_free(as, &c->btree_interior_update_pool);
-       percpu_ref_put(&c->writes);
 
        closure_wake_up(&c->btree_interior_update_wait);
        mutex_unlock(&c->btree_interior_update_lock);
@@ -637,12 +610,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
        int ret;
 
        ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-       if (ret < 0)
-               goto err;
-       if (!ret) {
+       if (ret == -EAGAIN) {
                continue_at(cl, btree_update_wait_on_journal, system_wq);
                return;
        }
+       if (ret < 0)
+               goto err;
 
        bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
@@ -682,6 +655,12 @@ retry:
                closure_wait(&btree_current_write(b)->wait, cl);
 
                list_del(&as->write_blocked_list);
+
+               /*
+                * for flush_held_btree_writes() waiting on updates to flush or
+                * nodes to be writeable:
+                */
+               closure_wake_up(&c->btree_interior_update_wait);
                mutex_unlock(&c->btree_interior_update_lock);
 
                /*
@@ -985,6 +964,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
                list_del(&p->write_blocked_list);
                btree_update_reparent(as, p);
+
+               /*
+                * for flush_held_btree_writes() waiting on updates to flush or
+                * nodes to be writeable:
+                */
+               closure_wake_up(&c->btree_interior_update_wait);
        }
 
        clear_btree_node_dirty(b);
@@ -1038,14 +1023,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
        struct btree_reserve *reserve;
        struct btree_update *as;
 
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
-               return ERR_PTR(-EROFS);
-
        reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-       if (IS_ERR(reserve)) {
-               percpu_ref_put(&c->writes);
+       if (IS_ERR(reserve))
                return ERR_CAST(reserve);
-       }
 
        as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
        memset(as, 0, sizeof(*as));
@@ -1089,22 +1069,28 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 {
        struct bch_fs *c = as->c;
        struct btree *old = btree_node_root(c, b);
-       struct bch_fs_usage stats = { 0 };
+       struct bch_fs_usage *fs_usage;
 
        __bch2_btree_set_root_inmem(c, b);
 
-       bch2_mark_key(c, BKEY_TYPE_BTREE,
-                     bkey_i_to_s_c(&b->key),
+       mutex_lock(&c->btree_interior_update_lock);
+       percpu_down_read_preempt_disable(&c->mark_lock);
+       fs_usage = bch2_fs_usage_get_scratch(c);
+
+       bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
                      true, 0,
                      gc_pos_btree_root(b->btree_id),
-                     &stats, 0, 0);
+                     fs_usage, 0, 0);
 
        if (old && !btree_node_fake(old))
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&old->key),
-                                          &stats);
-       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                          fs_usage);
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                            gc_pos_btree_root(b->btree_id));
+
+       percpu_up_read_preempt_enable(&c->mark_lock);
+       mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
@@ -1175,17 +1161,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                                        struct btree_node_iter *node_iter)
 {
        struct bch_fs *c = as->c;
-       struct bch_fs_usage stats = { 0 };
+       struct bch_fs_usage *fs_usage;
        struct bkey_packed *k;
        struct bkey tmp;
 
        BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
 
-       if (bkey_extent_is_data(&insert->k))
-               bch2_mark_key(c, BKEY_TYPE_BTREE,
-                             bkey_i_to_s_c(insert),
-                             true, 0,
-                             gc_pos_btree_node(b), &stats, 0, 0);
+       mutex_lock(&c->btree_interior_update_lock);
+       percpu_down_read_preempt_disable(&c->mark_lock);
+       fs_usage = bch2_fs_usage_get_scratch(c);
+
+       bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+                            true, 0,
+                            gc_pos_btree_node(b), fs_usage, 0, 0);
 
        while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
               bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1198,11 +1186,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        if (k && !bkey_cmp_packed(b, k, &insert->k))
                bch2_btree_node_free_index(as, b,
                                           bkey_disassemble(b, k, &tmp),
-                                          &stats);
+                                          fs_usage);
 
-       bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                            gc_pos_btree_node(b));
 
+       percpu_up_read_preempt_enable(&c->mark_lock);
+       mutex_unlock(&c->btree_interior_update_lock);
+
        bch2_btree_bset_insert_key(iter, b, node_iter, insert);
        set_btree_node_dirty(b);
        set_btree_node_need_write(b);
@@ -1435,25 +1426,19 @@ static void btree_split(struct btree_update *as, struct btree *b,
        if (n3)
                bch2_open_buckets_put(c, &n3->ob);
 
-       /*
-        * Note - at this point other linked iterators could still have @b read
-        * locked; we're depending on the bch2_btree_iter_node_replace() calls
-        * below removing all references to @b so we don't return with other
-        * iterators pointing to a node they have locked that's been freed.
-        *
-        * We have to free the node first because the bch2_iter_node_replace()
-        * calls will drop _our_ iterator's reference - and intent lock - to @b.
-        */
-       bch2_btree_node_free_inmem(c, b, iter);
-
        /* Successful split, update the iterator to point to the new nodes: */
 
+       bch2_btree_iter_node_drop(iter, b);
        if (n3)
                bch2_btree_iter_node_replace(iter, n3);
        if (n2)
                bch2_btree_iter_node_replace(iter, n2);
        bch2_btree_iter_node_replace(iter, n1);
 
+       bch2_btree_node_free_inmem(c, b, iter);
+
+       bch2_btree_iter_verify_locks(iter);
+
        bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
 }
 
@@ -1749,17 +1734,21 @@ retry:
        bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
        bch2_open_buckets_put(c, &n->ob);
-       bch2_btree_node_free_inmem(c, b, iter);
-       bch2_btree_node_free_inmem(c, m, iter);
+
+       bch2_btree_iter_node_drop(iter, b);
        bch2_btree_iter_node_replace(iter, n);
 
        bch2_btree_iter_verify(iter, n);
 
+       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_btree_node_free_inmem(c, m, iter);
+
        bch2_btree_update_done(as);
 
-       six_unlock_intent(&m->lock);
        up_read(&c->gc_lock);
 out:
+       bch2_btree_iter_verify_locks(iter);
+
        /*
         * Don't downgrade locks here: we're called after successful insert,
         * and the caller will downgrade locks after a successful insert
@@ -1842,9 +1831,9 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        bch2_open_buckets_put(c, &n->ob);
 
-       bch2_btree_node_free_inmem(c, b, iter);
-
+       bch2_btree_iter_node_drop(iter, b);
        bch2_btree_iter_node_replace(iter, n);
+       bch2_btree_node_free_inmem(c, b, iter);
 
        bch2_btree_update_done(as);
        return 0;
@@ -1907,7 +1896,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                                         struct btree_update *as,
                                         struct btree_iter *iter,
                                         struct btree *b, struct btree *new_hash,
-                                        struct bkey_i_extent *new_key)
+                                        struct bkey_i_btree_ptr *new_key)
 {
        struct btree *parent;
        int ret;
@@ -1938,6 +1927,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
        btree_interior_update_add_node_reference(as, b);
 
+       /*
+        * XXX: the rest of the update path treats this like we're actually
+        * inserting a new node and deleting the existing node, so the
+        * reservation needs to include enough space for @b
+        *
+        * that is actually sketch as fuck though and I am surprised the code
+        * seems to work like that, definitely need to go back and rework it
+        * into something saner.
+        *
+        * (I think @b is just getting double counted until the btree update
+        * finishes and "deletes" @b on disk)
+        */
+       ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
+                       c->opts.btree_node_size *
+                       bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
+                       BCH_DISK_RESERVATION_NOFAIL|
+                       BCH_DISK_RESERVATION_GC_LOCK_HELD);
+       BUG_ON(ret);
+
        parent = btree_node_parent(iter, b);
        if (parent) {
                if (new_hash) {
@@ -1964,23 +1972,29 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                        bkey_copy(&b->key, &new_key->k_i);
                }
        } else {
-               struct bch_fs_usage stats = { 0 };
+               struct bch_fs_usage *fs_usage;
 
                BUG_ON(btree_node_root(c, b) != b);
 
                bch2_btree_node_lock_write(b, iter);
 
-               bch2_mark_key(c, BKEY_TYPE_BTREE,
-                             bkey_i_to_s_c(&new_key->k_i),
+               mutex_lock(&c->btree_interior_update_lock);
+               percpu_down_read_preempt_disable(&c->mark_lock);
+               fs_usage = bch2_fs_usage_get_scratch(c);
+
+               bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
                              true, 0,
                              gc_pos_btree_root(b->btree_id),
-                             &stats, 0, 0);
+                             fs_usage, 0, 0);
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&b->key),
-                                          &stats);
-               bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+                                          fs_usage);
+               bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
                                    gc_pos_btree_root(b->btree_id));
 
+               percpu_up_read_preempt_enable(&c->mark_lock);
+               mutex_unlock(&c->btree_interior_update_lock);
+
                if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
                        mutex_lock(&c->btree_cache.lock);
                        bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -2001,7 +2015,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 }
 
 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
-                              struct btree *b, struct bkey_i_extent *new_key)
+                              struct btree *b,
+                              struct bkey_i_btree_ptr *new_key)
 {
        struct btree *parent = btree_node_parent(iter, b);
        struct btree_update *as = NULL;
@@ -2067,8 +2082,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
                        goto err;
        }
 
-       ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
-                                     extent_i_to_s_c(new_key).s_c);
+       ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
        if (ret)
                goto err_free_update;
 
@@ -2103,7 +2117,6 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
        BUG_ON(btree_node_root(c, b));
 
        __bch2_btree_set_root_inmem(c, b);
-       bch2_btree_set_root_ondisk(c, b, READ);
 }
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
@@ -2126,9 +2139,9 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
        b->level        = 0;
        b->btree_id     = id;
 
-       bkey_extent_init(&b->key);
+       bkey_btree_ptr_init(&b->key);
        b->key.k.p = POS_MAX;
-       bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
+       PTR_HASH(&b->key) = U64_MAX - id;
 
        bch2_bset_init_first(b, &b->data->keys);
        bch2_btree_build_aux_trees(b);
@@ -2149,20 +2162,20 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 
 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
 {
-       char *out = buf, *end = buf + PAGE_SIZE;
+       struct printbuf out = _PBUF(buf, PAGE_SIZE);
        struct btree_update *as;
 
        mutex_lock(&c->btree_interior_update_lock);
        list_for_each_entry(as, &c->btree_interior_update_list, list)
-               out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
-                                as,
-                                as->mode,
-                                as->nodes_written,
-                                atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
-                                as->journal.seq);
+               pr_buf(&out, "%p m %u w %u r %u j %llu\n",
+                      as,
+                      as->mode,
+                      as->nodes_written,
+                      atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+                      as->journal.seq);
        mutex_unlock(&c->btree_interior_update_lock);
 
-       return out - buf;
+       return out.pos - buf;
 }
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)