]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to bf340e68c7 bcachefs: Ignore cached data when calculating...
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 11 Jan 2022 02:41:51 +0000 (21:41 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Tue, 11 Jan 2022 02:41:51 +0000 (21:41 -0500)
28 files changed:
.bcachefs_revision
include/trace/events/bcachefs.h
libbcachefs/alloc_background.c
libbcachefs/bcachefs.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/buckets_waiting_for_journal.c [new file with mode: 0644]
libbcachefs/buckets_waiting_for_journal.h [new file with mode: 0644]
libbcachefs/buckets_waiting_for_journal_types.h [new file with mode: 0644]
libbcachefs/inode.c
libbcachefs/journal_io.c
libbcachefs/move.c
libbcachefs/movinggc.c
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/tests.c

index 8226b3a6351d572c63a49990c2d0e07e5a63d9e8..71e83e28f05d0f5e6b1c582cfcaa88165682d826 100644 (file)
@@ -1 +1 @@
-5242db9aec10220b6ee7162ba7bec173417348cf
+bf340e68c74cdb70c692698ef7367b9dc6f6e61f
index 295dcd60e70470d54b88872815738e22665134f5..8f10d13b27d565dff4209e17b8726fbba82d1d43 100644 (file)
@@ -346,6 +346,52 @@ TRACE_EVENT(btree_cache_scan,
                  __entry->ret)
 );
 
+TRACE_EVENT(btree_node_relock_fail,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos,
+                unsigned long node,
+                u32 iter_lock_seq,
+                u32 node_lock_seq),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 24    )
+               __array(char,                   caller, 32      )
+               __field(u8,                     btree_id        )
+               __field(u64,                    pos_inode       )
+               __field(u64,                    pos_offset      )
+               __field(u32,                    pos_snapshot    )
+               __field(unsigned long,          node            )
+               __field(u32,                    iter_lock_seq   )
+               __field(u32,                    node_lock_seq   )
+       ),
+
+       TP_fast_assign(
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
+               __entry->btree_id               = btree_id;
+               __entry->pos_inode              = pos->inode;
+               __entry->pos_offset             = pos->offset;
+               __entry->pos_snapshot           = pos->snapshot;
+               __entry->node                   = node;
+               __entry->iter_lock_seq          = iter_lock_seq;
+               __entry->node_lock_seq          = node_lock_seq;
+       ),
+
+       TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+                 __entry->trans_fn,
+                 __entry->caller,
+                 __entry->btree_id,
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
+                 __entry->node,
+                 __entry->iter_lock_seq,
+                 __entry->node_lock_seq)
+);
+
 /* Garbage collection */
 
 DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
@@ -621,7 +667,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 
        TP_STRUCT__entry(
                __array(char,                   trans_fn, 24    )
-               __field(unsigned long,          caller_ip       )
+               __array(char,                   caller, 32      )
                __field(u8,                     btree_id        )
                __field(u64,                    pos_inode       )
                __field(u64,                    pos_offset      )
@@ -630,16 +676,16 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 
        TP_fast_assign(
                strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
-               __entry->caller_ip              = caller_ip;
+               snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
                __entry->btree_id               = btree_id;
                __entry->pos_inode              = pos->inode;
                __entry->pos_offset             = pos->offset;
                __entry->pos_snapshot           = pos->snapshot;
        ),
 
-       TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+       TP_printk("%s %s btree %u pos %llu:%llu:%u",
                  __entry->trans_fn,
-                 (void *) __entry->caller_ip,
+                 __entry->caller,
                  __entry->btree_id,
                  __entry->pos_inode,
                  __entry->pos_offset,
@@ -694,6 +740,54 @@ DEFINE_EVENT(transaction_restart_iter,     trans_restart_relock,
        TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
 DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
index 688a53b4ca580f97b584fc4da0b3d338731a171b..7ad16c21eb08cf2fd8d85e98b77cc907aecb59fd 100644 (file)
@@ -9,6 +9,7 @@
 #include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "ec.h"
@@ -463,19 +464,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
+       struct bkey_s_c k;
        struct bkey_alloc_unpacked u;
        u64 *time, now;
        int ret = 0;
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
                             BTREE_ITER_CACHED|
-                            BTREE_ITER_CACHED_NOFILL|
                             BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(&iter);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
        if (ret)
                goto out;
 
-       u = alloc_mem_to_key(c, &iter);
+       u = bch2_alloc_unpack(k);
 
        time = rw == READ ? &u.read_time : &u.write_time;
        now = atomic64_read(&c->io_clock[rw].now);
@@ -542,7 +544,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
                                u64 now, u64 last_seq_ondisk)
 {
-       unsigned used = bucket_sectors_used(m);
+       unsigned used = m.cached_sectors;
 
        if (used) {
                /*
@@ -561,8 +563,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
                 * keys when there's only a small difference, so that we can
                 * keep sequential buckets together:
                 */
-               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
-                       (bucket_gc_gen(g) >> 4);
+               return bucket_gc_gen(g) >> 4;
        }
 }
 
@@ -611,6 +612,13 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
                if (!bch2_can_invalidate_bucket(ca, b, m))
                        continue;
 
+               if (!m.data_type &&
+                   bch2_bucket_needs_journal_commit(c, last_seq_ondisk,
+                                                    ca->dev_idx, b)) {
+                       ca->buckets_waiting_on_journal++;
+                       continue;
+               }
+
                if (e.nr && e.bucket + e.nr == b && e.key == key) {
                        e.nr++;
                } else {
@@ -647,6 +655,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 
        ca->inc_gen_needs_gc                    = 0;
        ca->inc_gen_really_needs_gc             = 0;
+       ca->buckets_waiting_on_journal          = 0;
 
        find_reclaimable_buckets_lru(c, ca);
 
@@ -658,56 +667,34 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
        return nr;
 }
 
-/*
- * returns sequence number of most recent journal entry that updated this
- * bucket:
- */
-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-{
-       if (m.journal_seq_valid) {
-               u64 journal_seq = atomic64_read(&c->journal.seq);
-               u64 bucket_seq  = journal_seq;
-
-               bucket_seq &= ~((u64) U16_MAX);
-               bucket_seq |= m.journal_seq;
-
-               if (bucket_seq > journal_seq)
-                       bucket_seq -= 1 << 16;
-
-               return bucket_seq;
-       } else {
-               return 0;
-       }
-}
-
 static int bucket_invalidate_btree(struct btree_trans *trans,
-                                  struct bch_dev *ca, u64 b)
+                                  struct bch_dev *ca, u64 b,
+                                  struct bkey_alloc_unpacked *u)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_alloc_unpacked u;
        struct btree_iter iter;
+       struct bkey_s_c k;
        int ret;
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
                             POS(ca->dev_idx, b),
                             BTREE_ITER_CACHED|
-                            BTREE_ITER_CACHED_NOFILL|
                             BTREE_ITER_INTENT);
 
-       ret = bch2_btree_iter_traverse(&iter);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
        if (ret)
                goto err;
 
-       u = alloc_mem_to_key(c, &iter);
-
-       u.gen++;
-       u.data_type     = 0;
-       u.dirty_sectors = 0;
-       u.cached_sectors = 0;
-       u.read_time     = atomic64_read(&c->io_clock[READ].now);
-       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
+       *u = bch2_alloc_unpack(k);
+       u->gen++;
+       u->data_type            = 0;
+       u->dirty_sectors        = 0;
+       u->cached_sectors       = 0;
+       u->read_time            = atomic64_read(&c->io_clock[READ].now);
+       u->write_time           = atomic64_read(&c->io_clock[WRITE].now);
 
-       ret = bch2_alloc_write(trans, &iter, &u,
+       ret = bch2_alloc_write(trans, &iter, u,
                               BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
        bch2_trans_iter_exit(trans, &iter);
@@ -717,21 +704,24 @@ err:
 static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
                                      u64 *journal_seq, unsigned flags)
 {
-       struct bucket *g;
-       struct bucket_mark m;
+       struct bkey_alloc_unpacked u;
        size_t b;
+       u64 commit_seq = 0;
        int ret = 0;
 
+       /*
+        * If the read-only path is trying to shut down, we can't be generating
+        * new btree updates:
+        */
+       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
+               return 1;
+
        BUG_ON(!ca->alloc_heap.used ||
               !ca->alloc_heap.data[0].nr);
        b = ca->alloc_heap.data[0].bucket;
 
        /* first, put on free_inc and mark as owned by allocator: */
        percpu_down_read(&c->mark_lock);
-       g = bucket(ca, b);
-       m = READ_ONCE(g->mark);
-
-       BUG_ON(m.dirty_sectors);
 
        bch2_mark_alloc_bucket(c, ca, b, true);
 
@@ -740,38 +730,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
        BUG_ON(!fifo_push(&ca->free_inc, b));
        spin_unlock(&c->freelist_lock);
 
-       /*
-        * If we're not invalidating cached data, we only increment the bucket
-        * gen in memory here, the incremented gen will be updated in the btree
-        * by bch2_trans_mark_pointer():
-        */
-       if (!m.cached_sectors &&
-           !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
-               BUG_ON(m.data_type);
-               bucket_cmpxchg(g, m, m.gen++);
-               *bucket_gen(ca, b) = m.gen;
-               percpu_up_read(&c->mark_lock);
-               goto out;
-       }
-
        percpu_up_read(&c->mark_lock);
 
-       /*
-        * If the read-only path is trying to shut down, we can't be generating
-        * new btree updates:
-        */
-       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
-               ret = 1;
-               goto out;
-       }
-
-       ret = bch2_trans_do(c, NULL, journal_seq,
+       ret = bch2_trans_do(c, NULL, &commit_seq,
                            BTREE_INSERT_NOCHECK_RW|
                            BTREE_INSERT_NOFAIL|
                            BTREE_INSERT_JOURNAL_RESERVED|
                            flags,
-                           bucket_invalidate_btree(&trans, ca, b));
-out:
+                           bucket_invalidate_btree(&trans, ca, b, &u));
+
        if (!ret) {
                /* remove from alloc_heap: */
                struct alloc_heap_entry e, *top = ca->alloc_heap.data;
@@ -783,11 +750,17 @@ out:
                        heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 
                /*
-                * Make sure we flush the last journal entry that updated this
-                * bucket (i.e. deleting the last reference) before writing to
-                * this bucket again:
+                * If we invalidating cached data then we need to wait on the
+                * journal commit:
+                */
+               if (u.data_type)
+                       *journal_seq = max(*journal_seq, commit_seq);
+
+               /*
+                * We already waiting on u.alloc_seq when we filtered out
+                * buckets that need journal commit:
                 */
-               *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+               BUG_ON(*journal_seq > u.journal_seq);
        } else {
                size_t b2;
 
@@ -954,8 +927,14 @@ static int bch2_allocator_thread(void *arg)
                        gc_count = c->gc_count;
                        nr = find_reclaimable_buckets(c, ca);
 
-                       trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-                                        ca->inc_gen_really_needs_gc);
+                       if (!nr && ca->buckets_waiting_on_journal) {
+                               ret = bch2_journal_flush(&c->journal);
+                               if (ret)
+                                       goto stop;
+                       } else if (nr < (ca->mi.nbuckets >> 6) &&
+                                  ca->buckets_waiting_on_journal >= nr / 2) {
+                               bch2_journal_flush_async(&c->journal, NULL);
+                       }
 
                        if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
                             ca->inc_gen_really_needs_gc) &&
@@ -963,6 +942,9 @@ static int bch2_allocator_thread(void *arg)
                                atomic_inc(&c->kick_gc);
                                wake_up_process(c->gc_thread);
                        }
+
+                       trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+                                        ca->inc_gen_really_needs_gc);
                }
 
                ret = bch2_invalidate_buckets(c, ca);
index c64db2bfd2a5532f94affef96b53e384001a58aa..a28ddcd5d7b727ef9b2d34b1219059ddda5a4639 100644 (file)
@@ -355,6 +355,7 @@ enum bch_time_stats {
 #include "alloc_types.h"
 #include "btree_types.h"
 #include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
 #include "ec_types.h"
 #include "journal_types.h"
@@ -482,6 +483,7 @@ struct bch_dev {
 
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
+       size_t                  buckets_waiting_on_journal;
 
        enum allocator_states   allocator_state;
 
@@ -777,6 +779,8 @@ struct bch_fs {
        struct mutex            write_points_hash_lock;
        unsigned                write_points_nr;
 
+       struct buckets_waiting_for_journal buckets_waiting_for_journal;
+
        /* GARBAGE COLLECTION */
        struct task_struct      *gc_thread;
        atomic_t                kick_gc;
index fc6c4d4cd02fbd56ec266207783cda73eda68444..986d08d708cc9593238e482b226c16fb4d01fe2f 100644 (file)
@@ -666,6 +666,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * been freed:
         */
        if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+               trace_trans_restart_relock_parent_for_fill(trans->fn,
+                                       _THIS_IP_, btree_id, &path->pos);
                btree_trans_restart(trans);
                return ERR_PTR(-EINTR);
        }
@@ -713,6 +715,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
        }
 
        if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+               trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
+                                          btree_id, &path->pos);
                btree_trans_restart(trans);
                return ERR_PTR(-EINTR);
        }
index a201052e8259191012d5761c1f18ee8dca2839b6..809c9a76230302b046dde303c0eb530dfacd0d46 100644 (file)
@@ -604,8 +604,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
                        if (data_type == BCH_DATA_btree) {
                                g2->_mark.data_type     = g->_mark.data_type    = data_type;
-                               g2->gen_valid           = g->gen_valid          = true;
                                set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        } else {
                                do_update = true;
                        }
@@ -1327,12 +1327,6 @@ static int bch2_gc_start(struct bch_fs *c,
 
        percpu_down_write(&c->mark_lock);
 
-       /*
-        * indicate to stripe code that we need to allocate for the gc stripes
-        * radix tree, too
-        */
-       gc_pos_set(c, gc_phase(GC_PHASE_START));
-
        for_each_member_device(ca, c, i) {
                struct bucket_array *dst = __bucket_array(ca, 1);
                struct bucket_array *src = __bucket_array(ca, 0);
@@ -1360,6 +1354,27 @@ static int bch2_gc_start(struct bch_fs *c,
        return 0;
 }
 
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *buckets = __bucket_array(ca, true);
+               struct bucket *g;
+
+               for_each_bucket(g, buckets) {
+                       if (metadata_only &&
+                           (g->mark.data_type == BCH_DATA_user ||
+                            g->mark.data_type == BCH_DATA_cached ||
+                            g->mark.data_type == BCH_DATA_parity))
+                               continue;
+                       g->_mark.dirty_sectors = 0;
+                       g->_mark.cached_sectors = 0;
+               }
+       };
+}
+
 static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                                bool metadata_only)
 {
@@ -1430,6 +1445,55 @@ fsck_err:
        return ret;
 }
 
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+                                bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct reflink_gc *r;
+       int ret = 0;
+
+       if (metadata_only)
+               return 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       c->reflink_gc_nr = 0;
+
+       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               const __le64 *refcount = bkey_refcount_c(k);
+
+               if (!refcount)
+                       continue;
+
+               r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+                                      GFP_KERNEL);
+               if (!r) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               r->offset       = k.k->p.offset;
+               r->size         = k.k->size;
+               r->refcount     = 0;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
+                                 bool metadata_only)
+{
+       struct genradix_iter iter;
+       struct reflink_gc *r;
+
+       genradix_for_each(&c->reflink_gc_table, iter, r)
+               r->refcount = 0;
+}
+
 static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
                                bool metadata_only)
 {
@@ -1493,43 +1557,10 @@ fsck_err:
        return ret;
 }
 
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
-                                bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
+                               bool metadata_only)
 {
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct reflink_gc *r;
-       int ret = 0;
-
-       if (metadata_only)
-               return 0;
-
-       bch2_trans_init(&trans, c, 0, 0);
-       c->reflink_gc_nr = 0;
-
-       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               const __le64 *refcount = bkey_refcount_c(k);
-
-               if (!refcount)
-                       continue;
-
-               r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
-                                      GFP_KERNEL);
-               if (!r) {
-                       ret = -ENOMEM;
-                       break;
-               }
-
-               r->offset       = k.k->p.offset;
-               r->size         = k.k->size;
-               r->refcount     = 0;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-       return ret;
+       genradix_free(&c->gc_stripes);
 }
 
 /**
@@ -1565,11 +1596,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
        /* flush interior btree updates: */
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
-again:
+
        ret   = bch2_gc_start(c, metadata_only) ?:
                bch2_gc_reflink_start(c, initial, metadata_only);
        if (ret)
                goto out;
+again:
+       gc_pos_set(c, gc_phase(GC_PHASE_START));
 
        bch2_mark_superblocks(c);
 
@@ -1607,25 +1640,26 @@ again:
 
        if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
            (!iter && bch2_test_restart_gc)) {
+               if (iter++ > 2) {
+                       bch_info(c, "Unable to fix bucket gens, looping");
+                       ret = -EINVAL;
+                       goto out;
+               }
+
                /*
                 * XXX: make sure gens we fixed got saved
                 */
-               if (iter++ <= 2) {
-                       bch_info(c, "Second GC pass needed, restarting:");
-                       clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-                       __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
-
-                       percpu_down_write(&c->mark_lock);
-                       bch2_gc_free(c);
-                       percpu_up_write(&c->mark_lock);
-                       /* flush fsck errors, reset counters */
-                       bch2_flush_fsck_errs(c);
+               bch_info(c, "Second GC pass needed, restarting:");
+               clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+               __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
-                       goto again;
-               }
+               bch2_gc_stripes_reset(c, initial, metadata_only);
+               bch2_gc_alloc_reset(c, initial, metadata_only);
+               bch2_gc_reflink_reset(c, initial, metadata_only);
 
-               bch_info(c, "Unable to fix bucket gens, looping");
-               ret = -EINVAL;
+               /* flush fsck errors, reset counters */
+               bch2_flush_fsck_errs(c);
+               goto again;
        }
 out:
        if (!ret) {
index 2ae4e523ff3b20192cc0d07992d5cc22c0a567ad..efe9b8cb9f1cde340ab7f481cfc9497b86f2c567 100644 (file)
@@ -178,19 +178,25 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
        int want = __btree_lock_want(path, level);
 
        if (!is_btree_node(path, level))
-               return false;
+               goto fail;
 
        if (race_fault())
-               return false;
+               goto fail;
 
        if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
            (btree_node_lock_seq_matches(path, b, level) &&
             btree_node_lock_increment(trans, b, level, want))) {
                mark_btree_node_locked(path, level, want);
                return true;
-       } else {
-               return false;
        }
+fail:
+       trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+                                    path->btree_id,
+                                    &path->pos,
+                                    (unsigned long) b,
+                                    path->l[level].lock_seq,
+                                    is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+       return false;
 }
 
 bool bch2_btree_node_upgrade(struct btree_trans *trans,
@@ -237,7 +243,7 @@ success:
 
 static inline bool btree_path_get_locks(struct btree_trans *trans,
                                        struct btree_path *path,
-                                       bool upgrade, unsigned long trace_ip)
+                                       bool upgrade)
 {
        unsigned l = path->level;
        int fail_idx = -1;
@@ -440,6 +446,8 @@ bool bch2_btree_path_relock_intent(struct btree_trans *trans,
                if (!bch2_btree_node_relock(trans, path, l)) {
                        __bch2_btree_path_unlock(path);
                        btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+                       trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
+                                                  path->btree_id, &path->pos);
                        btree_trans_restart(trans);
                        return false;
                }
@@ -452,10 +460,13 @@ __flatten
 static bool bch2_btree_path_relock(struct btree_trans *trans,
                        struct btree_path *path, unsigned long trace_ip)
 {
-       bool ret = btree_path_get_locks(trans, path, false, trace_ip);
+       bool ret = btree_path_get_locks(trans, path, false);
 
-       if (!ret)
+       if (!ret) {
+               trace_trans_restart_relock_path(trans->fn, trace_ip,
+                                               path->btree_id, &path->pos);
                btree_trans_restart(trans);
+       }
        return ret;
 }
 
@@ -469,7 +480,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
 
        path->locks_want = new_locks_want;
 
-       if (btree_path_get_locks(trans, path, true, _THIS_IP_))
+       if (btree_path_get_locks(trans, path, true))
                return true;
 
        /*
@@ -497,7 +508,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
                    linked->btree_id == path->btree_id &&
                    linked->locks_want < new_locks_want) {
                        linked->locks_want = new_locks_want;
-                       btree_path_get_locks(trans, linked, true, _THIS_IP_);
+                       btree_path_get_locks(trans, linked, true);
                }
 
        return false;
@@ -701,9 +712,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
        BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
 
-       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              iter->pos.snapshot != iter->snapshot);
-
        BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
 
@@ -711,6 +719,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
               !btree_type_has_snapshots(iter->btree_id));
 
+       if (iter->update_path)
+               bch2_btree_path_verify(trans, iter->update_path);
        bch2_btree_path_verify(trans, iter->path);
 }
 
@@ -1962,7 +1972,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
        locks_want = min(locks_want, BTREE_MAX_DEPTH);
        if (locks_want > path->locks_want) {
                path->locks_want = locks_want;
-               btree_path_get_locks(trans, path, true, _THIS_IP_);
+               btree_path_get_locks(trans, path, true);
        }
 
        return path;
@@ -2099,6 +2109,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
                __bch2_btree_path_unlock(path);
                path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
                path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+               trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
+                                          path->btree_id, &path->pos);
                btree_trans_restart(trans);
                ret = -EINTR;
                goto err;
@@ -2182,6 +2194,23 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
        return ret;
 }
 
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+                                                     enum btree_id btree_id,
+                                                     struct bpos pos)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_update(trans, i)
+               if ((cmp_int(btree_id,  i->btree_id) ?:
+                    bpos_cmp(pos,      i->k->k.p)) <= 0) {
+                       if (btree_id == i->btree_id)
+                               return i->k;
+                       break;
+               }
+
+       return NULL;
+}
+
 static noinline
 struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
                                          struct btree_path *path)
@@ -2218,21 +2247,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
        return k;
 }
 
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
        struct btree_trans *trans = iter->trans;
-       struct bpos search_key = btree_iter_search_key(iter);
        struct bkey_i *next_update;
        struct bkey_s_c k;
        int ret;
 
        EBUG_ON(iter->path->cached || iter->path->level);
        bch2_btree_iter_verify(iter);
-       bch2_btree_iter_verify_entry_exit(iter);
 
        while (1) {
                iter->path = btree_path_set_pos(trans, iter->path, search_key,
@@ -2277,24 +2300,6 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                }
 
                if (likely(k.k)) {
-                       /*
-                        * We can never have a key in a leaf node at POS_MAX, so
-                        * we don't have to check these successor() calls:
-                        */
-                       if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
-                           !bch2_snapshot_is_ancestor(trans->c,
-                                                      iter->snapshot,
-                                                      k.k->p.snapshot)) {
-                               search_key = bpos_successor(k.k->p);
-                               continue;
-                       }
-
-                       if (bkey_whiteout(k.k) &&
-                           !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
-                               search_key = bkey_successor(iter, k.k->p);
-                               continue;
-                       }
-
                        break;
                } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
                        /* Advance to next leaf node: */
@@ -2306,6 +2311,92 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                        goto out;
                }
        }
+out:
+       bch2_btree_iter_verify(iter);
+
+       return k;
+}
+
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bpos search_key = btree_iter_search_key(iter);
+       struct bkey_s_c k;
+       int ret;
+
+       if (iter->update_path) {
+               bch2_path_put(trans, iter->update_path,
+                             iter->flags & BTREE_ITER_INTENT);
+               iter->update_path = NULL;
+       }
+
+       bch2_btree_iter_verify_entry_exit(iter);
+
+       while (1) {
+               k = __bch2_btree_iter_peek(iter, search_key);
+               if (!k.k || bkey_err(k))
+                       goto out;
+
+               if (iter->update_path &&
+                   bkey_cmp(iter->update_path->pos, k.k->p)) {
+                       bch2_path_put(trans, iter->update_path,
+                                     iter->flags & BTREE_ITER_INTENT);
+                       iter->update_path = NULL;
+               }
+
+               if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+                   (iter->flags & BTREE_ITER_INTENT) &&
+                   !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+                   !iter->update_path) {
+                       struct bpos pos = k.k->p;
+
+                       if (pos.snapshot < iter->snapshot) {
+                               search_key = bpos_successor(k.k->p);
+                               continue;
+                       }
+
+                       pos.snapshot = iter->snapshot;
+
+                       /*
+                        * advance, same as on exit for iter->path, but only up
+                        * to snapshot
+                        */
+                       __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+                       iter->update_path = iter->path;
+
+                       iter->update_path = btree_path_set_pos(trans,
+                                               iter->update_path, pos,
+                                               iter->flags & BTREE_ITER_INTENT,
+                                               btree_iter_ip_allocated(iter));
+
+                       BUG_ON(!(iter->update_path->nodes_locked & 1));
+                       iter->update_path->should_be_locked = true;
+               }
+
+               /*
+                * We can never have a key in a leaf node at POS_MAX, so
+                * we don't have to check these successor() calls:
+                */
+               if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+                   !bch2_snapshot_is_ancestor(trans->c,
+                                              iter->snapshot,
+                                              k.k->p.snapshot)) {
+                       search_key = bpos_successor(k.k->p);
+                       continue;
+               }
+
+               if (bkey_whiteout(k.k) &&
+                   !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                       search_key = bkey_successor(iter, k.k->p);
+                       continue;
+               }
+
+               break;
+       }
 
        /*
         * iter->pos should be mononotically increasing, and always be equal to
@@ -2316,21 +2407,27 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
-       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
-               iter->pos.snapshot = iter->snapshot;
-
        iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
                                iter->flags & BTREE_ITER_INTENT,
                                btree_iter_ip_allocated(iter));
        BUG_ON(!iter->path->nodes_locked);
 out:
+       if (iter->update_path) {
+               BUG_ON(!(iter->update_path->nodes_locked & 1));
+               iter->update_path->should_be_locked = true;
+       }
        iter->path->should_be_locked = true;
 
-       bch2_btree_iter_verify_entry_exit(iter);
-       bch2_btree_iter_verify(iter);
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               iter->pos.snapshot = iter->snapshot;
+
        ret = bch2_btree_iter_verify_ret(iter, k);
-       if (unlikely(ret))
-               return bkey_s_c_err(ret);
+       if (unlikely(ret)) {
+               bch2_btree_iter_set_pos(iter, iter->pos);
+               k = bkey_s_c_err(ret);
+       }
+
+       bch2_btree_iter_verify_entry_exit(iter);
 
        return k;
 }
@@ -2720,7 +2817,11 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
        if (iter->path)
                bch2_path_put(trans, iter->path,
                              iter->flags & BTREE_ITER_INTENT);
+       if (iter->update_path)
+               bch2_path_put(trans, iter->update_path,
+                             iter->flags & BTREE_ITER_INTENT);
        iter->path = NULL;
+       iter->update_path = NULL;
 }
 
 static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2750,6 +2851,7 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 
        iter->trans     = trans;
        iter->path      = NULL;
+       iter->update_path = NULL;
        iter->btree_id  = btree_id;
        iter->min_depth = depth;
        iter->flags     = flags;
@@ -2798,6 +2900,8 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
        *dst = *src;
        if (src->path)
                __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+       if (src->update_path)
+               __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
index eceec5d55f9be000c85404430a910e7265a94b6f..5205d53ce8dc15bff99af001a8e716ad3bd623ca 100644 (file)
@@ -222,11 +222,8 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 bool bch2_btree_iter_advance(struct btree_iter *);
 bool bch2_btree_iter_rewind(struct btree_iter *);
 
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
-               new_pos.snapshot = iter->snapshot;
-
        iter->k.type = KEY_TYPE_deleted;
        iter->k.p.inode         = iter->pos.inode       = new_pos.inode;
        iter->k.p.offset        = iter->pos.offset      = new_pos.offset;
@@ -234,6 +231,19 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
        iter->k.size = 0;
 }
 
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+       if (unlikely(iter->update_path))
+               bch2_path_put(iter->trans, iter->update_path,
+                             iter->flags & BTREE_ITER_INTENT);
+       iter->update_path = NULL;
+
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               new_pos.snapshot = iter->snapshot;
+
+       __bch2_btree_iter_set_pos(iter, new_pos);
+}
+
 static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
 {
        BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
@@ -295,7 +305,7 @@ static inline int bkey_err(struct bkey_s_c k)
        return PTR_ERR_OR_ZERO(k.k);
 }
 
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
                                                     unsigned flags)
 {
        return flags & BTREE_ITER_SLOTS
@@ -316,7 +326,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
        struct bkey_s_c k;
 
        while (btree_trans_too_many_iters(trans) ||
-              (k = __bch2_btree_iter_peek(iter, flags),
+              (k = bch2_btree_iter_peek_type(iter, flags),
                bkey_err(k) == -EINTR))
                bch2_trans_begin(trans);
 
@@ -335,7 +345,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
                           _start, _flags, _k, _ret)                    \
        for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
                                  (_start), (_flags));                  \
-            (_k) = __bch2_btree_iter_peek(&(_iter), _flags),           \
+            (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),        \
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
             bch2_btree_iter_advance(&(_iter)))
 
@@ -347,7 +357,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
        for (;                                                          \
-            (_k) = __bch2_btree_iter_peek(&(_iter), _flags),           \
+            (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),        \
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
             bch2_btree_iter_advance(&(_iter)))
 
index 1d7b101224f1927a9a15d93b4a95fcfc6df822f7..faed51e7f4b86227f4f2ab8c70bfa12155fd047d 100644 (file)
@@ -222,7 +222,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                goto err;
 
        if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-               trace_transaction_restart_ip(trans->fn, _THIS_IP_);
+               trace_trans_restart_relock_key_cache_fill(trans->fn,
+                               _THIS_IP_, ck_path->btree_id, &ck_path->pos);
                ret = btree_trans_restart(trans);
                goto err;
        }
index 914d536cd29e75e3bbee43e74adcac94712aa2a4..65f460e3c567241cc68148d9820c3a5ba187a461 100644 (file)
@@ -276,6 +276,7 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
 struct btree_iter {
        struct btree_trans      *trans;
        struct btree_path       *path;
+       struct btree_path       *update_path;
 
        enum btree_id           btree_id:4;
        unsigned                min_depth:4;
index 16ebf1a2b1f9977455491a303219b5eafbab7530..5e5a1b5e750eb1d75552c27c31743a5326072896 100644 (file)
@@ -73,8 +73,14 @@ int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
 int bch2_btree_node_update_key_get_iter(struct btree_trans *,
                                struct btree *, struct bkey_i *, bool);
 
+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
+                            struct bkey_i *, enum btree_update_flags);
+
+int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+                                  struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
                                   struct bkey_i *, enum btree_update_flags);
+
 void bch2_trans_commit_hook(struct btree_trans *,
                            struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
@@ -135,21 +141,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
             (_i) < (_trans)->updates + (_trans)->nr_updates;           \
             (_i)++)
 
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
-                                                     enum btree_id btree_id,
-                                                     struct bpos pos)
-{
-       struct btree_insert_entry *i;
-
-       trans_for_each_update(trans, i)
-               if ((cmp_int(btree_id,  i->btree_id) ?:
-                    bpos_cmp(pos,      i->k->k.p)) <= 0) {
-                       if (btree_id == i->btree_id)
-                               return i->k;
-                       break;
-               }
-
-       return NULL;
-}
-
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index 47568a0bc5f18c456612b5cdd04ded6a3d06b632..7b8ca1153efebd25f6d55c9c5df6f82802f871b9 100644 (file)
@@ -1938,6 +1938,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        ret = bch2_trans_commit(trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL|
                                BTREE_INSERT_NOCHECK_RW|
+                               BTREE_INSERT_USE_RESERVE|
                                BTREE_INSERT_JOURNAL_RECLAIM|
                                BTREE_INSERT_JOURNAL_RESERVED);
        if (ret)
index ca98e68551951f27833190093f3c135d96944206..7186457d198b0f005668e365c4eec0edfb0209b2 100644 (file)
@@ -828,7 +828,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        struct bch_fs *c = trans->c;
        int ret;
 
-       if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+       if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+           test_bit(BCH_FS_STARTED, &c->flags))
                return -EROFS;
 
        bch2_trans_unlock(trans);
@@ -844,28 +845,63 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        return 0;
 }
 
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+                          bool overwrite)
 {
        struct bkey             _deleted = KEY(0, 0, 0);
        struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
        struct bkey_s_c         old;
        struct bkey             unpacked;
-       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-       bool trans_trigger_run;
-       unsigned btree_id = 0;
        int ret = 0;
 
-       /*
-        *
-        * For a given btree, this algorithm runs insert triggers before
-        * overwrite triggers: this is so that when extents are being moved
-        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-        * they are re-added.
-        */
-       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-               while (btree_id_start < trans->updates + trans->nr_updates &&
-                      btree_id_start->btree_id < btree_id)
-                       btree_id_start++;
+       if ((i->flags & BTREE_TRIGGER_NORUN) ||
+           !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+               return 0;
+
+       if (!overwrite) {
+               if (i->insert_trigger_run)
+                       return 0;
+
+               BUG_ON(i->overwrite_trigger_run);
+               i->insert_trigger_run = true;
+       } else {
+               if (i->overwrite_trigger_run)
+                       return 0;
+
+               BUG_ON(!i->insert_trigger_run);
+               i->overwrite_trigger_run = true;
+       }
+
+       old = bch2_btree_path_peek_slot(i->path, &unpacked);
+       _deleted.p = i->path->pos;
+
+       if (overwrite) {
+               ret = bch2_trans_mark_key(trans, old, deleted,
+                               BTREE_TRIGGER_OVERWRITE|i->flags);
+       } else if (old.k->type == i->k->k.type &&
+           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+               i->overwrite_trigger_run = true;
+               ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+       } else {
+               ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+                               BTREE_TRIGGER_INSERT|i->flags);
+       }
+
+       if (ret == -EINTR)
+               trace_trans_restart_mark(trans->fn, _RET_IP_,
+                                        i->btree_id, &i->path->pos);
+       return ret ?: 1;
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+                             struct btree_insert_entry *btree_id_start)
+{
+       struct btree_insert_entry *i;
+       bool trans_trigger_run;
+       int ret, overwrite;
+
+       for (overwrite = 0; overwrite < 2; overwrite++) {
 
                /*
                 * Running triggers will append more updates to the list of updates as
@@ -877,66 +913,39 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
                        for (i = btree_id_start;
                             i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
                             i++) {
-                               if (i->insert_trigger_run ||
-                                   (i->flags & BTREE_TRIGGER_NORUN) ||
-                                   !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-                                       continue;
-
-                               BUG_ON(i->overwrite_trigger_run);
-
-                               i->insert_trigger_run = true;
-                               trans_trigger_run = true;
-
-                               old = bch2_btree_path_peek_slot(i->path, &unpacked);
-                               _deleted.p = i->path->pos;
-
-                               if (old.k->type == i->k->k.type &&
-                                   ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-                                       i->overwrite_trigger_run = true;
-                                       ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
-                                                       BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
-                               } else {
-                                       ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
-                                                       BTREE_TRIGGER_INSERT|i->flags);
-                               }
-
-                               if (ret == -EINTR)
-                                       trace_trans_restart_mark(trans->fn, _RET_IP_,
-                                                       i->btree_id, &i->path->pos);
-                               if (ret)
+                               ret = run_one_trigger(trans, i, overwrite);
+                               if (ret < 0)
                                        return ret;
+                               if (ret)
+                                       trans_trigger_run = true;
                        }
                } while (trans_trigger_run);
+       }
 
-               do {
-                       trans_trigger_run = false;
-
-                       for (i = btree_id_start;
-                            i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-                            i++) {
-                               if (i->overwrite_trigger_run ||
-                                   (i->flags & BTREE_TRIGGER_NORUN) ||
-                                   !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-                                       continue;
-
-                               BUG_ON(!i->insert_trigger_run);
-
-                               i->overwrite_trigger_run = true;
-                               trans_trigger_run = true;
+       return 0;
+}
 
-                               old = bch2_btree_path_peek_slot(i->path, &unpacked);
-                               _deleted.p = i->path->pos;
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+       unsigned btree_id = 0;
+       int ret = 0;
 
-                               ret = bch2_trans_mark_key(trans, old, deleted,
-                                               BTREE_TRIGGER_OVERWRITE|i->flags);
+       /*
+        *
+        * For a given btree, this algorithm runs insert triggers before
+        * overwrite triggers: this is so that when extents are being moved
+        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+        * they are re-added.
+        */
+       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+               while (btree_id_start < trans->updates + trans->nr_updates &&
+                      btree_id_start->btree_id < btree_id)
+                       btree_id_start++;
 
-                               if (ret == -EINTR)
-                                       trace_trans_restart_mark(trans->fn, _RET_IP_,
-                                                       i->btree_id, &i->path->pos);
-                               if (ret)
-                                       return ret;
-                       }
-               } while (trans_trigger_run);
+               ret = run_btree_triggers(trans, btree_id, btree_id_start);
+               if (ret)
+                       return ret;
        }
 
        trans_for_each_update(trans, i)
@@ -1072,6 +1081,9 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret;
 
+       if (!btree_type_has_snapshots(id))
+               return 0;
+
        if (!snapshot_t(c, pos.snapshot)->children[0])
                return 0;
 
@@ -1100,10 +1112,10 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_trans_update_extent(struct btree_trans *trans,
-                                   struct btree_iter *orig_iter,
-                                   struct bkey_i *insert,
-                                   enum btree_update_flags flags)
+int bch2_trans_update_extent(struct btree_trans *trans,
+                            struct btree_iter *orig_iter,
+                            struct bkey_i *insert,
+                            enum btree_update_flags flags)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter, update_iter;
@@ -1261,13 +1273,9 @@ nomerge1:
                        bkey_reassemble(update, k);
                        bch2_cut_front(insert->k.p, update);
 
-                       bch2_trans_copy_iter(&update_iter, &iter);
-                       update_iter.pos = update->k.p;
-                       ret   = bch2_trans_update(trans, &update_iter, update,
+                       ret = bch2_trans_update_by_path(trans, iter.path, update,
                                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                                  flags);
-                       bch2_trans_iter_exit(trans, &update_iter);
-
                        if (ret)
                                goto err;
                        goto out;
@@ -1350,26 +1358,23 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
        return ret;
 }
 
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
                                   struct bkey_i *k, enum btree_update_flags flags)
 {
        struct btree_insert_entry *i, n;
 
-       BUG_ON(!iter->path->should_be_locked);
-
-       if (iter->flags & BTREE_ITER_IS_EXTENTS)
-               return bch2_trans_update_extent(trans, iter, k, flags);
+       BUG_ON(!path->should_be_locked);
 
        BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-       BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+       BUG_ON(bpos_cmp(k->k.p, path->pos));
 
        n = (struct btree_insert_entry) {
                .flags          = flags,
-               .bkey_type      = __btree_node_type(iter->path->level, iter->btree_id),
-               .btree_id       = iter->btree_id,
-               .level          = iter->path->level,
-               .cached         = iter->flags & BTREE_ITER_CACHED,
-               .path           = iter->path,
+               .bkey_type      = __btree_node_type(path->level, path->btree_id),
+               .btree_id       = path->btree_id,
+               .level          = path->level,
+               .cached         = path->cached,
+               .path           = path,
                .k              = k,
                .ip_allocated   = _RET_IP_,
        };
@@ -1380,16 +1385,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
                       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
 
-       if (bkey_deleted(&n.k->k) &&
-           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-               int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
-               if (unlikely(ret < 0))
-                       return ret;
-
-               if (ret)
-                       n.k->k.type = KEY_TYPE_whiteout;
-       }
-
        /*
         * Pending updates are kept sorted: first, find position of new update,
         * then delete/trim any updates the new update overwrites:
@@ -1420,10 +1415,29 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
                                  i - trans->updates, n);
 
        __btree_path_get(n.path, true);
-
        return 0;
 }
 
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+                                  struct bkey_i *k, enum btree_update_flags flags)
+{
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return bch2_trans_update_extent(trans, iter, k, flags);
+
+       if (bkey_deleted(&k->k) &&
+           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+               int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+               if (unlikely(ret < 0))
+                       return ret;
+
+               if (ret)
+                       k->k.type = KEY_TYPE_whiteout;
+       }
+
+       return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
+                                        k, flags);
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
                            struct btree_trans_commit_hook *h)
 {
index 895ff2555662a4ae90626d246b4e6315e4284c53..bf5ad436057afae2f747ffa19e9471d7a50e0925 100644 (file)
@@ -11,6 +11,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "ec.h"
 #include "error.h"
 #include "inode.h"
@@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
        }
 }
 
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
-       u64 journal_seq = atomic64_read(&c->journal.seq);
-       u16 last_seq_ondisk = c->journal.flushed_seq_ondisk;
-       struct bch_dev *ca;
-       struct bucket_array *buckets;
-       struct bucket *g;
-       struct bucket_mark m;
-       unsigned i;
-
-       if (journal_seq - c->last_bucket_seq_cleanup <
-           (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
-               return;
-
-       c->last_bucket_seq_cleanup = journal_seq;
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets) {
-                       bucket_cmpxchg(g, m, ({
-                               if (!m.journal_seq_valid ||
-                                   bucket_needs_journal_commit(m, last_seq_ondisk))
-                                       break;
-
-                               m.journal_seq_valid = 0;
-                       }));
-               }
-               up_read(&ca->bucket_lock);
-       }
-}
-
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
        struct bch_fs_usage *usage;
@@ -323,8 +287,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
                                            struct bucket_mark m)
 {
-       return bucket_sectors_used(m)
-               ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+       return m.dirty_sectors
+               ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
                : 0;
 }
 
@@ -570,16 +534,24 @@ static int bch2_mark_alloc(struct btree_trans *trans,
                v->journal_seq = cpu_to_le64(new_u.journal_seq);
        }
 
-       ca = bch_dev_bkey_exists(c, new.k->p.inode);
+       if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+               ret = bch2_set_bucket_needs_journal_commit(c,
+                               new_u.dev, new_u.bucket,
+                               new_u.journal_seq);
+               if (ret)
+                       return ret;
+       }
+
+       ca = bch_dev_bkey_exists(c, new_u.dev);
 
-       if (new.k->p.offset >= ca->mi.nbuckets)
+       if (new_u.bucket >= ca->mi.nbuckets)
                return 0;
 
        percpu_down_read(&c->mark_lock);
        if (!gc && new_u.gen != old_u.gen)
-               *bucket_gen(ca, new.k->p.offset) = new_u.gen;
+               *bucket_gen(ca, new_u.bucket) = new_u.gen;
 
-       g = __bucket(ca, new.k->p.offset, gc);
+       g = __bucket(ca, new_u.bucket, gc);
 
        old_m = bucket_cmpxchg(g, m, ({
                m.gen                   = new_u.gen;
@@ -587,11 +559,6 @@ static int bch2_mark_alloc(struct btree_trans *trans,
                m.dirty_sectors         = new_u.dirty_sectors;
                m.cached_sectors        = new_u.cached_sectors;
                m.stripe                = new_u.stripe != 0;
-
-               if (journal_seq) {
-                       m.journal_seq_valid     = 1;
-                       m.journal_seq           = journal_seq;
-               }
        }));
 
        bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
@@ -619,7 +586,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
                        return ret;
                }
 
-               trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+               trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
                                 old_m.cached_sectors);
        }
 
@@ -767,9 +734,10 @@ static int check_bucket_ref(struct bch_fs *c,
 static int mark_stripe_bucket(struct btree_trans *trans,
                              struct bkey_s_c k,
                              unsigned ptr_idx,
-                             u64 journal_seq, unsigned flags)
+                             unsigned flags)
 {
        struct bch_fs *c = trans->c;
+       u64 journal_seq = trans->journal_res.seq;
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
        unsigned nr_data = s->nr_blocks - s->nr_redundant;
        bool parity = ptr_idx >= nr_data;
@@ -810,11 +778,6 @@ static int mark_stripe_bucket(struct btree_trans *trans,
                if (data_type)
                        new.data_type           = data_type;
 
-               if (journal_seq) {
-                       new.journal_seq_valid   = 1;
-                       new.journal_seq         = journal_seq;
-               }
-
                new.stripe = true;
        }));
 
@@ -886,11 +849,6 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 
                new.data_type = bucket_data_type;
 
-               if (journal_seq) {
-                       new.journal_seq_valid = 1;
-                       new.journal_seq = journal_seq;
-               }
-
                if (flags & BTREE_TRIGGER_NOATOMIC) {
                        g->_mark = new;
                        break;
@@ -1111,7 +1069,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
                memset(m->block_sectors, 0, sizeof(m->block_sectors));
 
                for (i = 0; i < new_s->nr_blocks; i++) {
-                       ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
+                       ret = mark_stripe_bucket(trans, new, i, flags);
                        if (ret)
                                return ret;
                }
@@ -1459,24 +1417,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-       struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
+       struct bkey_s_c k;
        int ret;
 
-       bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+       bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
+                            POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
+                            BTREE_ITER_WITH_UPDATES|
                             BTREE_ITER_CACHED|
-                            BTREE_ITER_CACHED_NOFILL|
                             BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(iter);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
        if (ret) {
                bch2_trans_iter_exit(trans, iter);
                return ret;
        }
 
-       *u = update && !bpos_cmp(update->k.p, pos)
-               ? bch2_alloc_unpack(bkey_i_to_s_c(update))
-               : alloc_mem_to_key(c, iter);
-
+       *u = bch2_alloc_unpack(k);
        return 0;
 }
 
index 45c6d230f24264e5d6ec10048fcbf34a65ca962d..d35c96bcf3a167c6e47131b9cef72fb7241ae813 100644 (file)
@@ -149,23 +149,11 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 
 /* bucket gc marks */
 
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-{
-       return mark.dirty_sectors + mark.cached_sectors;
-}
-
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
        return !mark.dirty_sectors && !mark.stripe;
 }
 
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
-                                              u16 last_seq_ondisk)
-{
-       return m.journal_seq_valid &&
-               ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
 /* Device usage: */
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
@@ -240,7 +228,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 /* key/bucket marking: */
 
-void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
index 18bca269b7503f2a7d589d6842aee118d31a7e7e..24139831226d49f7912854a5fcf90f1550423231 100644 (file)
@@ -15,18 +15,9 @@ struct bucket_mark {
        u8              gen;
        u8              data_type:3,
                        owned_by_allocator:1,
-                       journal_seq_valid:1,
                        stripe:1;
        u16             dirty_sectors;
        u16             cached_sectors;
-
-       /*
-        * low bits of journal sequence number when this bucket was most
-        * recently modified: if journal_seq_valid is set, this bucket can't be
-        * reused until the journal sequence number written to disk is >= the
-        * bucket's journal sequence number:
-        */
-       u16             journal_seq;
        };
        };
 };
diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c
new file mode 100644 (file)
index 0000000..33ae637
--- /dev/null
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/jhash.h>
+
+static u32 hash_seeds[] = {
+       2168153708,
+       1262039142,
+       1183479835,
+};
+
+static inline unsigned bucket_hash(u64 dev_bucket, unsigned hash_seed_idx)
+{
+       return jhash_2words(dev_bucket << 32, dev_bucket, hash_seeds[hash_seed_idx]);
+}
+
+bool bch2_bucket_needs_journal_commit(struct bch_fs *c,
+                                     u64 flushed_seq,
+                                     unsigned dev, u64 bucket)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+       u64 dev_bucket = (u64) dev << 56 | bucket;
+       bool ret = false;
+       unsigned i;
+
+       mutex_lock(&b->lock);
+       BUG_ON(!is_power_of_2(b->nr));
+
+       for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) {
+               u32 h = bucket_hash(dev_bucket, i) & (b->nr - 1);
+
+               if (b->d[h].dev_bucket == dev_bucket) {
+                       ret = b->d[h].journal_seq > flushed_seq;
+                       break;
+               }
+       }
+
+       mutex_unlock(&b->lock);
+
+       return ret;
+}
+
+static int bch2_buckets_waiting_for_journal_rehash(struct bch_fs *c)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+       u64 flushed_seq = c->journal.flushed_seq_ondisk;
+       unsigned i, j, h, new_nr = b->nr * 2, elements = 0;
+       struct bucket_hashed *new_table;
+
+       new_table = kvmalloc_array(new_nr, sizeof(*new_table), __GFP_ZERO);
+       if (!new_table)
+               return -ENOMEM;
+
+       for (i = 0; i < b->nr; i++) {
+               if (b->d[i].journal_seq < flushed_seq)
+                       continue;
+
+               for (j = 0; j < ARRAY_SIZE(hash_seeds); j++) {
+                       h = bucket_hash(b->d[i].dev_bucket, j);
+                       if ((h & (b->nr - 1)) == i)
+                               break;
+               }
+
+               BUG_ON(j == ARRAY_SIZE(hash_seeds));
+               BUG_ON(new_table[h & (new_nr - 1)].dev_bucket);
+
+               new_table[h & (new_nr - 1)] = b->d[i];
+
+               elements++;
+       }
+
+       kvfree(b->d);
+       b->nr   = new_nr;
+       b->d    = new_table;
+       return 0;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct bch_fs *c, unsigned dev, u64 bucket,
+                                        u64 journal_seq)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+       struct bucket_hashed new = {
+               .dev_bucket     = (u64) dev << 56 | bucket,
+               .journal_seq    = journal_seq,
+       }, *last_evicted = NULL;
+       u64 flushed_seq = c->journal.flushed_seq_ondisk;
+       unsigned tries, i;
+       int ret = 0;
+
+       mutex_lock(&b->lock);
+       BUG_ON(!is_power_of_2(b->nr));
+retry:
+       for (tries = 0; tries < 5; tries++) {
+               struct bucket_hashed *old, *victim = NULL;
+
+               for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) {
+                       old = b->d + (bucket_hash(new.dev_bucket, i) & (b->nr - 1));
+
+                       if (old->dev_bucket == new.dev_bucket ||
+                           old->journal_seq <= flushed_seq) {
+                               *old = new;
+                               goto out;
+                       }
+
+                       if (last_evicted != old)
+                               victim = old;
+               }
+
+               /* Failed to find an empty slot: */
+               swap(new, *victim);
+               last_evicted = victim;
+       }
+
+       ret = bch2_buckets_waiting_for_journal_rehash(c);
+       if (!ret)
+               goto retry;
+out:
+       mutex_unlock(&b->lock);
+
+       return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+       kvfree(b->d);
+}
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+       mutex_init(&b->lock);
+
+       b->nr = 8;
+       b->d = kvmalloc_array(b->nr, sizeof(*b->d), __GFP_ZERO);
+       if (!b->d)
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h
new file mode 100644 (file)
index 0000000..079a591
--- /dev/null
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct bch_fs *, u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct bch_fs *, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h
new file mode 100644 (file)
index 0000000..99d17ff
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+struct bucket_hashed {
+       u64                     dev_bucket;
+       u64                     journal_seq;
+};
+
+struct buckets_waiting_for_journal {
+       struct mutex            lock;
+       size_t                  nr;
+       struct bucket_hashed    *d;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
index ef6da53567b8489a8d8c1a50c6b26c6b5ee48fc2..3a7c1468410210b72395263527d9b327732ddfa7 100644 (file)
@@ -585,62 +585,49 @@ found_slot:
 static int bch2_inode_delete_keys(struct btree_trans *trans,
                                  subvol_inum inum, enum btree_id id)
 {
-       u64 offset = 0;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i delete;
+       u32 snapshot;
        int ret = 0;
 
-       while (!ret || ret == -EINTR) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(trans->c, 0);
-               struct btree_iter iter;
-               struct bkey_s_c k;
-               struct bkey_i delete;
-               u32 snapshot;
+       /*
+        * We're never going to be deleting extents, no need to use an extent
+        * iterator:
+        */
+       bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_INTENT);
 
+       while (1) {
                bch2_trans_begin(trans);
 
                ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
                if (ret)
-                       continue;
+                       goto err;
 
-               bch2_trans_iter_init(trans, &iter, id,
-                                    SPOS(inum.inum, offset, snapshot),
-                                    BTREE_ITER_INTENT);
-               k = bch2_btree_iter_peek(&iter);
-
-               if (!k.k || iter.pos.inode != inum.inum) {
-                       bch2_trans_iter_exit(trans, &iter);
-                       break;
-               }
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
 
+               k = bch2_btree_iter_peek(&iter);
                ret = bkey_err(k);
                if (ret)
                        goto err;
 
+               if (!k.k || iter.pos.inode != inum.inum)
+                       break;
+
                bkey_init(&delete.k);
                delete.k.p = iter.pos;
 
-               if (btree_node_type_is_extents(iter.btree_id)) {
-                       unsigned max_sectors =
-                               min_t(u64, U64_MAX - iter.pos.offset,
-                                     KEY_SIZE_MAX & (~0 << trans->c->block_bits));
-
-                       /* create the biggest key we can */
-                       bch2_key_resize(&delete.k, max_sectors);
-
-                       ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-                       if (ret)
-                               goto err;
-               }
-
                ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
-                     bch2_trans_commit(trans, &disk_res, NULL,
+                     bch2_trans_commit(trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
-               bch2_disk_reservation_put(trans->c, &disk_res);
 err:
-               offset = iter.pos.offset;
-               bch2_trans_iter_exit(trans, &iter);
+               if (ret && ret != -EINTR)
+                       break;
        }
 
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index df4d1a7ad533bc3b0181e064a3ed1d35d4eb9dad..e566f8516052ee43d50c041b4615020690d12bbf 100644 (file)
@@ -1671,13 +1671,9 @@ retry_alloc:
                }
        }
 
-       bch2_bucket_seq_cleanup(c);
-
        continue_at(cl, do_journal_write, c->io_complete_wq);
        return;
 no_io:
-       bch2_bucket_seq_cleanup(c);
-
        continue_at(cl, journal_write_done, c->io_complete_wq);
        return;
 err:
index f73be9cb7ac3ca46a2d202f3fe4c5d8f105868d5..3e3dcec327a0ac1e14076504a582e4849df5add2 100644 (file)
@@ -700,17 +700,20 @@ static int __bch2_move_data(struct bch_fs *c,
                bch2_trans_begin(&trans);
 
                k = bch2_btree_iter_peek(&iter);
-
-               stats->pos = iter.pos;
-
                if (!k.k)
                        break;
+
                ret = bkey_err(k);
+               if (ret == -EINTR)
+                       continue;
                if (ret)
                        break;
+
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
+               stats->pos = iter.pos;
+
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
@@ -753,10 +756,8 @@ static int __bch2_move_data(struct bch_fs *c,
                ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
                                        data_cmd, data_opts);
                if (ret2) {
-                       if (ret2 == -EINTR) {
-                               bch2_trans_begin(&trans);
+                       if (ret2 == -EINTR)
                                continue;
-                       }
 
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
index 7cd1b0cf27e4118a6ae4432f4627720fc05471e6..92f78907bcb6fa6e412ba5d3d88a3dff7ba1b855 100644 (file)
@@ -69,10 +69,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                        .dev    = p.ptr.dev,
                        .offset = p.ptr.offset,
                };
+               ssize_t i;
 
-               ssize_t i = eytzinger0_find_le(h->data, h->used,
-                                              sizeof(h->data[0]),
-                                              bucket_offset_cmp, &search);
+               if (p.ptr.cached)
+                       continue;
+
+               i = eytzinger0_find_le(h->data, h->used,
+                                      sizeof(h->data[0]),
+                                      bucket_offset_cmp, &search);
 #if 0
                /* eytzinger search verify code: */
                ssize_t j = -1, k;
@@ -185,8 +189,7 @@ static int bch2_copygc(struct bch_fs *c)
 
                        if (m.owned_by_allocator ||
                            m.data_type != BCH_DATA_user ||
-                           !bucket_sectors_used(m) ||
-                           bucket_sectors_used(m) >= ca->mi.bucket_size)
+                           m.dirty_sectors >= ca->mi.bucket_size)
                                continue;
 
                        WARN_ON(m.stripe && !g->stripe_redundancy);
@@ -195,9 +198,9 @@ static int bch2_copygc(struct bch_fs *c)
                                .dev            = dev_idx,
                                .gen            = m.gen,
                                .replicas       = 1 + g->stripe_redundancy,
-                               .fragmentation  = bucket_sectors_used(m) * (1U << 15)
+                               .fragmentation  = m.dirty_sectors * (1U << 15)
                                        / ca->mi.bucket_size,
-                               .sectors        = bucket_sectors_used(m),
+                               .sectors        = m.dirty_sectors,
                                .offset         = bucket_to_sector(ca, b),
                        };
                        heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
@@ -231,8 +234,11 @@ static int bch2_copygc(struct bch_fs *c)
 
        buckets_to_move = h->used;
 
-       if (!buckets_to_move)
+       if (!buckets_to_move) {
+               bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
+                                   sectors_reserved);
                return 0;
+       }
 
        eytzinger0_sort(h->data, h->used,
                        sizeof(h->data[0]),
@@ -260,8 +266,8 @@ static int bch2_copygc(struct bch_fs *c)
                        m = READ_ONCE(buckets->b[b].mark);
 
                        if (i->gen == m.gen &&
-                           bucket_sectors_used(m)) {
-                               sectors_not_moved += bucket_sectors_used(m);
+                           m.dirty_sectors) {
+                               sectors_not_moved += m.dirty_sectors;
                                buckets_not_moved++;
                        }
                }
index 8aeb2e417a157fd87874d94787ef5b83827919c8..69603327d93df6587f4e8713d249c240c8bc1fde 100644 (file)
@@ -456,10 +456,10 @@ err:
        return ret;
 }
 
-static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-                                    u32 *new_snapids,
-                                    u32 *snapshot_subvols,
-                                    unsigned nr_snapids)
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+                             u32 *new_snapids,
+                             u32 *snapshot_subvols,
+                             unsigned nr_snapids)
 {
        struct btree_iter iter;
        struct bkey_i_snapshot *n;
@@ -522,7 +522,7 @@ static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
                n = bch2_trans_kmalloc(trans, sizeof(*n));
                ret = PTR_ERR_OR_ZERO(n);
                if (ret)
-                       return ret;
+                       goto err;
 
                bkey_reassemble(&n->k_i, k);
 
index e4c3fdcdf22f959e61c96180f9822b1d6564ddfe..4abe53df2788466f58fe347f4611a2370fcf1f60 100644 (file)
@@ -122,6 +122,10 @@ int bch2_snapshot_get_subvol(struct btree_trans *, u32,
                             struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+                             u32 *, u32 *, unsigned);
+
 int bch2_subvolume_delete(struct btree_trans *, u32);
 int bch2_subvolume_unlink(struct btree_trans *, u32);
 int bch2_subvolume_create(struct btree_trans *, u64, u32,
index 577b58e43b059f0be04fdf887069a413f7386dd5..586ba60d03ea7a2e0cb5b4ae9abeeb0c4bff0638 100644 (file)
@@ -16,6 +16,7 @@
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "buckets_waiting_for_journal.h"
 #include "chardev.h"
 #include "checksum.h"
 #include "clock.h"
@@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
        bch2_fs_io_exit(c);
+       bch2_fs_buckets_waiting_for_journal_exit(c);
        bch2_fs_btree_interior_update_exit(c);
        bch2_fs_btree_iter_exit(c);
        bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
@@ -810,6 +812,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
            bch2_fs_btree_iter_init(c) ?:
            bch2_fs_btree_interior_update_init(c) ?:
+           bch2_fs_buckets_waiting_for_journal_init(c);
            bch2_fs_subvolumes_init(c) ?:
            bch2_fs_io_init(c) ?:
            bch2_fs_encryption_init(c) ?:
index 6d1596322ee2d9105df7188e8018b3e0eb856502..ed9a095063e8f684098f5a12d5e1541236a00bba 100644 (file)
@@ -192,7 +192,7 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
-read_attribute(data_op_data_progress);
+read_attribute(data_jobs);
 
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
@@ -230,32 +230,20 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
        return nr ? div64_u64(sectors, nr) : 0;
 }
 
-static long stats_to_text(struct printbuf *out, struct bch_fs *c,
-                         struct bch_move_stats *stats)
-{
-       pr_buf(out, "%s: data type %s btree_id %s position: ",
-               stats->name,
-               bch2_data_types[stats->data_type],
-               bch2_btree_ids[stats->btree_id]);
-       bch2_bpos_to_text(out, stats->pos);
-       pr_buf(out, "%s", "\n");
-
-       return 0;
-}
-
 static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 {
        long ret = 0;
-       struct bch_move_stats *iter;
+       struct bch_move_stats *stats;
 
        mutex_lock(&c->data_progress_lock);
-
-       if (list_empty(&c->data_progress_list))
-               pr_buf(out, "%s", "no progress to report\n");
-       else
-               list_for_each_entry(iter, &c->data_progress_list, list) {
-                       stats_to_text(out, c, iter);
-               }
+       list_for_each_entry(stats, &c->data_progress_list, list) {
+               pr_buf(out, "%s: data type %s btree_id %s position: ",
+                      stats->name,
+                      bch2_data_types[stats->data_type],
+                      bch2_btree_ids[stats->btree_id]);
+               bch2_bpos_to_text(out, stats->pos);
+               pr_buf(out, "%s", "\n");
+       }
 
        mutex_unlock(&c->data_progress_lock);
        return ret;
@@ -463,7 +451,7 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
-       if (attr == &sysfs_data_op_data_progress) {
+       if (attr == &sysfs_data_jobs) {
                data_progress_to_text(&out, c);
                return out.pos - buf;
        }
@@ -616,7 +604,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_rebalance_work,
        sysfs_pd_controller_files(rebalance),
 
-       &sysfs_data_op_data_progress,
+       &sysfs_data_jobs,
 
        &sysfs_internal_uuid,
        NULL
index 16d67eb6d1c2caac49aa6986751f5fe47b5d89a5..de84ce83497598a867cdaa9cb737ef5743f8ca59 100644 (file)
@@ -4,6 +4,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "journal_reclaim.h"
+#include "subvolume.h"
 #include "tests.h"
 
 #include "linux/kthread.h"
@@ -461,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
                __test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_cookie cookie;
+       int ret;
+
+       bkey_cookie_init(&cookie.k_i);
+       cookie.k.p.snapshot = snapid_hi;
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            SPOS(0, 0, snapid_lo), 0);
+       k = bch2_btree_iter_peek(&iter);
+
+       BUG_ON(k.k->p.snapshot != U32_MAX);
+
+       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+       struct bkey_i_cookie cookie;
+       u32 snapids[2];
+       u32 snapid_subvols[2] = { 1, 1 };
+       int ret;
+
+       bkey_cookie_init(&cookie.k_i);
+       cookie.k.p.snapshot = U32_MAX;
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+       ret = bch2_trans_do(c, NULL, NULL, 0,
+                     bch2_snapshot_node_create(&trans, U32_MAX,
+                                               snapids,
+                                               snapid_subvols,
+                                               2));
+       if (ret)
+               return ret;
+
+       if (snapids[0] > snapids[1])
+               swap(snapids[0], snapids[1]);
+
+       ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+       if (ret) {
+               bch_err(c, "err %i from test_snapshot_filter", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
 /* perf tests */
 
 static u64 test_rand(void)
@@ -789,8 +854,10 @@ static int btree_perf_test_thread(void *data)
        }
 
        ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
-       if (ret)
+       if (ret) {
+               bch_err(j->c, "%ps: error %i", j->fn, ret);
                j->ret = ret;
+       }
 
        if (atomic_dec_and_test(&j->done)) {
                j->finish = sched_clock();
@@ -843,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
        perf_test(test_extent_overwrite_middle);
        perf_test(test_extent_overwrite_all);
 
+       perf_test(test_snapshots);
+
        if (!j.fn) {
                pr_err("unknown test %s", testname);
                return -EINVAL;