]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 1b14994029 bcachefs: Fragmentation LRU
authorKent Overstreet <kent.overstreet@linux.dev>
Fri, 17 Feb 2023 22:51:22 +0000 (17:51 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Fri, 17 Feb 2023 22:51:22 +0000 (17:51 -0500)
29 files changed:
.bcachefs_revision
include/trace/events/bcachefs.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/backpointers.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_leaf.c
libbcachefs/btree_write_buffer.c
libbcachefs/buckets_types.h
libbcachefs/data_update.c
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/fsck.c
libbcachefs/lru.c
libbcachefs/lru.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/recovery.c
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/super.c
libbcachefs/util.c
linux/six.c

index bfcbcf58204593cf8c08f392b560b4c70cd07390..7b2b6b45a1187a64834c001343c7c90b13f06a2c 100644 (file)
@@ -1 +1 @@
-8dbfede1d9e6483c682956c7c8a4900a65f98dde
+1b149940290c0ef39070b4afaadab84a65bba034
index 10e51bb37557ce990c610fa4580ebba66538bf88..b9dca1d2aebb9c5f4671062c47ebac10e0590332 100644 (file)
@@ -723,8 +723,8 @@ TRACE_EVENT(move_data,
 TRACE_EVENT(evacuate_bucket,
        TP_PROTO(struct bch_fs *c, struct bpos *bucket,
                 unsigned sectors, unsigned bucket_size,
-                int ret),
-       TP_ARGS(c, bucket, sectors, bucket_size, ret),
+                u64 fragmentation, int ret),
+       TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
 
        TP_STRUCT__entry(
                __field(dev_t,          dev             )
@@ -732,6 +732,7 @@ TRACE_EVENT(evacuate_bucket,
                __field(u64,            bucket          )
                __field(u32,            sectors         )
                __field(u32,            bucket_size     )
+               __field(u64,            fragmentation   )
                __field(int,            ret             )
        ),
 
@@ -741,14 +742,15 @@ TRACE_EVENT(evacuate_bucket,
                __entry->bucket                 = bucket->offset;
                __entry->sectors                = sectors;
                __entry->bucket_size            = bucket_size;
+               __entry->fragmentation          = fragmentation;
                __entry->ret                    = ret;
        ),
 
-       TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
+       TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->member, __entry->bucket,
                  __entry->sectors, __entry->bucket_size,
-                 __entry->ret)
+                 __entry->fragmentation, __entry->ret)
 );
 
 TRACE_EVENT(copygc,
index 2db44365a4ec4afe819a1783cd20fd8704b9d971..755faa34e0f32daa7755bdeb03b1cddf687eaaa3 100644 (file)
@@ -9,6 +9,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_gc.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "buckets_waiting_for_journal.h"
 #include "clock.h"
@@ -414,6 +415,8 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
        prt_newline(out);
        prt_printf(out, "io_time[WRITE]    %llu",       a->io_time[WRITE]);
        prt_newline(out);
+       prt_printf(out, "fragmentation     %llu",       a->fragmentation_lru);
+       prt_newline(out);
        prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
        prt_newline(out);
 
@@ -909,8 +912,8 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
            !new_a->io_time[READ])
                new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-       old_lru = alloc_lru_idx(*old_a);
-       new_lru = alloc_lru_idx(*new_a);
+       old_lru = alloc_lru_idx_read(*old_a);
+       new_lru = alloc_lru_idx_read(*new_a);
 
        if (old_lru != new_lru) {
                ret = bch2_lru_change(trans, new->k.p.inode,
@@ -920,6 +923,18 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
                        return ret;
        }
 
+       new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+                                       bch_dev_bkey_exists(c, new->k.p.inode));
+
+       if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+               ret = bch2_lru_change(trans,
+                               BCH_LRU_FRAGMENTATION_START,
+                               bucket_to_u64(new->k.p),
+                               old_a->fragmentation_lru, new_a->fragmentation_lru);
+               if (ret)
+                       return ret;
+       }
+
        if (old_a->gen != new_a->gen) {
                ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
                if (ret)
@@ -1775,15 +1790,11 @@ static int invalidate_one_bucket(struct btree_trans *trans,
        if (ret)
                goto out;
 
-       if (lru_pos_time(lru_iter->pos) != alloc_lru_idx(a->v)) {
-               prt_str(&buf, "alloc key does not point back to lru entry when invalidating bucket:");
-               goto err;
-       }
+       /* We expect harmless races here due to the btree write buffer: */
+       if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+               goto out;
 
-       if (a->v.data_type != BCH_DATA_cached) {
-               prt_str(&buf, "lru entry points to non cached bucket:");
-               goto err;
-       }
+       BUG_ON(a->v.data_type != BCH_DATA_cached);
 
        if (!a->v.cached_sectors)
                bch_err(c, "invalidating empty bucket, confused");
@@ -1845,6 +1856,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 
        bch2_trans_init(&trans, c, 0, 0);
 
+       ret = bch2_btree_write_buffer_flush(&trans);
+       if (ret)
+               goto err;
+
        for_each_member_device(ca, c, i) {
                s64 nr_to_invalidate =
                        should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
@@ -1860,7 +1875,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                        break;
                }
        }
-
+err:
        bch2_trans_exit(&trans);
        bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
index b3c2f1e0deb695b747a97792d273372c08f73d38..96ac8f396d4669966e017904406e49f41ca97271 100644 (file)
@@ -64,11 +64,24 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
                                 a.stripe, a, data_type);
 }
 
-static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
 {
        return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
 }
 
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+                                             struct bch_dev *ca)
+{
+       if (a.data_type != BCH_DATA_btree &&
+           a.data_type != BCH_DATA_user)
+               return 0;
+
+       if (a.dirty_sectors >= ca->mi.bucket_size)
+               return 0;
+
+       return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
 static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
 {
        return ((u64) alloc_gc_gen(a) >> 4) << 56;
index c269fc73a41dd038e97e665bf5736950b6bda01a..e001f41916713d44df03e944c1dc0ab23e4d2ed5 100644 (file)
@@ -738,7 +738,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
 
        si_meminfo(&i);
        mem_bytes = i.totalram * i.mem_unit;
-       return (mem_bytes >> 1) / btree_bytes(c);
+       return div_u64(mem_bytes >> 1, btree_bytes(c));
 }
 
 int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
index c8620127f1aef8f3ceb081ad634837d885a2b677..b10e39d8c4a20017a3fc013af3f822438dd3345f 100644 (file)
@@ -927,7 +927,6 @@ struct bch_fs {
 
        /* COPYGC */
        struct task_struct      *copygc_thread;
-       copygc_heap             copygc_heap;
        struct write_point      copygc_write_point;
        s64                     copygc_wait;
        bool                    copygc_running;
index 7574eccd25d4d447ee9500e9f042595fcd2db920..fdd0050c2707a608652207a0fd7060d138945568 100644 (file)
@@ -988,6 +988,7 @@ struct bch_alloc_v4 {
        __u64                   io_time[2];
        __u32                   stripe;
        __u32                   nr_external_backpointers;
+       __u64                   fragmentation_lru;
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0   6
@@ -1559,7 +1560,8 @@ struct bch_sb_field_journal_seq_blacklist {
        x(inode_v3,                     23)             \
        x(unwritten_extents,            24)             \
        x(bucket_gens,                  25)             \
-       x(lru_v2,                       26)
+       x(lru_v2,                       26)             \
+       x(fragmentation_lru,            27)
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
index 3977bb1fd8345bb8b9c0b970e56f15e3f9fcb853..dc2b2a0819bd886330b6a910dcc74448c6c2b870 100644 (file)
@@ -1174,17 +1174,10 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 
        path->uptodate = BTREE_ITER_UPTODATE;
 out:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) {
-               struct printbuf buf = PRINTBUF;
-
-               prt_printf(&buf, "ret %s (%i) trans->restarted %s (%i)\n",
-                          bch2_err_str(ret), ret,
-                          bch2_err_str(trans->restarted), trans->restarted);
-#ifdef CONFIG_BCACHEFS_DEBUG
-               bch2_prt_backtrace(&buf, &trans->last_restarted);
-#endif
-               panic("%s", buf.buf);
-       }
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+               panic("ret %s (%i) trans->restarted %s (%i)\n",
+                     bch2_err_str(ret), ret,
+                     bch2_err_str(trans->restarted), trans->restarted);
        bch2_btree_path_verify(trans, path);
        return ret;
 }
@@ -1367,14 +1360,14 @@ void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
 {
        panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
              trans->restart_count, restart_count,
-             (void *) trans->last_begin_ip);
+             (void *) trans->last_restarted_ip);
 }
 
 void bch2_trans_in_restart_error(struct btree_trans *trans)
 {
        panic("in transaction restart: %s, last restarted by %pS\n",
              bch2_err_str(trans->restarted),
-             (void *) trans->last_begin_ip);
+             (void *) trans->last_restarted_ip);
 }
 
 noinline __cold
@@ -2872,7 +2865,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
        if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
                bch2_trans_reset_srcu_lock(trans);
 
-       trans->last_begin_ip = _RET_IP_;
+       trans->last_restarted_ip = _RET_IP_;
        if (trans->restarted) {
                bch2_btree_path_traverse_all(trans);
                trans->notrace_relock_fail = false;
@@ -3053,10 +3046,6 @@ void bch2_trans_exit(struct btree_trans *trans)
        if (trans->paths)
                mempool_free(trans->paths, &c->btree_paths_pool);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       darray_exit(&trans->last_restarted);
-#endif
-
        trans->mem      = (void *) 0x1;
        trans->paths    = (void *) 0x1;
 }
index bbbbe52be83942ebc986eae30d688eb1acf2dbbc..0ede02c34eac59ce84828f860ee91a3711fdefa4 100644 (file)
@@ -251,10 +251,6 @@ static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int er
        BUG_ON(err <= 0);
        BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       bch2_save_backtrace(&trans->last_restarted, current);
-#endif
-
        trans->restarted = err;
        return -err;
 }
index 97ff267c3aff069ff54d2f2bc0512506a6c80c30..ad73cd2e81475dea677a62f0617ab74c996a223d 100644 (file)
@@ -442,10 +442,7 @@ struct btree_trans {
        bool                    notrace_relock_fail:1;
        enum bch_errcode        restarted:16;
        u32                     restart_count;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       bch_stacktrace          last_restarted;
-#endif
-       unsigned long           last_begin_ip;
+       unsigned long           last_restarted_ip;
        unsigned long           srcu_lock_time;
 
        /*
index 771e4b239c66b234883363c1d222662fdbc2948b..ee1d15931022f42a4331d48e70d0f1ffaffee501 100644 (file)
@@ -58,6 +58,9 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
                                unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+                               struct bkey_i *, enum btree_update_flags);
+
 int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
                        enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
index 0195b13de1523ade6f91b528ad77a0b88ca013c8..20ad79891bfdae58cdf408699ca99a4fa07020d8 100644 (file)
@@ -56,9 +56,10 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
                        k = bkey_i_to_s_c(j_k);
        }
 
-       i->old_k.needs_whiteout = k.k->needs_whiteout;
+       u = *k.k;
+       u.needs_whiteout = i->old_k.needs_whiteout;
 
-       BUG_ON(memcmp(&i->old_k, k.k, sizeof(struct bkey)));
+       BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
        BUG_ON(i->old_v != k.v);
 #endif
 }
@@ -1306,12 +1307,45 @@ static noinline int extent_back_merge(struct btree_trans *trans,
        return 0;
 }
 
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+                                     enum btree_id btree_id, struct bpos pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u32 snapshot = pos.snapshot;
+       int ret;
+
+       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+               return 0;
+
+       pos.snapshot++;
+
+       for_each_btree_key_norestart(trans, iter, btree_id, pos,
+                          BTREE_ITER_ALL_SNAPSHOTS|
+                          BTREE_ITER_NOPRESERVE, k, ret) {
+               if (!bkey_eq(k.k->p, pos))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+                                             k.k->p.snapshot)) {
+                       ret = !bkey_whiteout(k.k);
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
 int bch2_trans_update_extent(struct btree_trans *trans,
                             struct btree_iter *orig_iter,
                             struct bkey_i *insert,
                             enum btree_update_flags flags)
 {
-       struct btree_iter iter, update_iter;
+       struct btree_iter iter;
        struct bpos start = bkey_start_pos(&insert->k);
        struct bkey_i *update;
        struct bkey_s_c k;
@@ -1359,16 +1393,8 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
                        bch2_cut_back(start, update);
 
-                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
-                                            BTREE_ITER_NOT_EXTENTS|
-                                            BTREE_ITER_ALL_SNAPSHOTS|
-                                            BTREE_ITER_INTENT);
-                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
-                               bch2_trans_update(trans, &update_iter, update,
-                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                                 flags);
-                       bch2_trans_iter_exit(trans, &update_iter);
-
+                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
                }
@@ -1382,15 +1408,8 @@ int bch2_trans_update_extent(struct btree_trans *trans,
                        bch2_cut_front(start, update);
                        bch2_cut_back(insert->k.p, update);
 
-                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
-                                            BTREE_ITER_NOT_EXTENTS|
-                                            BTREE_ITER_ALL_SNAPSHOTS|
-                                            BTREE_ITER_INTENT);
-                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
-                               bch2_trans_update(trans, &update_iter, update,
-                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                                 flags);
-                       bch2_trans_iter_exit(trans, &update_iter);
+                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
                }
@@ -1402,21 +1421,15 @@ int bch2_trans_update_extent(struct btree_trans *trans,
 
                        bkey_init(&update->k);
                        update->k.p = k.k->p;
+                       update->k.p.snapshot = insert->k.p.snapshot;
 
-                       if (insert->k.p.snapshot != k.k->p.snapshot) {
-                               update->k.p.snapshot = insert->k.p.snapshot;
+                       if (insert->k.p.snapshot != k.k->p.snapshot ||
+                           (btree_type_has_snapshots(btree_id) &&
+                            need_whiteout_for_snapshot(trans, btree_id, update->k.p)))
                                update->k.type = KEY_TYPE_whiteout;
-                       }
-
-                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
-                                            BTREE_ITER_NOT_EXTENTS|
-                                            BTREE_ITER_INTENT);
-                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
-                               bch2_trans_update(trans, &update_iter, update,
-                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                                 flags);
-                       bch2_trans_iter_exit(trans, &update_iter);
 
+                       ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
                        if (ret)
                                goto err;
                }
@@ -1468,40 +1481,6 @@ err:
        return ret;
 }
 
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-                                     enum btree_id btree_id, struct bpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u32 snapshot = pos.snapshot;
-       int ret;
-
-       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-               return 0;
-
-       pos.snapshot++;
-
-       for_each_btree_key_norestart(trans, iter, btree_id, pos,
-                          BTREE_ITER_ALL_SNAPSHOTS|
-                          BTREE_ITER_NOPRESERVE, k, ret) {
-               if (!bkey_eq(k.k->p, pos))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-                                             k.k->p.snapshot)) {
-                       ret = !bkey_whiteout(k.k);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
 static int __must_check
 bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
                                struct bkey_i *k, enum btree_update_flags flags,
@@ -1747,8 +1726,23 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
        trans->hooks = h;
 }
 
-int __bch2_btree_insert(struct btree_trans *trans,
-                       enum btree_id id,
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+                               enum btree_id btree, struct bkey_i *k,
+                               enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, flags);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
                        struct bkey_i *k, enum btree_update_flags flags)
 {
        struct btree_iter iter;
index 05b755a0e79ca21edf751778fc4c10e1b6472424..6285532e77904f3e834afe317d0b6fac98b36aa5 100644 (file)
@@ -88,6 +88,8 @@ static union btree_write_buffer_state btree_write_buffer_switch(struct btree_wri
        while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
                cpu_relax();
 
+       smp_mb();
+
        return old;
 }
 
index 1dbba7d906dd883a12db432766a02fb527e56207..2a9dab9006efa68ca64648d866fec47010b58ad2 100644 (file)
@@ -89,15 +89,4 @@ struct disk_reservation {
        unsigned                nr_replicas;
 };
 
-struct copygc_heap_entry {
-       u8                      dev;
-       u8                      gen;
-       u8                      replicas;
-       u32                     fragmentation;
-       u32                     sectors;
-       u64                     bucket;
-};
-
-typedef HEAP(struct copygc_heap_entry) copygc_heap;
-
 #endif /* _BUCKETS_TYPES_H */
index 199e894e0f8b31bb4bc8b3b77419149ccc2cba2d..de0575f61cfbcb7920086cca2690c9f46298eab4 100644 (file)
@@ -22,9 +22,10 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
                                     struct bpos new_pos)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter, update_iter;
-       struct bkey_s_c k;
+       struct btree_iter iter, iter2;
+       struct bkey_s_c k, k2;
        snapshot_id_list s;
+       struct bkey_i *update;
        int ret;
 
        if (!btree_type_has_snapshots(id))
@@ -32,10 +33,7 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
 
        darray_init(&s);
 
-       if (bkey_eq(old_pos, new_pos))
-               return 0;
-
-       if (!snapshot_t(c, old_pos.snapshot)->children[0])
+       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
                return 0;
 
        bch2_trans_iter_init(trans, &iter, id, old_pos,
@@ -47,33 +45,39 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
                if (ret)
                        break;
 
+               if (!k.k)
+                       break;
+
                if (!bkey_eq(old_pos, k.k->p))
                        break;
 
-               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
-                       struct bkey_i *update;
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot) &&
+                   !snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) {
+                       struct bpos whiteout_pos = new_pos;
 
-                       if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot))
-                               continue;
+                       whiteout_pos.snapshot = k.k->p.snapshot;
 
-                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+                       bch2_trans_iter_init(trans, &iter2, id, whiteout_pos,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_INTENT);
+                       k2 = bch2_btree_iter_peek_slot(&iter2);
+                       ret = bkey_err(k2);
 
-                       ret = PTR_ERR_OR_ZERO(update);
-                       if (ret)
-                               break;
+                       if (!ret && k2.k->type == KEY_TYPE_deleted) {
+                               update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+                               ret = PTR_ERR_OR_ZERO(update);
+                               if (ret)
+                                       break;
 
-                       bkey_init(&update->k);
-                       update->k.p = new_pos;
-                       update->k.p.snapshot = k.k->p.snapshot;
+                               bkey_init(&update->k);
+                               update->k.p             = whiteout_pos;
+                               update->k.type          = KEY_TYPE_whiteout;
+
+                               ret = bch2_trans_update(trans, &iter2, update,
+                                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                       }
+                       bch2_trans_iter_exit(trans, &iter2);
 
-                       bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
-                                            BTREE_ITER_NOT_EXTENTS|
-                                            BTREE_ITER_ALL_SNAPSHOTS|
-                                            BTREE_ITER_INTENT);
-                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
-                               bch2_trans_update(trans, &update_iter, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-                       bch2_trans_iter_exit(trans, &update_iter);
                        if (ret)
                                break;
 
@@ -229,9 +233,21 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
 
                next_pos = insert->k.p;
 
-               ret   = insert_snapshot_whiteouts(trans, m->btree_id,
-                                                 k.k->p, insert->k.p) ?:
-                       bch2_trans_update(trans, &iter, insert,
+               if (!bkey_eq(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+                       ret = insert_snapshot_whiteouts(trans, m->btree_id, k.k->p,
+                                                       bkey_start_pos(&insert->k));
+                       if (ret)
+                               goto err;
+               }
+
+               if (!bkey_eq(insert->k.p, k.k->p)) {
+                       ret = insert_snapshot_whiteouts(trans, m->btree_id,
+                                                       k.k->p, insert->k.p);
+                       if (ret)
+                               goto err;
+               }
+
+               ret   = bch2_trans_update(trans, &iter, insert,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
                                NULL,
index c2882c599896cc694c9e0180bf4abf4e90f05e0a..1dae649ff0e223386eefe0e1992ddd564f020197 100644 (file)
@@ -98,7 +98,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
 
        INIT_LIST_HEAD(&s->list);
        s->fmt = fmt;
-       s->buf = PRINTBUF;
        list_add(&s->list, &c->fsck_errors);
        return s;
 }
@@ -111,9 +110,23 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
        struct printbuf buf = PRINTBUF, *out = &buf;
        int ret = -BCH_ERR_fsck_ignore;
 
+       va_start(args, fmt);
+       prt_vprintf(out, fmt, args);
+       va_end(args);
+
        mutex_lock(&c->fsck_error_lock);
        s = fsck_err_get(c, fmt);
        if (s) {
+               if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+                       ret = s->ret;
+                       mutex_unlock(&c->fsck_error_lock);
+                       printbuf_exit(&buf);
+                       return ret;
+               }
+
+               kfree(s->last_msg);
+               s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+
                if (c->opts.ratelimit_errors &&
                    !(flags & FSCK_NO_RATELIMIT) &&
                    s->nr >= FSCK_ERR_RATELIMIT_NR) {
@@ -123,8 +136,6 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
                                print = false;
                }
 
-               printbuf_reset(&s->buf);
-               out = &s->buf;
                s->nr++;
        }
 
@@ -133,10 +144,6 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
                prt_printf(out, bch2_log_msg(c, ""));
 #endif
 
-       va_start(args, fmt);
-       prt_vprintf(out, fmt, args);
-       va_end(args);
-
        if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
                if (c->opts.errors != BCH_ON_ERROR_continue ||
                    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
@@ -190,6 +197,9 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
        else if (suppressing)
                bch_err(c, "Ratelimiting new instances of previous error");
 
+       if (s)
+               s->ret = ret;
+
        mutex_unlock(&c->fsck_error_lock);
 
        printbuf_exit(&buf);
@@ -214,11 +224,11 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
        mutex_lock(&c->fsck_error_lock);
 
        list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
-               if (s->ratelimited)
-                       bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf.buf);
+               if (s->ratelimited && s->last_msg)
+                       bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
 
                list_del(&s->list);
-               printbuf_exit(&s->buf);
+               kfree(s->last_msg);
                kfree(s);
        }
 
index 9991879dfbff12bb36119646d4222423f09bfd2b..91c7e4ee8f7266ea193414461a145236e7dc5277 100644 (file)
@@ -103,7 +103,8 @@ struct fsck_err_state {
        const char              *fmt;
        u64                     nr;
        bool                    ratelimited;
-       struct printbuf         buf;
+       int                     ret;
+       char                    *last_msg;
 };
 
 #define FSCK_CAN_FIX           (1 << 0)
index 52bb00b52b900bedcf35faf338ce037953fd5f46..f2768a7437e873da942736f0d1968d8e254bb84a 100644 (file)
@@ -605,6 +605,17 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
                : bch2_snapshot_is_ancestor(c, src, dst);
 }
 
+static int ref_visible2(struct bch_fs *c,
+                      u32 src, struct snapshots_seen *src_seen,
+                      u32 dst, struct snapshots_seen *dst_seen)
+{
+       if (dst > src) {
+               swap(dst, src);
+               swap(dst_seen, src_seen);
+       }
+       return key_visible_in_snapshot(c, src_seen, dst, src);
+}
+
 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                              \
        for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
             (_i)->snapshot <= (_snapshot); _i++)                                       \
@@ -1158,10 +1169,102 @@ fsck_err:
        return ret;
 }
 
+struct extent_end {
+       u32                     snapshot;
+       u64                     offset;
+       struct snapshots_seen   seen;
+};
+
+typedef DARRAY(struct extent_end) extent_ends;
+
+static int check_overlapping_extents(struct btree_trans *trans,
+                             struct snapshots_seen *seen,
+                             extent_ends *extent_ends,
+                             struct bkey_s_c k,
+                             struct btree_iter *iter)
+{
+       struct bch_fs *c = trans->c;
+       struct extent_end *i;
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       darray_for_each(*extent_ends, i) {
+               /* duplicate, due to transaction restart: */
+               if (i->offset   == k.k->p.offset &&
+                   i->snapshot == k.k->p.snapshot)
+                       continue;
+
+               if (!ref_visible2(c,
+                                 k.k->p.snapshot, seen,
+                                 i->snapshot, &i->seen))
+                       continue;
+
+               if (fsck_err_on(i->offset > bkey_start_offset(k.k), c,
+                               "overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s",
+                               i->snapshot,
+                               i->offset,
+                               (printbuf_reset(&buf),
+                                bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+                       struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+                       bkey_reassemble(update, k);
+                       ret = bch2_trans_update_extent(trans, iter, update, 0);
+                       if (!ret)
+                               goto err;
+               }
+       }
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int extent_ends_at(extent_ends *extent_ends,
+                         struct snapshots_seen *seen,
+                         struct bkey_s_c k)
+{
+       struct extent_end *i, n = (struct extent_end) {
+               .snapshot       = k.k->p.snapshot,
+               .offset         = k.k->p.offset,
+               .seen           = *seen,
+       };
+
+       n.seen.ids.data = kmemdup(seen->ids.data,
+                             sizeof(seen->ids.data[0]) * seen->ids.size,
+                             GFP_KERNEL);
+       if (!n.seen.ids.data)
+               return -ENOMEM;
+
+       darray_for_each(*extent_ends, i) {
+               if (i->snapshot == k.k->p.snapshot) {
+                       snapshots_seen_exit(&i->seen);
+                       *i = n;
+                       return 0;
+               }
+
+               if (i->snapshot >= k.k->p.snapshot)
+                       break;
+       }
+
+       return darray_insert_item(extent_ends, i - extent_ends->data, n);
+}
+
+static void extent_ends_reset(extent_ends *extent_ends)
+{
+       struct extent_end *i;
+
+       darray_for_each(*extent_ends, i)
+               snapshots_seen_exit(&i->seen);
+
+       extent_ends->nr = 0;
+}
+
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                        struct bkey_s_c k,
                        struct inode_walker *inode,
-                       struct snapshots_seen *s)
+                       struct snapshots_seen *s,
+                       extent_ends *extent_ends)
 {
        struct bch_fs *c = trans->c;
        struct inode_walker_entry *i;
@@ -1189,24 +1292,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                ret = check_i_sectors(trans, inode);
                if (ret)
                        goto err;
+
+               extent_ends_reset(extent_ends);
        }
 
        BUG_ON(!iter->path->should_be_locked);
-#if 0
-       if (bkey_gt(prev.k->k.p, bkey_start_pos(k.k))) {
-               char buf1[200];
-               char buf2[200];
 
-               bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
-               bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+       ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
+       if (ret)
+               goto err;
+
+       ret = extent_ends_at(extent_ends, s, k);
+       if (ret)
+               goto err;
 
-               if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
-                       ret = fix_overlapping_extent(trans, k, prev.k->k.p)
-                               ?: -BCH_ERR_transaction_restart_nested;
-                       goto out;
-               }
-       }
-#endif
        ret = __walk_inode(trans, inode, equiv);
        if (ret < 0)
                goto err;
@@ -1304,13 +1403,9 @@ static int check_extents(struct bch_fs *c)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       extent_ends extent_ends = { 0 };
        int ret = 0;
 
-#if 0
-       struct bkey_buf prev;
-       bch2_bkey_buf_init(&prev);
-       prev.k->k = KEY(0, 0, 0);
-#endif
        snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
@@ -1321,10 +1416,10 @@ static int check_extents(struct bch_fs *c)
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                        NULL, NULL,
                        BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_extent(&trans, &iter, k, &w, &s));
-#if 0
-       bch2_bkey_buf_exit(&prev, c);
-#endif
+               check_extent(&trans, &iter, k, &w, &s, &extent_ends));
+
+       extent_ends_reset(&extent_ends);
+       darray_exit(&extent_ends);
        inode_walker_exit(&w);
        bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
index 07fb41ca8c6b4e2e7d12d8cb4948ba499039fc77..9eec12a99535e84675e5ad3d5bd63b3348611f1b 100644 (file)
@@ -49,7 +49,6 @@ void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
 static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
                        u64 dev_bucket, u64 time, unsigned key_type)
 {
-       struct btree_iter iter;
        struct bkey_i *k;
        int ret = 0;
 
@@ -69,13 +68,7 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
        EBUG_ON(lru_pos_time(k->k.p) != time);
        EBUG_ON(k->k.p.offset != dev_bucket);
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
-                            k->k.p, BTREE_ITER_INTENT);
-
-       ret = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(trans, &iter, k, 0);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
+       return bch2_trans_update_buffered(trans, BTREE_ID_lru, k);
 }
 
 int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
@@ -99,6 +92,13 @@ int bch2_lru_change(struct btree_trans *trans,
                bch2_lru_set(trans, lru_id, dev_bucket, new_time);
 }
 
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+       BCH_LRU_TYPES()
+#undef x
+       NULL
+};
+
 static int bch2_check_lru_key(struct btree_trans *trans,
                              struct btree_iter *lru_iter,
                              struct bkey_s_c lru_k)
@@ -110,7 +110,9 @@ static int bch2_check_lru_key(struct btree_trans *trans,
        const struct bch_alloc_v4 *a;
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
+       enum bch_lru_type type = lru_type(lru_k);
        struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+       u64 idx;
        int ret;
 
        if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
@@ -126,11 +128,21 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 
        a = bch2_alloc_to_v4(k, &a_convert);
 
+       switch (type) {
+       case BCH_LRU_read:
+               idx = alloc_lru_idx_read(*a);
+               break;
+       case BCH_LRU_fragmentation:
+               idx = a->fragmentation_lru;
+               break;
+       }
+
        if (fsck_err_on(lru_k.k->type != KEY_TYPE_set ||
-                       a->data_type != BCH_DATA_cached ||
-                       a->io_time[READ] != lru_pos_time(lru_k.k->p), c,
-                       "incorrect lru entry (time %llu) %s\n"
+                       lru_pos_time(lru_k.k->p) != idx, c,
+                       "incorrect lru entry: lru %s time %llu\n"
+                       "  %s\n"
                        "  for %s",
+                       bch2_lru_types[type],
                        lru_pos_time(lru_k.k->p),
                        (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
                        (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
index b8d9848cdb1acc030a84cc950b92f5504354eac1..78a6076999ed11383a029e007440c7c61d0f22b5 100644 (file)
@@ -22,6 +22,27 @@ static inline u64 lru_pos_time(struct bpos pos)
        return pos.inode & ~(~0ULL << LRU_TIME_BITS);
 }
 
+#define BCH_LRU_TYPES()                \
+       x(read)                 \
+       x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+       BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START    ((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+       u16 lru_id = l.k->p.inode >> 48;
+
+       if (lru_id == BCH_LRU_FRAGMENTATION_START)
+               return BCH_LRU_fragmentation;
+       return BCH_LRU_read;
+}
+
 int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
index e7eb55bd8a251cf40d7ded91e66012e45cbd7b46..7dac9264304e4188f43d591790900e4fa3bed518 100644 (file)
@@ -653,13 +653,13 @@ failed_to_evacuate:
        printbuf_exit(&buf);
 }
 
-int __bch2_evacuate_bucket(struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct btree_trans *trans,
+                          struct moving_context *ctxt,
                           struct bpos bucket, int gen,
                           struct data_update_opts _data_opts)
 {
        struct bch_fs *c = ctxt->c;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_buf sk;
        struct bch_backpointer bp;
@@ -668,17 +668,17 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
        struct bkey_s_c k;
        struct data_update_opts data_opts;
        unsigned dirty_sectors, bucket_size;
+       u64 fragmentation;
        u64 bp_offset = 0, cur_inum = U64_MAX;
        int ret = 0;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
                             bucket, BTREE_ITER_CACHED);
-       ret = lockrestart_do(&trans,
+       ret = lockrestart_do(trans,
                        bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret) {
                bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
@@ -688,17 +688,18 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
        a = bch2_alloc_to_v4(k, &a_convert);
        dirty_sectors = a->dirty_sectors;
        bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+       fragmentation = a->fragmentation_lru;
 
-       ret = bch2_btree_write_buffer_flush(&trans);
+       ret = bch2_btree_write_buffer_flush(trans);
        if (ret) {
                bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
                goto err;
        }
 
-       while (!(ret = move_ratelimit(&trans, ctxt))) {
-               bch2_trans_begin(&trans);
+       while (!(ret = move_ratelimit(trans, ctxt))) {
+               bch2_trans_begin(trans);
 
-               ret = bch2_get_next_backpointer(&trans, bucket, gen,
+               ret = bch2_get_next_backpointer(trans, bucket, gen,
                                                &bp_offset, &bp,
                                                BTREE_ITER_CACHED);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -713,7 +714,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                        struct bkey_s_c k;
                        unsigned i = 0;
 
-                       k = bch2_backpointer_get_key(&trans, &iter,
+                       k = bch2_backpointer_get_key(trans, &iter,
                                                bucket, bp_offset, bp);
                        ret = bkey_err(k);
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -726,9 +727,9 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                        bch2_bkey_buf_reassemble(&sk, c, k);
                        k = bkey_i_to_s_c(sk.k);
 
-                       ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+                       ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
                        if (ret) {
-                               bch2_trans_iter_exit(&trans, &iter);
+                               bch2_trans_iter_exit(trans, &iter);
                                continue;
                        }
 
@@ -742,15 +743,15 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                                i++;
                        }
 
-                       ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
+                       ret = bch2_move_extent(trans, &iter, ctxt, io_opts,
                                               bp.btree_id, k, data_opts);
-                       bch2_trans_iter_exit(&trans, &iter);
+                       bch2_trans_iter_exit(trans, &iter);
 
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, &trans);
+                               bch2_move_ctxt_wait_for_io(ctxt, trans);
                                continue;
                        }
                        if (ret)
@@ -762,7 +763,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                } else {
                        struct btree *b;
 
-                       b = bch2_backpointer_get_node(&trans, &iter,
+                       b = bch2_backpointer_get_node(trans, &iter,
                                                bucket, bp_offset, bp);
                        ret = PTR_ERR_OR_ZERO(b);
                        if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
@@ -774,8 +775,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
                        if (!b)
                                goto next;
 
-                       ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
-                       bch2_trans_iter_exit(&trans, &iter);
+                       ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+                       bch2_trans_iter_exit(trans, &iter);
 
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
@@ -792,17 +793,16 @@ next:
                bp_offset++;
        }
 
-       trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, ret);
+       trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
                move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
                closure_sync(&ctxt->cl);
                if (!ctxt->write_error)
-                       verify_bucket_evacuated(&trans, bucket, gen);
+                       verify_bucket_evacuated(trans, bucket, gen);
        }
 err:
-       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
        return ret;
 }
@@ -815,12 +815,15 @@ int bch2_evacuate_bucket(struct bch_fs *c,
                         struct write_point_specifier wp,
                         bool wait_on_copygc)
 {
+       struct btree_trans trans;
        struct moving_context ctxt;
        int ret;
 
+       bch2_trans_init(&trans, c, 0, 0);
        bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
+       ret = __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
        bch2_moving_ctxt_exit(&ctxt);
+       bch2_trans_exit(&trans);
 
        return ret;
 }
index aef613802935e90958a9e3d4659b7d94e57773df..c5a7c0add1d695939c6b0b52123c5356d9a07228 100644 (file)
@@ -66,7 +66,8 @@ int bch2_move_data(struct bch_fs *,
                   bool,
                   move_pred_fn, void *);
 
-int __bch2_evacuate_bucket(struct moving_context *,
+int __bch2_evacuate_bucket(struct btree_trans *,
+                          struct moving_context *,
                           struct bpos, int,
                           struct data_update_opts);
 int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
index f0ab65ffab7373dca455b4bec9528c31d6d0fbe3..1a64643313cf188c4d197a3cfd11988250c28bd0 100644 (file)
@@ -10,6 +10,7 @@
 #include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
 #include "disk_groups.h"
@@ -19,6 +20,7 @@
 #include "eytzinger.h"
 #include "io.h"
 #include "keylist.h"
+#include "lru.h"
 #include "move.h"
 #include "movinggc.h"
 #include "super-io.h"
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-static inline int fragmentation_cmp(copygc_heap *heap,
-                                  struct copygc_heap_entry l,
-                                  struct copygc_heap_entry r)
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+                                 struct bpos bucket, u64 time, u8 *gen)
 {
-       return cmp_int(l.fragmentation, r.fragmentation);
-}
-
-static int find_buckets_to_copygc(struct bch_fs *c)
-{
-       copygc_heap *h = &c->copygc_heap;
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bch_alloc_v4 _a;
+       const struct bch_alloc_v4 *a;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
+               return 0;
 
-       /*
-        * Find buckets with lowest sector counts, skipping completely
-        * empty buckets, by building a maxheap sorted by sector count,
-        * and repeatedly replacing the maximum element until all
-        * buckets have been visited.
-        */
-       h->used = 0;
-
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
-               struct copygc_heap_entry e;
-               struct bch_alloc_v4 a_convert;
-               const struct bch_alloc_v4 *a;
-
-               a = bch2_alloc_to_v4(k, &a_convert);
-
-               if ((a->data_type != BCH_DATA_btree &&
-                    a->data_type != BCH_DATA_user) ||
-                   a->dirty_sectors >= ca->mi.bucket_size ||
-                   bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
-                       continue;
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               return ret;
 
-               e = (struct copygc_heap_entry) {
-                       .dev            = iter.pos.inode,
-                       .gen            = a->gen,
-                       .replicas       = 1 + a->stripe_redundancy,
-                       .fragmentation  = div_u64((u64) a->dirty_sectors * (1ULL << 31),
-                                                 ca->mi.bucket_size),
-                       .sectors        = a->dirty_sectors,
-                       .bucket         = iter.pos.offset,
-               };
-               heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+       a = bch2_alloc_to_v4(k, &_a);
+       *gen = a->gen;
+       ret = (a->data_type == BCH_DATA_btree ||
+              a->data_type == BCH_DATA_user) &&
+               a->fragmentation_lru &&
+               a->fragmentation_lru <= time;
 
+       if (ret) {
+               struct printbuf buf = PRINTBUF;
+
+               bch2_bkey_val_to_text(&buf, trans->c, k);
+               pr_debug("%s", buf.buf);
+               printbuf_exit(&buf);
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
-       bch2_trans_exit(&trans);
        return ret;
 }
 
+static int bch2_copygc_next_bucket(struct btree_trans *trans,
+                                  struct bpos *bucket, u8 *gen, struct bpos *pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+                                 bpos_max(*pos, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0)),
+                                 lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+                                 0, k, ({
+               *bucket = u64_to_bucket(k.k->p.offset);
+
+               bch2_bucket_is_movable(trans, *bucket, lru_pos_time(k.k->p), gen);
+       }));
+
+       *pos = iter.pos;
+       if (ret < 0)
+               return ret;
+       return ret ? 0 : -ENOENT;
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
-       copygc_heap *h = &c->copygc_heap;
-       struct copygc_heap_entry e;
        struct bch_move_stats move_stats;
-       struct bch_dev *ca;
-       unsigned dev_idx;
-       size_t heap_size = 0;
+       struct btree_trans trans;
        struct moving_context ctxt;
        struct data_update_opts data_opts = {
                .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
        };
+       struct bpos bucket;
+       struct bpos pos;
+       u8 gen = 0;
+       unsigned nr_evacuated;
        int ret = 0;
 
        bch2_move_stats_init(&move_stats, "copygc");
-
-       for_each_rw_member(ca, c, dev_idx)
-               heap_size += ca->mi.nbuckets >> 7;
-
-       if (h->size < heap_size) {
-               free_heap(&c->copygc_heap);
-               if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
-                       bch_err(c, "error allocating copygc heap");
-                       return 0;
-               }
-       }
-
-       ret = find_buckets_to_copygc(c);
-       if (ret) {
-               bch2_fs_fatal_error(c, "error walking buckets to copygc!");
-               return ret;
-       }
-
-       if (!h->used) {
-               s64 wait = S64_MAX, dev_wait;
-               u64 dev_min_wait_fragmented = 0;
-               u64 dev_min_wait_allowed = 0;
-               int dev_min_wait = -1;
-
-               for_each_rw_member(ca, c, dev_idx) {
-                       struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-                       s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
-                                              ca->mi.bucket_size) >> 1);
-                       s64 fragmented = usage.d[BCH_DATA_user].fragmented;
-
-                       dev_wait = max(0LL, allowed - fragmented);
-
-                       if (dev_min_wait < 0 || dev_wait < wait) {
-                               dev_min_wait = dev_idx;
-                               dev_min_wait_fragmented = fragmented;
-                               dev_min_wait_allowed    = allowed;
-                       }
-               }
-
-               bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu",
-                                   dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed);
-               return 0;
-       }
-
-       heap_resort(h, fragmentation_cmp, NULL);
-
        bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
                              writepoint_ptr(&c->copygc_write_point),
                              false);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = bch2_btree_write_buffer_flush(&trans);
+       BUG_ON(ret);
 
-       /* not correct w.r.t. device removal */
-       while (h->used && !ret) {
-               BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-               ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen,
-                                            data_opts);
+       for (nr_evacuated = 0, pos = POS_MIN;
+            nr_evacuated < 32 && !ret;
+            nr_evacuated++, pos = bpos_nosnap_successor(pos)) {
+               ret = bch2_copygc_next_bucket(&trans, &bucket, &gen, &pos) ?:
+                       __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts);
+               if (bkey_eq(pos, POS_MAX))
+                       break;
        }
 
+       bch2_trans_exit(&trans);
        bch2_moving_ctxt_exit(&ctxt);
 
+       /* no entries in LRU btree found, or got to end: */
+       if (ret == -ENOENT)
+               ret = 0;
+
        if (ret < 0 && !bch2_err_matches(ret, EROFS))
                bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 
index 178f064244601ec302521213f8bac5d80b4a83a8..1976d5fa3427ff0d4f1036e85c2e9ab42e914591 100644 (file)
@@ -1105,6 +1105,9 @@ int bch2_fs_recovery(struct bch_fs *c)
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
                        c->opts.fix_errors      = FSCK_OPT_YES;
+               } else if (c->sb.version < bcachefs_metadata_version_fragmentation_lru) {
+                       bch_info(c, "version prior to backpointers, upgrade required");
+                       c->opts.version_upgrade = true;
                }
        }
 
index 1805c8542d65381605a5506a5c82554102524587..ba281104eb302bb0621aa026003b14ee67334ea3 100644 (file)
@@ -512,8 +512,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
                n->v.pad        = 0;
                SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-               ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
-                       bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+               ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
                if (ret)
                        goto err;
 
index b6740eab78d3d588f0813e9a2433d45196d7271c..7c488c3d78e0e31d0dfa26bf525ae9e04e71c85f 100644 (file)
@@ -8,15 +8,15 @@
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
                          unsigned, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
+                      struct bkey_s_c, unsigned);
 
 #define bch2_bkey_ops_snapshot ((struct bkey_ops) {            \
        .key_invalid    = bch2_snapshot_invalid,                \
        .val_to_text    = bch2_snapshot_to_text,                \
+       .atomic_trigger = bch2_mark_snapshot,                   \
 })
 
-int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
-                      struct bkey_s_c, unsigned);
-
 static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
 {
        return genradix_ptr(&c->snapshots, U32_MAX - id);
@@ -68,6 +68,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
        return id == ancestor;
 }
 
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+       struct snapshot_t *t = snapshot_t(c, id);
+
+       return (t->children[0]|t->children[1]) != 0;
+}
+
 static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 {
        u32 *i;
index 8bed11857bacfb824d98cffd3486d1e3ea75731b..3b7cf9e8e5d4a78a9cd06e8bdae2ba54064bdb51 100644 (file)
@@ -488,7 +488,6 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
        kfree(c->unused_inode_hints);
-       free_heap(&c->copygc_heap);
 
        if (c->io_complete_wq)
                destroy_workqueue(c->io_complete_wq);
index a9790954aa5ad5a3c8c6c6b78e9c1ae4cd249cb5..bf5ffb47ea7d3f93684700ae7dbedd3c2f758bfe 100644 (file)
@@ -433,8 +433,8 @@ static const struct time_unit {
        { "us",         NSEC_PER_USEC    },
        { "ms",         NSEC_PER_MSEC    },
        { "s",          NSEC_PER_SEC     },
-       { "m",          NSEC_PER_SEC * 60},
-       { "h",          NSEC_PER_SEC * 3600},
+       { "m",          (u64) NSEC_PER_SEC * 60},
+       { "h",          (u64) NSEC_PER_SEC * 3600},
        { "eon",        U64_MAX          },
 };
 
index 41337a7faeb9710b95cac7a97a9805fac278b9c6..5a6eadc0e8405459ce10bf9b72cbd0db42b07eaa 100644 (file)
@@ -833,19 +833,12 @@ struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
        struct six_lock_count ret;
 
-       ret.n[SIX_LOCK_read]    = 0;
+       ret.n[SIX_LOCK_read]    = !lock->readers
+               ? lock->state.read_lock
+               : pcpu_read_count(lock);
        ret.n[SIX_LOCK_intent]  = lock->state.intent_lock + lock->intent_lock_recurse;
        ret.n[SIX_LOCK_write]   = lock->state.seq & 1;
 
-       if (!lock->readers)
-               ret.n[SIX_LOCK_read] += lock->state.read_lock;
-       else {
-               int cpu;
-
-               for_each_possible_cpu(cpu)
-                       ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
-       }
-
        return ret;
 }
 EXPORT_SYMBOL_GPL(six_lock_counts);