]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 1d669389f7 bcachefs: use a radix tree for inum bitmap...
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 7 Nov 2020 16:26:00 +0000 (11:26 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Sat, 7 Nov 2020 18:09:23 +0000 (13:09 -0500)
48 files changed:
.bcachefs_revision
cmd_migrate.c
include/linux/cpumask.h
include/linux/page.h
include/trace/events/bcachefs.h
libbcachefs/alloc_background.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_sort.c
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/compress.c
libbcachefs/debug.c
libbcachefs/debug.h
libbcachefs/ec.c
libbcachefs/extents.c
libbcachefs/fs-common.c
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal_reclaim.c
libbcachefs/recovery.c
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/varint.c [new file with mode: 0644]
libbcachefs/varint.h [new file with mode: 0644]

index dc226f849e9c05cf2f76fe7f8c92f9e2dadd5167..dc58304780a4458127051429c47f5053788c3d7a 100644 (file)
@@ -1 +1 @@
-8436db7aac9ced2118bf19b8f1bf3682f479d17e
+1d669389f79de8571732c13fdf4d23039e2308fd
index 797c51e0ef540988947e7ba82bde3eb022a06a65..42fbc2bc5feabb60b4907bff80941600c4dbee2b 100644 (file)
@@ -122,7 +122,7 @@ static void update_inode(struct bch_fs *c,
        struct bkey_inode_buf packed;
        int ret;
 
-       bch2_inode_pack(&packed, inode);
+       bch2_inode_pack(c, &packed, inode);
        ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
                                NULL, NULL, 0);
        if (ret)
index 024d645c5579914b386b8dcdd10cb4a7eff5d5e8..bfab7ea70eb56cab400de1a57f46d08c1210207c 100644 (file)
@@ -10,6 +10,8 @@
 #define cpu_present(cpu)       ((cpu) == 0)
 #define cpu_active(cpu)                ((cpu) == 0)
 
+#define raw_smp_processor_id() 0U
+
 #define for_each_cpu(cpu, mask)                        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)            \
index 87be064f7dbc3df218409ac580ff7ddc823f00a3..310b3eda6de0f63aeb7162bbeb85dc2ddb7ec684 100644 (file)
@@ -21,6 +21,8 @@ struct page;
 #define kmap_atomic(page)              page_address(page)
 #define kunmap_atomic(addr)            do {} while (0)
 
+#define PageHighMem(page)              false
+
 static const char zero_page[PAGE_SIZE];
 
 #define ZERO_PAGE(o)                   ((struct page *) &zero_page[0])
index 9b4e8295ed75a55b9e9150198e693a758f808f9d..ba2c55559796d1d3d4ee5edab07023766a1018dd 100644 (file)
@@ -536,9 +536,46 @@ DEFINE_EVENT(transaction_restart,  trans_restart_btree_node_reused,
        TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_would_deadlock,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
+TRACE_EVENT(trans_restart_would_deadlock,
+       TP_PROTO(unsigned long  trans_ip,
+                unsigned long  caller_ip,
+                unsigned       reason,
+                enum btree_id  have_btree_id,
+                unsigned       have_iter_type,
+                enum btree_id  want_btree_id,
+                unsigned       want_iter_type),
+       TP_ARGS(trans_ip, caller_ip, reason,
+               have_btree_id, have_iter_type,
+               want_btree_id, want_iter_type),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,          trans_ip        )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     reason          )
+               __field(u8,                     have_btree_id   )
+               __field(u8,                     have_iter_type  )
+               __field(u8,                     want_btree_id   )
+               __field(u8,                     want_iter_type  )
+       ),
+
+       TP_fast_assign(
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
+               __entry->reason                 = reason;
+               __entry->have_btree_id          = have_btree_id;
+               __entry->have_iter_type         = have_iter_type;
+               __entry->want_btree_id          = want_btree_id;
+               __entry->want_iter_type         = want_iter_type;
+       ),
+
+       TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
+                 __entry->reason,
+                 __entry->have_btree_id,
+                 __entry->have_iter_type,
+                 __entry->want_btree_id,
+                 __entry->want_iter_type)
 );
 
 TRACE_EVENT(trans_restart_iters_realloced,
index cbaff56f7473f4f77edb5da718312ee7f29c598b..d10ff56e4de169fc5bbb63823f4a5c49cfa8fe1d 100644 (file)
@@ -76,7 +76,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
                                          size_t bucket)
 {
-       if (expensive_debug_checks(c)) {
+       if (bch2_expensive_debug_checks) {
                size_t iter;
                long i;
                unsigned j;
index 29f411635f29968e9fae422d9fbc5ad67318b265..35311dbb189caca3ddad7fa6f3a7b222e5300c6b 100644 (file)
@@ -265,6 +265,8 @@ do {                                                                        \
        BCH_DEBUG_PARAM(debug_check_bkeys,                              \
                "Run bkey_debugcheck (primarily checking GC/allocation "\
                "information) when iterating over keys")                \
+       BCH_DEBUG_PARAM(debug_check_btree_accounting,                   \
+               "Verify btree accounting for keys within a node")       \
        BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
                "Reread btree nodes at various points to verify the "   \
                "mergesort in the read path against modifications "     \
@@ -295,6 +297,16 @@ do {                                                                       \
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
 #define BCH_TIME_STATS()                       \
        x(btree_node_mem_alloc)                 \
        x(btree_node_split)                     \
@@ -529,6 +541,10 @@ struct journal_keys {
        u64                     journal_seq_base;
 };
 
+struct btree_iter_buf {
+       struct btree_iter       *iter;
+};
+
 struct bch_fs {
        struct closure          cl;
 
@@ -624,6 +640,7 @@ struct bch_fs {
        struct mutex            btree_trans_lock;
        struct list_head        btree_trans_list;
        mempool_t               btree_iters_pool;
+       struct btree_iter_buf  __percpu *btree_iters_bufs;
 
        struct btree_key_cache  btree_key_cache;
 
@@ -801,7 +818,8 @@ struct bch_fs {
        struct mutex            verify_lock;
 #endif
 
-       u64                     unused_inode_hint;
+       u64                     *unused_inode_hints;
+       unsigned                inode_shard_bits;
 
        /*
         * A btree node on disk could have too many bsets for an iterator to fit
@@ -826,10 +844,6 @@ struct bch_fs {
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
 
-#define BCH_DEBUG_PARAM(name, description) bool name;
-       BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
        struct time_stats       times[BCH_TIME_STAT_NR];
 };
 
index 2926c648a17f2761fe77df869dd7106994539866..94b5418587e364591b29643fea73b3eb53a5103a 100644 (file)
@@ -669,10 +669,10 @@ struct bch_inode_generation {
 } __attribute__((packed, aligned(8)));
 
 #define BCH_INODE_FIELDS()                     \
-       x(bi_atime,                     64)     \
-       x(bi_ctime,                     64)     \
-       x(bi_mtime,                     64)     \
-       x(bi_otime,                     64)     \
+       x(bi_atime,                     96)     \
+       x(bi_ctime,                     96)     \
+       x(bi_mtime,                     96)     \
+       x(bi_otime,                     96)     \
        x(bi_size,                      64)     \
        x(bi_sectors,                   64)     \
        x(bi_uid,                       32)     \
@@ -739,7 +739,8 @@ enum {
 #define BCH_INODE_UNLINKED     (1 << __BCH_INODE_UNLINKED)
 
 LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 32);
+LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
 
 /* Dirents */
 
@@ -1330,13 +1331,15 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,       struct bch_sb, flags[3],  0, 16);
        x(btree_ptr_v2,                 11)     \
        x(extents_above_btree_updates,  12)     \
        x(btree_updates_journalled,     13)     \
-       x(reflink_inline_data,          14)
+       x(reflink_inline_data,          14)     \
+       x(new_varint,                   15)
 
 #define BCH_SB_FEATURES_ALL                            \
        ((1ULL << BCH_FEATURE_new_siphash)|             \
         (1ULL << BCH_FEATURE_new_extent_overwrite)|    \
         (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
-        (1ULL << BCH_FEATURE_extents_above_btree_updates))
+        (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+        (1ULL << BCH_FEATURE_new_varint))\
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
index 4d0c9129cd4abcb9cef2dc9543f124f990d872cb..c06d0a965be1acec16cc3e856cb6e121002cb236 100644 (file)
@@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
                if ((*p & mask) != mask) {
                        *p += 1ULL << offset;
-                       EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+                       EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
                        return true;
                }
 
@@ -1054,9 +1054,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
 }
 
 __pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
-                          const struct bkey_packed *r,
-                          const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+                        const struct bkey_packed *l,
+                        const struct bkey_packed *r)
 {
        struct bkey unpacked;
 
index 80ea488d57b0c8f806eb6ae1a32d4f1909a30ce6..2d2c640305e21e0580be6588a0026944423615a5 100644 (file)
@@ -67,13 +67,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 #define bkey_whiteout(_k)                              \
        ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
 
-#define bkey_packed_typecheck(_k)                                      \
-({                                                                     \
-       BUILD_BUG_ON(!type_is(_k, struct bkey *) &&                     \
-                    !type_is(_k, struct bkey_packed *));               \
-       type_is(_k, struct bkey_packed *);                              \
-})
-
 enum bkey_lr_packed {
        BKEY_PACKED_BOTH,
        BKEY_PACKED_RIGHT,
@@ -81,9 +74,6 @@ enum bkey_lr_packed {
        BKEY_PACKED_NONE,
 };
 
-#define bkey_lr_packed_typecheck(_l, _r)                               \
-       (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
 #define bkey_lr_packed(_l, _r)                                         \
        ((_l)->format + ((_r)->format << 1))
 
@@ -132,9 +122,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
                                          const struct bpos *);
 
 __pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
-                          const struct bkey_packed *,
-                          const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+                        const struct bkey_packed *,
+                        const struct bkey_packed *);
 
 __pure
 int __bch2_bkey_cmp_left_packed(const struct btree *,
@@ -160,37 +150,6 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
        return bkey_cmp_left_packed(b, l, &r);
 }
 
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r)                                    \
-({                                                                     \
-       int _cmp;                                                       \
-                                                                       \
-       switch (bkey_lr_packed_typecheck(_l, _r)) {                     \
-       case BKEY_PACKED_NONE:                                          \
-               _cmp = bkey_cmp(((struct bkey *) (_l))->p,              \
-                               ((struct bkey *) (_r))->p);             \
-               break;                                                  \
-       case BKEY_PACKED_LEFT:                                          \
-               _cmp = bkey_cmp_left_packed((_b),                       \
-                                 (struct bkey_packed *) (_l),          \
-                                 &((struct bkey *) (_r))->p);          \
-               break;                                                  \
-       case BKEY_PACKED_RIGHT:                                         \
-               _cmp = -bkey_cmp_left_packed((_b),                      \
-                                 (struct bkey_packed *) (_r),          \
-                                 &((struct bkey *) (_l))->p);          \
-               break;                                                  \
-       case BKEY_PACKED_BOTH:                                          \
-               _cmp = __bch2_bkey_cmp_packed((void *) (_l),            \
-                                        (void *) (_r), (_b));          \
-               break;                                                  \
-       }                                                               \
-       _cmp;                                                           \
-})
-
 #if 1
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
index 32849229801dbbf195f3baa4421c96e034971c21..99b7fce2bfd30716fc53e3a6c678fe1683f93319 100644 (file)
@@ -236,7 +236,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
        const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
        enum merge_result ret;
 
-       if (key_merging_disabled(c) ||
+       if (bch2_key_merging_disabled ||
            !ops->key_merge ||
            l.k->type != r.k->type ||
            bversion_cmp(l.k->version, r.k->version) ||
index 839e78d1dc35fb3e71fdaff3407a9a50d58cd50d..99e0a4011faeeefc497d204a00362111676b5366 100644 (file)
@@ -86,7 +86,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b,
                                               struct bkey_packed *l,
                                               struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r) ?:
+       return bch2_bkey_cmp_packed(b, l, r) ?:
                cmp_int((unsigned long) l, (unsigned long) r);
 }
 
@@ -98,7 +98,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter)
         * and should be dropped.
         */
        return iter->used >= 2 &&
-               !bkey_cmp_packed(iter->b,
+               !bch2_bkey_cmp_packed(iter->b,
                                 iter->data[0].k,
                                 iter->data[1].k);
 }
@@ -223,7 +223,7 @@ static inline int sort_keys_cmp(struct btree *b,
                                struct bkey_packed *l,
                                struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r) ?:
+       return bch2_bkey_cmp_packed(b, l, r) ?:
                (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
                (int) l->needs_whiteout - (int) r->needs_whiteout;
 }
@@ -245,7 +245,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
                        continue;
 
                while ((next = sort_iter_peek(iter)) &&
-                      !bkey_cmp_packed(iter->b, in, next)) {
+                      !bch2_bkey_cmp_packed(iter->b, in, next)) {
                        BUG_ON(in->needs_whiteout &&
                               next->needs_whiteout);
                        needs_whiteout |= in->needs_whiteout;
@@ -406,7 +406,7 @@ static inline int sort_extents_cmp(struct btree *b,
                                   struct bkey_packed *l,
                                   struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r) ?:
+       return bch2_bkey_cmp_packed(b, l, r) ?:
                (int) bkey_deleted(l) - (int) bkey_deleted(r);
 }
 
index f7c2841ed8a79908c602e7d6ab470a66aecc6247..26716657453f45c4684259ce1eb92674776eec3e 100644 (file)
@@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b,
        return ro_aux_tree_base(b, t)->f + idx;
 }
 
-static void bset_aux_tree_verify(struct btree *b)
+static void bset_aux_tree_verify(const struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct bset_tree *t;
+       const struct bset_tree *t;
 
        for_each_bset(b, t) {
                if (t->aux_data_offset == U16_MAX)
@@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b)
 #endif
 }
 
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
 {
        unsigned i;
 
        b->nsets                = 0;
        memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
-       b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
        for (i = 0; i < MAX_BSETS; i++)
                b->set[i].data_offset = U16_MAX;
 
@@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
        struct bkey_packed *k = btree_bkey_first(b, t);
        unsigned j = 0;
 
-       if (!btree_keys_expensive_checks(b))
+       if (!bch2_expensive_debug_checks)
                return;
 
        BUG_ON(bset_has_ro_aux_tree(t));
@@ -710,20 +708,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
 }
 
 /* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
        bset_aux_tree_verify(b);
 
        return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
 }
 
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
        return __bset_tree_capacity(b, t) /
                (sizeof(struct bkey_float) + sizeof(u8));
 }
 
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
        return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
 }
@@ -922,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
                k = p;
        }
 
-       if (btree_keys_expensive_checks(b)) {
+       if (bch2_expensive_debug_checks) {
                BUG_ON(ret >= orig_k);
 
                for (i = ret
@@ -1227,8 +1225,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
 
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
-                               struct bset_tree *t,
-                               struct bpos *search,
+                               const struct bset_tree *t,
+                               const struct bpos *search,
                                const struct bkey_packed *packed_search)
 {
        struct ro_aux_tree *base = ro_aux_tree_base(b, t);
@@ -1345,7 +1343,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
                       bkey_iter_pos_cmp(b, m, search) < 0)
                        m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
-       if (btree_keys_expensive_checks(b)) {
+       if (bch2_expensive_debug_checks) {
                struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
                BUG_ON(prev &&
@@ -1601,7 +1599,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
                                  struct btree *b)
 {
-       if (btree_keys_expensive_checks(b)) {
+       if (bch2_expensive_debug_checks) {
                bch2_btree_node_iter_verify(iter, b);
                bch2_btree_node_iter_next_check(iter, b);
        }
@@ -1620,7 +1618,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
        struct bset_tree *t;
        unsigned end = 0;
 
-       if (btree_keys_expensive_checks(b))
+       if (bch2_expensive_debug_checks)
                bch2_btree_node_iter_verify(iter, b);
 
        for_each_bset(b, t) {
@@ -1656,7 +1654,7 @@ found:
        iter->data[0].k = __btree_node_key_to_offset(b, prev);
        iter->data[0].end = end;
 
-       if (btree_keys_expensive_checks(b))
+       if (bch2_expensive_debug_checks)
                bch2_btree_node_iter_verify(iter, b);
        return prev;
 }
index 5921cf68910578c94d6db0b07c69dcdcbe3fef8a..469294cc716c581697ef904b4a05348e2e87ff41 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
-#include "bcachefs_format.h"
+#include "bcachefs.h"
 #include "bkey.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
  * first key in that range of bytes again.
  */
 
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
-       return false;
-#endif
-}
-
 enum bset_aux_tree_type {
        BSET_NO_AUX_TREE,
        BSET_RO_AUX_TREE,
@@ -201,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
 
 #define BSET_CACHELINE         128
 
-static inline size_t btree_keys_cachelines(struct btree *b)
+static inline size_t btree_keys_cachelines(const struct btree *b)
 {
        return (1U << b->byte_order) / BSET_CACHELINE;
 }
 
-static inline size_t btree_aux_data_bytes(struct btree *b)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
 {
        return btree_keys_cachelines(b) * 8;
 }
 
-static inline size_t btree_aux_data_u64s(struct btree *b)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
 {
        return btree_aux_data_bytes(b) / sizeof(u64);
 }
@@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
                compiled_unpack_fn unpack_fn = b->aux_data;
                unpack_fn(dst, src);
 
-               if (btree_keys_expensive_checks(b)) {
+               if (bch2_expensive_debug_checks) {
                        struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
 
                        BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
@@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b,
        return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
 }
 
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
@@ -477,7 +466,7 @@ static inline int bkey_iter_cmp(const struct btree *b,
                                const struct bkey_packed *l,
                                const struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r)
+       return bch2_bkey_cmp_packed(b, l, r)
                ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
                ?: cmp_int(l, r);
 }
@@ -654,7 +643,7 @@ static inline void bch2_verify_insert_pos(struct btree *b,
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
 {
-       if (btree_keys_expensive_checks(b))
+       if (bch2_debug_check_btree_accounting)
                __bch2_verify_btree_nr_keys(b);
 }
 
index bb94fa2341eea839eca31128f1245b190d8244f8..325a16615a068ae149b22f880cebdc085a7add60 100644 (file)
@@ -211,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
                 * - unless btree verify mode is enabled, since it runs out of
                 * the post write cleanup:
                 */
-               if (verify_btree_ondisk(c))
+               if (bch2_verify_btree_ondisk)
                        bch2_btree_node_write(c, b, SIX_LOCK_intent);
                else
                        __bch2_btree_node_write(c, b, SIX_LOCK_read);
@@ -254,7 +254,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
        unsigned long freed = 0;
        unsigned i, flags;
 
-       if (btree_shrinker_disabled(c))
+       if (bch2_btree_shrinker_disabled)
                return SHRINK_STOP;
 
        /* Return -1 if we can't do anything right now */
@@ -341,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
                                        btree_cache.shrink);
        struct btree_cache *bc = &c->btree_cache;
 
-       if (btree_shrinker_disabled(c))
+       if (bch2_btree_shrinker_disabled)
                return 0;
 
        return btree_cache_can_free(bc) * btree_pages(c);
@@ -590,7 +590,7 @@ out:
        b->sib_u64s[0]          = 0;
        b->sib_u64s[1]          = 0;
        b->whiteout_u64s        = 0;
-       bch2_btree_keys_init(b, &c->expensive_debug_checks);
+       bch2_btree_keys_init(b);
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
                               start_time);
@@ -705,7 +705,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
  */
 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
                                  const struct bkey_i *k, unsigned level,
-                                 enum six_lock_type lock_type)
+                                 enum six_lock_type lock_type,
+                                 unsigned long trace_ip)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
@@ -767,7 +768,7 @@ lock_node:
                        btree_node_unlock(iter, level + 1);
 
                if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
-                                    lock_node_check_fn, (void *) k)) {
+                                    lock_node_check_fn, (void *) k, trace_ip)) {
                        if (b->hash_val != btree_ptr_hash_val(k))
                                goto retry;
                        return ERR_PTR(-EINTR);
@@ -935,7 +936,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
        bch2_bkey_unpack(parent, &tmp.k, k);
 
        ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-                                 SIX_LOCK_intent);
+                                 SIX_LOCK_intent, _THIS_IP_);
 
        if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
                struct btree_iter *linked;
@@ -948,14 +949,14 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
                 * holding other locks that would cause us to deadlock:
                 */
                trans_for_each_iter(trans, linked)
-                       if (btree_iter_cmp(iter, linked) < 0)
+                       if (btree_iter_lock_cmp(iter, linked) < 0)
                                __bch2_btree_iter_unlock(linked);
 
                if (sib == btree_prev_sib)
                        btree_node_unlock(iter, level);
 
                ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-                                         SIX_LOCK_intent);
+                                         SIX_LOCK_intent, _THIS_IP_);
 
                /*
                 * before btree_iter_relock() calls btree_iter_verify_locks():
index d0d3a85bb8be6354c5cb6218e1e3a69116586aa4..8a19e60e9258014816b6d5a8c4ac54e00587a7f2 100644 (file)
@@ -23,7 +23,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
 struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  const struct bkey_i *, unsigned,
-                                 enum six_lock_type);
+                                 enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
                                         enum btree_id, unsigned);
index e8c1e752a25d63ec7fc32c5982d3ad3e32bc1f9c..ba4acc112ed34ef22e6ad7526381966026b277fa 100644 (file)
@@ -101,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
        int ret = 0;
 
        if (initial) {
-               BUG_ON(journal_seq_verify(c) &&
+               BUG_ON(bch2_journal_seq_verify &&
                       k.k->version.lo > journal_cur_seq(&c->journal));
 
                /* XXX change to fsck check */
@@ -209,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        struct btree_iter *iter;
        struct btree *b;
        unsigned depth = metadata_only                  ? 1
-               : expensive_debug_checks(c)             ? 0
+               : bch2_expensive_debug_checks           ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
        u8 max_stale = 0;
@@ -236,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
                                                BTREE_INSERT_USE_RESERVE|
                                                BTREE_INSERT_NOWAIT|
                                                BTREE_INSERT_GC_LOCK_HELD);
-                       else if (!btree_gc_rewrite_disabled(c) &&
-                                (btree_gc_always_rewrite(c) || max_stale > 16))
+                       else if (!bch2_btree_gc_rewrite_disabled &&
+                                (bch2_btree_gc_always_rewrite || max_stale > 16))
                                bch2_btree_node_rewrite(c, iter,
                                                b->data->keys.seq,
                                                BTREE_INSERT_NOWAIT|
@@ -328,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 {
        struct btree *b;
        unsigned target_depth = metadata_only           ? 1
-               : expensive_debug_checks(c)             ? 0
+               : bch2_expensive_debug_checks           ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
        u8 max_stale = 0;
@@ -835,7 +835,7 @@ again:
 out:
        if (!ret &&
            (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-            (!iter && test_restart_gc(c)))) {
+            (!iter && bch2_test_restart_gc))) {
                /*
                 * XXX: make sure gens we fixed got saved
                 */
index 682f599cbef588d6a912cceaf7627b73a12d48d1..10a00085cdd6f951e628a9d0052f80cb1c968349 100644 (file)
@@ -42,7 +42,7 @@ static void verify_no_dups(struct btree *b,
                BUG_ON(extents
                       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
                       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-               //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
+               //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0);
        }
 #endif
 }
@@ -102,14 +102,14 @@ static void sort_bkey_ptrs(const struct btree *bt,
                        break;
 
                for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
-                       b = bkey_cmp_packed(bt,
+                       b = bch2_bkey_cmp_packed(bt,
                                            ptrs[c],
                                            ptrs[d]) >= 0 ? c : d;
                if (d == n)
                        b = c;
 
                while (b != a &&
-                      bkey_cmp_packed(bt,
+                      bch2_bkey_cmp_packed(bt,
                                       ptrs[a],
                                       ptrs[b]) >= 0)
                        b = (b - 1) / 2;
@@ -1044,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
 
                if (invalid ||
-                   (inject_invalid_keys(c) &&
+                   (bch2_inject_invalid_keys &&
                     !bversion_cmp(u.k->version, MAX_VERSION))) {
                        char buf[160];
 
index 6fab76c3220c55f64687d93299ac942d941cb84d..58f1a3dd97d30591f9cc936f39c31ee386edb8eb 100644 (file)
@@ -197,13 +197,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                            unsigned level, struct btree_iter *iter,
                            enum six_lock_type type,
-                           six_lock_should_sleep_fn should_sleep_fn,
-                           void *p)
+                           six_lock_should_sleep_fn should_sleep_fn, void *p,
+                           unsigned long ip)
 {
        struct btree_trans *trans = iter->trans;
-       struct btree_iter *linked;
+       struct btree_iter *linked, *deadlock_iter = NULL;
        u64 start_time = local_clock();
-       bool ret = true;
+       unsigned reason = 9;
 
        /* Check if it's safe to block: */
        trans_for_each_iter(trans, linked) {
@@ -228,11 +228,34 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                                linked->locks_want = max_t(unsigned,
                                                linked->locks_want,
                                                __fls(linked->nodes_locked) + 1);
-                               if (!btree_iter_get_locks(linked, true, false))
-                                       ret = false;
+                               if (!btree_iter_get_locks(linked, true, false)) {
+                                       deadlock_iter = linked;
+                                       reason = 1;
+                               }
                        } else {
-                               ret = false;
+                               deadlock_iter = linked;
+                               reason = 2;
+                       }
+               }
+
+               if (linked->btree_id != iter->btree_id) {
+                       if (linked->btree_id > iter->btree_id) {
+                               deadlock_iter = linked;
+                               reason = 3;
+                       }
+                       continue;
+               }
+
+               /*
+                * Within the same btree, cached iterators come before non
+                * cached iterators:
+                */
+               if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
+                       if (btree_iter_is_cached(iter)) {
+                               deadlock_iter = linked;
+                               reason = 4;
                        }
+                       continue;
                }
 
                /*
@@ -240,30 +263,29 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 * another iterator has possible descendants locked of the node
                 * we're about to lock, it must have the ancestors locked too:
                 */
-               if (linked->btree_id == iter->btree_id &&
-                   level > __fls(linked->nodes_locked)) {
+               if (level > __fls(linked->nodes_locked)) {
                        if (!(trans->nounlock)) {
                                linked->locks_want =
                                        max(level + 1, max_t(unsigned,
                                            linked->locks_want,
                                            iter->locks_want));
-                               if (!btree_iter_get_locks(linked, true, false))
-                                       ret = false;
+                               if (!btree_iter_get_locks(linked, true, false)) {
+                                       deadlock_iter = linked;
+                                       reason = 5;
+                               }
                        } else {
-                               ret = false;
+                               deadlock_iter = linked;
+                               reason = 6;
                        }
                }
 
                /* Must lock btree nodes in key order: */
-               if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-                    -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
-                       ret = false;
-
-               if (iter->btree_id == linked->btree_id &&
-                   btree_node_locked(linked, level) &&
+               if (btree_node_locked(linked, level) &&
                    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-                                                btree_iter_type(linked))) <= 0)
-                       ret = false;
+                                                btree_iter_type(linked))) <= 0) {
+                       deadlock_iter = linked;
+                       reason = 7;
+               }
 
                /*
                 * Recheck if this is a node we already have locked - since one
@@ -277,8 +299,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                }
        }
 
-       if (unlikely(!ret)) {
-               trace_trans_restart_would_deadlock(iter->trans->ip);
+       if (unlikely(deadlock_iter)) {
+               trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+                               reason,
+                               deadlock_iter->btree_id,
+                               btree_iter_type(deadlock_iter),
+                               iter->btree_id,
+                               btree_iter_type(iter));
                return false;
        }
 
@@ -471,7 +498,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
        char buf1[100], buf2[100];
        const char *msg;
 
-       if (!debug_check_iterators(iter->trans->c))
+       if (!bch2_debug_check_iterators)
                return;
 
        if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
@@ -567,7 +594,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 {
        struct btree_iter *iter;
 
-       if (!debug_check_iterators(trans->c))
+       if (!bch2_debug_check_iterators)
                return;
 
        trans_for_each_iter_with_node(trans, b, iter)
@@ -739,7 +766,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                __bch2_btree_node_iter_fix(iter, b, node_iter, t,
                                           where, clobber_u64s, new_u64s);
 
-               if (debug_check_iterators(iter->trans->c))
+               if (bch2_debug_check_iterators)
                        bch2_btree_node_iter_verify(node_iter, b);
        }
 
@@ -769,7 +796,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 
        ret = bkey_disassemble(l->b, k, u);
 
-       if (debug_check_bkeys(iter->trans->c))
+       if (bch2_debug_check_bkeys)
                bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 
        return ret;
@@ -945,7 +972,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
 }
 
 static inline int btree_iter_lock_root(struct btree_iter *iter,
-                                      unsigned depth_want)
+                                      unsigned depth_want,
+                                      unsigned long trace_ip)
 {
        struct bch_fs *c = iter->trans->c;
        struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
@@ -974,7 +1002,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
                lock_type = __btree_lock_want(iter, iter->level);
                if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
                                              iter, lock_type,
-                                             lock_root_check_fn, rootp)))
+                                             lock_root_check_fn, rootp,
+                                             trace_ip)))
                        return -EINTR;
 
                if (likely(b == READ_ONCE(*rootp) &&
@@ -1046,7 +1075,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
                btree_node_unlock(iter, plevel);
 }
 
-static __always_inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter,
+                                          unsigned long trace_ip)
 {
        struct bch_fs *c = iter->trans->c;
        struct btree_iter_level *l = &iter->l[iter->level];
@@ -1060,7 +1090,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter)
        bch2_bkey_unpack(l->b, &tmp.k,
                         bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
+       b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
        if (unlikely(IS_ERR(b)))
                return PTR_ERR(b);
 
@@ -1084,7 +1114,7 @@ static void btree_iter_up(struct btree_iter *iter)
        btree_node_unlock(iter, iter->level++);
 }
 
-static int btree_iter_traverse_one(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 {
@@ -1104,11 +1134,12 @@ retry_all:
                sorted[nr_sorted++] = iter->idx;
 
 #define btree_iter_cmp_by_idx(_l, _r)                          \
-               btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+               btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
        bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
        bch2_trans_unlock(trans);
+       cond_resched();
 
        if (unlikely(ret == -ENOMEM)) {
                struct closure cl;
@@ -1139,7 +1170,7 @@ retry_all:
                if (!(trans->iters_linked & (1ULL << idx)))
                        continue;
 
-               ret = btree_iter_traverse_one(&trans->iters[idx]);
+               ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
                if (ret)
                        goto retry_all;
        }
@@ -1202,7 +1233,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter,
+                                  unsigned long trace_ip)
 {
        unsigned depth_want = iter->level;
 
@@ -1249,8 +1281,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
         */
        while (iter->level > depth_want) {
                int ret = btree_iter_node(iter, iter->level)
-                       ? btree_iter_down(iter)
-                       : btree_iter_lock_root(iter, depth_want);
+                       ? btree_iter_down(iter, trace_ip)
+                       : btree_iter_lock_root(iter, depth_want, trace_ip);
                if (unlikely(ret)) {
                        if (ret == 1)
                                return 0;
@@ -1281,7 +1313,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
        int ret;
 
        ret =   bch2_trans_cond_resched(trans) ?:
-               btree_iter_traverse_one(iter);
+               btree_iter_traverse_one(iter, _RET_IP_);
        if (unlikely(ret))
                ret = __btree_iter_traverse_all(trans, ret);
 
@@ -1545,13 +1577,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
 
                ret.v = bkeyp_val(&l->b->format, _k);
 
-               if (debug_check_iterators(iter->trans->c)) {
+               if (bch2_debug_check_iterators) {
                        struct bkey k = bkey_unpack_key(l->b, _k);
 
                        BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
                }
 
-               if (debug_check_bkeys(iter->trans->c))
+               if (bch2_debug_check_bkeys)
                        bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
        }
 
@@ -1970,6 +2002,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
        return bch2_trans_iter_put(trans, iter);
 }
 
+#if 0
 static int bch2_trans_realloc_iters(struct btree_trans *trans,
                                    unsigned new_size)
 {
@@ -2018,8 +2051,7 @@ success:
                       sizeof(struct btree_iter) * trans->nr_iters +
                       sizeof(struct btree_insert_entry) * trans->nr_iters);
 
-       if (trans->iters != trans->iters_onstack)
-               kfree(trans->iters);
+       kfree(trans->iters);
 
        trans->iters            = new_iters;
        trans->updates          = new_updates;
@@ -2033,6 +2065,7 @@ success:
 
        return 0;
 }
+#endif
 
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 {
@@ -2042,28 +2075,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
                goto got_slot;
 
        if (trans->nr_iters == trans->size) {
-               int ret;
-
-               if (trans->nr_iters >= BTREE_ITER_MAX) {
-                       struct btree_iter *iter;
-
-                       trans_for_each_iter(trans, iter) {
-                               pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
-                                      bch2_btree_ids[iter->btree_id],
-                                      iter->pos.inode,
-                                      iter->pos.offset,
-                                      (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
-                                      (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-                                      iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-                                      (void *) iter->ip_allocated);
-                       }
+               struct btree_iter *iter;
 
-                       panic("trans iter oveflow\n");
+               BUG_ON(trans->size < BTREE_ITER_MAX);
+
+               trans_for_each_iter(trans, iter) {
+                       pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
+                              bch2_btree_ids[iter->btree_id],
+                              iter->pos.inode,
+                              iter->pos.offset,
+                              (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+                              (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+                              iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+                              (void *) iter->ip_allocated);
                }
 
+               panic("trans iter oveflow\n");
+#if 0
                ret = bch2_trans_realloc_iters(trans, trans->size * 2);
                if (ret)
                        return ERR_PTR(ret);
+#endif
        }
 
        idx = trans->nr_iters++;
@@ -2305,28 +2337,37 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
                bch2_btree_iter_traverse_all(trans);
 }
 
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+{
+       unsigned new_size = BTREE_ITER_MAX;
+       size_t iters_bytes      = sizeof(struct btree_iter) * new_size;
+       size_t updates_bytes    = sizeof(struct btree_insert_entry) * new_size;
+       void *p;
+
+       BUG_ON(trans->used_mempool);
+
+       p =     this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
+               mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+
+       trans->iters            = p; p += iters_bytes;
+       trans->updates          = p; p += updates_bytes;
+       trans->updates2         = p; p += updates_bytes;
+       trans->size             = new_size;
+}
+
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
                     unsigned expected_nr_iters,
                     size_t expected_mem_bytes)
 {
-       memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+       memset(trans, 0, sizeof(*trans));
+       trans->c                = c;
+       trans->ip               = _RET_IP_;
 
        /*
         * reallocating iterators currently completely breaks
-        * bch2_trans_iter_put():
+        * bch2_trans_iter_put(), we always allocate the max:
         */
-       expected_nr_iters = BTREE_ITER_MAX;
-
-       trans->c                = c;
-       trans->ip               = _RET_IP_;
-       trans->size             = ARRAY_SIZE(trans->iters_onstack);
-       trans->iters            = trans->iters_onstack;
-       trans->updates          = trans->updates_onstack;
-       trans->updates2         = trans->updates2_onstack;
-       trans->fs_usage_deltas  = NULL;
-
-       if (expected_nr_iters > trans->size)
-               bch2_trans_realloc_iters(trans, expected_nr_iters);
+       bch2_trans_alloc_iters(trans, c);
 
        if (expected_mem_bytes)
                bch2_trans_preload_mem(trans, expected_mem_bytes);
@@ -2341,6 +2382,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 int bch2_trans_exit(struct btree_trans *trans)
 {
+       struct bch_fs *c = trans->c;
+
        bch2_trans_unlock(trans);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2353,19 +2396,21 @@ int bch2_trans_exit(struct btree_trans *trans)
 
        kfree(trans->fs_usage_deltas);
        kfree(trans->mem);
-       if (trans->used_mempool)
+
+       trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+       if (trans->iters)
                mempool_free(trans->iters, &trans->c->btree_iters_pool);
-       else if (trans->iters != trans->iters_onstack)
-               kfree(trans->iters);
+
        trans->mem      = (void *) 0x1;
        trans->iters    = (void *) 0x1;
 
        return trans->error ? -EIO : 0;
 }
 
-static void bch2_btree_iter_node_to_text(struct printbuf *out,
-                                struct btree_bkey_cached_common *_b,
-                                enum btree_iter_type type)
+static void __maybe_unused
+bch2_btree_iter_node_to_text(struct printbuf *out,
+                            struct btree_bkey_cached_common *_b,
+                            enum btree_iter_type type)
 {
        pr_buf(out, "    %px l=%u %s:",
               _b, _b->level, bch2_btree_ids[_b->btree_id]);
index bd9ec3ec9a92a2809128f6ae799dfeae44bc9ca9..f7a73619c85b2ff00680e10c95a952bcfebb7c7c 100644 (file)
@@ -177,11 +177,12 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
 void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
 void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-                                const struct btree_iter *r)
+/* Sort order for locking btree iterators: */
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
+                                     const struct btree_iter *r)
 {
        return   cmp_int(l->btree_id, r->btree_id) ?:
-               -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+               -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
                 bkey_cmp(l->pos, r->pos);
 }
 
index 61662750dfc046583d2d4c6092854e6a2c948335..0ee4f78ce67a1b5ea4acd767973ef68b2db4e098 100644 (file)
@@ -29,8 +29,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = {
 };
 
 __flatten
-static inline struct bkey_cached *
-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
 {
        struct bkey_cached_key key = {
                .btree_id       = btree_id,
@@ -204,6 +204,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
                !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
 }
 
+__flatten
 int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 {
        struct btree_trans *trans = iter->trans;
@@ -218,7 +219,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
                goto fill;
        }
 retry:
-       ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+       ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
        if (!ck) {
                if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
                        iter->l[0].b = NULL;
@@ -242,7 +243,7 @@ retry:
                enum six_lock_type lock_want = __btree_lock_want(iter, 0);
 
                if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
-                                    bkey_cached_check_fn, iter)) {
+                                    bkey_cached_check_fn, iter, _THIS_IP_)) {
                        if (ck->key.btree_id != iter->btree_id ||
                            bkey_cmp(ck->key.pos, iter->pos)) {
                                goto retry;
@@ -415,7 +416,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
        struct bkey_cached_key key = { id, pos };
 
        /* Fastpath - assume it won't be found: */
-       if (!btree_key_cache_find(c, id, pos))
+       if (!bch2_btree_key_cache_find(c, id, pos))
                return 0;
 
        return btree_key_cache_flush_pos(trans, key, 0, true);
@@ -462,7 +463,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
                               enum btree_id id, struct bpos pos)
 {
-       BUG_ON(btree_key_cache_find(trans->c, id, pos));
+       BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
 }
 #endif
 
index b1756c6c622cec53fbad64f08b2600982ec38167..d448264abcc89db5382ff6fe99f539c28d6a4326 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
 int bch2_btree_iter_traverse_cached(struct btree_iter *);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
index 81fbf3e186473da3ef818ffcfb2d07df2141e5fb..38323e32731fcc5833ef4ca099dede86bffc576c 100644 (file)
@@ -176,13 +176,15 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
                            struct btree_iter *, enum six_lock_type,
-                           six_lock_should_sleep_fn, void *);
+                           six_lock_should_sleep_fn, void *,
+                           unsigned long);
 
 static inline bool btree_node_lock(struct btree *b,
                        struct bpos pos, unsigned level,
                        struct btree_iter *iter,
                        enum six_lock_type type,
-                       six_lock_should_sleep_fn should_sleep_fn, void *p)
+                       six_lock_should_sleep_fn should_sleep_fn, void *p,
+                       unsigned long ip)
 {
        struct btree_trans *trans = iter->trans;
        bool ret;
@@ -200,7 +202,7 @@ static inline bool btree_node_lock(struct btree *b,
        ret   = likely(six_trylock_type(&b->c.lock, type)) ||
                btree_node_lock_increment(trans, b, level, type) ||
                __bch2_btree_node_lock(b, pos, level, iter, type,
-                                      should_sleep_fn, p);
+                                      should_sleep_fn, p, ip);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans->locking = NULL;
index cc01baeec138daaf5e06c3e030e095e4e20b7443..93721fbc77949f858431679f9eb0dd2d9150b684 100644 (file)
@@ -130,10 +130,6 @@ struct btree {
 
        struct btree_write      writes[2];
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       bool                    *expensive_debug_checks;
-#endif
-
        /* Key/pointer for this btree node */
        __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
@@ -283,6 +279,11 @@ btree_iter_type(const struct btree_iter *iter)
        return iter->flags & BTREE_ITER_TYPE;
 }
 
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
+{
+       return btree_iter_type(iter) == BTREE_ITER_CACHED;
+}
+
 static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 {
        return iter->l + iter->level;
@@ -380,10 +381,6 @@ struct btree_trans {
        unsigned                journal_u64s;
        unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
-
-       struct btree_iter       iters_onstack[2];
-       struct btree_insert_entry updates_onstack[2];
-       struct btree_insert_entry updates2_onstack[2];
 };
 
 #define BTREE_FLAG(flag)                                               \
index a2604b0ce2d83eb91bba44b78f3350703e112ea0..4ddd1697ffdec6fe000fa82a8097a1f06c0cad4b 100644 (file)
@@ -1313,7 +1313,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
         * the node the iterator points to:
         */
        while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-              (bkey_cmp_packed(b, k, &insert->k) >= 0))
+              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
                ;
 
        for_each_keylist_key(keys, insert)
index 49995cd00c16c26c1f4f77c9d9b5eb4da83d3634..e386f8ed39222071592432400d325f7db2e57aa3 100644 (file)
@@ -72,7 +72,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
        EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
        k = bch2_btree_node_iter_peek_all(node_iter, b);
-       if (k && bkey_cmp_packed(b, k, &insert->k))
+       if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
                k = NULL;
 
        /* @k is the key being overwritten/deleted, if any: */
@@ -220,7 +220,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
 
        BUG_ON(bkey_cmp(insert->k.p, iter->pos));
-       BUG_ON(debug_check_bkeys(c) &&
+       BUG_ON(bch2_debug_check_bkeys &&
               bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
                                 __btree_node_type(iter->level, iter->btree_id)));
 }
@@ -440,10 +440,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
         */
 
        if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-               if (journal_seq_verify(c))
+               if (bch2_journal_seq_verify)
                        trans_for_each_update2(trans, i)
                                i->k->k.version.lo = trans->journal_res.seq;
-               else if (inject_invalid_keys(c))
+               else if (bch2_inject_invalid_keys)
                        trans_for_each_update2(trans, i)
                                i->k->k.version = MAX_VERSION;
        }
@@ -680,6 +680,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        return 0;
 }
 
+static inline int btree_iter_pos_cmp(const struct btree_iter *l,
+                                    const struct btree_iter *r)
+{
+       return   cmp_int(l->btree_id, r->btree_id) ?:
+                bkey_cmp(l->pos, r->pos);
+}
+
 static void bch2_trans_update2(struct btree_trans *trans,
                               struct btree_iter *iter,
                               struct bkey_i *insert)
@@ -697,12 +704,12 @@ static void bch2_trans_update2(struct btree_trans *trans,
        iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
        trans_for_each_update2(trans, i) {
-               if (btree_iter_cmp(n.iter, i->iter) == 0) {
+               if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
                        *i = n;
                        return;
                }
 
-               if (btree_iter_cmp(n.iter, i->iter) <= 0)
+               if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
                        break;
        }
 
@@ -986,7 +993,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
         * Pending updates are kept sorted: first, find position of new update:
         */
        trans_for_each_update(trans, i)
-               if (btree_iter_cmp(iter, i->iter) <= 0)
+               if (btree_iter_pos_cmp(iter, i->iter) <= 0)
                        break;
 
        /*
index b50d2b0d5fd33f3f37dd5c874f12e95709cf5749..aebf46bb1d21e5fd269e94f3193c73a23f56cc2a 100644 (file)
@@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 
        BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
 
-       if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+       if (!PageHighMem(bio_iter_page(bio, start)) &&
            bio_phys_contig(bio, start))
                return (struct bbuf) {
                        .b = page_address(bio_iter_page(bio, start)) +
index aa10591a3b1a8f3b84322080b989f00547282dc3..bbe3fefa2651079ec44f9b3b6c42ef0d0da7d026 100644 (file)
@@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        v->written      = 0;
        v->c.level      = b->c.level;
        v->c.btree_id   = b->c.btree_id;
-       bch2_btree_keys_init(v, &c->expensive_debug_checks);
+       bch2_btree_keys_init(v);
 
        if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
                                       NULL, &pick) <= 0)
index 56c2d1ab5f630de8ce1b2484f934bc90dceb12eb..7ac1615e9447db326533ac6db5c1b129f02481a2 100644 (file)
@@ -8,44 +8,15 @@ struct bio;
 struct btree;
 struct bch_fs;
 
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       static inline bool name(struct bch_fs *c)                       \
-       { return bch2_##name || c->name;        }
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
 #ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       static inline bool name(struct bch_fs *c)                       \
-       { return bch2_##name || c->name;        }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d)         ((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
+#else
 static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d)         0
-
 #endif
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
-       if (verify_btree_ondisk(c))
+       if (bch2_verify_btree_ondisk)
                __bch2_btree_verify(c, b);
 }
 
index e4a4805ef218d4973d15933c34a30c8a1e3c45af..d7ba0e7fc3b3825e60416cae62acdd708125e96c 100644 (file)
@@ -1586,7 +1586,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
        size_t i;
 
        spin_lock(&c->ec_stripes_heap_lock);
-       for (i = 0; i < min(h->used, 20UL); i++) {
+       for (i = 0; i < min_t(size_t, h->used, 20); i++) {
                m = genradix_ptr(&c->stripes[0], h->data[i].idx);
 
                pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
index 88297b30f6221850ac15b46197c7de908520477a..7fae6a4ba26f0e1e0209f2d86c8cf74cc1537121 100644 (file)
@@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c,
                return bch2_rand_range(l1 + l2) > l1;
        }
 
-       if (force_reconstruct_read(c))
+       if (bch2_force_reconstruct_read)
                return p1.idx > p2.idx;
 
        return p1.idx < p2.idx;
@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
                    !bch2_dev_is_readable(ca))
                        p.idx++;
 
-               if (force_reconstruct_read(c) &&
+               if (bch2_force_reconstruct_read &&
                    !p.idx && p.has_ec)
                        p.idx++;
 
index 878419d409927c7d33902993d0f37a4b5357e362..503ce1920f395f935a7f3ba57f2cad2afee70bc2 100644 (file)
@@ -34,9 +34,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        if (!name)
                new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-       ret = bch2_inode_create(trans, new_inode,
-                               BLOCKDEV_INODE_MAX, 0,
-                               &c->unused_inode_hint);
+       ret = bch2_inode_create(trans, new_inode);
        if (ret)
                goto err;
 
index 3aed2ca4dcedbef12bb94da817886cf17443605d..1eb69ed38b10bd2cbd56695f2f106bc40d91d976 100644 (file)
@@ -265,28 +265,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
 /* for newly allocated pages: */
 static void __bch2_page_state_release(struct page *page)
 {
-       struct bch_page_state *s = __bch2_page_state(page);
-
-       if (!s)
-               return;
-
-       ClearPagePrivate(page);
-       set_page_private(page, 0);
-       put_page(page);
-       kfree(s);
+       kfree(detach_page_private(page));
 }
 
 static void bch2_page_state_release(struct page *page)
 {
-       struct bch_page_state *s = bch2_page_state(page);
-
-       if (!s)
-               return;
-
-       ClearPagePrivate(page);
-       set_page_private(page, 0);
-       put_page(page);
-       kfree(s);
+       EBUG_ON(!PageLocked(page));
+       __bch2_page_state_release(page);
 }
 
 /* for newly allocated pages: */
@@ -300,13 +285,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
                return NULL;
 
        spin_lock_init(&s->lock);
-       /*
-        * migrate_page_move_mapping() assumes that pages with private data
-        * have their count elevated by 1.
-        */
-       get_page(page);
-       set_page_private(page, (unsigned long) s);
-       SetPagePrivate(page);
+       attach_page_private(page, s);
        return s;
 }
 
@@ -608,14 +587,8 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
        if (ret != MIGRATEPAGE_SUCCESS)
                return ret;
 
-       if (PagePrivate(page)) {
-               ClearPagePrivate(page);
-               get_page(newpage);
-               set_page_private(newpage, page_private(page));
-               set_page_private(page, 0);
-               put_page(page);
-               SetPagePrivate(newpage);
-       }
+       if (PagePrivate(page))
+               attach_page_private(newpage, detach_page_private(page));
 
        if (mode != MIGRATE_SYNC_NO_COPY)
                migrate_page_copy(newpage, page);
@@ -647,41 +620,33 @@ static void bch2_readpages_end_io(struct bio *bio)
        bio_put(bio);
 }
 
-static inline void page_state_init_for_read(struct page *page)
-{
-       SetPagePrivate(page);
-       page->private = 0;
-}
-
 struct readpages_iter {
        struct address_space    *mapping;
        struct page             **pages;
        unsigned                nr_pages;
-       unsigned                nr_added;
        unsigned                idx;
        pgoff_t                 offset;
 };
 
 static int readpages_iter_init(struct readpages_iter *iter,
-                              struct address_space *mapping,
-                              struct list_head *pages, unsigned nr_pages)
+                              struct readahead_control *ractl)
 {
+       unsigned i, nr_pages = readahead_count(ractl);
+
        memset(iter, 0, sizeof(*iter));
 
-       iter->mapping   = mapping;
-       iter->offset    = list_last_entry(pages, struct page, lru)->index;
+       iter->mapping   = ractl->mapping;
+       iter->offset    = readahead_index(ractl);
+       iter->nr_pages  = nr_pages;
 
        iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
        if (!iter->pages)
                return -ENOMEM;
 
-       while (!list_empty(pages)) {
-               struct page *page = list_last_entry(pages, struct page, lru);
-
-               __bch2_page_state_create(page, __GFP_NOFAIL);
-
-               iter->pages[iter->nr_pages++] = page;
-               list_del(&page->lru);
+       __readahead_batch(ractl, iter->pages, nr_pages);
+       for (i = 0; i < nr_pages; i++) {
+               __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+               put_page(iter->pages[i]);
        }
 
        return 0;
@@ -689,41 +654,9 @@ static int readpages_iter_init(struct readpages_iter *iter,
 
 static inline struct page *readpage_iter_next(struct readpages_iter *iter)
 {
-       struct page *page;
-       unsigned i;
-       int ret;
-
-       BUG_ON(iter->idx > iter->nr_added);
-       BUG_ON(iter->nr_added > iter->nr_pages);
-
-       if (iter->idx < iter->nr_added)
-               goto out;
-
-       while (1) {
-               if (iter->idx == iter->nr_pages)
-                       return NULL;
-
-               ret = add_to_page_cache_lru_vec(iter->mapping,
-                               iter->pages     + iter->nr_added,
-                               iter->nr_pages  - iter->nr_added,
-                               iter->offset    + iter->nr_added,
-                               GFP_NOFS);
-               if (ret > 0)
-                       break;
-
-               page = iter->pages[iter->nr_added];
-               iter->idx++;
-               iter->nr_added++;
-
-               __bch2_page_state_release(page);
-               put_page(page);
-       }
-
-       iter->nr_added += ret;
+       if (iter->idx >= iter->nr_pages)
+               return NULL;
 
-       for (i = iter->idx; i < iter->nr_added; i++)
-               put_page(iter->pages[i]);
-out:
        EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
 
        return iter->pages[iter->idx];
@@ -889,10 +822,9 @@ retry:
        bkey_on_stack_exit(&sk, c);
 }
 
-int bch2_readpages(struct file *file, struct address_space *mapping,
-                  struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
 {
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
        struct btree_trans trans;
@@ -901,7 +833,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
        struct readpages_iter readpages_iter;
        int ret;
 
-       ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+       ret = readpages_iter_init(&readpages_iter, ractl);
        BUG_ON(ret);
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -936,8 +868,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 
        bch2_trans_exit(&trans);
        kfree(readpages_iter.pages);
-
-       return 0;
 }
 
 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
index 7063556d289b7446716d674457b2dd0c32d9387f..2537a3d25ede1dd585f284353c8e2d58d1d21a78 100644 (file)
@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *);
 int bch2_readpage(struct file *, struct page *);
 
 int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
-                  struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
 
 int bch2_write_begin(struct file *, struct address_space *, loff_t,
                     unsigned, unsigned, struct page **, void **);
index 1d66acaca33cf9d35a9ebdff026cbe25819f198e..3ac57ba29e9f6285eb16dd47dd5ef7a3d1307e63 100644 (file)
@@ -42,6 +42,11 @@ static void journal_seq_copy(struct bch_fs *c,
                             struct bch_inode_info *dst,
                             u64 journal_seq)
 {
+       /*
+        * atomic64_cmpxchg has a fallback for archs that don't support it,
+        * cmpxchg does not:
+        */
+       atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
        u64 old, v = READ_ONCE(dst->ei_journal_seq);
 
        do {
@@ -49,7 +54,7 @@ static void journal_seq_copy(struct bch_fs *c,
 
                if (old >= journal_seq)
                        break;
-       } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+       } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
 
        bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
 }
@@ -225,6 +230,13 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
        return &inode->v;
 }
 
+static int inum_test(struct inode *inode, void *p)
+{
+       unsigned long *ino = p;
+
+       return *ino == inode->i_ino;
+}
+
 static struct bch_inode_info *
 __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
              umode_t mode, dev_t rdev, bool tmpfile)
@@ -304,8 +316,12 @@ err_before_quota:
         * thread pulling the inode in and modifying it:
         */
 
-       old = to_bch_ei(insert_inode_locked2(&inode->v));
-       if (unlikely(old)) {
+       inode->v.i_state |= I_CREATING;
+       old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+                                     inum_test, NULL, &inode->v.i_ino));
+       BUG_ON(!old);
+
+       if (unlikely(old != inode)) {
                /*
                 * We raced, another process pulled the new inode into cache
                 * before us:
@@ -807,7 +823,7 @@ static int bch2_fill_extent(struct bch_fs *c,
                            struct fiemap_extent_info *info,
                            struct bkey_s_c k, unsigned flags)
 {
-       if (bkey_extent_is_data(k.k)) {
+       if (bkey_extent_is_direct_data(k.k)) {
                struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
@@ -838,6 +854,12 @@ static int bch2_fill_extent(struct bch_fs *c,
                }
 
                return 0;
+       } else if (bkey_extent_is_inline_data(k.k)) {
+               return fiemap_fill_next_extent(info,
+                                              bkey_start_offset(k.k) << 9,
+                                              0, k.k->size << 9,
+                                              flags|
+                                              FIEMAP_EXTENT_DATA_INLINE);
        } else if (k.k->type == KEY_TYPE_reservation) {
                return fiemap_fill_next_extent(info,
                                               bkey_start_offset(k.k) << 9,
@@ -891,9 +913,7 @@ retry:
                        bkey_start_offset(k.k);
                sectors                 = k.k->size - offset_into_extent;
 
-               bkey_on_stack_realloc(&cur, c, k.k->u64s);
-               bkey_on_stack_realloc(&prev, c, k.k->u64s);
-               bkey_reassemble(cur.k, k);
+               bkey_on_stack_reassemble(&cur, c, k);
 
                ret = bch2_read_indirect_extent(&trans,
                                        &offset_into_extent, &cur);
@@ -901,14 +921,14 @@ retry:
                        break;
 
                k = bkey_i_to_s_c(cur.k);
+               bkey_on_stack_realloc(&prev, c, k.k->u64s);
 
                sectors = min(sectors, k.k->size - offset_into_extent);
 
-               if (offset_into_extent)
-                       bch2_cut_front(POS(k.k->p.inode,
-                                          bkey_start_offset(k.k) +
-                                          offset_into_extent),
-                                      cur.k);
+               bch2_cut_front(POS(k.k->p.inode,
+                                  bkey_start_offset(k.k) +
+                                  offset_into_extent),
+                              cur.k);
                bch2_key_resize(&cur.k->k, sectors);
                cur.k->k.p = iter->pos;
                cur.k->k.p.offset += cur.k->k.size;
@@ -923,10 +943,8 @@ retry:
                bkey_copy(prev.k, cur.k);
                have_extent = true;
 
-               if (k.k->type == KEY_TYPE_reflink_v)
-                       bch2_btree_iter_set_pos(iter, k.k->p);
-               else
-                       bch2_btree_iter_next(iter);
+               bch2_btree_iter_set_pos(iter,
+                       POS(iter->pos.inode, iter->pos.offset + sectors));
        }
 
        if (ret == -EINTR)
@@ -1062,7 +1080,7 @@ static const struct address_space_operations bch_address_space_operations = {
        .writepage      = bch2_writepage,
        .readpage       = bch2_readpage,
        .writepages     = bch2_writepages,
-       .readpages      = bch2_readpages,
+       .readahead      = bch2_readahead,
        .set_page_dirty = __set_page_dirty_nobuffers,
        .write_begin    = bch2_write_begin,
        .write_end      = bch2_write_end,
@@ -1238,6 +1256,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct bch_fs *c = sb->s_fs_info;
        struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
        unsigned shift = sb->s_blocksize_bits - 9;
+       /*
+        * this assumes inodes take up 64 bytes, which is a decent average
+        * number:
+        */
+       u64 avail_inodes = ((usage.capacity - usage.used) << 3);
        u64 fsid;
 
        buf->f_type     = BCACHEFS_STATFS_MAGIC;
@@ -1245,8 +1268,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks   = usage.capacity >> shift;
        buf->f_bfree    = (usage.capacity - usage.used) >> shift;
        buf->f_bavail   = buf->f_bfree;
-       buf->f_files    = 0;
-       buf->f_ffree    = 0;
+
+       buf->f_files    = usage.nr_inodes + avail_inodes;
+       buf->f_ffree    = avail_inodes;
 
        fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
               le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
index 5a6df3d1973a9dedbfab51e1a5b667a7da3751df..0c503527084656ad62f189134e904176c0e8e2d6 100644 (file)
@@ -537,7 +537,7 @@ retry:
 
                        bch2_trans_unlock(&trans);
 
-                       bch2_inode_pack(&p, &w.inode);
+                       bch2_inode_pack(c, &p, &w.inode);
 
                        ret = bch2_btree_insert(c, BTREE_ID_INODES,
                                                &p.inode.k_i, NULL, NULL,
@@ -808,7 +808,7 @@ create_root:
                        0, NULL);
        root_inode->bi_inum = BCACHEFS_ROOT_INO;
 
-       bch2_inode_pack(&packed, root_inode);
+       bch2_inode_pack(c, &packed, root_inode);
 
        return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
                                 NULL, NULL,
@@ -866,36 +866,22 @@ create_lostfound:
        return ret;
 }
 
-struct inode_bitmap {
-       unsigned long   *bits;
-       size_t          size;
-};
+typedef GENRADIX(unsigned long) inode_bitmap;
 
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
 {
-       return nr < b->size ? test_bit(nr, b->bits) : false;
+       unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
+       return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
 }
 
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
 {
-       if (nr >= b->size) {
-               size_t new_size = max_t(size_t, max_t(size_t,
-                                       PAGE_SIZE * 8,
-                                       b->size * 2),
-                                       nr + 1);
-               void *n;
-
-               new_size = roundup_pow_of_two(new_size);
-               n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
-               if (!n) {
-                       return -ENOMEM;
-               }
+       unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
 
-               b->bits = n;
-               b->size = new_size;
-       }
+       if (!w)
+               return -ENOMEM;
 
-       __set_bit(nr, b->bits);
+       *w |= 1UL << (nr & (BITS_PER_LONG - 1));
        return 0;
 }
 
@@ -934,7 +920,7 @@ noinline_for_stack
 static int check_directory_structure(struct bch_fs *c,
                                     struct bch_inode_unpacked *lostfound_inode)
 {
-       struct inode_bitmap dirs_done = { NULL, 0 };
+       inode_bitmap dirs_done;
        struct pathbuf path = { 0, 0, NULL };
        struct pathbuf_entry *e;
        struct btree_trans trans;
@@ -951,6 +937,7 @@ static int check_directory_structure(struct bch_fs *c,
 
        /* DFS: */
 restart_dfs:
+       genradix_init(&dirs_done);
        had_unreachable = false;
 
        ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
@@ -1057,7 +1044,7 @@ retry:
 
        if (had_unreachable) {
                bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
-               kfree(dirs_done.bits);
+               genradix_free(&dirs_done);
                kfree(path.entries);
                memset(&dirs_done, 0, sizeof(dirs_done));
                memset(&path, 0, sizeof(path));
@@ -1066,7 +1053,7 @@ retry:
 err:
 fsck_err:
        ret = bch2_trans_exit(&trans) ?: ret;
-       kfree(dirs_done.bits);
+       genradix_free(&dirs_done);
        kfree(path.entries);
        return ret;
 }
@@ -1326,7 +1313,7 @@ static int check_inode(struct btree_trans *trans,
        if (do_update) {
                struct bkey_inode_buf p;
 
-               bch2_inode_pack(&p, &u);
+               bch2_inode_pack(c, &p, &u);
 
                ret = __bch2_trans_do(trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|
index 7d20f082ad45a48d67fded8629aaad613cddbdee..42371de7f72a87be4ab91b174202b36f0124a33f 100644 (file)
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "varint.h"
 
 #include <linux/random.h>
 
@@ -88,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end,
        return bytes;
 }
 
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-                    const struct bch_inode_unpacked *inode)
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
+                                       const struct bch_inode_unpacked *inode)
 {
-       u8 *out = packed->inode.v.fields;
+       struct bkey_i_inode *k = &packed->inode;
+       u8 *out = k->v.fields;
        u8 *end = (void *) &packed[1];
        u8 *last_nonzero_field = out;
        unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
        unsigned bytes;
 
-       bkey_inode_init(&packed->inode.k_i);
-       packed->inode.k.p.offset        = inode->bi_inum;
-       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
-       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
-       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
-
-#define x(_name, _bits)                                        \
+#define x(_name, _bits)                                                        \
        out += inode_encode_field(out, end, 0, inode->_name);           \
        nr_fields++;                                                    \
                                                                        \
@@ -122,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
        set_bkey_val_bytes(&packed->inode.k, bytes);
        memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-       SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+       SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
+                              const struct bch_inode_unpacked *inode)
+{
+       struct bkey_i_inode *k = &packed->inode;
+       u8 *out = k->v.fields;
+       u8 *end = (void *) &packed[1];
+       u8 *last_nonzero_field = out;
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+       unsigned bytes;
+       int ret;
+
+#define x(_name, _bits)                                                        \
+       nr_fields++;                                                    \
+                                                                       \
+       if (inode->_name) {                                             \
+               ret = bch2_varint_encode(out, inode->_name);            \
+               out += ret;                                             \
+                                                                       \
+               if (_bits > 64)                                         \
+                       *out++ = 0;                                     \
+                                                                       \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       } else {                                                        \
+               *out++ = 0;                                             \
+                                                                       \
+               if (_bits > 64)                                         \
+                       *out++ = 0;                                     \
+       }
+
+       BCH_INODE_FIELDS()
+#undef  x
+       BUG_ON(out > end);
+
+       out = last_nonzero_field;
+       nr_fields = last_nonzero_fieldnr;
+
+       bytes = out - (u8 *) &packed->inode.v;
+       set_bkey_val_bytes(&packed->inode.k, bytes);
+       memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+       SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+void bch2_inode_pack(struct bch_fs *c,
+                    struct bkey_inode_buf *packed,
+                    const struct bch_inode_unpacked *inode)
+{
+       bkey_inode_init(&packed->inode.k_i);
+       packed->inode.k.p.offset        = inode->bi_inum;
+       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
+       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
+       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
+
+       if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
+               SET_INODE_NEW_VARINT(&packed->inode.v, true);
+               bch2_inode_pack_v2(packed, inode);
+       } else {
+               bch2_inode_pack_v1(packed, inode);
+       }
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
@@ -134,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
                BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
                BUG_ON(unpacked.bi_mode         != inode->bi_mode);
 
-#define x(_name, _bits)        BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits)        if (unpacked._name != inode->_name)             \
+                       panic("unpacked %llu should be %llu",           \
+                             (u64) unpacked._name, (u64) inode->_name);
                BCH_INODE_FIELDS()
 #undef  x
        }
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
-                     struct bch_inode_unpacked *unpacked)
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+                               struct bch_inode_unpacked *unpacked)
 {
        const u8 *in = inode.v->fields;
-       const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+       const u8 *end = bkey_val_end(inode);
        u64 field[2];
        unsigned fieldnr = 0, field_bits;
        int ret;
 
-       unpacked->bi_inum       = inode.k->p.offset;
-       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
-       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
-
 #define x(_name, _bits)                                        \
        if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
                memset(&unpacked->_name, 0,                             \
@@ -176,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 #undef  x
 
        /* XXX: signal if there were more fields than expected? */
+       return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
+                               struct bch_inode_unpacked *unpacked)
+{
+       const u8 *in = inode.v->fields;
+       const u8 *end = bkey_val_end(inode);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v[2];
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < INODE_NR_FIELDS(inode.v)) {                       \
+               ret = bch2_varint_decode(in, end, &v[0]);               \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+                                                                       \
+               if (_bits > 64) {                                       \
+                       ret = bch2_varint_decode(in, end, &v[1]);       \
+                       if (ret < 0)                                    \
+                               return ret;                             \
+                       in += ret;                                      \
+               } else {                                                \
+                       v[1] = 0;                                       \
+               }                                                       \
+       } else {                                                        \
+               v[0] = v[1] = 0;                                        \
+       }                                                               \
+                                                                       \
+       unpacked->_name = v[0];                                         \
+       if (v[1] || v[0] != unpacked->_name)                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_INODE_FIELDS()
+#undef  x
+
+       /* XXX: signal if there were more fields than expected? */
+       return 0;
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+                     struct bch_inode_unpacked *unpacked)
+{
+       unpacked->bi_inum       = inode.k->p.offset;
+       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
+       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+       if (INODE_NEW_VARINT(inode.v)) {
+               return bch2_inode_unpack_v2(inode, unpacked);
+       } else {
+               return bch2_inode_unpack_v1(inode, unpacked);
+       }
 
        return 0;
 }
@@ -189,11 +301,11 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
        int ret;
 
        iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
-                                  BTREE_ITER_SLOTS|flags);
+                                  BTREE_ITER_CACHED|flags);
        if (IS_ERR(iter))
                return iter;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_cached(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -222,7 +334,7 @@ int bch2_inode_write(struct btree_trans *trans,
        if (IS_ERR(inode_p))
                return PTR_ERR(inode_p);
 
-       bch2_inode_pack(inode_p, inode);
+       bch2_inode_pack(trans->c, inode_p, inode);
        bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
        return 0;
 }
@@ -271,6 +383,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
                return;
        }
 
+       pr_buf(out, "mode: %o ", unpacked.bi_mode);
+
 #define x(_name, _bits)                                                \
        pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
        BCH_INODE_FIELDS()
@@ -359,20 +473,24 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 }
 
 int bch2_inode_create(struct btree_trans *trans,
-                     struct bch_inode_unpacked *inode_u,
-                     u64 min, u64 max, u64 *hint)
+                     struct bch_inode_unpacked *inode_u)
 {
+       struct bch_fs *c = trans->c;
        struct bkey_inode_buf *inode_p;
        struct btree_iter *iter = NULL;
        struct bkey_s_c k;
-       u64 start;
+       u64 min, max, start, *hint;
        int ret;
 
-       if (!max)
-               max = ULLONG_MAX;
+       unsigned cpu = raw_smp_processor_id();
+       unsigned bits = (c->opts.inodes_32bit
+               ? 31 : 63) - c->inode_shard_bits;
 
-       if (trans->c->opts.inodes_32bit)
-               max = min_t(u64, max, U32_MAX);
+       min = (cpu << bits);
+       max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+       min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+       hint = c->unused_inode_hints + cpu;
 
        start = READ_ONCE(*hint);
 
@@ -388,7 +506,17 @@ again:
                if (bkey_cmp(iter->pos, POS(0, max)) > 0)
                        break;
 
-               if (k.k->type != KEY_TYPE_inode)
+               /*
+                * There's a potential cache coherency issue with the btree key
+                * cache code here - we're iterating over the btree, skipping
+                * that cache. We should never see an empty slot that isn't
+                * actually empty due to a pending update in the key cache
+                * because the update that creates the inode isn't done with a
+                * cached iterator, but - better safe than sorry, check the
+                * cache before using a slot:
+                */
+               if (k.k->type != KEY_TYPE_inode &&
+                   !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
                        goto found_slot;
        }
 
@@ -409,10 +537,7 @@ found_slot:
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
 
-       bch2_inode_pack(inode_p, inode_u);
-       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-       bch2_trans_iter_put(trans, iter);
-       return 0;
+       return bch2_inode_write(trans, iter, inode_u);
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -422,6 +547,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
        struct bkey_i_inode_generation delete;
        struct bpos start = POS(inode_nr, 0);
        struct bpos end = POS(inode_nr + 1, 0);
+       struct bkey_s_c k;
+       u64 bi_generation;
        int ret;
 
        /*
@@ -442,51 +569,62 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
                return ret;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       bi_generation = 0;
+
+       ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
+       if (ret) {
+               if (ret != -EINTR)
+                       bch_err(c, "error flushing btree key cache: %i", ret);
+               goto err;
+       }
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       do {
-               struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-               u32 bi_generation = 0;
+       k = bch2_btree_iter_peek_slot(iter);
 
-               ret = bkey_err(k);
-               if (ret)
-                       break;
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
 
-               bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
-                                       "inode %llu not found when deleting",
-                                       inode_nr);
+       bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+                               "inode %llu not found when deleting",
+                               inode_nr);
 
-               switch (k.k->type) {
-               case KEY_TYPE_inode: {
-                       struct bch_inode_unpacked inode_u;
+       switch (k.k->type) {
+       case KEY_TYPE_inode: {
+               struct bch_inode_unpacked inode_u;
 
-                       if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-                               bi_generation = inode_u.bi_generation + 1;
-                       break;
-               }
-               case KEY_TYPE_inode_generation: {
-                       struct bkey_s_c_inode_generation g =
-                               bkey_s_c_to_inode_generation(k);
-                       bi_generation = le32_to_cpu(g.v->bi_generation);
-                       break;
-               }
-               }
+               if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+                       bi_generation = inode_u.bi_generation + 1;
+               break;
+       }
+       case KEY_TYPE_inode_generation: {
+               struct bkey_s_c_inode_generation g =
+                       bkey_s_c_to_inode_generation(k);
+               bi_generation = le32_to_cpu(g.v->bi_generation);
+               break;
+       }
+       }
 
-               if (!bi_generation) {
-                       bkey_init(&delete.k);
-                       delete.k.p.offset = inode_nr;
-               } else {
-                       bkey_inode_generation_init(&delete.k_i);
-                       delete.k.p.offset = inode_nr;
-                       delete.v.bi_generation = cpu_to_le32(bi_generation);
-               }
+       if (!bi_generation) {
+               bkey_init(&delete.k);
+               delete.k.p.offset = inode_nr;
+       } else {
+               bkey_inode_generation_init(&delete.k_i);
+               delete.k.p.offset = inode_nr;
+               delete.v.bi_generation = cpu_to_le32(bi_generation);
+       }
 
-               bch2_trans_update(&trans, iter, &delete.k_i, 0);
+       bch2_trans_update(&trans, iter, &delete.k_i, 0);
 
-               ret = bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL);
-       } while (ret == -EINTR);
+       ret = bch2_trans_commit(&trans, NULL, NULL,
+                               BTREE_INSERT_NOFAIL);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
        bch2_trans_exit(&trans);
        return ret;
@@ -500,11 +638,11 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
        int ret;
 
        iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-                       POS(0, inode_nr), BTREE_ITER_SLOTS);
+                       POS(0, inode_nr), BTREE_ITER_CACHED);
        if (IS_ERR(iter))
                return PTR_ERR(iter);
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_cached(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -523,32 +661,3 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
        return bch2_trans_do(c, NULL, NULL, 0,
                bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
 }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
-       struct bch_inode_unpacked *u, test_inodes[] = {
-               {
-                       .bi_atime       = U64_MAX,
-                       .bi_ctime       = U64_MAX,
-                       .bi_mtime       = U64_MAX,
-                       .bi_otime       = U64_MAX,
-                       .bi_size        = U64_MAX,
-                       .bi_sectors     = U64_MAX,
-                       .bi_uid         = U32_MAX,
-                       .bi_gid         = U32_MAX,
-                       .bi_nlink       = U32_MAX,
-                       .bi_generation  = U32_MAX,
-                       .bi_dev         = U32_MAX,
-               },
-       };
-
-       for (u = test_inodes;
-            u < test_inodes + ARRAY_SIZE(test_inodes);
-            u++) {
-               struct bkey_inode_buf p;
-
-               bch2_inode_pack(&p, u);
-       }
-}
-#endif
index bb759a46dc415a57a4f76119036cb245fd2c645b..ef7e885dce0c87e671248f30f9debbaaf13b2dc9 100644 (file)
@@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
        .val_to_text    = bch2_inode_generation_to_text,        \
 }
 
+#if 0
+typedef struct {
+       u64                     lo;
+       u32                     hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
 struct bch_inode_unpacked {
        u64                     bi_inum;
        __le64                  bi_hash_seed;
@@ -43,7 +51,8 @@ struct bkey_inode_buf {
 #undef  x
 } __attribute__((packed, aligned(8)));
 
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
+                    const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 struct btree_iter *bch2_inode_peek(struct btree_trans *,
@@ -60,9 +69,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     uid_t, gid_t, umode_t, dev_t,
                     struct bch_inode_unpacked *);
 
-int bch2_inode_create(struct btree_trans *,
-                     struct bch_inode_unpacked *,
-                     u64, u64, u64 *);
+int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
 
 int bch2_inode_rm(struct bch_fs *, u64);
 
@@ -168,10 +175,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
        }
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
-
 #endif /* _BCACHEFS_INODE_H */
index 8add8ccd129dade398cffe98d053cb983e97e0fd..21087d1193dcf9143be1fa7f947b16d1073d9bc0 100644 (file)
@@ -171,7 +171,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
        while (size) {
                struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-               unsigned len = min(PAGE_SIZE, size);
+               unsigned len = min_t(size_t, PAGE_SIZE, size);
 
                BUG_ON(!bio_add_page(bio, page, len, 0));
                size -= len;
@@ -301,7 +301,7 @@ int bch2_extent_update(struct btree_trans *trans,
                inode_u.bi_sectors += delta;
 
                if (delta || new_i_size) {
-                       bch2_inode_pack(&inode_p, &inode_u);
+                       bch2_inode_pack(trans->c, &inode_p, &inode_u);
                        bch2_trans_update(trans, inode_iter,
                                          &inode_p.inode.k_i, 0);
                }
index b8b719902c637ffd739d7b43f1432beb8820f327..c2cafd3892a4cf1d88404e9cc6764a58ffea2451 100644 (file)
@@ -980,9 +980,11 @@ void bch2_fs_journal_stop(struct journal *j)
 
        wait_event(j->wait, journal_entry_close(j));
 
-       /* do we need to write another journal entry? */
-       if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
-               bch2_journal_meta(j);
+       /*
+        * Always write a new journal entry, to make sure the clock hands are up
+        * to date (and match the superblock)
+        */
+       bch2_journal_meta(j);
 
        journal_quiesce(j);
 
index 57591983eebd420c604453e9f4db8d130c46aae3..18e45296e7def4d2636246c504fb77966574bba0 100644 (file)
@@ -465,34 +465,12 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
        return ret;
 }
 
-/**
- * bch2_journal_reclaim - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-void bch2_journal_reclaim(struct journal *j)
+static u64 journal_seq_to_flush(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       unsigned iter, min_nr = 0;
        u64 seq_to_flush = 0;
-
-       lockdep_assert_held(&j->reclaim_lock);
-
-       bch2_journal_do_discards(j);
+       unsigned iter;
 
        spin_lock(&j->lock);
 
@@ -524,20 +502,52 @@ void bch2_journal_reclaim(struct journal *j)
                             (j->pin.size >> 1));
        spin_unlock(&j->lock);
 
-       /*
-        * If it's been longer than j->reclaim_delay_ms since we last flushed,
-        * make sure to flush at least one journal pin:
-        */
-       if (time_after(jiffies, j->last_flushed +
-                      msecs_to_jiffies(j->reclaim_delay_ms)))
-               min_nr = 1;
+       return seq_to_flush;
+}
 
-       if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
-               seq_to_flush = max(seq_to_flush, journal_last_seq(j));
-               min_nr = 1;
-       }
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       unsigned min_nr = 0;
+       u64 seq_to_flush = 0;
+
+       lockdep_assert_held(&j->reclaim_lock);
+
+       do {
+               bch2_journal_do_discards(j);
+
+               seq_to_flush = journal_seq_to_flush(j);
+               min_nr = 0;
+
+               /*
+                * If it's been longer than j->reclaim_delay_ms since we last flushed,
+                * make sure to flush at least one journal pin:
+                */
+               if (time_after(jiffies, j->last_flushed +
+                              msecs_to_jiffies(j->reclaim_delay_ms)))
+                       min_nr = 1;
 
-       journal_flush_pins(j, seq_to_flush, min_nr);
+               if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+                       min_nr = 1;
+       } while (journal_flush_pins(j, seq_to_flush, min_nr));
 
        if (!bch2_journal_error(j))
                queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
index 32fed6b81a526a6f83549bc05cf56ab5907fb7a9..1745cfac6b26aef400984100b5d82193dec1ad88 100644 (file)
@@ -1320,7 +1320,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_inode_init(c, &root_inode, 0, 0,
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
        root_inode.bi_inum = BCACHEFS_ROOT_INO;
-       bch2_inode_pack(&packed_inode, &root_inode);
+       bch2_inode_pack(c, &packed_inode, &root_inode);
 
        err = "error creating root directory";
        ret = bch2_btree_insert(c, BTREE_ID_INODES,
index 015bbd9f21fd933774c0982b285a42014da3ba35..8673e9744ce18d27ccf3e41d83d6d81fcb1ebee6 100644 (file)
@@ -451,6 +451,7 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
        unsigned i;
+       int cpu;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
@@ -475,6 +476,12 @@ static void __bch2_fs_free(struct bch_fs *c)
        free_percpu(c->usage[1]);
        free_percpu(c->usage[0]);
        kfree(c->usage_base);
+
+       if (c->btree_iters_bufs)
+               for_each_possible_cpu(cpu)
+                       kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+
+       free_percpu(c->btree_iters_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
@@ -485,6 +492,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(c->replicas_gc.entries);
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
+       kfree(c->unused_inode_hints);
        free_heap(&c->copygc_heap);
 
        if (c->journal_reclaim_wq)
@@ -736,11 +744,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                (btree_blocks(c) + 1) * 2 *
                sizeof(struct sort_iter_set);
 
+       c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
        if (!(c->wq = alloc_workqueue("bcachefs",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-           !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+           !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-           !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+           !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            percpu_ref_init(&c->writes, bch2_writes_disabled,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
@@ -750,9 +760,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+           !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+           !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+                                             sizeof(u64), GFP_KERNEL)) ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
@@ -2012,7 +2025,6 @@ static void bcachefs_exit(void)
 static int __init bcachefs_init(void)
 {
        bch2_bkey_pack_test();
-       bch2_inode_pack_test();
 
        if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
            bch2_chardev_init() ||
index 0cb29f43d99d9a28c6e70139bc637d45b1ba0d6f..d7ad293aff4dfd7cb77c3dd228a44a17639f6790 100644 (file)
@@ -208,12 +208,6 @@ read_attribute(io_timers_write);
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
 
-#define BCH_DEBUG_PARAM(name, description)                             \
-       rw_attribute(name);
-
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 #define x(_name)                                               \
        static struct attribute sysfs_time_stat_##_name =               \
                { .name = #_name, .mode = S_IRUGO };
@@ -414,10 +408,6 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
        return 0;
 }
 
@@ -462,10 +452,6 @@ STORE(bch2_fs)
 
        /* Debugging: */
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
        if (!test_bit(BCH_FS_STARTED, &c->flags))
                return -EPERM;
 
@@ -590,11 +576,6 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_io_timers_write,
 
        &sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
        NULL
 };
 
index fd4044a6a08fbafd487e4cd80dfb507dff61a264..2709163e02b538b0b0a6df075a56ba48b4ed8e6c 100644 (file)
@@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 {
        while (size) {
                struct page *page = alloc_page(gfp_mask);
-               unsigned len = min(PAGE_SIZE, size);
+               unsigned len = min_t(size_t, PAGE_SIZE, size);
 
                if (!page)
                        return -ENOMEM;
index f48c6380684f67ec5f6052507c07906148ee66f6..6e5335440b4b5696b1764bba740225bd46ef039f 100644 (file)
@@ -37,17 +37,6 @@ struct closure;
 #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
 #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
 
-#define memcpy(dst, src, len)                                          \
-({                                                                     \
-       void *_dst = (dst);                                             \
-       const void *_src = (src);                                       \
-       size_t _len = (len);                                            \
-                                                                       \
-       BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||         \
-                (void *) (_dst) + (_len) <= (void *) (_src)));         \
-       memcpy(_dst, _src, _len);                                       \
-})
-
 #else /* DEBUG */
 
 #define EBUG_ON(cond)
diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c
new file mode 100644 (file)
index 0000000..a3d252c
--- /dev/null
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#include "varint.h"
+
+int bch2_varint_encode(u8 *out, u64 v)
+{
+       unsigned bits = fls64(v|1);
+       unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+       if (likely(bytes < 9)) {
+               v <<= bytes;
+               v |= ~(~0 << (bytes - 1));
+       } else {
+               *out++ = 255;
+               bytes = 9;
+       }
+
+       put_unaligned_le64(v, out);
+       return bytes;
+}
+
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+       u64 v = get_unaligned_le64(in);
+       unsigned bytes = ffz(v & 255) + 1;
+
+       if (unlikely(in + bytes > end))
+               return -1;
+
+       if (likely(bytes < 9)) {
+               v >>= bytes;
+               v &= ~(~0ULL << (7 * bytes));
+       } else {
+               v = get_unaligned_le64(++in);
+       }
+
+       *out = v;
+       return bytes;
+}
diff --git a/libbcachefs/varint.h b/libbcachefs/varint.h
new file mode 100644 (file)
index 0000000..8daf813
--- /dev/null
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */