]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 04036b4910 bcachefs: Fix a memory leak
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 27 Feb 2022 17:01:32 +0000 (12:01 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Sun, 27 Feb 2022 17:01:32 +0000 (12:01 -0500)
19 files changed:
.bcachefs_revision
include/trace/events/bcachefs.h
libbcachefs/bcachefs.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/debug.c
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/super-io.c
libbcachefs/sysfs.c
libbcachefs/util.h

index 52682fdc4b33382c3fa0822ca82b290bf0743327..6f4750b7de12f086c79956fe4facaf4885b1022c 100644 (file)
@@ -1 +1 @@
-31718a290491ef933e0bfc5fb666a197b08a4d10
+04036b491089aeb4bac5d796ae1716d019564f7a
index 8cf6669e28306cc22d3ca7e634a58675858f7dab..0596887959d3ef6be198e58c1c19dd73caef2ea3 100644 (file)
@@ -918,6 +918,14 @@ TRACE_EVENT(trans_restart_mem_realloced,
                  __entry->bytes)
 );
 
+DEFINE_EVENT(transaction_restart_iter, trans_restart_key_cache_key_realloced,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
+
 #endif /* _TRACE_BCACHE_H */
 
 /* This part must be outside protection */
index 45a43f716c44b2683eb70072a4a13b53fa37da89..211fd5adf9e3031a0cb94e98dea4615091656024 100644 (file)
@@ -534,14 +534,10 @@ enum {
        BCH_FS_NEED_ANOTHER_GC,
        BCH_FS_DELETED_NODES,
        BCH_FS_REBUILD_REPLICAS,
-       BCH_FS_HOLD_BTREE_WRITES,
 };
 
 struct btree_debug {
        unsigned                id;
-       struct dentry           *btree;
-       struct dentry           *btree_format;
-       struct dentry           *failed;
 };
 
 struct bch_fs_pcpu {
@@ -886,7 +882,8 @@ struct bch_fs {
        struct bch_memquota_type quotas[QTYP_NR];
 
        /* DEBUG JUNK */
-       struct dentry           *debug;
+       struct dentry           *fs_debug_dir;
+       struct dentry           *btree_debug_dir;
        struct btree_debug      btree_debug[BTREE_ID_NR];
        struct btree            *verify_data;
        struct btree_node       *verify_ondisk;
index 00d4b18292aec9d5120ab93285b8334000fb5c3d..1347b1fc1166eed01d02e71cda8fbdabf1ec7814 100644 (file)
 
 struct lock_class_key bch2_btree_node_lock_key;
 
+const char * const bch2_btree_node_flags[] = {
+#define x(f)   #f,
+       BTREE_FLAGS()
+#undef x
+       NULL
+};
+
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
        unsigned i, reserve = 16;
@@ -217,15 +224,13 @@ wait_on_io:
                goto wait_on_io;
        }
 
-       if (btree_node_noevict(b))
-               goto out_unlock;
-
-       if (!btree_node_may_write(b))
+       if (btree_node_noevict(b) ||
+           btree_node_write_blocked(b) ||
+           btree_node_will_make_reachable(b))
                goto out_unlock;
 
        if (btree_node_dirty(b)) {
-               if (!flush ||
-                   test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+               if (!flush)
                        goto out_unlock;
                /*
                 * Using the underscore version because we don't want to compact
@@ -234,9 +239,9 @@ wait_on_io:
                 * the post write cleanup:
                 */
                if (bch2_verify_btree_ondisk)
-                       bch2_btree_node_write(c, b, SIX_LOCK_intent);
+                       bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
                else
-                       __bch2_btree_node_write(c, b, false);
+                       __bch2_btree_node_write(c, b, 0);
 
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
@@ -415,7 +420,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
                if (btree_node_dirty(b))
                        bch2_btree_complete_write(c, b, btree_current_write(b));
-               clear_btree_node_dirty(c, b);
+               clear_btree_node_dirty_acct(c, b);
 
                btree_node_data_free(c, b);
        }
@@ -1059,7 +1064,7 @@ wait_on_io:
        six_lock_write(&b->c.lock, NULL, NULL);
 
        if (btree_node_dirty(b)) {
-               __bch2_btree_node_write(c, b, false);
+               __bch2_btree_node_write(c, b, 0);
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
                goto wait_on_io;
index f7e10986f317cc2036abcb143648bc721c0c2eb0..2901f0dc925b2e74b059ceaaf1d11027b2ad2815 100644 (file)
@@ -7,6 +7,8 @@
 
 extern struct lock_class_key bch2_btree_node_lock_key;
 
+extern const char * const bch2_btree_node_flags[];
+
 struct btree_iter;
 
 void bch2_recalc_btree_reserve(struct bch_fs *);
index 88b234f58ef54d04baeba7672c09fc4cf366f9a0..cd9016541d9c53d2add243ba30f821e4eaed52ca 100644 (file)
@@ -1059,6 +1059,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 
        bch2_trans_init(&trans, c, 0, 0);
 
+       if (initial)
+               trans.is_initial_gc = true;
+
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
        bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
index 2b16b656c9beadd31a5f36f44f9ccd6b8b72b29b..08f5f6b865c6ed17bc7fe69d504a10b3e1e0699f 100644 (file)
@@ -477,7 +477,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
                };
 
                if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-                       bch2_btree_node_write(c, b, SIX_LOCK_write);
+                       bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
                        reinit_iter = true;
                }
        }
@@ -1596,29 +1596,13 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
        bch2_journal_pin_drop(&c->journal, &w->journal);
 }
 
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
        struct btree_write *w = btree_prev_write(b);
        unsigned long old, new, v;
 
        bch2_btree_complete_write(c, b, w);
 
-       v = READ_ONCE(b->flags);
-       do {
-               old = new = v;
-
-               if (old & (1U << BTREE_NODE_need_write))
-                       goto do_write;
-
-               new &= ~(1U << BTREE_NODE_write_in_flight);
-               new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-       } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-       return;
-
-do_write:
-       six_lock_read(&b->c.lock, NULL, NULL);
        v = READ_ONCE(b->flags);
        do {
                old = new = v;
@@ -1626,7 +1610,8 @@ do_write:
                if ((old & (1U << BTREE_NODE_dirty)) &&
                    (old & (1U << BTREE_NODE_need_write)) &&
                    !(old & (1U << BTREE_NODE_never_write)) &&
-                   btree_node_may_write(b)) {
+                   !(old & (1U << BTREE_NODE_write_blocked)) &&
+                   !(old & (1U << BTREE_NODE_will_make_reachable))) {
                        new &= ~(1U << BTREE_NODE_dirty);
                        new &= ~(1U << BTREE_NODE_need_write);
                        new |=  (1U << BTREE_NODE_write_in_flight);
@@ -1640,8 +1625,13 @@ do_write:
        } while ((v = cmpxchg(&b->flags, old, new)) != old);
 
        if (new & (1U << BTREE_NODE_write_in_flight))
-               __bch2_btree_node_write(c, b, true);
+               __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+}
 
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+       six_lock_read(&b->c.lock, NULL, NULL);
+       __btree_node_write_done(c, b);
        six_unlock_read(&b->c.lock);
 }
 
@@ -1756,7 +1746,7 @@ static void btree_write_submit(struct work_struct *work)
        bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 {
        struct btree_write_bio *wbio;
        struct bset_tree *t;
@@ -1773,12 +1763,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
        void *data;
        int ret;
 
-       if (already_started)
+       if (flags & BTREE_WRITE_ALREADY_STARTED)
                goto do_write;
 
-       if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-               return;
-
        /*
         * We may only have a read lock on the btree node - the dirty bit is our
         * "lock" against racing with other threads that may be trying to start
@@ -1792,13 +1779,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
                if (!(old & (1 << BTREE_NODE_dirty)))
                        return;
 
-               if (!btree_node_may_write(b))
+               if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+                   !(old & (1 << BTREE_NODE_need_write)))
                        return;
 
-               if (old & (1 << BTREE_NODE_never_write))
+               if (old &
+                   ((1 << BTREE_NODE_never_write)|
+                    (1 << BTREE_NODE_write_blocked)))
                        return;
 
-               BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
+               if (b->written &&
+                   (old & (1 << BTREE_NODE_will_make_reachable)))
+                       return;
+
+               if (old & (1 << BTREE_NODE_write_in_flight))
+                       return;
 
                new &= ~(1 << BTREE_NODE_dirty);
                new &= ~(1 << BTREE_NODE_need_write);
@@ -1998,7 +1993,7 @@ err:
        b->written += sectors_to_write;
 nowrite:
        btree_bounce_free(c, bytes, used_mempool, data);
-       btree_node_write_done(c, b);
+       __btree_node_write_done(c, b);
 }
 
 /*
@@ -2061,12 +2056,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
  * Use this one if the node is intent locked:
  */
 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-                          enum six_lock_type lock_type_held)
+                          enum six_lock_type lock_type_held,
+                          unsigned flags)
 {
        if (lock_type_held == SIX_LOCK_intent ||
            (lock_type_held == SIX_LOCK_read &&
             six_lock_tryupgrade(&b->c.lock))) {
-               __bch2_btree_node_write(c, b, false);
+               __bch2_btree_node_write(c, b, flags);
 
                /* don't cycle lock unnecessarily: */
                if (btree_node_just_written(b) &&
@@ -2078,7 +2074,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (lock_type_held == SIX_LOCK_read)
                        six_lock_downgrade(&b->c.lock);
        } else {
-               __bch2_btree_node_write(c, b, false);
+               __bch2_btree_node_write(c, b, flags);
                if (lock_type_held == SIX_LOCK_write &&
                    btree_node_just_written(b))
                        bch2_btree_post_write_cleanup(c, b);
@@ -2112,30 +2108,3 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
 {
        __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
-
-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct bucket_table *tbl;
-       struct rhash_head *pos;
-       struct btree *b;
-       unsigned i;
-
-       rcu_read_lock();
-       for_each_cached_btree(b, c, tbl, i, pos) {
-               unsigned long flags = READ_ONCE(b->flags);
-
-               if (!(flags & (1 << BTREE_NODE_dirty)))
-                       continue;
-
-               pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
-                      b,
-                      (flags & (1 << BTREE_NODE_dirty)) != 0,
-                      (flags & (1 << BTREE_NODE_need_write)) != 0,
-                      b->c.level,
-                      b->written,
-                      !list_empty_careful(&b->write_blocked),
-                      b->will_make_reachable != 0,
-                      b->will_make_reachable & 1);
-       }
-       rcu_read_unlock();
-}
index 095ad505338d36e1bac1916d5ee9a3f606ed29bc..d818d87661e863a78b19b047276094e9a84696a1 100644 (file)
@@ -15,18 +15,13 @@ struct btree;
 struct btree_iter;
 struct btree_node_read_all;
 
-static inline bool btree_node_dirty(struct btree *b)
-{
-       return test_bit(BTREE_NODE_dirty, &b->flags);
-}
-
-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 {
        if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
                atomic_inc(&c->btree_cache.dirty);
 }
 
-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 {
        if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
                atomic_dec(&c->btree_cache.dirty);
@@ -67,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *);
 void bch2_btree_node_wait_on_read(struct btree *);
 void bch2_btree_node_wait_on_write(struct btree *);
 
-static inline bool btree_node_may_write(struct btree *b)
-{
-       return list_empty_careful(&b->write_blocked) &&
-               (!b->written || !b->will_make_reachable);
-}
-
 enum compact_mode {
        COMPACT_LAZY,
        COMPACT_ALL,
@@ -148,41 +137,23 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
+#define BTREE_WRITE_ONLY_IF_NEED       (1U << 0)
+#define BTREE_WRITE_ALREADY_STARTED    (1U << 1)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
-                         enum six_lock_type);
+                          enum six_lock_type, unsigned);
 
 static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
                                            enum six_lock_type lock_held)
 {
-       if (b->written &&
-           btree_node_need_write(b) &&
-           btree_node_may_write(b) &&
-           !btree_node_write_in_flight(b))
-               bch2_btree_node_write(c, b, lock_held);
+       bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-#define bch2_btree_node_write_cond(_c, _b, cond)                       \
-do {                                                                   \
-       unsigned long old, new, v = READ_ONCE((_b)->flags);             \
-                                                                       \
-       do {                                                            \
-               old = new = v;                                          \
-                                                                       \
-               if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))        \
-                       break;                                          \
-                                                                       \
-               new |= (1 << BTREE_NODE_need_write);                    \
-       } while ((v = cmpxchg(&(_b)->flags, old, new)) != old);         \
-                                                                       \
-       btree_node_write_if_need(_c, _b, SIX_LOCK_read);                \
-} while (0)
-
 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
                                  unsigned version, unsigned big_endian,
index 8ff6a8d03dc444fb117342ce3e0e34bb2305caf5..c0357ee9cfb78a102ce1a50848b58e818b66f5a9 100644 (file)
@@ -558,7 +558,12 @@ void bch2_trans_unlock(struct btree_trans *trans)
        trans_for_each_path(trans, path)
                __bch2_btree_path_unlock(path);
 
-       BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+       /*
+        * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
+        * btree nodes, it implements its own walking:
+        */
+       BUG_ON(!trans->is_initial_gc &&
+              lock_class_is_held(&bch2_btree_node_lock_key));
 }
 
 /* Btree iterator: */
index 7a1555c2b0975111e891bf3c33f627687baafeee..d87069c5654fc77fe308051c2898e3099667326b 100644 (file)
@@ -392,6 +392,7 @@ struct btree_trans {
        bool                    restarted:1;
        bool                    memory_allocation_failure:1;
        bool                    journal_transaction_names:1;
+       bool                    is_initial_gc:1;
        /*
         * For when bch2_trans_update notices we'll be splitting a compressed
         * extent:
@@ -424,7 +425,31 @@ struct btree_trans {
        struct replicas_delta_list *fs_usage_deltas;
 };
 
-#define BTREE_FLAG(flag)                                               \
+#define BTREE_FLAGS()                                                  \
+       x(read_in_flight)                                               \
+       x(read_error)                                                   \
+       x(dirty)                                                        \
+       x(need_write)                                                   \
+       x(write_blocked)                                                \
+       x(will_make_reachable)                                          \
+       x(noevict)                                                      \
+       x(write_idx)                                                    \
+       x(accessed)                                                     \
+       x(write_in_flight)                                              \
+       x(write_in_flight_inner)                                        \
+       x(just_written)                                                 \
+       x(dying)                                                        \
+       x(fake)                                                         \
+       x(need_rewrite)                                                 \
+       x(never_write)
+
+enum btree_flags {
+#define x(flag)        BTREE_NODE_##flag,
+       BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag)                                                                \
 static inline bool btree_node_ ## flag(struct btree *b)                        \
 {      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
                                                                        \
@@ -434,36 +459,8 @@ static inline void set_btree_node_ ## flag(struct btree *b)                \
 static inline void clear_btree_node_ ## flag(struct btree *b)          \
 {      clear_bit(BTREE_NODE_ ## flag, &b->flags); }
 
-enum btree_flags {
-       BTREE_NODE_read_in_flight,
-       BTREE_NODE_read_error,
-       BTREE_NODE_dirty,
-       BTREE_NODE_need_write,
-       BTREE_NODE_noevict,
-       BTREE_NODE_write_idx,
-       BTREE_NODE_accessed,
-       BTREE_NODE_write_in_flight,
-       BTREE_NODE_write_in_flight_inner,
-       BTREE_NODE_just_written,
-       BTREE_NODE_dying,
-       BTREE_NODE_fake,
-       BTREE_NODE_need_rewrite,
-       BTREE_NODE_never_write,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(write_in_flight_inner);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(need_rewrite);
-BTREE_FLAG(never_write);
+BTREE_FLAGS()
+#undef x
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
index ba76a86ac10d30a5b71afdcfbdf29f09b0d71bbb..63832fb9a4072080d048ecb4764af4787b77f0d6 100644 (file)
@@ -271,7 +271,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        six_lock_write(&b->c.lock, NULL, NULL);
 
        set_btree_node_accessed(b);
-       set_btree_node_dirty(c, b);
+       set_btree_node_dirty_acct(c, b);
        set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
@@ -619,6 +619,8 @@ err:
                mutex_lock(&c->btree_interior_update_lock);
 
                list_del(&as->write_blocked_list);
+               if (list_empty(&b->write_blocked))
+                       clear_btree_node_write_blocked(b);
 
                /*
                 * Node might have been freed, recheck under
@@ -663,6 +665,7 @@ err:
 
                BUG_ON(b->will_make_reachable != (unsigned long) as);
                b->will_make_reachable = 0;
+               clear_btree_node_will_make_reachable(b);
        }
        mutex_unlock(&c->btree_interior_update_lock);
 
@@ -729,6 +732,8 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 
        as->mode        = BTREE_INTERIOR_UPDATING_NODE;
        as->b           = b;
+
+       set_btree_node_write_blocked(b);
        list_add(&as->write_blocked_list, &b->write_blocked);
 
        mutex_unlock(&c->btree_interior_update_lock);
@@ -794,6 +799,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 
        as->new_nodes[as->nr_new_nodes++] = b;
        b->will_make_reachable = 1UL|(unsigned long) as;
+       set_btree_node_will_make_reachable(b);
 
        mutex_unlock(&c->btree_interior_update_lock);
 
@@ -816,6 +822,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
         * xchg() is for synchronization with bch2_btree_complete_write:
         */
        v = xchg(&b->will_make_reachable, 0);
+       clear_btree_node_will_make_reachable(b);
        as = (struct btree_update *) (v & ~1UL);
 
        if (!as) {
@@ -881,7 +888,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                closure_wake_up(&c->btree_interior_update_wait);
        }
 
-       clear_btree_node_dirty(c, b);
+       clear_btree_node_dirty_acct(c, b);
        clear_btree_node_need_write(b);
 
        /*
@@ -1096,8 +1103,7 @@ static void bch2_btree_set_root(struct btree_update *as,
        struct btree *old;
 
        trace_btree_set_root(c, b);
-       BUG_ON(!b->written &&
-              !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+       BUG_ON(!b->written);
 
        old = btree_node_root(c, b);
 
@@ -1165,7 +1171,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
                bch2_btree_node_iter_advance(node_iter, b);
 
        bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-       set_btree_node_dirty(c, b);
+       set_btree_node_dirty_acct(c, b);
        set_btree_node_need_write(b);
 }
 
@@ -1386,8 +1392,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
                six_unlock_write(&n2->c.lock);
                six_unlock_write(&n1->c.lock);
 
-               bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-               bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+               bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+               bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
 
                /*
                 * Note that on recursive parent_keys == keys, so we
@@ -1406,7 +1412,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
 
                        btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 
-                       bch2_btree_node_write(c, n3, SIX_LOCK_intent);
+                       bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
                }
        } else {
                trace_btree_compact(c, b);
@@ -1414,7 +1420,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
                bch2_btree_build_aux_trees(n1);
                six_unlock_write(&n1->c.lock);
 
-               bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+               bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
                if (parent)
                        bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1702,7 +1708,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
        bch2_btree_build_aux_trees(n);
        six_unlock_write(&n->c.lock);
 
-       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
        bkey_init(&delete.k);
        delete.k.p = prev->key.k.p;
@@ -1776,7 +1782,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
        trace_btree_gc_rewrite_node(c, b);
 
-       bch2_btree_node_write(c, n, SIX_LOCK_intent);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
        if (parent) {
                bch2_keylist_add(&as->parent_keys, &n->key);
index 334df6382817c8a41db8fbd2a13501b08e61d8f2..19cb6e1e9722a3ab2679cb0a02126cbce800dbaf 100644 (file)
@@ -167,10 +167,24 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct btree_write *w = container_of(pin, struct btree_write, journal);
        struct btree *b = container_of(w, struct btree, writes[i]);
+       unsigned long old, new, v;
+       unsigned idx = w - b->writes;
 
        six_lock_read(&b->c.lock, NULL, NULL);
-       bch2_btree_node_write_cond(c, b,
-               (btree_current_write(b) == w && w->journal.seq == seq));
+       v = READ_ONCE(b->flags);
+
+       do {
+               old = new = v;
+
+               if (!(old & (1 << BTREE_NODE_dirty)) ||
+                   !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+                   w->journal.seq != seq)
+                       break;
+
+               new |= 1 << BTREE_NODE_need_write;
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       btree_node_write_if_need(c, b, SIX_LOCK_read);
        six_unlock_read(&b->c.lock);
        return 0;
 }
@@ -220,7 +234,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
        bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 
        if (unlikely(!btree_node_dirty(b)))
-               set_btree_node_dirty(c, b);
+               set_btree_node_dirty_acct(c, b);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -367,7 +381,13 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
        ck->u64s        = new_u64s;
        ck->k           = new_k;
-       return BTREE_INSERT_OK;
+       /*
+        * Keys returned by peek() are no longer valid pointers, so we need a
+        * transaction restart:
+        */
+       trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
+                                            path->btree_id, &path->pos);
+       return btree_trans_restart(trans);
 }
 
 static inline void do_btree_insert_one(struct btree_trans *trans,
index ee22ed31ce37673078354233c90c507190b38aa4..2d65ae370931cb3ac5fe0641a7dc5ab67d405155 100644 (file)
@@ -185,9 +185,10 @@ out:
 /* XXX: bch_fs refcounting */
 
 struct dump_iter {
-       struct bpos             from;
-       struct bch_fs   *c;
+       struct bch_fs           *c;
        enum btree_id           id;
+       struct bpos             from;
+       u64                     iter;
 
        struct printbuf         buf;
 
@@ -226,6 +227,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
 
        file->private_data = i;
        i->from = POS_MIN;
+       i->iter = 0;
        i->c    = container_of(bd, struct bch_fs, btree_debug[bd->id]);
        i->id   = bd->id;
        i->buf  = PRINTBUF;
@@ -420,10 +422,148 @@ static const struct file_operations bfloat_failed_debug_ops = {
        .read           = bch2_read_bfloat_failed,
 };
 
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+                                          struct btree *b)
+{
+       out->tabstops[0] = 32;
+
+       pr_buf(out, "%px btree=%s l=%u ",
+              b,
+              bch2_btree_ids[b->c.btree_id],
+              b->c.level);
+       pr_newline(out);
+
+       pr_indent_push(out, 2);
+
+       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+       pr_newline(out);
+
+       pr_buf(out, "flags: ");
+       pr_tab(out);
+       bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
+       pr_newline(out);
+
+       pr_buf(out, "written:");
+       pr_tab(out);
+       pr_buf(out, "%u", b->written);
+       pr_newline(out);
+
+       pr_buf(out, "writes blocked:");
+       pr_tab(out);
+       pr_buf(out, "%u", !list_empty_careful(&b->write_blocked));
+       pr_newline(out);
+
+       pr_buf(out, "will make reachable:");
+       pr_tab(out);
+       pr_buf(out, "%lx", b->will_make_reachable);
+       pr_newline(out);
+
+       pr_buf(out, "journal pin %px:", &b->writes[0].journal);
+       pr_tab(out);
+       pr_buf(out, "%llu", b->writes[0].journal.seq);
+       pr_newline(out);
+
+       pr_buf(out, "journal pin %px:", &b->writes[1].journal);
+       pr_tab(out);
+       pr_buf(out, "%llu", b->writes[1].journal.seq);
+       pr_newline(out);
+
+       pr_indent_pop(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+                                           size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct bch_fs *c = i->c;
+       bool done = false;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       do {
+               struct bucket_table *tbl;
+               struct rhash_head *pos;
+               struct btree *b;
+
+               err = flush_buf(i);
+               if (err)
+                       return err;
+
+               if (!i->size)
+                       break;
+
+               rcu_read_lock();
+               i->buf.atomic++;
+               tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+                                         &c->btree_cache.table);
+               if (i->iter < tbl->size) {
+                       rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+                               bch2_cached_btree_node_to_text(&i->buf, c, b);
+                       i->iter++;;
+               } else {
+                       done = true;
+               }
+               --i->buf.atomic;
+               rcu_read_unlock();
+       } while (!done);
+
+       if (i->buf.allocation_failure)
+               return -ENOMEM;
+
+       return i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_cached_btree_nodes_read,
+};
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+                                     size_t size, loff_t *ppos)
+{
+       struct dump_iter *i = file->private_data;
+       struct bch_fs *c = i->c;
+       bool done = false;
+       int err;
+
+       i->ubuf = buf;
+       i->size = size;
+       i->ret  = 0;
+
+       do {
+               err = flush_buf(i);
+               if (err)
+                       return err;
+
+               if (!i->size)
+                       break;
+
+               done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+               i->iter++;
+       } while (!done);
+
+       if (i->buf.allocation_failure)
+               return -ENOMEM;
+
+       return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch2_dump_open,
+       .release        = bch2_dump_release,
+       .read           = bch2_journal_pins_read,
+};
+
 void bch2_fs_debug_exit(struct bch_fs *c)
 {
-       if (!IS_ERR_OR_NULL(c->debug))
-               debugfs_remove_recursive(c->debug);
+       if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+               debugfs_remove_recursive(c->fs_debug_dir);
 }
 
 void bch2_fs_debug_init(struct bch_fs *c)
@@ -435,29 +575,39 @@ void bch2_fs_debug_init(struct bch_fs *c)
                return;
 
        snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-       c->debug = debugfs_create_dir(name, bch_debug);
-       if (IS_ERR_OR_NULL(c->debug))
+       c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+       if (IS_ERR_OR_NULL(c->fs_debug_dir))
+               return;
+
+       debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+                           c->btree_debug, &cached_btree_nodes_ops);
+
+       debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+                           c->btree_debug, &journal_pins_ops);
+
+       c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+       if (IS_ERR_OR_NULL(c->btree_debug_dir))
                return;
 
        for (bd = c->btree_debug;
             bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
             bd++) {
                bd->id = bd - c->btree_debug;
-               bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
-                                               0400, c->debug, bd,
-                                               &btree_debug_ops);
+               debugfs_create_file(bch2_btree_ids[bd->id],
+                                   0400, c->btree_debug_dir, bd,
+                                   &btree_debug_ops);
 
                snprintf(name, sizeof(name), "%s-formats",
                         bch2_btree_ids[bd->id]);
 
-               bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
-                                                      &btree_format_debug_ops);
+               debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+                                   &btree_format_debug_ops);
 
                snprintf(name, sizeof(name), "%s-bfloat-failed",
                         bch2_btree_ids[bd->id]);
 
-               bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
-                                                &bfloat_failed_debug_ops);
+               debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+                                   &bfloat_failed_debug_ops);
        }
 }
 
index 3bea9986908c8ffe9a753669597b008d9e2014de..cf97594b7c6fc1f7edc0bc5a662cab1a76503590 100644 (file)
@@ -2041,7 +2041,14 @@ retry_pick:
 
        ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 
-       if (!pick.ptr.cached &&
+       /*
+        * Stale dirty pointers are treated as IO errors, but @failed isn't
+        * allocated unless we're in the retry path - so if we're not in the
+        * retry path, don't check here, it'll be caught in bch2_read_endio()
+        * and we'll end up in the retry path:
+        */
+       if ((flags & BCH_READ_IN_RETRY) &&
+           !pick.ptr.cached &&
            unlikely(ptr_stale(ca, &pick.ptr))) {
                read_from_stale_dirty_pointer(trans, k, pick.ptr);
                bch2_mark_io_failure(failed, &pick);
index ffaf589564509e4074f129ae0df6bef4036ff381..9cd1e11ad1b53be581e4f9396b1a83913764138f 100644 (file)
@@ -1281,35 +1281,59 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        spin_unlock(&j->lock);
 }
 
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *pin;
-       u64 i;
 
        spin_lock(&j->lock);
+       *seq = max(*seq, j->pin.front);
+
+       if (*seq >= j->pin.back) {
+               spin_unlock(&j->lock);
+               return true;
+       }
+
        out->atomic++;
 
-       fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-               pr_buf(out, "%llu: count %u\n",
-                      i, atomic_read(&pin_list->count));
+       pin_list = journal_seq_pin(j, *seq);
 
-               list_for_each_entry(pin, &pin_list->key_cache_list, list)
-                       pr_buf(out, "\t%px %ps\n",
-                              pin, pin->flush);
+       pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+       pr_newline(out);
+       pr_indent_push(out, 2);
 
-               list_for_each_entry(pin, &pin_list->list, list)
-                       pr_buf(out, "\t%px %ps\n",
-                              pin, pin->flush);
+       list_for_each_entry(pin, &pin_list->list, list) {
+               pr_buf(out, "\t%px %ps", pin, pin->flush);
+               pr_newline(out);
+       }
+
+       list_for_each_entry(pin, &pin_list->key_cache_list, list) {
+               pr_buf(out, "\t%px %ps", pin, pin->flush);
+               pr_newline(out);
+       }
 
-               if (!list_empty(&pin_list->flushed))
-                       pr_buf(out, "flushed:\n");
+       if (!list_empty(&pin_list->flushed)) {
+               pr_buf(out, "flushed:");
+               pr_newline(out);
+       }
 
-               list_for_each_entry(pin, &pin_list->flushed, list)
-                       pr_buf(out, "\t%px %ps\n",
-                              pin, pin->flush);
+       list_for_each_entry(pin, &pin_list->flushed, list) {
+               pr_buf(out, "\t%px %ps", pin, pin->flush);
+               pr_newline(out);
        }
 
+       pr_indent_pop(out, 2);
+
        --out->atomic;
        spin_unlock(&j->lock);
+
+       return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+       u64 seq = 0;
+
+       while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+               seq++;
 }
index 296981740cc365d33734cc951c72ee979a48ebfc..0a3fb8a061c27c5f80e62389e01f36bc6497b21b 100644 (file)
@@ -501,6 +501,7 @@ void bch2_journal_block(struct journal *);
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
                                unsigned nr);
index 08966f4004fbfd3d39ea2abdd1ecc0afd2849c93..8580b6fd580adb3dda32ef4bb07501d3dce32532 100644 (file)
@@ -1420,24 +1420,25 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 };
 
 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
-                                 struct printbuf *orig_err)
+                                 struct printbuf *err)
 {
        unsigned type = le32_to_cpu(f->type);
-       struct printbuf err = *orig_err;
+       struct printbuf field_err = PRINTBUF;
        int ret;
 
        if (type >= BCH_SB_FIELD_NR)
                return 0;
 
-       pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
-
-       ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+       ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
        if (ret) {
-               pr_newline(&err);
-               bch2_sb_field_to_text(&err, sb, f);
-               *orig_err = err;
+               pr_buf(err, "Invalid superblock section %s: %s",
+                      bch2_sb_fields[type],
+                      field_err.buf);
+               pr_newline(err);
+               bch2_sb_field_to_text(err, sb, f);
        }
 
+       printbuf_exit(&field_err);
        return ret;
 }
 
index ce32b9068518d7cde1133924389ffefe1493e393..3018250d421b8794e839d9cff4d570adc61ffc8e 100644 (file)
@@ -174,9 +174,7 @@ read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
-read_attribute(journal_pins);
 read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
@@ -402,15 +400,9 @@ SHOW(bch2_fs)
        if (attr == &sysfs_journal_debug)
                bch2_journal_debug_to_text(out, &c->journal);
 
-       if (attr == &sysfs_journal_pins)
-               bch2_journal_pins_to_text(out, &c->journal);
-
        if (attr == &sysfs_btree_updates)
                bch2_btree_updates_to_text(out, c);
 
-       if (attr == &sysfs_dirty_btree_nodes)
-               bch2_dirty_btree_nodes_to_text(out, c);
-
        if (attr == &sysfs_btree_cache)
                bch2_btree_cache_to_text(out, c);
 
@@ -564,9 +556,7 @@ SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
        &sysfs_journal_debug,
-       &sysfs_journal_pins,
        &sysfs_btree_updates,
-       &sysfs_dirty_btree_nodes,
        &sysfs_btree_cache,
        &sysfs_btree_key_cache,
        &sysfs_btree_transactions,
index 25ae98cc5a6db3e362c77028ad31fcc782b3b601..4095df2fcded750170317f47926ef6bd65de3327 100644 (file)
@@ -300,6 +300,10 @@ static inline void pr_indent_push(struct printbuf *buf, unsigned spaces)
 
 static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces)
 {
+       if (buf->last_newline + buf->indent == buf->pos) {
+               buf->pos -= spaces;
+               buf->buf[buf->pos] = 0;
+       }
        buf->indent -= spaces;
 }