]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 021e62a098 bcachefs: Fix error in filesystem initialization
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 30 Nov 2020 04:55:51 +0000 (23:55 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 30 Nov 2020 05:06:46 +0000 (00:06 -0500)
34 files changed:
.bcachefs_revision
include/linux/sched/mm.h
include/linux/slab.h
include/trace/events/bcachefs.h
libbcachefs/alloc_background.c
libbcachefs/bcachefs.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/chardev.c
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_types.h
libbcachefs/movinggc.c
libbcachefs/rebalance.c
libbcachefs/recovery.c
libbcachefs/super.c
libbcachefs/sysfs.c

index ec3e6587dca9fb63f704d9a0361f5e71d228fcbf..6ba1c9af6484e66345dc5b54826080a4ca8be948 100644 (file)
@@ -1 +1 @@
-b1107114caf6aa6f725170f3d75b072badcfa573
+021e62a098d9fa7e558ae935180e2fb16bb50a3a
index 347105c6e0caa83b7637459bd73de71bfe5bfb47..03feda7ab1defe0d03c9dce745df7a52f311d3c7 100644 (file)
@@ -1,7 +1,8 @@
 #ifndef _LINUX_SCHED_MM_H
 #define _LINUX_SCHED_MM_H
 
-#define PF_MEMALLOC_NOFS 0
+#define PF_MEMALLOC            0x00000800      /* Allocating memory */
+#define PF_MEMALLOC_NOFS       0x00040000      /* All allocation requests will inherit GFP_NOFS */
 
 static inline unsigned int memalloc_nofs_save(void)
 {
@@ -15,4 +16,16 @@ static inline void memalloc_nofs_restore(unsigned int flags)
        current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
 }
 
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+       unsigned int flags = current->flags & PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
+       return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+       current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
index ff342b6508f682de5053c1450225128803a5d2aa..b8a1235b105a064a0fa71968e01120620ef10492 100644 (file)
@@ -132,4 +132,35 @@ static inline void *kmemdup(const void *src, size_t len, gfp_t gfp)
        return p;
 }
 
+struct kmem_cache {
+       size_t              obj_size;
+};
+
+static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp)
+{
+       return kmalloc(c->obj_size, gfp);
+}
+
+static inline void kmem_cache_free(struct kmem_cache *c, void *p)
+{
+       kfree(p);
+}
+
+static inline void kmem_cache_destroy(struct kmem_cache *p)
+{
+       kfree(p);
+}
+
+static inline struct kmem_cache *kmem_cache_create(size_t obj_size)
+{
+       struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return NULL;
+
+       p->obj_size = obj_size;
+       return p;
+}
+
+#define KMEM_CACHE(_struct, _flags)    kmem_cache_create(sizeof(struct _struct))
+
 #endif /* __TOOLS_LINUX_SLAB_H */
index a8b8c5b677ccc80febae23df7665d2ccfc00623e..d4cb7a298cc2481a73c3de4c1c5ddd08f117d42e 100644 (file)
@@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write,
        TP_ARGS(bio)
 );
 
+TRACE_EVENT(journal_reclaim_start,
+       TP_PROTO(struct bch_fs *c, u64 min_nr,
+                u64 prereserved, u64 prereserved_total,
+                u64 btree_cache_dirty, u64 btree_cache_total,
+                u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+       TP_ARGS(c, min_nr, prereserved, prereserved_total,
+               btree_cache_dirty, btree_cache_total,
+               btree_key_cache_dirty, btree_key_cache_total),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16              )
+               __field(u64,            min_nr                  )
+               __field(u64,            prereserved             )
+               __field(u64,            prereserved_total       )
+               __field(u64,            btree_cache_dirty       )
+               __field(u64,            btree_cache_total       )
+               __field(u64,            btree_key_cache_dirty   )
+               __field(u64,            btree_key_cache_total   )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->min_nr                 = min_nr;
+               __entry->prereserved            = prereserved;
+               __entry->prereserved_total      = prereserved_total;
+               __entry->btree_cache_dirty      = btree_cache_dirty;
+               __entry->btree_cache_total      = btree_cache_total;
+               __entry->btree_key_cache_dirty  = btree_key_cache_dirty;
+               __entry->btree_key_cache_total  = btree_key_cache_total;
+       ),
+
+       TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+                 __entry->uuid,
+                 __entry->min_nr,
+                 __entry->prereserved,
+                 __entry->prereserved_total,
+                 __entry->btree_cache_dirty,
+                 __entry->btree_cache_total,
+                 __entry->btree_key_cache_dirty,
+                 __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+       TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+       TP_ARGS(c, nr_flushed),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16 )
+               __field(u64,            nr_flushed )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->nr_flushed = nr_flushed;
+       ),
+
+       TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+);
+
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -622,6 +681,11 @@ DEFINE_EVENT(transaction_restart,  trans_restart_journal_preres_get,
        TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
+       TP_PROTO(unsigned long ip),
+       TP_ARGS(ip)
+);
+
 DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
        TP_PROTO(unsigned long ip),
        TP_ARGS(ip)
@@ -657,11 +721,6 @@ DEFINE_EVENT(transaction_restart,  trans_restart_traverse,
        TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_atomic,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
 DECLARE_EVENT_CLASS(node_lock_fail,
        TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
        TP_ARGS(level, iter_seq, node, node_seq),
index 97508de9f7214204f8f6297b0d0d6d6be901732e..2dd8a37f29e78e15e9edeaa9e8b2e3327fd03bf4 100644 (file)
@@ -1456,7 +1456,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
                return 0;
 
        p = kthread_create(bch2_allocator_thread, ca,
-                          "bch_alloc[%s]", ca->name);
+                          "bch-alloc/%s", ca->name);
        if (IS_ERR(p))
                return PTR_ERR(p);
 
index b20895a42d99e797d28ebf5e7882cf1d9ae0a77e..6d54defcee58b5e9c341de2b4568e52aa6bbfe7a 100644 (file)
@@ -650,7 +650,6 @@ struct bch_fs {
        struct workqueue_struct *wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
-       struct workqueue_struct *journal_reclaim_wq;
 
        /* ALLOCATION */
        struct delayed_work     pd_controllers_update;
index 5bceff48078e27b8428945bab573105c722be485..09774f56f11c38c9aa36c22518567cc1b909baba 100644 (file)
@@ -1064,3 +1064,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               stats.floats,
               stats.failed);
 }
+
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used);
+       pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty));
+}
index 8a19e60e9258014816b6d5a8c4ac54e00587a7f2..e766ef552ce7470f4fb83fbfa915bd044963fafa 100644 (file)
@@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
                             struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
index ba4acc112ed34ef22e6ad7526381966026b277fa..ac81c9b9a06abba73f5411536600fdbe0df7f865 100644 (file)
@@ -1427,7 +1427,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
        BUG_ON(c->gc_thread);
 
-       p = kthread_create(bch2_gc_thread, c, "bch_gc");
+       p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
        if (IS_ERR(p))
                return PTR_ERR(p);
 
index d605ff181d2e4c9caf514ae6b000bc145955e167..a21dc485c677556a5072fc8ef9599bb5b8dbfe18 100644 (file)
@@ -12,6 +12,8 @@
 #include <linux/sched/mm.h>
 #include <trace/events/bcachefs.h>
 
+static struct kmem_cache *bch2_key_cache;
+
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
                                       const void *obj)
 {
@@ -76,10 +78,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 {
        struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 
+       BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
        ck->btree_trans_barrier_seq =
                start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-       list_move(&ck->list, &bc->freed);
+       list_move_tail(&ck->list, &bc->freed);
+       bc->nr_freed++;
 
        kfree(ck->k);
        ck->k           = NULL;
@@ -94,9 +99,20 @@ bkey_cached_alloc(struct btree_key_cache *c)
 {
        struct bkey_cached *ck;
 
-       list_for_each_entry(ck, &c->freed, list)
-               if (bkey_cached_lock_for_evict(ck))
+       list_for_each_entry_reverse(ck, &c->freed, list)
+               if (bkey_cached_lock_for_evict(ck)) {
+                       c->nr_freed--;
                        return ck;
+               }
+
+       ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
+       if (likely(ck)) {
+               INIT_LIST_HEAD(&ck->list);
+               six_lock_init(&ck->c.lock);
+               BUG_ON(!six_trylock_intent(&ck->c.lock));
+               BUG_ON(!six_trylock_write(&ck->c.lock));
+               return ck;
+       }
 
        list_for_each_entry(ck, &c->clean, list)
                if (bkey_cached_lock_for_evict(ck)) {
@@ -104,16 +120,7 @@ bkey_cached_alloc(struct btree_key_cache *c)
                        return ck;
                }
 
-       ck = kzalloc(sizeof(*ck), GFP_NOFS);
-       if (!ck)
-               return NULL;
-
-       INIT_LIST_HEAD(&ck->list);
-       six_lock_init(&ck->c.lock);
-       BUG_ON(!six_trylock_intent(&ck->c.lock));
-       BUG_ON(!six_trylock_write(&ck->c.lock));
-
-       return ck;
+       return NULL;
 }
 
 static struct bkey_cached *
@@ -132,8 +139,7 @@ btree_key_cache_create(struct btree_key_cache *c,
        ck->key.btree_id        = btree_id;
        ck->key.pos             = pos;
        ck->valid               = false;
-
-       BUG_ON(ck->flags);
+       ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 
        if (rhashtable_lookup_insert_fast(&c->table,
                                          &ck->hash,
@@ -290,6 +296,9 @@ fill:
                        goto err;
        }
 
+       if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+               set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
        iter->uptodate = BTREE_ITER_NEED_PEEK;
        bch2_btree_iter_downgrade(iter);
        return ret;
@@ -451,6 +460,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bkey_cached *ck = (void *) iter->l[0].b;
+       bool kick_reclaim = false;
 
        BUG_ON(insert->u64s > ck->u64s);
 
@@ -475,11 +485,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
                set_bit(BKEY_CACHED_DIRTY, &ck->flags);
                c->btree_key_cache.nr_dirty++;
+
+               if (bch2_nr_btree_keys_need_flush(c))
+                       kick_reclaim = true;
+
                mutex_unlock(&c->btree_key_cache.lock);
        }
 
        bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
                                &ck->journal, btree_key_cache_journal_flush);
+
+       if (kick_reclaim)
+               journal_reclaim_kick(&c->journal);
        return true;
 }
 
@@ -509,28 +526,34 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 
        flags = memalloc_nofs_save();
 
+       /*
+        * Newest freed entries are at the end of the list - once we hit one
+        * that's too new to be freed, we can bail out:
+        */
        list_for_each_entry_safe(ck, t, &bc->freed, list) {
-               scanned++;
+               if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+                                                ck->btree_trans_barrier_seq))
+                       break;
 
-               if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
-                                               ck->btree_trans_barrier_seq)) {
-                       list_del(&ck->list);
-                       kfree(ck);
-                       freed++;
-               }
-
-               if (scanned >= nr)
-                       goto out;
+               list_del(&ck->list);
+               kmem_cache_free(bch2_key_cache, ck);
+               bc->nr_freed--;
+               scanned++;
+               freed++;
        }
 
-       list_for_each_entry_safe(ck, t, &bc->clean, list) {
-               scanned++;
+       if (scanned >= nr)
+               goto out;
 
-               if (bkey_cached_lock_for_evict(ck)) {
+       list_for_each_entry_safe(ck, t, &bc->clean, list) {
+               if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+                       clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+               else if (bkey_cached_lock_for_evict(ck)) {
                        bkey_cached_evict(bc, ck);
                        bkey_cached_free(bc, ck);
                }
 
+               scanned++;
                if (scanned >= nr) {
                        if (&t->list != &bc->clean)
                                list_move_tail(&bc->clean, &t->list);
@@ -570,18 +593,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                bch2_journal_preres_put(&c->journal, &ck->res);
 
                kfree(ck->k);
-               kfree(ck);
+               list_del(&ck->list);
+               kmem_cache_free(bch2_key_cache, ck);
                bc->nr_keys--;
        }
 
        BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
        BUG_ON(bc->nr_keys);
 
-       list_for_each_entry_safe(ck, n, &bc->freed, list)
-               kfree(ck);
+       list_for_each_entry_safe(ck, n, &bc->freed, list) {
+               list_del(&ck->list);
+               kmem_cache_free(bch2_key_cache, ck);
+       }
        mutex_unlock(&bc->lock);
 
-       rhashtable_destroy(&bc->table);
+       if (bc->table_init_done)
+               rhashtable_destroy(&bc->table);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -594,33 +621,42 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 {
+       int ret;
+
+       c->shrink.seeks                 = 1;
        c->shrink.count_objects         = bch2_btree_key_cache_count;
        c->shrink.scan_objects          = bch2_btree_key_cache_scan;
 
-       return  register_shrinker(&c->shrink) ?:
-               rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+       ret = register_shrinker(&c->shrink);
+       if (ret)
+               return ret;
+
+       ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+       if (ret)
+               return ret;
+
+       c->table_init_done = true;
+       return 0;
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-       struct bucket_table *tbl;
-       struct bkey_cached *ck;
-       struct rhash_head *pos;
-       size_t i;
+       pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
+       pr_buf(out, "nr_keys:\t%zu\n",  c->nr_keys);
+       pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty);
+}
 
-       mutex_lock(&c->lock);
-       tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+void bch2_btree_key_cache_exit(void)
+{
+       if (bch2_key_cache)
+               kmem_cache_destroy(bch2_key_cache);
+}
 
-       for (i = 0; i < tbl->size; i++) {
-               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-                       pr_buf(out, "%s:",
-                              bch2_btree_ids[ck->key.btree_id]);
-                       bch2_bpos_to_text(out, ck->key.pos);
+int __init bch2_btree_key_cache_init(void)
+{
+       bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+       if (!bch2_key_cache)
+               return -ENOMEM;
 
-                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
-                               pr_buf(out, " journal seq %llu", ck->journal.seq);
-                       pr_buf(out, "\n");
-               }
-       }
-       mutex_unlock(&c->lock);
+       return 0;
 }
index d448264abcc89db5382ff6fe99f539c28d6a4326..d7d31a0662c366dafb431d8706c916f17098d51a 100644 (file)
@@ -1,6 +1,24 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+       size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+       size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+       size_t max_dirty = 4096 + nr_keys  / 2;
+
+       return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+       size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+       size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty);
+       size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+       return nr_dirty > max_dirty;
+}
+
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
@@ -25,4 +43,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
 
 void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
 
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
 #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
index 6013c9164f69edd4718af59febc576cfb85e4b99..cf59f12247413aca9c3a4d788d77d2110bf420c0 100644 (file)
@@ -293,11 +293,13 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 struct btree_key_cache {
        struct mutex            lock;
        struct rhashtable       table;
+       bool                    table_init_done;
        struct list_head        freed;
        struct list_head        clean;
        struct list_head        dirty;
        struct shrinker         shrink;
 
+       size_t                  nr_freed;
        size_t                  nr_keys;
        size_t                  nr_dirty;
 };
@@ -307,7 +309,8 @@ struct bkey_cached_key {
        struct bpos             pos;
 } __attribute__((packed, aligned(4)));
 
-#define BKEY_CACHED_DIRTY              0
+#define BKEY_CACHED_ACCESSED           0
+#define BKEY_CACHED_DIRTY              1
 
 struct bkey_cached {
        struct btree_bkey_cached_common c;
@@ -647,6 +650,7 @@ enum btree_insert_ret {
        BTREE_INSERT_ENOSPC,
        BTREE_INSERT_NEED_MARK_REPLICAS,
        BTREE_INSERT_NEED_JOURNAL_RES,
+       BTREE_INSERT_NEED_JOURNAL_RECLAIM,
 };
 
 enum btree_gc_coalesce_fail_reason {
index e0b1bde37484d990528aa9f78237d5b76bf2d440..adb07043cbb3d7f232711c08bc721e5e6c57bc7a 100644 (file)
@@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
                     struct disk_reservation *, u64 *, int flags);
 
-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
-                              struct bpos, u64 *);
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+                                 struct bpos, struct bpos, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
                            struct bpos, struct bpos, u64 *);
 
index d4f3dd7addcf4434163b4c06a9b6dec0d22c3c10..5143896e1b29e1f07e71d106789ad69587ba887b 100644 (file)
@@ -49,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                        break;
                bp = bkey_s_c_to_btree_ptr_v2(k);
 
-               BUG_ON(bkey_cmp(next_node, bp.v->min_key));
+               if (bkey_cmp(next_node, bp.v->min_key)) {
+                       bch2_dump_btree_node(c, b);
+                       panic("expected next min_key %llu:%llu got %llu:%llu\n",
+                             next_node.inode,
+                             next_node.offset,
+                             bp.v->min_key.inode,
+                             bp.v->min_key.offset);
+               }
 
                bch2_btree_node_iter_advance(&iter, b);
 
                if (bch2_btree_node_iter_end(&iter)) {
-                       BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+
+                       if (bkey_cmp(k.k->p, b->key.k.p)) {
+                               bch2_dump_btree_node(c, b);
+                               panic("expected end %llu:%llu got %llu:%llu\n",
+                                     b->key.k.p.inode,
+                                     b->key.k.p.offset,
+                                     k.k->p.inode,
+                                     k.k->p.offset);
+                       }
                        break;
                }
 
@@ -1026,7 +1041,8 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        struct bkey_packed *k;
        const char *invalid;
 
-       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b));
+       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
+               bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
        if (invalid) {
                char buf[160];
 
@@ -1368,9 +1384,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
        BUG_ON(!as || as->b);
        bch2_verify_keylist_sorted(keys);
 
-       if (as->must_rewrite)
-               goto split;
-
        bch2_btree_node_lock_for_insert(c, b, iter);
 
        if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
@@ -1378,6 +1391,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
                goto split;
        }
 
+       btree_node_interior_verify(c, b);
+
        bch2_btree_insert_keys_interior(as, b, iter, keys);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
index 41854fc345d2ba21fb830b424aaa97d94e209bad..45d212730fd781e22176839bb7cc2943e3a08a6a 100644 (file)
@@ -47,7 +47,6 @@ struct btree_update {
                BTREE_INTERIOR_UPDATING_AS,
        } mode;
 
-       unsigned                        must_rewrite:1;
        unsigned                        nodes_written:1;
 
        enum btree_id                   btree_id;
index a2ca31e75a7e28d51dc292257606f2b6e8c8a796..bbc6d51242751ebcf0615c9f522544a19157b4a0 100644 (file)
@@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
        BUG_ON(iter->level);
 
+       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+           bch2_btree_key_cache_must_wait(trans->c))
+               return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+
        if (u64s <= ck->u64s)
                return BTREE_INSERT_OK;
 
@@ -642,20 +646,24 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                trace_trans_restart_journal_res_get(trans->ip);
                ret = -EINTR;
                break;
-       default:
-               BUG_ON(ret >= 0);
-               break;
-       }
-
-       if (ret == -EINTR) {
-               int ret2 = bch2_btree_iter_traverse_all(trans);
+       case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+               bch2_trans_unlock(trans);
 
-               if (ret2) {
-                       trace_trans_restart_traverse(trans->ip);
-                       return ret2;
+               while (bch2_btree_key_cache_must_wait(c)) {
+                       mutex_lock(&c->journal.reclaim_lock);
+                       bch2_journal_reclaim(&c->journal);
+                       mutex_unlock(&c->journal.reclaim_lock);
                }
 
-               trace_trans_restart_atomic(trans->ip);
+               if (bch2_trans_relock(trans))
+                       return 0;
+
+               trace_trans_restart_journal_reclaim(trans->ip);
+               ret = -EINTR;
+               break;
+       default:
+               BUG_ON(ret >= 0);
+               break;
        }
 
        return ret;
@@ -1076,13 +1084,32 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
                             __bch2_btree_insert(&trans, id, k));
 }
 
-int bch2_btree_delete_at_range(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bpos end,
-                              u64 *journal_seq)
+int bch2_btree_delete_at(struct btree_trans *trans,
+                        struct btree_iter *iter, unsigned flags)
+{
+       struct bkey_i k;
+
+       bkey_init(&k.k);
+       k.k.p = iter->pos;
+
+       bch2_trans_update(trans, iter, &k, 0);
+       return bch2_trans_commit(trans, NULL, NULL,
+                                BTREE_INSERT_NOFAIL|
+                                BTREE_INSERT_USE_RESERVE|flags);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+                                 struct bpos start, struct bpos end,
+                                 u64 *journal_seq)
 {
+       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret = 0;
+
+       iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+       ret = PTR_ERR_OR_ZERO(iter);
+       if (ret)
+               return ret;
 retry:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
@@ -1093,6 +1120,10 @@ retry:
 
                bkey_init(&delete.k);
 
+               /*
+                * This could probably be more efficient for extents:
+                */
+
                /*
                 * For extents, iter.pos won't necessarily be the same as
                 * bkey_start_pos(k.k) (for non extents they always will be the
@@ -1132,22 +1163,8 @@ retry:
                goto retry;
        }
 
+       bch2_trans_iter_put(trans, iter);
        return ret;
-
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-                        struct btree_iter *iter, unsigned flags)
-{
-       struct bkey_i k;
-
-       bkey_init(&k.k);
-       k.k.p = iter->pos;
-
-       bch2_trans_update(trans, iter, &k, 0);
-       return bch2_trans_commit(trans, NULL, NULL,
-                                BTREE_INSERT_NOFAIL|
-                                BTREE_INSERT_USE_RESERVE|flags);
 }
 
 /*
@@ -1159,21 +1176,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                            struct bpos start, struct bpos end,
                            u64 *journal_seq)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       int ret = 0;
-
-       /*
-        * XXX: whether we need mem/more iters depends on whether this btree id
-        * has triggers
-        */
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
-
-       iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
-       ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
-       ret = bch2_trans_exit(&trans) ?: ret;
-
-       BUG_ON(ret == -EINTR);
-       return ret;
+       return bch2_trans_do(c, NULL, journal_seq, 0,
+                            bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
 }
index be65f2e78a62a7b37821d1c94cfeeb885563d86e..f7bdb14372f81bdbeb549fe6942bb6d0df7bea19 100644 (file)
@@ -2044,16 +2044,6 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
        return avail_factor(__bch2_fs_usage_read_short(c).free);
 }
 
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
-{
-       percpu_down_read(&c->mark_lock);
-       this_cpu_sub(c->usage[0]->online_reserved,
-                    res->sectors);
-       percpu_up_read(&c->mark_lock);
-
-       res->sectors = 0;
-}
-
 #define SECTORS_CACHE  1024
 
 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
index a3873becbb70111b173b6c42e369e2bc5012027f..856dc5a8c8a3f19a5f197456c8b1bcf0a9a6b52a 100644 (file)
@@ -272,13 +272,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
 
 /* disk reservations: */
 
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
-
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
                                             struct disk_reservation *res)
 {
-       if (res->sectors)
-               __bch2_disk_reservation_put(c, res);
+       this_cpu_sub(c->usage[0]->online_reserved, res->sectors);
+       res->sectors = 0;
 }
 
 #define BCH_DISK_RESERVATION_NOFAIL            (1 << 0)
index 4663784d2f28bd1bb1d953ac4f3c7d74760d5a42..e7c8969aaad176c0c2ac33b3fe02edb859081854 100644 (file)
@@ -341,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c,
        ctx->c = c;
        ctx->arg = arg;
 
-       ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+       ctx->thread = kthread_create(bch2_data_thread, ctx,
+                                    "bch-data/%s", c->name);
        if (IS_ERR(ctx->thread)) {
                ret = PTR_ERR(ctx->thread);
                goto err;
index 389f23ee6f918fc720bbb7e0401ffeaabb828e71..7d193ce4780ef78aab3fdebcf1ecfaa6967d67dd 100644 (file)
@@ -684,7 +684,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
        if (!iter->pages)
                return -ENOMEM;
 
-       __readahead_batch(ractl, iter->pages, nr_pages);
+       nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
        for (i = 0; i < nr_pages; i++) {
                __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
                put_page(iter->pages[i]);
index 6e3d4bea81b885bde9e478bb3d2559cb9c057494..f3f6fe6c776a11a03020b64fe571f913d9f5004c 100644 (file)
@@ -1252,7 +1252,7 @@ static void bch2_evict_inode(struct inode *vinode)
                                KEY_TYPE_QUOTA_WARN);
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
-               bch2_inode_rm(c, inode->v.i_ino);
+               bch2_inode_rm(c, inode->v.i_ino, true);
        }
 }
 
index 0c503527084656ad62f189134e904176c0e8e2d6..09ce6c29b88cf91e85a57d6c9abce4b7bf56fa5d 100644 (file)
@@ -1254,7 +1254,7 @@ static int check_inode(struct btree_trans *trans,
 
                bch2_fs_lazy_rw(c);
 
-               ret = bch2_inode_rm(c, u.bi_inum);
+               ret = bch2_inode_rm(c, u.bi_inum, false);
                if (ret)
                        bch_err(c, "error in fsck: error %i while deleting inode", ret);
                return ret;
index 823a1ddec5aca57983b123fc13df6be2ba940aaa..82099e5a48d8f0f98009cd327a67b1f802f4f289 100644 (file)
@@ -542,7 +542,7 @@ found_slot:
        return ret;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -553,6 +553,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
        u64 bi_generation;
        int ret;
 
+       bch2_trans_init(&trans, c, 0, 0);
+
        /*
         * If this was a directory, there shouldn't be any real dirents left -
         * but there could be whiteouts (from hash collisions) that we should
@@ -561,30 +563,34 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret   = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-                                       start, end, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_XATTRS,
-                                       start, end, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-                                       start, end, NULL);
+       ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS,
+                                             start, end, NULL) ?:
+               bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS,
+                                             start, end, NULL) ?:
+               bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS,
+                                             start, end, NULL);
        if (ret)
-               return ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
+               goto err;
 retry:
        bch2_trans_begin(&trans);
 
        bi_generation = 0;
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
-                                  BTREE_ITER_CACHED|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_cached(iter);
+       if (cached) {
+               iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+                                          BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_cached(iter);
+       } else {
+               iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+                                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_slot(iter);
+       }
 
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+       bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c,
                                "inode %llu not found when deleting",
                                inode_nr);
 
index ef7e885dce0c87e671248f30f9debbaaf13b2dc9..dbdfcf63d07992fc2e0a5a070fd593f3c0e3b404 100644 (file)
@@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 
 int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
 
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, u64, bool);
 
 int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
                                  struct bch_inode_unpacked *);
index 1b3f249bc3e4211019996ac38538b02fcbaa6534..5874a9ff2204fc0244fa108825ad1a19d3e8d788 100644 (file)
@@ -226,16 +226,19 @@ static bool journal_entry_close(struct journal *j)
  */
 static int journal_entry_open(struct journal *j)
 {
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf = journal_cur_buf(j);
        union journal_res_state old, new;
        int u64s;
        u64 v;
 
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
        lockdep_assert_held(&j->lock);
        BUG_ON(journal_entry_is_open(j));
 
        if (j->blocked)
-               return -EAGAIN;
+               return cur_entry_blocked;
 
        if (j->cur_entry_error)
                return j->cur_entry_error;
@@ -251,7 +254,7 @@ static int journal_entry_open(struct journal *j)
        u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
        if (u64s <= le32_to_cpu(buf->data->u64s))
-               return -ENOSPC;
+               return cur_entry_journal_full;
 
        /*
         * Must be set before marking the journal entry as open:
@@ -263,7 +266,7 @@ static int journal_entry_open(struct journal *j)
                old.v = new.v = v;
 
                if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-                       return -EROFS;
+                       return cur_entry_insufficient_devices;
 
                /* Handle any already added entries */
                new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
@@ -376,7 +379,7 @@ retry:
                 * Don't want to close current journal entry, just need to
                 * invoke reclaim:
                 */
-               ret = -ENOSPC;
+               ret = cur_entry_journal_full;
                goto unlock;
        }
 
@@ -399,14 +402,16 @@ retry:
                 * there's still a previous one in flight:
                 */
                trace_journal_entry_full(c);
-               ret = -EAGAIN;
+               ret = cur_entry_blocked;
        } else {
                ret = journal_entry_open(j);
        }
 unlock:
-       if ((ret == -EAGAIN || ret == -ENOSPC) &&
-           !j->res_get_blocked_start)
+       if ((ret && ret != cur_entry_insufficient_devices) &&
+           !j->res_get_blocked_start) {
                j->res_get_blocked_start = local_clock() ?: 1;
+               trace_journal_full(c);
+       }
 
        can_discard = j->can_discard;
        spin_unlock(&j->lock);
@@ -414,41 +419,39 @@ unlock:
        if (!ret)
                goto retry;
 
-       if (ret == -ENOSPC) {
-               if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
-                             "JOURNAL_RES_GET_RESERVED set but journal full")) {
-                       char *buf;
-
-                       buf = kmalloc(4096, GFP_NOFS);
-                       if (buf) {
-                               bch2_journal_debug_to_text(&PBUF(buf), j);
-                               pr_err("\n%s", buf);
-                               kfree(buf);
-                       }
+       if (WARN_ONCE(ret == cur_entry_journal_full &&
+                     !can_discard &&
+                     (flags & JOURNAL_RES_GET_RESERVED),
+                     "JOURNAL_RES_GET_RESERVED set but journal full")) {
+               char *buf;
+
+               buf = kmalloc(4096, GFP_NOFS);
+               if (buf) {
+                       bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+                       pr_err("\n%s", buf);
+                       kfree(buf);
                }
+       }
 
-               /*
-                * Journal is full - can't rely on reclaim from work item due to
-                * freezing:
-                */
-               trace_journal_full(c);
-
-               if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
-                       if (can_discard) {
-                               bch2_journal_do_discards(j);
-                               goto retry;
-                       }
-
-                       if (mutex_trylock(&j->reclaim_lock)) {
-                               bch2_journal_reclaim(j);
-                               mutex_unlock(&j->reclaim_lock);
-                       }
+       /*
+        * Journal is full - can't rely on reclaim from work item due to
+        * freezing:
+        */
+       if ((ret == cur_entry_journal_full ||
+            ret == cur_entry_journal_pin_full) &&
+           !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+               if (can_discard) {
+                       bch2_journal_do_discards(j);
+                       goto retry;
                }
 
-               ret = -EAGAIN;
+               if (mutex_trylock(&j->reclaim_lock)) {
+                       bch2_journal_reclaim(j);
+                       mutex_unlock(&j->reclaim_lock);
+               }
        }
 
-       return ret;
+       return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
 }
 
 /*
@@ -481,8 +484,10 @@ static bool journal_preres_available(struct journal *j,
 {
        bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
 
-       if (!ret)
-               bch2_journal_reclaim_work(&j->reclaim_work.work);
+       if (!ret && mutex_trylock(&j->reclaim_lock)) {
+               bch2_journal_reclaim(j);
+               mutex_unlock(&j->reclaim_lock);
+       }
 
        return ret;
 }
@@ -543,12 +548,20 @@ out:
  * necessary
  */
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
-                                 struct closure *parent)
+                                struct closure *parent)
 {
        struct journal_buf *buf;
        int ret = 0;
 
+       if (seq <= j->err_seq)
+               return -EIO;
+
+       if (seq <= j->seq_ondisk)
+               return 1;
+
        spin_lock(&j->lock);
+
+       /* Recheck under lock: */
        if (seq <= j->err_seq) {
                ret = -EIO;
                goto out;
@@ -678,16 +691,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        if (nr <= ja->nr)
                return 0;
 
-       ret = -ENOMEM;
        new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
        new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
-       if (!new_buckets || !new_bucket_seq)
+       if (!new_buckets || !new_bucket_seq) {
+               ret = -ENOMEM;
                goto err;
+       }
 
        journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
                                        nr + sizeof(*journal_buckets) / sizeof(u64));
-       if (!journal_buckets)
+       if (!journal_buckets) {
+               ret = -ENOSPC;
                goto err;
+       }
 
        /*
         * We may be called from the device add path, before the new device has
@@ -716,8 +732,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                goto err;
                        }
                } else {
+                       rcu_read_lock();
                        ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
                                               false, cl);
+                       rcu_read_unlock();
                        if (IS_ERR(ob)) {
                                ret = cl ? -EAGAIN : -ENOSPC;
                                goto err;
@@ -769,8 +787,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (!new_fs)
                        bch2_open_bucket_put(c, ob);
        }
-
-       ret = 0;
 err:
        bch2_sb_resize_journal(&ca->disk_sb,
                ja->nr + sizeof(*journal_buckets) / sizeof(u64));
@@ -889,7 +905,7 @@ void bch2_fs_journal_stop(struct journal *j)
                j->last_empty_seq + 1 != journal_cur_seq(j)));
 
        cancel_delayed_work_sync(&j->write_work);
-       cancel_delayed_work_sync(&j->reclaim_work);
+       bch2_journal_reclaim_stop(j);
 }
 
 int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -1017,7 +1033,6 @@ int bch2_fs_journal_init(struct journal *j)
        spin_lock_init(&j->err_lock);
        init_waitqueue_head(&j->wait);
        INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-       INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
        init_waitqueue_head(&j->pin_flush_wait);
        mutex_init(&j->reclaim_lock);
        mutex_init(&j->discard_lock);
@@ -1069,7 +1084,10 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               "last_seq:\t\t%llu\n"
               "last_seq_ondisk:\t%llu\n"
               "prereserved:\t\t%u/%u\n"
+              "nr direct reclaim:\t%llu\n"
+              "nr background reclaim:\t%llu\n"
               "current entry sectors:\t%u\n"
+              "current entry error:\t%u\n"
               "current entry:\t\t",
               fifo_used(&j->pin),
               journal_cur_seq(j),
@@ -1077,7 +1095,10 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               j->last_seq_ondisk,
               j->prereserved.reserved,
               j->prereserved.remaining,
-              j->cur_entry_sectors);
+              j->nr_direct_reclaim,
+              j->nr_background_reclaim,
+              j->cur_entry_sectors,
+              j->cur_entry_error);
 
        switch (s.cur_entry_offset) {
        case JOURNAL_ENTRY_ERROR_VAL:
index 7c157bc50268ceaec3972396bcbcd7a8b524878f..d1367cf067d3036b02072771ef3e2402ed8d532d 100644 (file)
@@ -994,7 +994,7 @@ static void journal_write_done(struct closure *cl)
         * Must come before signaling write completion, for
         * bch2_fs_journal_stop():
         */
-       mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
+       journal_reclaim_kick(&c->journal);
 
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
@@ -1045,6 +1045,8 @@ void bch2_journal_write(struct closure *cl)
        unsigned i, sectors, bytes, u64s;
        int ret;
 
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
        bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
 
        journal_buf_realloc(j, w);
index 7a04d06bb3426e2d53dfac40c481017b95920b13..66f5dcce8889771bb113fa8c93863406a1528576 100644 (file)
@@ -1,12 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
 #include "super.h"
 
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+#include <trace/events/bcachefs.h>
+
 /* Free space calculations: */
 
 static unsigned journal_space_from(struct journal_device *ja,
@@ -164,12 +169,12 @@ void bch2_journal_space_available(struct journal *j)
        j->can_discard = can_discard;
 
        if (nr_online < c->opts.metadata_replicas_required) {
-               ret = -EROFS;
+               ret = cur_entry_insufficient_devices;
                goto out;
        }
 
        if (!fifo_free(&j->pin)) {
-               ret = -ENOSPC;
+               ret = cur_entry_journal_pin_full;
                goto out;
        }
 
@@ -180,7 +185,7 @@ void bch2_journal_space_available(struct journal *j)
        clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
 
        if (!discarded.next_entry)
-               ret = -ENOSPC;
+               ret = cur_entry_journal_full;
 
        overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
                journal_entry_overhead(j);
@@ -432,7 +437,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
                list_move(&ret->list, &pin_list->flushed);
                BUG_ON(j->flush_in_progress);
                j->flush_in_progress = ret;
-               j->last_flushed = jiffies;
        }
 
        spin_unlock(&j->lock);
@@ -441,17 +445,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 }
 
 /* returns true if we did work */
-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
-                              unsigned min_nr)
+static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
+                             unsigned min_nr)
 {
        struct journal_entry_pin *pin;
-       bool ret = false;
-       u64 seq;
+       u64 seq, ret = 0;
 
        lockdep_assert_held(&j->reclaim_lock);
 
-       while ((pin = journal_get_next_pin(j, min_nr
-                               ? U64_MAX : seq_to_flush, &seq))) {
+       while (1) {
+               cond_resched();
+
+               j->last_flushed = jiffies;
+
+               pin = journal_get_next_pin(j, min_nr
+                               ? U64_MAX : seq_to_flush, &seq);
+               if (!pin)
+                       break;
+
                if (min_nr)
                        min_nr--;
 
@@ -460,7 +471,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
                BUG_ON(j->flush_in_progress != pin);
                j->flush_in_progress = NULL;
                wake_up(&j->pin_flush_wait);
-               ret = true;
+               ret++;
        }
 
        return ret;
@@ -524,15 +535,27 @@ static u64 journal_seq_to_flush(struct journal *j)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-void bch2_journal_reclaim(struct journal *j)
+static void __bch2_journal_reclaim(struct journal *j, bool direct)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       unsigned min_nr = 0;
-       u64 seq_to_flush = 0;
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
+       u64 seq_to_flush, nr_flushed = 0;
+       size_t min_nr;
+       unsigned flags;
 
+       /*
+        * We can't invoke memory reclaim while holding the reclaim_lock -
+        * journal reclaim is required to make progress for memory reclaim
+        * (cleaning the caches), so we can't get stuck in memory reclaim while
+        * we're holding the reclaim lock:
+        */
        lockdep_assert_held(&j->reclaim_lock);
+       flags = memalloc_noreclaim_save();
 
        do {
+               if (kthread && kthread_should_stop())
+                       break;
+
                bch2_journal_do_discards(j);
 
                seq_to_flush = journal_seq_to_flush(j);
@@ -549,26 +572,103 @@ void bch2_journal_reclaim(struct journal *j)
                if (j->prereserved.reserved * 2 > j->prereserved.remaining)
                        min_nr = 1;
 
-               if ((atomic_read(&c->btree_cache.dirty) * 4 >
-                    c->btree_cache.used  * 3) ||
-                   (c->btree_key_cache.nr_dirty * 4 >
-                    c->btree_key_cache.nr_keys))
+               if (atomic_read(&c->btree_cache.dirty) * 4 >
+                   c->btree_cache.used  * 3)
                        min_nr = 1;
-       } while (journal_flush_pins(j, seq_to_flush, min_nr));
 
-       if (!bch2_journal_error(j))
-               queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
-                                  msecs_to_jiffies(j->reclaim_delay_ms));
+               min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
+
+               trace_journal_reclaim_start(c,
+                               min_nr,
+                               j->prereserved.reserved,
+                               j->prereserved.remaining,
+                               atomic_read(&c->btree_cache.dirty),
+                               c->btree_cache.used,
+                               c->btree_key_cache.nr_dirty,
+                               c->btree_key_cache.nr_keys);
+
+               nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
+
+               if (direct)
+                       j->nr_direct_reclaim += nr_flushed;
+               else
+                       j->nr_background_reclaim += nr_flushed;
+               trace_journal_reclaim_finish(c, nr_flushed);
+       } while (min_nr);
+
+       memalloc_noreclaim_restore(flags);
 }
 
-void bch2_journal_reclaim_work(struct work_struct *work)
+void bch2_journal_reclaim(struct journal *j)
 {
-       struct journal *j = container_of(to_delayed_work(work),
-                               struct journal, reclaim_work);
+       __bch2_journal_reclaim(j, true);
+}
 
-       mutex_lock(&j->reclaim_lock);
-       bch2_journal_reclaim(j);
-       mutex_unlock(&j->reclaim_lock);
+static int bch2_journal_reclaim_thread(void *arg)
+{
+       struct journal *j = arg;
+       unsigned long next;
+
+       set_freezable();
+
+       kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
+
+       while (!kthread_should_stop()) {
+               j->reclaim_kicked = false;
+
+               mutex_lock(&j->reclaim_lock);
+               __bch2_journal_reclaim(j, false);
+               mutex_unlock(&j->reclaim_lock);
+
+               next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+
+               while (1) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (kthread_should_stop())
+                               break;
+                       if (j->reclaim_kicked)
+                               break;
+                       if (time_after_eq(jiffies, next))
+                               break;
+                       schedule_timeout(next - jiffies);
+                       try_to_freeze();
+
+               }
+               __set_current_state(TASK_RUNNING);
+       }
+
+       return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+       struct task_struct *p = j->reclaim_thread;
+
+       j->reclaim_thread = NULL;
+
+       if (p) {
+               kthread_stop(p);
+               put_task_struct(p);
+       }
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct task_struct *p;
+
+       if (j->reclaim_thread)
+               return 0;
+
+       p = kthread_create(bch2_journal_reclaim_thread, j,
+                          "bch-reclaim/%s", c->name);
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       get_task_struct(p);
+       j->reclaim_thread = p;
+       wake_up_process(p);
+       return 0;
 }
 
 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
@@ -582,7 +682,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
        mutex_lock(&j->reclaim_lock);
 
-       *did_work = journal_flush_pins(j, seq_to_flush, 0);
+       *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
 
        spin_lock(&j->lock);
        /*
index 8128907a7623cb223718c55ed2e63c380b0ce796..bae2c9210db8612ffb0cd47e731d5143c2b7a1e0 100644 (file)
@@ -10,6 +10,17 @@ enum journal_space_from {
        journal_space_clean,
 };
 
+static inline void journal_reclaim_kick(struct journal *j)
+{
+       struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+       if (p && !j->reclaim_kicked) {
+               j->reclaim_kicked = true;
+               if (p)
+                       wake_up_process(p);
+       }
+}
+
 unsigned bch2_journal_dev_buckets_available(struct journal *,
                                            struct journal_device *,
                                            enum journal_space_from);
@@ -55,7 +66,9 @@ void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_do_discards(struct journal *);
 void bch2_journal_reclaim(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
 
 bool bch2_journal_flush_pins(struct journal *, u64);
 
index 9757e3d55991ed8b29b2ca7cb9c745326ef4c6cc..4640bb8687cc18414eb0ba9017d78a9949602f81 100644 (file)
@@ -146,7 +146,13 @@ struct journal {
         * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
         * insufficient devices:
         */
-       int                     cur_entry_error;
+       enum {
+               cur_entry_ok,
+               cur_entry_blocked,
+               cur_entry_journal_full,
+               cur_entry_journal_pin_full,
+               cur_entry_insufficient_devices,
+       }                       cur_entry_error;
 
        union journal_preres_state prereserved;
 
@@ -210,8 +216,12 @@ struct journal {
        struct write_point      wp;
        spinlock_t              err_lock;
 
-       struct delayed_work     reclaim_work;
        struct mutex            reclaim_lock;
+       struct task_struct      *reclaim_thread;
+       bool                    reclaim_kicked;
+       u64                     nr_direct_reclaim;
+       u64                     nr_background_reclaim;
+
        unsigned long           last_flushed;
        struct journal_entry_pin *flush_in_progress;
        wait_queue_head_t       pin_flush_wait;
index ddfda1ef8a799a369006fb53f3836aabbaa059e3..4834f41f48edd4d5ba54a43e573155bcc4bed379 100644 (file)
@@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c)
        if (bch2_fs_init_fault("copygc_start"))
                return -ENOMEM;
 
-       t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
+       t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
        if (IS_ERR(t))
                return PTR_ERR(t);
 
index 44d2651be9700590177844f29e14b0938efc8235..c3373c48fa8136c611213529adaf2fd6d606bf83 100644 (file)
@@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c)
        if (c->opts.nochanges)
                return 0;
 
-       p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+       p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
        if (IS_ERR(p))
                return PTR_ERR(p);
 
index 6750063663b5f888e39b77f3b514ce3ef12b0975..0b3521c9cc19ef3ba285805a05de94bcc8bfd717 100644 (file)
@@ -616,6 +616,7 @@ static int bch2_journal_replay(struct bch_fs *c,
         */
        set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
        set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+       journal_reclaim_kick(j);
 
        j->replay_journal_seq = seq;
 
index 8673e9744ce18d27ccf3e41d83d6d81fcb1ebee6..e3bbd0b0d6989deba07e110ad6bf746fe57301a5 100644 (file)
@@ -49,7 +49,6 @@
 #include <linux/device.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
-#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/random.h>
@@ -259,7 +258,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
 void bch2_fs_read_only(struct bch_fs *c)
 {
        if (!test_bit(BCH_FS_RW, &c->flags)) {
-               cancel_delayed_work_sync(&c->journal.reclaim_work);
+               BUG_ON(c->journal.reclaim_thread);
                return;
        }
 
@@ -417,6 +416,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
        set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 
+       ret = bch2_journal_reclaim_start(&c->journal);
+       if (ret) {
+               bch_err(c, "error starting journal reclaim: %i", ret);
+               return ret;
+       }
+
        if (!early) {
                ret = bch2_fs_read_write_late(c);
                if (ret)
@@ -425,9 +430,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
        percpu_ref_reinit(&c->writes);
        set_bit(BCH_FS_RW, &c->flags);
-
-       queue_delayed_work(c->journal_reclaim_wq,
-                          &c->journal.reclaim_work, 0);
        return 0;
 err:
        __bch2_fs_read_only(c);
@@ -495,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(c->unused_inode_hints);
        free_heap(&c->copygc_heap);
 
-       if (c->journal_reclaim_wq)
-               destroy_workqueue(c->journal_reclaim_wq);
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
        if (c->wq)
@@ -750,8 +750,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-           !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            percpu_ref_init(&c->writes, bch2_writes_disabled,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
            mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
@@ -2018,6 +2016,7 @@ static void bcachefs_exit(void)
        bch2_debug_exit();
        bch2_vfs_exit();
        bch2_chardev_exit();
+       bch2_btree_key_cache_exit();
        if (bcachefs_kset)
                kset_unregister(bcachefs_kset);
 }
@@ -2027,6 +2026,7 @@ static int __init bcachefs_init(void)
        bch2_bkey_pack_test();
 
        if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+           bch2_btree_key_cache_init() ||
            bch2_chardev_init() ||
            bch2_vfs_init() ||
            bch2_debug_init())
index 58c00e26ebe8a62fb5aff3a40354336826db1bc9..900eda88a5dc00e581b8d058b3998d0207266821 100644 (file)
@@ -165,6 +165,7 @@ read_attribute(journal_debug);
 read_attribute(journal_pins);
 read_attribute(btree_updates);
 read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
@@ -374,6 +375,11 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
+       if (attr == &sysfs_btree_cache) {
+               bch2_btree_cache_to_text(&out, c);
+               return out.pos - buf;
+       }
+
        if (attr == &sysfs_btree_key_cache) {
                bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
                return out.pos - buf;
@@ -550,6 +556,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_journal_pins,
        &sysfs_btree_updates,
        &sysfs_dirty_btree_nodes,
+       &sysfs_btree_cache,
        &sysfs_btree_key_cache,
        &sysfs_btree_transactions,
        &sysfs_stripes_heap,