]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to d1fd471830 bcachefs: Add more debug checks
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 13 Nov 2020 19:41:06 +0000 (14:41 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 16 Nov 2020 23:23:54 +0000 (18:23 -0500)
30 files changed:
.bcachefs_revision
include/linux/bitops.h
include/linux/kernel.h
include/linux/srcu.h [new file with mode: 0644]
include/linux/types.h
include/trace/events/bcachefs.h
libbcachefs/bcachefs.h
libbcachefs/bkey_methods.c
libbcachefs/bset.c
libbcachefs/btree_cache.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_key_cache.c
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/inode.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_types.h
libbcachefs/recovery.c
libbcachefs/sysfs.c

index dc58304780a4458127051429c47f5053788c3d7a..9c20ba85dcace21114f7c3dbcab1676997b7dc7b 100644 (file)
@@ -1 +1 @@
-1d669389f79de8571732c13fdf4d23039e2308fd
+d1fd47183051729471bce1c9f84fa63cb84dc557
index f2183d5430ba49affa395b20b0a6acf3c444b2c4..2fe736e95b86cc333b50019e049c0490891ddfbb 100644 (file)
@@ -85,6 +85,17 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
        return (old & mask) != 0;
 }
 
+static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+       unsigned long old;
+
+       old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED);
+
+       return (old & mask) != 0;
+}
+
 static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
        unsigned long mask = BIT_MASK(nr);
index 10d94c5eca5f453303ae368f78f9f7622ffaa490..4b45306d0ba1b517bc492c91031e04633664687a 100644 (file)
@@ -219,4 +219,6 @@ struct qstr {
 
 #define POISON_FREE 0x6b
 
+static inline void dump_stack(void) {}
+
 #endif
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
new file mode 100644 (file)
index 0000000..75823cf
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef __TOOLS_LINUX_SRCU_H
+#define __TOOLS_LINUX_SRCU_H
+
+struct srcu_struct {
+};
+
+static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {}
+
+static inline int srcu_read_lock(struct srcu_struct *ssp)
+{
+       return 0;
+}
+
+static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+       return false;
+}
+
+static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+       return 0;
+}
+
+static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
+
+static inline int init_srcu_struct(struct srcu_struct *ssp)
+{
+       return 0;
+}
+
+#endif /* __TOOLS_LINUX_SRCU_H */
index 387c38314f0535b41cbba148d85929427086dcfe..1e1255508cd96a680b7a2f642b271662481aecf5 100644 (file)
@@ -31,6 +31,7 @@ typedef unsigned gfp_t;
 #define __GFP_IO       0
 #define __GFP_NOWARN   0
 #define __GFP_NORETRY  0
+#define __GFP_NOFAIL   0
 #define __GFP_ZERO     1
 
 #define PAGE_ALLOC_COSTLY_ORDER        6
index ba2c55559796d1d3d4ee5edab07023766a1018dd..a8b8c5b677ccc80febae23df7665d2ccfc00623e 100644 (file)
@@ -513,7 +513,7 @@ TRACE_EVENT(transaction_restart_ip,
                __entry->ip     = ip;
        ),
 
-       TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+       TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart,
@@ -528,7 +528,7 @@ DECLARE_EVENT_CLASS(transaction_restart,
                __entry->ip = ip;
        ),
 
-       TP_printk("%pf", (void *) __entry->ip)
+       TP_printk("%ps", (void *) __entry->ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_btree_node_reused,
@@ -568,7 +568,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
                __entry->want_iter_type         = want_iter_type;
        ),
 
-       TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+       TP_printk("%ps %pS because %u have %u:%u want %u:%u",
                  (void *) __entry->trans_ip,
                  (void *) __entry->caller_ip,
                  __entry->reason,
@@ -592,7 +592,7 @@ TRACE_EVENT(trans_restart_iters_realloced,
                __entry->nr     = nr;
        ),
 
-       TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr)
+       TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
 );
 
 TRACE_EVENT(trans_restart_mem_realloced,
@@ -609,7 +609,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
                __entry->bytes  = bytes;
        ),
 
-       TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes)
+       TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
index 35311dbb189caca3ddad7fa6f3a7b222e5300c6b..b20895a42d99e797d28ebf5e7882cf1d9ae0a77e 100644 (file)
 #include <linux/semaphore.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
+#include <linux/srcu.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
@@ -642,6 +643,8 @@ struct bch_fs {
        mempool_t               btree_iters_pool;
        struct btree_iter_buf  __percpu *btree_iters_bufs;
 
+       struct srcu_struct      btree_trans_barrier;
+
        struct btree_key_cache  btree_key_cache;
 
        struct workqueue_struct *wq;
index 99b7fce2bfd30716fc53e3a6c678fe1683f93319..f5779795a4b24e32f137c41d024f82d5791a4183 100644 (file)
@@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
        if (k) {
-               pr_buf(out, "u64s %u type %s ", k->u64s,
-                      bch2_bkey_types[k->type]);
+               pr_buf(out, "u64s %u type ", k->u64s);
+
+               if (k->type < KEY_TYPE_MAX)
+                       pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+               else
+                       pr_buf(out, "%u ", k->type);
 
                bch2_bpos_to_text(out, k->p);
 
@@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
                      struct bkey_s_c k)
 {
-       const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+       if (k.k->type < KEY_TYPE_MAX) {
+               const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 
-       if (likely(ops->val_to_text))
-               ops->val_to_text(out, c, k);
+               if (likely(ops->val_to_text))
+                       ops->val_to_text(out, c, k);
+       } else {
+               pr_buf(out, "(invalid type %u)", k.k->type);
+       }
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
index 26716657453f45c4684259ce1eb92674776eec3e..1c7318c6e46f7040327b0e44148b3ce875a80396 100644 (file)
@@ -604,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
        return (u16) v;
 }
 
-static void make_bfloat(struct btree *b, struct bset_tree *t,
-                       unsigned j,
-                       struct bkey_packed *min_key,
-                       struct bkey_packed *max_key)
+__always_inline
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
+                                unsigned j,
+                                struct bkey_packed *min_key,
+                                struct bkey_packed *max_key)
 {
        struct bkey_float *f = bkey_float(b, t, j);
        struct bkey_packed *m = tree_to_bkey(b, t, j);
-       struct bkey_packed *l, *r;
+       struct bkey_packed *l = is_power_of_2(j)
+               ? min_key
+               : tree_to_prev_bkey(b, t, j >> ffs(j));
+       struct bkey_packed *r = is_power_of_2(j + 1)
+               ? max_key
+               : tree_to_bkey(b, t, j >> (ffz(j) + 1));
        unsigned mantissa;
        int shift, exponent, high_bit;
 
-       if (is_power_of_2(j)) {
-               l = min_key;
-
-               if (!l->u64s) {
-                       if (!bkey_pack_pos(l, b->data->min_key, b)) {
-                               struct bkey_i tmp;
-
-                               bkey_init(&tmp.k);
-                               tmp.k.p = b->data->min_key;
-                               bkey_copy(l, &tmp);
-                       }
-               }
-       } else {
-               l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
-               EBUG_ON(m < l);
-       }
-
-       if (is_power_of_2(j + 1)) {
-               r = max_key;
-
-               if (!r->u64s) {
-                       if (!bkey_pack_pos(r, t->max_key, b)) {
-                               struct bkey_i tmp;
-
-                               bkey_init(&tmp.k);
-                               tmp.k.p = t->max_key;
-                               bkey_copy(r, &tmp);
-                       }
-               }
-       } else {
-               r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-               EBUG_ON(m > r);
-       }
-
        /*
         * for failed bfloats, the lookup code falls back to comparing against
         * the original key.
@@ -707,6 +677,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
        f->mantissa = mantissa;
 }
 
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+                       unsigned j,
+                       struct bkey_packed *min_key,
+                       struct bkey_packed *max_key)
+{
+       struct bkey_i *k;
+
+       if (is_power_of_2(j) &&
+           !min_key->u64s) {
+               k = (void *) min_key;
+               bkey_init(&k->k);
+               k->k.p = b->data->min_key;
+       }
+
+       if (is_power_of_2(j + 1) &&
+           !max_key->u64s) {
+               k = (void *) max_key;
+               bkey_init(&k->k);
+               k->k.p = t->max_key;
+       }
+
+       __make_bfloat(b, t, j, min_key, max_key);
+}
+
 /* bytes remaining - only valid for last bset: */
 static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
@@ -726,7 +720,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t
        return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
 }
 
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 {
        struct bkey_packed *k;
 
@@ -745,15 +739,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
        }
 }
 
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
 {
        struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
-       struct bkey_packed min_key, max_key;
+       struct bkey_i min_key, max_key;
        unsigned j, cacheline = 1;
 
-       /* signal to make_bfloat() that they're uninitialized: */
-       min_key.u64s = max_key.u64s = 0;
-
        t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
                      bset_ro_tree_capacity(b, t));
 retry:
@@ -789,9 +780,16 @@ retry:
 
        t->max_key = bkey_unpack_pos(b, prev);
 
+       bkey_init(&min_key.k);
+       min_key.k.p = b->data->min_key;
+       bkey_init(&max_key.k);
+       max_key.k.p = t->max_key;
+
        /* Then we build the tree */
        eytzinger1_for_each(j, t->size)
-               make_bfloat(b, t, j, &min_key, &max_key);
+               __make_bfloat(b, t, j,
+                             bkey_to_packed(&min_key),
+                             bkey_to_packed(&max_key));
 }
 
 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
index 325a16615a068ae149b22f880cebdc085a7add60..5bceff48078e27b8428945bab573105c722be485 100644 (file)
@@ -328,9 +328,9 @@ restart:
                        clear_btree_node_accessed(b);
        }
 
-       memalloc_nofs_restore(flags);
        mutex_unlock(&bc->lock);
 out:
+       memalloc_nofs_restore(flags);
        return (unsigned long) freed * btree_pages(c);
 }
 
@@ -381,11 +381,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
                if (btree_node_dirty(b))
                        bch2_btree_complete_write(c, b, btree_current_write(b));
-               clear_btree_node_dirty(b);
+               clear_btree_node_dirty(c, b);
 
                btree_node_data_free(c, b);
        }
 
+       BUG_ON(atomic_read(&c->btree_cache.dirty));
+
        while (!list_empty(&bc->freed)) {
                b = list_first_entry(&bc->freed, struct btree, list);
                list_del(&b->list);
@@ -445,7 +447,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
        bc->shrink.scan_objects         = bch2_btree_cache_scan;
        bc->shrink.seeks                = 4;
        bc->shrink.batch                = btree_pages(c) * 2;
-       register_shrinker(&bc->shrink);
+       ret = register_shrinker(&bc->shrink);
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
        return ret;
index 10a00085cdd6f951e628a9d0052f80cb1c968349..2406745fb3659aa427c52af3210f176ff478e26d 100644 (file)
@@ -1442,8 +1442,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 
        ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
                validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
-       if (ret)
+       if (ret) {
                bch2_inconsistent_error(c);
+               dump_stack();
+       }
 
        return ret;
 }
@@ -1498,6 +1500,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+       atomic_dec(&c->btree_cache.dirty);
+
        BUG_ON(btree_node_fake(b));
        BUG_ON((b->will_make_reachable != 0) != !b->written);
 
@@ -1530,6 +1534,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                seq = max(seq, le64_to_cpu(i->journal_seq));
        }
 
+       /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+       bytes += 8;
+
        data = btree_bounce_alloc(c, bytes, &used_mempool);
 
        if (!b->written) {
index 626d0f071b7008d7f9f76f89df0a1bda34adb2b0..1a4b11e99cc40367457a69259860a3cff462eae6 100644 (file)
@@ -14,6 +14,23 @@ struct btree_write;
 struct btree;
 struct btree_iter;
 
+static inline bool btree_node_dirty(struct btree *b)
+{
+       return test_bit(BTREE_NODE_dirty, &b->flags);
+}
+
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+       if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+               atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+       if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+               atomic_dec(&c->btree_cache.dirty);
+}
+
 struct btree_read_bio {
        struct bch_fs           *c;
        u64                     start_time;
index 58f1a3dd97d30591f9cc936f39c31ee386edb8eb..96cc5394295e1d65b20c0fb449cedc8ad9d6b91d 100644 (file)
@@ -2342,12 +2342,15 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
        unsigned new_size = BTREE_ITER_MAX;
        size_t iters_bytes      = sizeof(struct btree_iter) * new_size;
        size_t updates_bytes    = sizeof(struct btree_insert_entry) * new_size;
-       void *p;
+       void *p = NULL;
 
        BUG_ON(trans->used_mempool);
 
-       p =     this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
-               mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+#ifdef __KERNEL__
+       p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+#endif
+       if (!p)
+               p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
 
        trans->iters            = p; p += iters_bytes;
        trans->updates          = p; p += updates_bytes;
@@ -2369,8 +2372,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
         */
        bch2_trans_alloc_iters(trans, c);
 
-       if (expected_mem_bytes)
-               bch2_trans_preload_mem(trans, expected_mem_bytes);
+       if (expected_mem_bytes) {
+               trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+               trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+       }
+
+       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans->pid = current->pid;
@@ -2392,12 +2399,19 @@ int bch2_trans_exit(struct btree_trans *trans)
        mutex_unlock(&trans->c->btree_trans_lock);
 #endif
 
+       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
        bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
        kfree(trans->fs_usage_deltas);
        kfree(trans->mem);
 
+#ifdef __KERNEL__
+       /*
+        * Userspace doesn't have a real percpu implementation:
+        */
        trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+#endif
        if (trans->iters)
                mempool_free(trans->iters, &trans->c->btree_iters_pool);
 
@@ -2474,6 +2488,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
        mempool_exit(&c->btree_iters_pool);
+       cleanup_srcu_struct(&c->btree_trans_barrier);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
@@ -2483,7 +2498,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
        INIT_LIST_HEAD(&c->btree_trans_list);
        mutex_init(&c->btree_trans_lock);
 
-       return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+       return  init_srcu_struct(&c->btree_trans_barrier) ?:
+               mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
                        sizeof(struct btree_iter) * nr +
                        sizeof(struct btree_insert_entry) * nr +
                        sizeof(struct btree_insert_entry) * nr);
index 0ee4f78ce67a1b5ea4acd767973ef68b2db4e098..d605ff181d2e4c9caf514ae6b000bc145955e167 100644 (file)
@@ -9,6 +9,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 
+#include <linux/sched/mm.h>
 #include <trace/events/bcachefs.h>
 
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -66,12 +67,19 @@ static void bkey_cached_evict(struct btree_key_cache *c,
        BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
                                      bch2_btree_key_cache_params));
        memset(&ck->key, ~0, sizeof(ck->key));
+
+       c->nr_keys--;
 }
 
-static void bkey_cached_free(struct btree_key_cache *c,
+static void bkey_cached_free(struct btree_key_cache *bc,
                             struct bkey_cached *ck)
 {
-       list_move(&ck->list, &c->freed);
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+       ck->btree_trans_barrier_seq =
+               start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+       list_move(&ck->list, &bc->freed);
 
        kfree(ck->k);
        ck->k           = NULL;
@@ -135,6 +143,8 @@ btree_key_cache_create(struct btree_key_cache *c,
                return NULL;
        }
 
+       c->nr_keys++;
+
        list_move(&ck->list, &c->clean);
        six_unlock_write(&ck->c.lock);
 
@@ -355,10 +365,14 @@ err:
 
        bch2_journal_pin_drop(j, &ck->journal);
        bch2_journal_preres_put(j, &ck->res);
-       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 
        if (!evict) {
                mutex_lock(&c->btree_key_cache.lock);
+               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+                       c->btree_key_cache.nr_dirty--;
+               }
+
                list_move_tail(&ck->list, &c->btree_key_cache.clean);
                mutex_unlock(&c->btree_key_cache.lock);
        } else {
@@ -371,6 +385,11 @@ evict:
                six_lock_write(&ck->c.lock, NULL, NULL);
 
                mutex_lock(&c->btree_key_cache.lock);
+               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+                       c->btree_key_cache.nr_dirty--;
+               }
+
                bkey_cached_evict(&c->btree_key_cache, ck);
                bkey_cached_free(&c->btree_key_cache, ck);
                mutex_unlock(&c->btree_key_cache.lock);
@@ -391,19 +410,23 @@ static void btree_key_cache_journal_flush(struct journal *j,
        struct bkey_cached_key key;
        struct btree_trans trans;
 
+       int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+
        six_lock_read(&ck->c.lock, NULL, NULL);
        key = ck->key;
 
        if (ck->journal.seq != seq ||
            !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                six_unlock_read(&ck->c.lock);
-               return;
+               goto unlock;
        }
        six_unlock_read(&ck->c.lock);
 
        bch2_trans_init(&trans, c, 0, 0);
        btree_key_cache_flush_pos(&trans, key, seq, false);
        bch2_trans_exit(&trans);
+unlock:
+       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 }
 
 /*
@@ -448,9 +471,10 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                mutex_lock(&c->btree_key_cache.lock);
-               list_del_init(&ck->list);
+               list_move(&ck->list, &c->btree_key_cache.dirty);
 
                set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+               c->btree_key_cache.nr_dirty++;
                mutex_unlock(&c->btree_key_cache.lock);
        }
 
@@ -467,20 +491,97 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
 }
 #endif
 
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+                                          struct shrink_control *sc)
+{
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_key_cache.shrink);
+       struct btree_key_cache *bc = &c->btree_key_cache;
+       struct bkey_cached *ck, *t;
+       size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+       unsigned flags;
+
+       /* Return -1 if we can't do anything right now */
+       if (sc->gfp_mask & __GFP_FS)
+               mutex_lock(&bc->lock);
+       else if (!mutex_trylock(&bc->lock))
+               return -1;
+
+       flags = memalloc_nofs_save();
+
+       list_for_each_entry_safe(ck, t, &bc->freed, list) {
+               scanned++;
+
+               if (poll_state_synchronize_srcu(&c->btree_trans_barrier,
+                                               ck->btree_trans_barrier_seq)) {
+                       list_del(&ck->list);
+                       kfree(ck);
+                       freed++;
+               }
+
+               if (scanned >= nr)
+                       goto out;
+       }
+
+       list_for_each_entry_safe(ck, t, &bc->clean, list) {
+               scanned++;
+
+               if (bkey_cached_lock_for_evict(ck)) {
+                       bkey_cached_evict(bc, ck);
+                       bkey_cached_free(bc, ck);
+               }
+
+               if (scanned >= nr) {
+                       if (&t->list != &bc->clean)
+                               list_move_tail(&bc->clean, &t->list);
+                       goto out;
+               }
+       }
+out:
+       memalloc_nofs_restore(flags);
+       mutex_unlock(&bc->lock);
+
+       return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+                                           struct shrink_control *sc)
 {
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_key_cache.shrink);
+       struct btree_key_cache *bc = &c->btree_key_cache;
+
+       return bc->nr_keys;
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
        struct bkey_cached *ck, *n;
 
-       mutex_lock(&c->lock);
-       list_for_each_entry_safe(ck, n, &c->clean, list) {
+       if (bc->shrink.list.next)
+               unregister_shrinker(&bc->shrink);
+
+       mutex_lock(&bc->lock);
+       list_splice(&bc->dirty, &bc->clean);
+
+       list_for_each_entry_safe(ck, n, &bc->clean, list) {
+               bch2_journal_pin_drop(&c->journal, &ck->journal);
+               bch2_journal_preres_put(&c->journal, &ck->res);
+
                kfree(ck->k);
                kfree(ck);
+               bc->nr_keys--;
        }
-       list_for_each_entry_safe(ck, n, &c->freed, list)
+
+       BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
+       BUG_ON(bc->nr_keys);
+
+       list_for_each_entry_safe(ck, n, &bc->freed, list)
                kfree(ck);
-       mutex_unlock(&c->lock);
+       mutex_unlock(&bc->lock);
 
-       rhashtable_destroy(&c->table);
+       rhashtable_destroy(&bc->table);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -488,11 +589,16 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
        mutex_init(&c->lock);
        INIT_LIST_HEAD(&c->freed);
        INIT_LIST_HEAD(&c->clean);
+       INIT_LIST_HEAD(&c->dirty);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 {
-       return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+       c->shrink.count_objects         = bch2_btree_key_cache_count;
+       c->shrink.scan_objects          = bch2_btree_key_cache_scan;
+
+       return  register_shrinker(&c->shrink) ?:
+               rhashtable_init(&c->table, &bch2_btree_key_cache_params);
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
index 93721fbc77949f858431679f9eb0dd2d9150b684..6013c9164f69edd4718af59febc576cfb85e4b99 100644 (file)
@@ -158,6 +158,7 @@ struct btree_cache {
        /* Number of elements in live + freeable lists */
        unsigned                used;
        unsigned                reserve;
+       atomic_t                dirty;
        struct shrinker         shrink;
 
        /*
@@ -294,6 +295,11 @@ struct btree_key_cache {
        struct rhashtable       table;
        struct list_head        freed;
        struct list_head        clean;
+       struct list_head        dirty;
+       struct shrinker         shrink;
+
+       size_t                  nr_keys;
+       size_t                  nr_dirty;
 };
 
 struct bkey_cached_key {
@@ -309,6 +315,7 @@ struct bkey_cached {
        unsigned long           flags;
        u8                      u64s;
        bool                    valid;
+       u32                     btree_trans_barrier_seq;
        struct bkey_cached_key  key;
 
        struct rhash_head       hash;
@@ -345,6 +352,7 @@ struct btree_trans {
        pid_t                   pid;
 #endif
        unsigned long           ip;
+       int                     srcu_idx;
 
        u64                     iters_linked;
        u64                     iters_live;
@@ -411,7 +419,6 @@ enum btree_flags {
 
 BTREE_FLAG(read_in_flight);
 BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
 BTREE_FLAG(need_write);
 BTREE_FLAG(noevict);
 BTREE_FLAG(write_idx);
index 4ddd1697ffdec6fe000fa82a8097a1f06c0cad4b..d4f3dd7addcf4434163b4c06a9b6dec0d22c3c10 100644 (file)
@@ -11,6 +11,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "buckets.h"
+#include "error.h"
 #include "extents.h"
 #include "journal.h"
 #include "journal_reclaim.h"
@@ -149,7 +150,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 
        b->ob.nr = 0;
 
-       clear_btree_node_dirty(b);
+       clear_btree_node_dirty(c, b);
 
        btree_node_lock_type(c, b, SIX_LOCK_write);
        __btree_node_free(c, b);
@@ -264,7 +265,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
        set_btree_node_accessed(b);
-       set_btree_node_dirty(b);
+       set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
@@ -523,6 +524,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
        struct btree *b = as->b;
+       struct btree_trans trans;
        u64 journal_seq = 0;
        unsigned i;
        int ret;
@@ -540,14 +542,16 @@ static void btree_update_nodes_written(struct btree_update *as)
         * journal reclaim does btree updates when flushing bkey_cached entries,
         * which may require allocations as well.
         */
-       ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
-                           BTREE_INSERT_NOFAIL|
-                           BTREE_INSERT_USE_RESERVE|
-                           BTREE_INSERT_USE_ALLOC_RESERVE|
-                           BTREE_INSERT_NOCHECK_RW|
-                           BTREE_INSERT_JOURNAL_RECLAIM|
-                           BTREE_INSERT_JOURNAL_RESERVED,
-                           btree_update_nodes_written_trans(&trans, as));
+       bch2_trans_init(&trans, c, 0, 512);
+       ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+                             BTREE_INSERT_NOFAIL|
+                             BTREE_INSERT_USE_RESERVE|
+                             BTREE_INSERT_USE_ALLOC_RESERVE|
+                             BTREE_INSERT_NOCHECK_RW|
+                             BTREE_INSERT_JOURNAL_RECLAIM|
+                             BTREE_INSERT_JOURNAL_RESERVED,
+                             btree_update_nodes_written_trans(&trans, as));
+       bch2_trans_exit(&trans);
        BUG_ON(ret && !bch2_journal_error(&c->journal));
 
        if (b) {
@@ -827,7 +831,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                closure_wake_up(&c->btree_interior_update_wait);
        }
 
-       clear_btree_node_dirty(b);
+       clear_btree_node_dirty(c, b);
        clear_btree_node_need_write(b);
 
        /*
@@ -1018,7 +1022,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                                        struct bkey_i *insert,
                                        struct btree_node_iter *node_iter)
 {
+       struct bch_fs *c = as->c;
        struct bkey_packed *k;
+       const char *invalid;
+
+       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b));
+       if (invalid) {
+               char buf[160];
+
+               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
+               bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+               dump_stack();
+       }
 
        BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
               ARRAY_SIZE(as->journal_entries));
@@ -1034,7 +1049,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                bch2_btree_node_iter_advance(node_iter, b);
 
        bch2_btree_bset_insert_key(iter, b, node_iter, insert);
-       set_btree_node_dirty(b);
+       set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
 }
 
index 7668225e72c66b386aabdb8ab6778497094c27c7..41854fc345d2ba21fb830b424aaa97d94e209bad 100644 (file)
@@ -237,6 +237,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
                b->whiteout_u64s;
        ssize_t total = c->opts.btree_node_size << 6;
 
+       /* Always leave one extra u64 for bch2_varint_decode: */
+       used++;
+
        return total - used;
 }
 
index e386f8ed39222071592432400d325f7db2e57aa3..a2ca31e75a7e28d51dc292257606f2b6e8c8a796 100644 (file)
@@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
        bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 
        if (unlikely(!btree_node_dirty(b)))
-               set_btree_node_dirty(b);
+               set_btree_node_dirty(c, b);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) bset_u64s(t) - old_u64s;
index 82f1cc4ca6931f47a72f8b9dc25a588f34bbbc19..be65f2e78a62a7b37821d1c94cfeeb885563d86e 100644 (file)
@@ -323,7 +323,7 @@ static u64 reserve_factor(u64 r)
 
 static u64 avail_factor(u64 r)
 {
-       return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+       return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
 }
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
index 1eb69ed38b10bd2cbd56695f2f106bc40d91d976..389f23ee6f918fc720bbb7e0401ffeaabb828e71 100644 (file)
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>
 
+static inline struct address_space *faults_disabled_mapping(void)
+{
+       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+       current->faults_disabled_mapping =
+               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+       return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
 struct quota_res {
        u64                             sectors;
 };
@@ -493,10 +509,35 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
 {
        struct file *file = vmf->vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct address_space *fdm = faults_disabled_mapping();
        struct bch_inode_info *inode = file_bch_inode(file);
        int ret;
 
+       if (fdm == mapping)
+               return VM_FAULT_SIGBUS;
+
+       /* Lock ordering: */
+       if (fdm > mapping) {
+               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+               if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+                       goto got_lock;
+
+               bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+
+               bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+               bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+               bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+
+               /* Signal that lock has been dropped: */
+               set_fdm_dropped_locks();
+               return VM_FAULT_SIGBUS;
+       }
+
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+got_lock:
        ret = filemap_fault(vmf);
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
@@ -1742,14 +1783,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
        struct bio *bio = &dio->op.wbio.bio;
        struct bvec_iter_all iter;
        struct bio_vec *bv;
-       unsigned unaligned;
-       bool sync = dio->sync;
+       unsigned unaligned, iter_count;
+       bool sync = dio->sync, dropped_locks;
        long ret;
 
        if (dio->loop)
                goto loop;
 
        while (1) {
+               iter_count = dio->iter.count;
+
                if (kthread)
                        kthread_use_mm(dio->mm);
                BUG_ON(current->faults_disabled_mapping);
@@ -1757,13 +1800,34 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
                ret = bio_iov_iter_get_pages(bio, &dio->iter);
 
+               dropped_locks = fdm_dropped_locks();
+
                current->faults_disabled_mapping = NULL;
                if (kthread)
                        kthread_unuse_mm(dio->mm);
 
+               /*
+                * If the fault handler returned an error but also signalled
+                * that it dropped & retook ei_pagecache_lock, we just need to
+                * re-shoot down the page cache and retry:
+                */
+               if (dropped_locks && ret)
+                       ret = 0;
+
                if (unlikely(ret < 0))
                        goto err;
 
+               if (unlikely(dropped_locks)) {
+                       ret = write_invalidate_inode_pages_range(mapping,
+                                       req->ki_pos,
+                                       req->ki_pos + iter_count - 1);
+                       if (unlikely(ret))
+                               goto err;
+
+                       if (!bio->bi_iter.bi_size)
+                               continue;
+               }
+
                unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
                bio->bi_iter.bi_size -= unaligned;
                iov_iter_revert(&dio->iter, unaligned);
index 3ac57ba29e9f6285eb16dd47dd5ef7a3d1307e63..6e3d4bea81b885bde9e478bb3d2559cb9c057494 100644 (file)
@@ -91,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock)
        __pagecache_lock_put(lock, 1);
 }
 
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
+{
+       return __pagecache_lock_tryget(lock, 1);
+}
+
 void bch2_pagecache_add_get(struct pagecache_lock *lock)
 {
        __pagecache_lock_get(lock, 1);
@@ -271,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
        if (!tmpfile)
                mutex_lock(&dir->ei_update_lock);
 
-       bch2_trans_init(&trans, c, 8, 1024);
+       bch2_trans_init(&trans, c, 8,
+                       2048 + (!tmpfile ? dentry->d_name.len : 0));
 retry:
        bch2_trans_begin(&trans);
 
index eda903a45325ea94929003db292cbaaefe31a8f0..4ee1ac994420c6a1a092d6bf748c4b182fae7b63 100644 (file)
@@ -26,6 +26,7 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock)
 }
 
 void bch2_pagecache_add_put(struct pagecache_lock *);
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
 void bch2_pagecache_add_get(struct pagecache_lock *);
 void bch2_pagecache_block_put(struct pagecache_lock *);
 void bch2_pagecache_block_get(struct pagecache_lock *);
index 42371de7f72a87be4ab91b174202b36f0124a33f..823a1ddec5aca57983b123fc13df6be2ba940aaa 100644 (file)
@@ -537,7 +537,9 @@ found_slot:
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
 
-       return bch2_inode_write(trans, iter, inode_u);
+       ret = bch2_inode_write(trans, iter, inode_u);
+       bch2_trans_iter_put(trans, iter);
+       return ret;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -574,16 +576,9 @@ retry:
 
        bi_generation = 0;
 
-       ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
-       if (ret) {
-               if (ret != -EINTR)
-                       bch_err(c, "error flushing btree key cache: %i", ret);
-               goto err;
-       }
-
        iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+                                  BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_cached(iter);
 
        ret = bkey_err(k);
        if (ret)
index c2cafd3892a4cf1d88404e9cc6764a58ffea2451..e99faad8098b4d4aeb4dfb57f7b1d0327b87c7d5 100644 (file)
 
 #include <trace/events/bcachefs.h>
 
-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+static u64 last_unwritten_seq(struct journal *j)
+{
+       union journal_res_state s = READ_ONCE(j->reservations);
+
+       lockdep_assert_held(&j->lock);
+
+       return journal_cur_seq(j) - s.prev_buf_unwritten;
+}
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+       return seq >= last_unwritten_seq(j);
+}
 
 static bool __journal_entry_is_open(union journal_res_state state)
 {
@@ -30,6 +42,22 @@ static bool journal_entry_is_open(struct journal *j)
        return __journal_entry_is_open(j->reservations);
 }
 
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+       struct journal_buf *buf = NULL;
+
+       EBUG_ON(seq > journal_cur_seq(j));
+       EBUG_ON(seq == journal_cur_seq(j) &&
+               j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+       if (journal_seq_unwritten(j, seq)) {
+               buf = j->buf + (seq & 1);
+               EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+       }
+       return buf;
+}
+
 static void journal_pin_new_entry(struct journal *j, int count)
 {
        struct journal_entry_pin_list *p;
@@ -51,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
 {
        struct journal_buf *buf = journal_cur_buf(j);
 
+       bkey_extent_init(&buf->key);
+
        memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
        memset(buf->data, 0, sizeof(*buf->data));
@@ -72,6 +102,7 @@ void bch2_journal_halt(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
+       j->err_seq = journal_cur_seq(j);
        journal_wake(j);
        closure_wake_up(&journal_cur_buf(j)->wait);
 }
@@ -139,8 +170,6 @@ static bool __journal_entry_close(struct journal *j)
        BUG_ON(sectors > buf->sectors);
        buf->sectors = sectors;
 
-       bkey_extent_init(&buf->key);
-
        /*
         * We have to set last_seq here, _before_ opening a new journal entry:
         *
@@ -162,11 +191,6 @@ static bool __journal_entry_close(struct journal *j)
         */
        buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
 
-       if (journal_entry_empty(buf->data))
-               clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
-       else
-               set_bit(JOURNAL_NOT_EMPTY, &j->flags);
-
        journal_pin_new_entry(j, 1);
 
        bch2_journal_buf_init(j);
@@ -391,8 +415,17 @@ unlock:
                goto retry;
 
        if (ret == -ENOSPC) {
-               WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
-                         "JOURNAL_RES_GET_RESERVED set but journal full");
+               if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
+                             "JOURNAL_RES_GET_RESERVED set but journal full")) {
+                       char *buf;
+
+                       buf = kmalloc(4096, GFP_NOFS);
+                       if (buf) {
+                               bch2_journal_debug_to_text(&PBUF(buf), j);
+                               pr_err("\n%s", buf);
+                               kfree(buf);
+                       }
+               }
 
                /*
                 * Journal is full - can't rely on reclaim from work item due to
@@ -503,146 +536,28 @@ out:
 
 /* journal flushing: */
 
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
-       u64 seq;
-
-       spin_lock(&j->lock);
-       seq = journal_cur_seq(j);
-       if (j->reservations.prev_buf_unwritten)
-               seq--;
-       spin_unlock(&j->lock);
-
-       return seq;
-}
-
-/**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
- *
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
- */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       int ret;
-
-       spin_lock(&j->lock);
-
-       /*
-        * Can't try to open more than one sequence number ahead:
-        */
-       BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
-
-       if (journal_cur_seq(j) > seq ||
-           journal_entry_is_open(j)) {
-               spin_unlock(&j->lock);
-               return 0;
-       }
-
-       if (journal_cur_seq(j) < seq &&
-           !__journal_entry_close(j)) {
-               /* haven't finished writing out the previous one: */
-               trace_journal_entry_full(c);
-               ret = -EAGAIN;
-       } else {
-               BUG_ON(journal_cur_seq(j) != seq);
-
-               ret = journal_entry_open(j);
-       }
-
-       if ((ret == -EAGAIN || ret == -ENOSPC) &&
-           !j->res_get_blocked_start)
-               j->res_get_blocked_start = local_clock() ?: 1;
-
-       if (ret == -EAGAIN || ret == -ENOSPC)
-               closure_wait(&j->async_wait, cl);
-
-       spin_unlock(&j->lock);
-
-       if (ret == -ENOSPC) {
-               trace_journal_full(c);
-               bch2_journal_reclaim_work(&j->reclaim_work.work);
-               ret = -EAGAIN;
-       }
-
-       return ret;
-}
-
-static int journal_seq_error(struct journal *j, u64 seq)
-{
-       union journal_res_state state = READ_ONCE(j->reservations);
-
-       if (seq == journal_cur_seq(j))
-               return bch2_journal_error(j);
-
-       if (seq + 1 == journal_cur_seq(j) &&
-           !state.prev_buf_unwritten &&
-           seq > j->seq_ondisk)
-               return -EIO;
-
-       return 0;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
-       /* seq should be for a journal entry that has been opened: */
-       BUG_ON(seq > journal_cur_seq(j));
-       BUG_ON(seq == journal_cur_seq(j) &&
-              j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
-
-       if (seq == journal_cur_seq(j))
-               return journal_cur_buf(j);
-       if (seq + 1 == journal_cur_seq(j) &&
-           j->reservations.prev_buf_unwritten)
-               return journal_prev_buf(j);
-       return NULL;
-}
-
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
-                             struct closure *parent)
-{
-       struct journal_buf *buf;
-
-       spin_lock(&j->lock);
-
-       if ((buf = journal_seq_to_buf(j, seq))) {
-               if (!closure_wait(&buf->wait, parent))
-                       BUG();
-
-               if (seq == journal_cur_seq(j)) {
-                       smp_mb();
-                       if (bch2_journal_error(j))
-                               closure_wake_up(&buf->wait);
-               }
-       }
-
-       spin_unlock(&j->lock);
-}
-
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
  *
  * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
                                  struct closure *parent)
 {
        struct journal_buf *buf;
+       int ret = 0;
 
        spin_lock(&j->lock);
+       if (seq <= j->err_seq) {
+               ret = -EIO;
+               goto out;
+       }
+
+       if (seq <= j->seq_ondisk) {
+               ret = 1;
+               goto out;
+       }
 
        if (parent &&
            (buf = journal_seq_to_buf(j, seq)))
@@ -651,20 +566,8 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
        if (seq == journal_cur_seq(j))
                __journal_entry_close(j);
+out:
        spin_unlock(&j->lock);
-}
-
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
-       int ret;
-
-       spin_lock(&j->lock);
-       ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
-
-       if (seq == journal_cur_seq(j))
-               __journal_entry_close(j);
-       spin_unlock(&j->lock);
-
        return ret;
 }
 
@@ -673,28 +576,13 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
        u64 start_time = local_clock();
        int ret, ret2;
 
-       ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+       ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
        bch2_time_stats_update(j->flush_seq_time, start_time);
 
        return ret ?: ret2 < 0 ? ret2 : 0;
 }
 
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
-       struct journal_res res;
-
-       memset(&res, 0, sizeof(res));
-
-       bch2_journal_res_get(j, &res, jset_u64s(0), 0);
-       bch2_journal_res_put(j, &res);
-
-       bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
 int bch2_journal_meta(struct journal *j)
 {
        struct journal_res res;
@@ -989,7 +877,8 @@ void bch2_fs_journal_stop(struct journal *j)
        journal_quiesce(j);
 
        BUG_ON(!bch2_journal_error(j) &&
-              test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+              (journal_entry_is_open(j) ||
+               j->last_empty_seq + 1 != journal_cur_seq(j)));
 
        cancel_delayed_work_sync(&j->write_work);
        cancel_delayed_work_sync(&j->reclaim_work);
@@ -1047,6 +936,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        set_bit(JOURNAL_STARTED, &j->flags);
 
        journal_pin_new_entry(j, 1);
+
+       j->reservations.idx = journal_cur_seq(j);
+
        bch2_journal_buf_init(j);
 
        c->last_bucket_seq_cleanup = journal_cur_seq(j);
index f60bc964ee1f4cb99527b0c7eff6086fc63c70ee..25c6876765ac4b09b05055898cd36e165c851ea0 100644 (file)
@@ -464,13 +464,8 @@ void bch2_journal_entry_res_resize(struct journal *,
                                   struct journal_entry_res *,
                                   unsigned);
 
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
-
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
 void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
index bd0e6b371701b93dc56c43a2ad9835367b87b2bd..7c157bc50268ceaec3972396bcbcd7a8b524878f 100644 (file)
@@ -161,6 +161,8 @@ static void journal_entry_null_range(void *start, void *end)
 #define journal_entry_err_on(cond, c, msg, ...)                                \
        ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 
+#define FSCK_DELETED_KEY       5
+
 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                                struct jset_entry *entry,
                                unsigned level, enum btree_id btree_id,
@@ -173,28 +175,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
        int ret = 0;
 
        if (journal_entry_err_on(!k->k.u64s, c,
-                       "invalid %s in journal: k->u64s 0", type)) {
+                       "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
+                       type, le64_to_cpu(jset->seq),
+                       (u64 *) entry - jset->_data,
+                       le32_to_cpu(jset->u64s),
+                       (u64 *) k - entry->_data,
+                       le16_to_cpu(entry->u64s))) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (journal_entry_err_on((void *) bkey_next(k) >
                                (void *) vstruct_next(entry), c,
-                       "invalid %s in journal: extends past end of journal entry",
-                       type)) {
+                       "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
+                       type, le64_to_cpu(jset->seq),
+                       (u64 *) entry - jset->_data,
+                       le32_to_cpu(jset->u64s),
+                       (u64 *) k - entry->_data,
+                       le16_to_cpu(entry->u64s))) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-                       "invalid %s in journal: bad format %u",
-                       type, k->k.format)) {
-               le16_add_cpu(&entry->u64s, -k->k.u64s);
+                       "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
+                       type, le64_to_cpu(jset->seq),
+                       (u64 *) entry - jset->_data,
+                       le32_to_cpu(jset->u64s),
+                       (u64 *) k - entry->_data,
+                       le16_to_cpu(entry->u64s),
+                       k->k.format)) {
+               le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (!write)
@@ -208,13 +224,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                char buf[160];
 
                bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-               mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
-                                type, invalid, buf);
-
-               le16_add_cpu(&entry->u64s, -k->k.u64s);
+               mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
+                                type, le64_to_cpu(jset->seq),
+                                (u64 *) entry - jset->_data,
+                                le32_to_cpu(jset->u64s),
+                                (u64 *) k - entry->_data,
+                                le16_to_cpu(entry->u64s),
+                                invalid, buf);
+
+               le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (write)
@@ -230,15 +251,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
                                             struct jset_entry *entry,
                                             int write)
 {
-       struct bkey_i *k;
+       struct bkey_i *k = entry->start;
 
-       vstruct_for_each(entry, k) {
+       while (k != vstruct_last(entry)) {
                int ret = journal_validate_key(c, jset, entry,
                                               entry->level,
                                               entry->btree_id,
                                               k, "key", write);
-               if (ret)
-                       return ret;
+               if (ret == FSCK_DELETED_KEY)
+                       continue;
+
+               k = bkey_next(k);
        }
 
        return 0;
@@ -432,46 +455,45 @@ static int jset_validate(struct bch_fs *c,
                        "%s sector %llu seq %llu: unknown journal entry version %u",
                        ca->name, sector, le64_to_cpu(jset->seq),
                        version)) {
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
+               /* don't try to continue: */
+               return EINVAL;
        }
 
+       if (bytes > (sectors_read << 9) &&
+           sectors_read < bucket_sectors_left)
+               return JOURNAL_ENTRY_REREAD;
+
        if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
                        "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
                        ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
+               ret = JOURNAL_ENTRY_BAD;
+               le32_add_cpu(&jset->u64s,
+                            -((bytes - (bucket_sectors_left << 9)) / 8));
        }
 
-       if (bytes > sectors_read << 9)
-               return JOURNAL_ENTRY_REREAD;
-
        if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
                        "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
                        ca->name, sector, le64_to_cpu(jset->seq),
-                       JSET_CSUM_TYPE(jset)))
-               return JOURNAL_ENTRY_BAD;
+                       JSET_CSUM_TYPE(jset))) {
+               ret = JOURNAL_ENTRY_BAD;
+               goto bad_csum_type;
+       }
 
        csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
        if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
                                 "%s sector %llu seq %llu: journal checksum bad",
-                                ca->name, sector, le64_to_cpu(jset->seq))) {
-               /* XXX: retry IO, when we start retrying checksum errors */
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
-       }
+                                ca->name, sector, le64_to_cpu(jset->seq)))
+               ret = JOURNAL_ENTRY_BAD;
 
        bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
                     jset->encrypted_start,
                     vstruct_end(jset) - (void *) jset->encrypted_start);
-
+bad_csum_type:
        if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
                                 "invalid journal entry: last_seq > seq")) {
                jset->last_seq = jset->seq;
                return JOURNAL_ENTRY_BAD;
        }
-
-       return 0;
 fsck_err:
        return ret;
 }
@@ -939,24 +961,29 @@ static void journal_write_done(struct closure *cl)
        struct bch_replicas_padded replicas;
        u64 seq = le64_to_cpu(w->data->seq);
        u64 last_seq = le64_to_cpu(w->data->last_seq);
+       int err = 0;
 
        bch2_time_stats_update(j->write_time, j->write_start_time);
 
        if (!devs.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
-               goto err;
+               err = -EIO;
+       } else {
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+               if (bch2_mark_replicas(c, &replicas.e))
+                       err = -EIO;
        }
 
-       bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
-
-       if (bch2_mark_replicas(c, &replicas.e))
-               goto err;
+       if (err)
+               bch2_fatal_error(c);
 
        spin_lock(&j->lock);
        if (seq >= j->pin.front)
                journal_seq_pin(j, seq)->devs = devs;
 
        j->seq_ondisk           = seq;
+       if (err && (!j->err_seq || seq < j->err_seq))
+               j->err_seq      = seq;
        j->last_seq_ondisk      = last_seq;
        bch2_journal_space_available(j);
 
@@ -968,7 +995,7 @@ static void journal_write_done(struct closure *cl)
         * bch2_fs_journal_stop():
         */
        mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
-out:
+
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
 
@@ -982,11 +1009,6 @@ out:
        if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
                mod_delayed_work(system_freezable_wq, &j->write_work, 0);
        spin_unlock(&j->lock);
-       return;
-err:
-       bch2_fatal_error(c);
-       spin_lock(&j->lock);
-       goto out;
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1067,6 +1089,9 @@ void bch2_journal_write(struct closure *cl)
        SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
        SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
+       if (journal_entry_empty(jset))
+               j->last_empty_seq = le64_to_cpu(jset->seq);
+
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
                validate_before_checksum = true;
 
index 18e45296e7def4d2636246c504fb77966574bba0..7a04d06bb3426e2d53dfac40c481017b95920b13 100644 (file)
@@ -263,6 +263,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
        while (!fifo_empty(&j->pin) &&
               !atomic_read(&fifo_peek_front(&j->pin).count)) {
                BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
                BUG_ON(!fifo_pop(&j->pin, temp));
                popped = true;
        }
@@ -547,6 +548,12 @@ void bch2_journal_reclaim(struct journal *j)
 
                if (j->prereserved.reserved * 2 > j->prereserved.remaining)
                        min_nr = 1;
+
+               if ((atomic_read(&c->btree_cache.dirty) * 4 >
+                    c->btree_cache.used  * 3) ||
+                   (c->btree_key_cache.nr_dirty * 4 >
+                    c->btree_key_cache.nr_keys))
+                       min_nr = 1;
        } while (journal_flush_pins(j, seq_to_flush, min_nr));
 
        if (!bch2_journal_error(j))
index 154b51b891d33f1f6f89888e7afeef387392b75a..9757e3d55991ed8b29b2ca7cb9c745326ef4c6cc 100644 (file)
@@ -127,7 +127,6 @@ enum {
        JOURNAL_STARTED,
        JOURNAL_RECLAIM_STARTED,
        JOURNAL_NEED_WRITE,
-       JOURNAL_NOT_EMPTY,
        JOURNAL_MAY_GET_UNRESERVED,
 };
 
@@ -181,6 +180,8 @@ struct journal {
        /* seq, last_seq from the most recent journal entry successfully written */
        u64                     seq_ondisk;
        u64                     last_seq_ondisk;
+       u64                     err_seq;
+       u64                     last_empty_seq;
 
        /*
         * FIFO of journal entries whose btree updates have not yet been
index 1745cfac6b26aef400984100b5d82193dec1ad88..6750063663b5f888e39b77f3b514ce3ef12b0975 100644 (file)
@@ -456,6 +456,7 @@ retry:
                __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
                bch2_trans_update(&trans, split_iter, split,
                                  BTREE_TRIGGER_NORUN);
+               bch2_trans_iter_put(&trans, split_iter);
 
                bch2_btree_iter_set_pos(iter, split->k.p);
 
@@ -481,6 +482,8 @@ retry:
                                BTREE_INSERT_LAZY_RW|
                                BTREE_INSERT_JOURNAL_REPLAY);
 err:
+       bch2_trans_iter_put(&trans, iter);
+
        if (ret == -EINTR)
                goto retry;
 
index d7ad293aff4dfd7cb77c3dd228a44a17639f6790..58c00e26ebe8a62fb5aff3a40354336826db1bc9 100644 (file)
@@ -458,7 +458,7 @@ STORE(bch2_fs)
        /* Debugging: */
 
        if (attr == &sysfs_trigger_journal_flush)
-               bch2_journal_meta_async(&c->journal, NULL);
+               bch2_journal_meta(&c->journal);
 
        if (attr == &sysfs_trigger_btree_coalesce)
                bch2_coalesce(c);