Update bcachefs sources to e1d0fb8c5f bcachefs: Don't require flush/fua on every...

author Kent Overstreet <kent.overstreet@gmail.com>

Fri, 4 Dec 2020 18:41:49 +0000 (13:41 -0500)

committer Kent Overstreet <kent.overstreet@gmail.com>

Fri, 4 Dec 2020 18:45:33 +0000 (13:45 -0500)
author Kent Overstreet <kent.overstreet@gmail.com>
Fri, 4 Dec 2020 18:41:49 +0000 (13:41 -0500)
committer Kent Overstreet <kent.overstreet@gmail.com>
Fri, 4 Dec 2020 18:45:33 +0000 (13:45 -0500)
diff --git a/.bcachefs_revision b/.bcachefs_revision

index 8f42b13966f1e2b34b81f1e4a3cde3c723ba437f..2cf8031141af2169ba5aaab32181347ede1aa78d 100644 (file)
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-00104032654027a8f4406a82d28911b243f19d94
+e1d0fb8c5fbc70df1007ebf5d9ab03018dc05275
diff --git a/include/linux/list.h b/include/linux/list.h

index 4a317090621c4eda065d9e70b11493b4e5c1c479..3639dc997ed43de7ca1359ae3dc5dee299937083 100644 (file)
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -26,7 +26,6 @@
  #define list_for_each_entry(p, h, m)   cds_list_for_each_entry(p, h, m)
  #define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m)
  #define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m)
-#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m)
  
  static inline int list_empty_careful(const struct list_head *head)
  {
@@ -54,6 +53,15 @@ static inline void list_splice_init(struct list_head *list,
  #define list_first_entry_or_null(ptr, type, member) \
         (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
  
+#define list_prev_entry(pos, member) \
+       list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+#define list_for_each_entry_safe_reverse(pos, n, head, member)         \
+       for (pos = list_last_entry(head, typeof(*pos), member),         \
+               n = list_prev_entry(pos, member);                       \
+            &pos->member != (head);                                    \
+            pos = n, n = list_prev_entry(n, member))
+
  /* hlists: */
  
  #include <urcu/hlist.h>
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h

index 6d54defcee58b5e9c341de2b4568e52aa6bbfe7a..eb5b4080477388f7d2b0dde43259b70b488f4b96 100644 (file)
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -214,9 +214,11 @@
          dynamic_fault("bcachefs:meta:write:" name)
  
  #ifdef __KERNEL__
-#define bch2_fmt(_c, fmt)      "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt(_c, fmt)              "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt)  "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
  #else
-#define bch2_fmt(_c, fmt)      fmt "\n"
+#define bch2_fmt(_c, fmt)              fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt)  "inum %llu: " fmt "\n", (_inum)
  #endif
  
  #define bch_info(c, fmt, ...) \
@@ -229,8 +231,11 @@
         printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
  #define bch_err(c, fmt, ...) \
         printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
  #define bch_err_ratelimited(c, fmt, ...) \
         printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+       printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
  
  #define bch_verbose(c, fmt, ...)                                       \
  do {                                                                   \
@@ -668,6 +673,7 @@ struct bch_fs {
         unsigned                bucket_size_max;
  
         atomic64_t              sectors_available;
+       struct mutex            sectors_available_lock;
  
         struct bch_fs_pcpu __percpu     *pcpu;
  
@@ -675,7 +681,7 @@ struct bch_fs {
  
         seqcount_t                      usage_lock;
         struct bch_fs_usage             *usage_base;
-       struct bch_fs_usage __percpu    *usage[2];
+       struct bch_fs_usage __percpu    *usage[JOURNAL_BUF_NR];
         struct bch_fs_usage __percpu    *usage_gc;
  
         /* single element mempool: */
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h

index 94b5418587e364591b29643fea73b3eb53a5103a..02a76c3d3acb7839d83d94d44e107e76aa8787d1 100644 (file)
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1332,14 +1332,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,       struct bch_sb, flags[3],  0, 16);
         x(extents_above_btree_updates,  12)     \
         x(btree_updates_journalled,     13)     \
         x(reflink_inline_data,          14)     \
-       x(new_varint,                   15)
+       x(new_varint,                   15)     \
+       x(journal_no_flush,             16)
  
  #define BCH_SB_FEATURES_ALL                            \
         ((1ULL << BCH_FEATURE_new_siphash)|             \
          (1ULL << BCH_FEATURE_new_extent_overwrite)|    \
          (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
          (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
-        (1ULL << BCH_FEATURE_new_varint))\
+        (1ULL << BCH_FEATURE_new_varint)|              \
+        (1ULL << BCH_FEATURE_journal_no_flush))
  
  enum bch_sb_feature {
  #define x(f, n) BCH_FEATURE_##f,
@@ -1575,6 +1577,7 @@ struct jset {
  
  LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
  LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,    struct jset, flags, 5, 6);
  
  #define BCH_JOURNAL_BUCKETS_MIN                8
  
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c

index ac81c9b9a06abba73f5411536600fdbe0df7f865..6268ea637d19f7a717b1a3fafd776bd306643011 100644 (file)
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c,
                 struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
                 struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
                 struct stripe *dst, *src;
-               unsigned i;
  
                 c->ec_stripes_heap.used = 0;
  
@@ -651,8 +650,8 @@ static int bch2_gc_done(struct bch_fs *c,
                 }
         };
  
-       bch2_fs_usage_acc_to_base(c, 0);
-       bch2_fs_usage_acc_to_base(c, 1);
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               bch2_fs_usage_acc_to_base(c, i);
  
         bch2_dev_usage_from_buckets(c);
  
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c

index 2406745fb3659aa427c52af3210f176ff478e26d..9b19432ae7a590f81072121d23a6d90c3a85979f 100644 (file)
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
                 bch2_btree_iter_reinit_node(iter, b);
  }
  
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+                         struct btree *b)
+{
+       pr_buf(out, "%s level %u/%u\n  ",
+              bch2_btree_ids[b->c.btree_id],
+              b->c.level,
+              c->btree_roots[b->c.btree_id].level);
+       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
  static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
                           struct btree *b, struct bset *i,
                           unsigned offset, int write)
  {
-       pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
-              "pos ",
-              write ? "before write " : "",
-              b->c.btree_id, b->c.level,
-              c->btree_roots[b->c.btree_id].level);
-       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+       pr_buf(out, "error validating btree node %sat btree ",
+              write ? "before write " : "");
+       btree_pos_to_text(out, c, b);
  
-       pr_buf(out, " node offset %u", b->written);
+       pr_buf(out, "\n  node offset %u", b->written);
         if (i)
                 pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
  }
@@ -1104,6 +1111,8 @@ static void btree_node_read_work(struct work_struct *work)
         struct btree *b         = rb->bio.bi_private;
         struct bio *bio         = &rb->bio;
         struct bch_io_failures failed = { .nr = 0 };
+       char buf[200];
+       struct printbuf out;
         bool can_retry;
  
         goto start;
@@ -1123,8 +1132,10 @@ static void btree_node_read_work(struct work_struct *work)
                         bio->bi_status = BLK_STS_REMOVED;
                 }
  start:
-               bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
-                                  bch2_blk_status_to_str(bio->bi_status));
+               out = PBUF(buf);
+               btree_pos_to_text(&out, c, b);
+               bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+                                  bch2_blk_status_to_str(bio->bi_status), buf);
                 if (rb->have_ioref)
                         percpu_ref_put(&ca->io_ref);
                 rb->have_ioref = false;
@@ -1408,7 +1419,7 @@ static void btree_node_write_endio(struct bio *bio)
         if (wbio->have_ioref)
                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
  
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
                                bch2_blk_status_to_str(bio->bi_status)) ||
             bch2_meta_write_fault("btree")) {
                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
@@ -1488,6 +1499,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                 if (!btree_node_may_write(b))
                         return;
  
+               if (old & (1 << BTREE_NODE_never_write))
+                       return;
+
                 if (old & (1 << BTREE_NODE_write_in_flight)) {
                         btree_node_wait_on_io(b);
                         continue;
@@ -1534,6 +1548,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                 seq = max(seq, le64_to_cpu(i->journal_seq));
         }
  
+       BUG_ON(b->written && !seq);
+
         /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
         bytes += 8;
  
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h

index 15af60e9282051261585c53ec2eb82323ac2c260..dc7de27112c66d2c7b4c707eb99b767fd479b101 100644 (file)
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -415,6 +415,7 @@ enum btree_flags {
         BTREE_NODE_fake,
         BTREE_NODE_old_extent_overwrite,
         BTREE_NODE_need_rewrite,
+       BTREE_NODE_never_write,
  };
  
  BTREE_FLAG(read_in_flight);
@@ -429,6 +430,7 @@ BTREE_FLAG(dying);
  BTREE_FLAG(fake);
  BTREE_FLAG(old_extent_overwrite);
  BTREE_FLAG(need_rewrite);
+BTREE_FLAG(never_write);
  
  static inline struct btree_write *btree_current_write(struct btree *b)
  {
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c

index edc11c22308c1dfa4de71cce501604d4da08fcea..4a169d36653832204a09ac5cd86b1ff48e06d783 100644 (file)
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -603,17 +603,30 @@ err:
  
                 list_del(&as->write_blocked_list);
  
-               if (!ret && as->b == b) {
+               /*
+                * Node might have been freed, recheck under
+                * btree_interior_update_lock:
+                */
+               if (as->b == b) {
                         struct bset *i = btree_bset_last(b);
  
                         BUG_ON(!b->c.level);
                         BUG_ON(!btree_node_dirty(b));
  
-                       i->journal_seq = cpu_to_le64(
-                               max(journal_seq,
-                                   le64_to_cpu(i->journal_seq)));
-
-                       bch2_btree_add_journal_pin(c, b, journal_seq);
+                       if (!ret) {
+                               i->journal_seq = cpu_to_le64(
+                                       max(journal_seq,
+                                           le64_to_cpu(i->journal_seq)));
+
+                               bch2_btree_add_journal_pin(c, b, journal_seq);
+                       } else {
+                               /*
+                                * If we didn't get a journal sequence number we
+                                * can't write this btree node, because recovery
+                                * won't know to ignore this write:
+                                */
+                               set_btree_node_never_write(b);
+                       }
                 }
  
                 mutex_unlock(&c->btree_interior_update_lock);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c

index 496855233c4c82a962c5ee6b91a8e0419d51e9ae..e7816afe4a08b0aa4573aff152b073f22cd90d1d 100644 (file)
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -649,13 +649,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
         case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
                 bch2_trans_unlock(trans);
  
-               while (bch2_btree_key_cache_must_wait(c)) {
+               do {
                         mutex_lock(&c->journal.reclaim_lock);
-                       bch2_journal_reclaim(&c->journal);
+                       ret = bch2_journal_reclaim(&c->journal);
                         mutex_unlock(&c->journal.reclaim_lock);
-               }
+               } while (!ret && bch2_btree_key_cache_must_wait(c));
  
-               if (bch2_trans_relock(trans))
+               if (!ret && bch2_trans_relock(trans))
                         return 0;
  
                 trace_trans_restart_journal_reclaim(trans->ip);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c

index 1b1200c551346ddb73996de4d3653029917a1776..0000fc76d2d9c31c72576de743851d2888b03cfc 100644 (file)
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -142,8 +142,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
         percpu_down_write(&c->mark_lock);
         usage = c->usage_base;
  
-       bch2_fs_usage_acc_to_base(c, 0);
-       bch2_fs_usage_acc_to_base(c, 1);
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               bch2_fs_usage_acc_to_base(c, i);
  
         for (i = 0; i < BCH_REPLICAS_MAX; i++)
                 usage->reserved += usage->persistent_reserved[i];
@@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
  {
         return this_cpu_ptr(gc
                             ? c->usage_gc
-                           : c->usage[journal_seq & 1]);
+                           : c->usage[journal_seq & JOURNAL_BUF_MASK]);
  }
  
  u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
  {
         ssize_t offset = v - (u64 *) c->usage_base;
-       unsigned seq;
+       unsigned i, seq;
         u64 ret;
  
         BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
@@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
  
         do {
                 seq = read_seqcount_begin(&c->usage_lock);
-               ret = *v +
-                       percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
-                       percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+               ret = *v;
+
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
         } while (read_seqcount_retry(&c->usage_lock, seq));
  
         return ret;
@@ -232,7 +233,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
  struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
  {
         struct bch_fs_usage *ret;
-       unsigned seq, v, u64s = fs_usage_u64s(c);
+       unsigned seq, i, v, u64s = fs_usage_u64s(c);
  retry:
         ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
         if (unlikely(!ret))
@@ -251,8 +252,8 @@ retry:
         do {
                 seq = read_seqcount_begin(&c->usage_lock);
                 memcpy(ret, c->usage_base, u64s * sizeof(u64));
-               acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
-               acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s);
         } while (read_seqcount_retry(&c->usage_lock, seq));
  
         return ret;
@@ -262,7 +263,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
  {
         unsigned u64s = fs_usage_u64s(c);
  
-       BUG_ON(idx >= 2);
+       BUG_ON(idx >= ARRAY_SIZE(c->usage));
  
         preempt_disable();
         write_seqcount_begin(&c->usage_lock);
@@ -2031,13 +2032,6 @@ int bch2_trans_mark_update(struct btree_trans *trans,
  
  /* Disk reservations: */
  
-static u64 bch2_recalc_sectors_available(struct bch_fs *c)
-{
-       percpu_u64_set(&c->pcpu->sectors_available, 0);
-
-       return avail_factor(__bch2_fs_usage_read_short(c).free);
-}
-
  void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
  {
         percpu_down_read(&c->mark_lock);
@@ -2072,7 +2066,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
  
                 if (get < sectors) {
                         preempt_enable();
-                       percpu_up_read(&c->mark_lock);
                         goto recalculate;
                 }
         } while ((v = atomic64_cmpxchg(&c->sectors_available,
@@ -2090,9 +2083,10 @@ out:
         return 0;
  
  recalculate:
-       percpu_down_write(&c->mark_lock);
+       mutex_lock(&c->sectors_available_lock);
  
-       sectors_available = bch2_recalc_sectors_available(c);
+       percpu_u64_set(&c->pcpu->sectors_available, 0);
+       sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
  
         if (sectors <= sectors_available ||
             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -2106,7 +2100,8 @@ recalculate:
                 ret = -ENOSPC;
         }
  
-       percpu_up_write(&c->mark_lock);
+       mutex_unlock(&c->sectors_available_lock);
+       percpu_up_read(&c->mark_lock);
  
         return ret;
  }
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c

index d7ba0e7fc3b3825e60416cae62acdd708125e96c..c409a4260f11ceac8e4ee2aa9dc4c6de73160e61 100644 (file)
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
                                              len << 9);
  
                         if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
-                               __bcache_io_error(c,
+                               bch_err_ratelimited(c,
                                         "checksum error while doing reconstruct read (%u:%u)",
                                         i, j);
                                 clear_bit(i, buf->valid);
@@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
         unsigned bytes = buf->size << 9;
  
         if (ec_nr_failed(buf) > v->nr_redundant) {
-               __bcache_io_error(c,
+               bch_err_ratelimited(c,
                         "error doing reconstruct read: unable to read enough blocks");
                 return -1;
         }
@@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio)
         struct bch_dev *ca = ec_bio->ca;
         struct closure *cl = bio->bi_private;
  
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
                                bio_data_dir(bio) ? "write" : "read",
                                bch2_blk_status_to_str(bio->bi_status)))
                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
@@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
                                    BTREE_ITER_SLOTS);
         k = bch2_btree_iter_peek_slot(iter);
         if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
-               __bcache_io_error(c,
+               bch_err_ratelimited(c,
                         "error doing reconstruct read: stripe not found");
                 kfree(buf);
                 return bch2_trans_exit(&trans) ?: -EIO;
@@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
  
                 if (ptr_stale(ca, ptr)) {
-                       __bcache_io_error(c,
+                       bch_err_ratelimited(c,
                                           "error doing reconstruct read: stale pointer");
                         clear_bit(i, buf->valid);
                         continue;
@@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
         closure_sync(&cl);
  
         if (ec_nr_failed(buf) > v->nr_redundant) {
-               __bcache_io_error(c,
+               bch_err_ratelimited(c,
                         "error doing reconstruct read: unable to read enough blocks");
                 ret = -EIO;
                 goto err;
diff --git a/libbcachefs/error.h b/libbcachefs/error.h

index 94b53312fbbda9b61e2f192f34f413ff88bcbeff..0e49fd728e440cb5be02bf1da3e399fa52e3e9f0 100644 (file)
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *);
  /* Logs message and handles the error: */
  #define bch2_dev_io_error(ca, fmt, ...)                                        \
  do {                                                                   \
-       printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,                  \
-               "IO error on %s for " fmt),                             \
+       printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt,              \
                 (ca)->name, ##__VA_ARGS__);                             \
         bch2_io_error(ca);                                              \
  } while (0)
  
+#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...)           \
+do {                                                                   \
+       printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
+               (ca)->name, (_inum), (_offset), ##__VA_ARGS__);         \
+       bch2_io_error(ca);                                              \
+} while (0)
+
  #define bch2_dev_io_err_on(cond, ca, ...)                              \
  ({                                                                     \
         bool _ret = (cond);                                             \
@@ -196,16 +202,13 @@ do {                                                                      \
         _ret;                                                           \
  })
  
-/* kill? */
-
-#define __bcache_io_error(c, fmt, ...)                                 \
-       printk_ratelimited(KERN_ERR bch2_fmt(c,                         \
-                       "IO error: " fmt), ##__VA_ARGS__)
-
-#define bcache_io_error(c, bio, fmt, ...)                              \
-do {                                                                   \
-       __bcache_io_error(c, fmt, ##__VA_ARGS__);                       \
-       (bio)->bi_status = BLK_STS_IOERR;                                       \
-} while (0)
+#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...)         \
+({                                                                     \
+       bool _ret = (cond);                                             \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+       _ret;                                                           \
+})
  
  #endif /* _BCACHEFS_ERROR_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c

index 8e6f7300e9b61421931230b73250a4aab8dd40d1..8170e93ca4d487497aeb65ed7f5fa947824405f1 100644 (file)
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -856,7 +856,9 @@ retry:
                 goto retry;
  
         if (ret) {
-               bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+               bch_err_inum_ratelimited(c, inum,
+                               "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
                 bio_endio(&rbio->bio);
         }
  
@@ -1013,6 +1015,8 @@ static void bch2_writepage_io_done(struct closure *cl)
         unsigned i;
  
         if (io->op.error) {
+               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
                 bio_for_each_segment_all(bvec, bio, iter) {
                         struct bch_page_state *s;
  
@@ -1902,7 +1906,13 @@ loop:
  
                 bio_for_each_segment_all(bv, bio, iter)
                         put_page(bv->bv_page);
-               if (!dio->iter.count || dio->op.error)
+
+               if (dio->op.error) {
+                       set_bit(EI_INODE_ERROR, &inode->ei_flags);
+                       break;
+               }
+
+               if (!dio->iter.count)
                         break;
  
                 bio_reset(bio);
@@ -2290,7 +2300,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
         if (ret)
                 goto err;
  
-       BUG_ON(inode->v.i_size < inode_u.bi_size);
+       WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+               inode->v.i_size < inode_u.bi_size);
  
         if (iattr->ia_size > inode->v.i_size) {
                 ret = bch2_extend(inode, &inode_u, iattr);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c

index f3f6fe6c776a11a03020b64fe571f913d9f5004c..e3edca4d265b60fe094905e3f74b0d415d4fd6a0 100644 (file)
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1151,6 +1151,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
         inode->v.i_generation   = bi->bi_generation;
         inode->v.i_size         = bi->bi_size;
  
+       inode->ei_flags         = 0;
         inode->ei_journal_seq   = 0;
         inode->ei_quota_reserved = 0;
         inode->ei_str_hash      = bch2_hash_info_init(c, bi);
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h

index 4ee1ac994420c6a1a092d6bf748c4b182fae7b63..3df85ffb450ccbdb87cbc809d6c5e0efdb2279f1 100644 (file)
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -33,6 +33,7 @@ void bch2_pagecache_block_get(struct pagecache_lock *);
  
  struct bch_inode_info {
         struct inode            v;
+       unsigned long           ei_flags;
  
         struct mutex            ei_update_lock;
         u64                     ei_journal_seq;
@@ -50,6 +51,12 @@ struct bch_inode_info {
         struct bch_inode_unpacked ei_inode;
  };
  
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR                 0
+
  #define to_bch_ei(_inode)                                      \
         container_of_or_null(_inode, struct bch_inode_info, v)
  
diff --git a/libbcachefs/io.c b/libbcachefs/io.c

index 46856c669f52ac1edf5f626be80fb2cc6a46ec63..3489605e0127907d12471e41aa133565175361ec 100644 (file)
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -576,7 +576,8 @@ static void __bch2_write_index(struct bch_write_op *op)
                 op->written += sectors_start - keylist_sectors(keys);
  
                 if (ret) {
-                       __bcache_io_error(c, "btree IO error %i", ret);
+                       bch_err_inum_ratelimited(c, op->pos.inode,
+                               "write error %i from btree update", ret);
                         op->error = ret;
                 }
         }
@@ -621,7 +622,10 @@ static void bch2_write_endio(struct bio *bio)
         struct bch_fs *c                = wbio->c;
         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
  
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+                                   op->pos.inode,
+                                   op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+                                   "data write error: %s",
                                bch2_blk_status_to_str(bio->bi_status)))
                 set_bit(wbio->dev, op->failed.d);
  
@@ -1279,15 +1283,14 @@ void bch2_write(struct closure *cl)
         wbio_init(bio)->put_bio = false;
  
         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
-               __bcache_io_error(c, "misaligned write");
+               bch_err_inum_ratelimited(c, op->pos.inode,
+                                        "misaligned write");
                 op->error = -EIO;
                 goto err;
         }
  
         if (c->opts.nochanges ||
             !percpu_ref_tryget(&c->writes)) {
-               if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
-                       __bcache_io_error(c, "read only");
                 op->error = -EROFS;
                 goto err;
         }
@@ -1716,7 +1719,8 @@ retry:
          * reading a btree node
          */
         BUG_ON(!ret);
-       __bcache_io_error(c, "btree IO error: %i", ret);
+       bch_err_inum_ratelimited(c, inode,
+                       "read error %i from btree lookup", ret);
  err:
         rbio->bio.bi_status = BLK_STS_IOERR;
  out:
@@ -1920,17 +1924,15 @@ csum_err:
                 return;
         }
  
-       bch2_dev_io_error(ca,
-               "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
-               rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+       bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
                 csum.hi, csum.lo, crc.csum_type);
         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
         return;
  decompression_err:
-       __bcache_io_error(c, "decompression error, inode %llu offset %llu",
-                         rbio->pos.inode,
-                         (u64) rbio->bvec_iter.bi_sector);
+       bch_err_inum_ratelimited(c, rbio->pos.inode,
+                                "decompression error");
         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
         return;
  }
@@ -1952,7 +1954,14 @@ static void bch2_read_endio(struct bio *bio)
         if (!rbio->split)
                 rbio->bio.bi_end_io = rbio->end_io;
  
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+       /*
+        * XXX: rbio->pos is not what we want here when reading from indirect
+        * extents
+        */
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+                                   rbio->pos.inode,
+                                   rbio->pos.offset,
+                                   "data read error: %s",
                                bch2_blk_status_to_str(bio->bi_status))) {
                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
                 return;
@@ -2002,7 +2011,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
  
         if (k.k->type != KEY_TYPE_reflink_v &&
             k.k->type != KEY_TYPE_indirect_inline_data) {
-               __bcache_io_error(trans->c,
+               bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
                                 "pointer to nonexistent indirect extent");
                 ret = -EIO;
                 goto err;
@@ -2048,7 +2057,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                 goto hole;
  
         if (pick_ret < 0) {
-               __bcache_io_error(c, "no device to read from");
+               bch_err_inum_ratelimited(c, k.k->p.inode,
+                                        "no device to read from");
                 goto err;
         }
  
@@ -2198,7 +2208,8 @@ get_bio:
  
         if (!rbio->pick.idx) {
                 if (!rbio->have_ioref) {
-                       __bcache_io_error(c, "no device to read from");
+                       bch_err_inum_ratelimited(c, k.k->p.inode,
+                                                "no device to read from");
                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
                         goto out;
                 }
@@ -2348,7 +2359,9 @@ err:
         if (ret == -EINTR)
                 goto retry;
  
-       bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
+       bch_err_inum_ratelimited(c, inode,
+                                "read error %i from btree lookup", ret);
+       rbio->bio.bi_status = BLK_STS_IOERR;
         bch2_rbio_done(rbio);
         goto out;
  }
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c

index dd8db8c0c980544438355253ff5ed874ab31ff35..701521030c3d46bd92cab91e913795c7aa316bbf 100644 (file)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -24,7 +24,7 @@ static u64 last_unwritten_seq(struct journal *j)
  
         lockdep_assert_held(&j->lock);
  
-       return journal_cur_seq(j) - s.prev_buf_unwritten;
+       return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
  }
  
  static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
@@ -52,7 +52,7 @@ journal_seq_to_buf(struct journal *j, u64 seq)
                 j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
  
         if (journal_seq_unwritten(j, seq)) {
-               buf = j->buf + (seq & 1);
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
                 EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
         }
         return buf;
@@ -80,6 +80,8 @@ static void bch2_journal_buf_init(struct journal *j)
         struct journal_buf *buf = journal_cur_buf(j);
  
         bkey_extent_init(&buf->key);
+       buf->noflush    = false;
+       buf->must_flush = false;
  
         memset(buf->has_inode, 0, sizeof(buf->has_inode));
  
@@ -109,15 +111,8 @@ void bch2_journal_halt(struct journal *j)
  
  /* journal entry close/open: */
  
-void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+void __bch2_journal_buf_put(struct journal *j)
  {
-       if (!need_write_just_set &&
-           test_bit(JOURNAL_NEED_WRITE, &j->flags))
-               bch2_time_stats_update(j->delay_time,
-                                      j->need_write_time);
-
-       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
         closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
  }
  
@@ -130,7 +125,6 @@ static bool __journal_entry_close(struct journal *j)
         struct journal_buf *buf = journal_cur_buf(j);
         union journal_res_state old, new;
         u64 v = atomic64_read(&j->reservations.counter);
-       bool set_need_write = false;
         unsigned sectors;
  
         lockdep_assert_held(&j->lock);
@@ -149,15 +143,13 @@ static bool __journal_entry_close(struct journal *j)
                 if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
                         set_bit(JOURNAL_NEED_WRITE, &j->flags);
                         j->need_write_time = local_clock();
-                       set_need_write = true;
                 }
  
-               if (new.prev_buf_unwritten)
-                       return false;
-
                 new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
                 new.idx++;
-               new.prev_buf_unwritten = 1;
+
+               if (new.idx == new.unwritten_idx)
+                       return false;
  
                 BUG_ON(journal_state_count(new, new.idx));
         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -191,24 +183,44 @@ static bool __journal_entry_close(struct journal *j)
          */
         buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
  
+       __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
+
         journal_pin_new_entry(j, 1);
  
         bch2_journal_buf_init(j);
  
         cancel_delayed_work(&j->write_work);
+       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
  
         bch2_journal_space_available(j);
  
-       bch2_journal_buf_put(j, old.idx, set_need_write);
+       bch2_journal_buf_put(j, old.idx);
         return true;
  }
  
+static bool journal_entry_want_write(struct journal *j)
+{
+       union journal_res_state s = READ_ONCE(j->reservations);
+       bool ret = false;
+
+       /*
+        * Don't close it yet if we already have a write in flight, but do set
+        * NEED_WRITE:
+        */
+       if (s.idx != s.unwritten_idx)
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+       else
+               ret = __journal_entry_close(j);
+
+       return ret;
+}
+
  static bool journal_entry_close(struct journal *j)
  {
         bool ret;
  
         spin_lock(&j->lock);
-       ret = __journal_entry_close(j);
+       ret = journal_entry_want_write(j);
         spin_unlock(&j->lock);
  
         return ret;
@@ -290,8 +302,8 @@ static int journal_entry_open(struct journal *j)
  
  static bool journal_quiesced(struct journal *j)
  {
-       union journal_res_state state = READ_ONCE(j->reservations);
-       bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
+       union journal_res_state s = READ_ONCE(j->reservations);
+       bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
  
         if (!ret)
                 journal_entry_close(j);
@@ -318,17 +330,29 @@ static void journal_write_work(struct work_struct *work)
  u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
  {
         size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-       u64 seq = 0;
+       union journal_res_state s;
+       unsigned i;
+       u64 seq;
  
-       if (!test_bit(h, j->buf[0].has_inode) &&
-           !test_bit(h, j->buf[1].has_inode))
-               return 0;
  
         spin_lock(&j->lock);
-       if (test_bit(h, journal_cur_buf(j)->has_inode))
-               seq = journal_cur_seq(j);
-       else if (test_bit(h, journal_prev_buf(j)->has_inode))
-               seq = journal_cur_seq(j) - 1;
+       seq = journal_cur_seq(j);
+       s = READ_ONCE(j->reservations);
+       i = s.idx;
+
+       while (1) {
+               if (test_bit(h, j->buf[i].has_inode))
+                       goto out;
+
+               if (i == s.unwritten_idx)
+                       break;
+
+               i = (i - 1) & JOURNAL_BUF_MASK;
+               seq--;
+       }
+
+       seq = 0;
+out:
         spin_unlock(&j->lock);
  
         return seq;
@@ -553,7 +577,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
         struct journal_buf *buf;
         int ret = 0;
  
-       if (seq <= j->seq_ondisk)
+       if (seq <= j->flushed_seq_ondisk)
                 return 1;
  
         spin_lock(&j->lock);
@@ -564,18 +588,55 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
                 goto out;
         }
  
-       if (seq <= j->seq_ondisk) {
+       if (seq <= j->flushed_seq_ondisk) {
                 ret = 1;
                 goto out;
         }
  
-       if (parent &&
-           (buf = journal_seq_to_buf(j, seq)))
-               if (!closure_wait(&buf->wait, parent))
+       /* if seq was written, but not flushed - flush a newer one instead */
+       seq = max(seq, last_unwritten_seq(j));
+
+recheck_need_open:
+       if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+               struct journal_res res = { 0 };
+
+               spin_unlock(&j->lock);
+
+               ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+               if (ret)
+                       return ret;
+
+               seq = res.seq;
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
+               buf->must_flush = true;
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+               if (parent && !closure_wait(&buf->wait, parent))
                         BUG();
  
+               bch2_journal_res_put(j, &res);
+
+               spin_lock(&j->lock);
+               goto want_write;
+       }
+
+       /*
+        * if write was kicked off without a flush, flush the next sequence
+        * number instead
+        */
+       buf = journal_seq_to_buf(j, seq);
+       if (buf->noflush) {
+               seq++;
+               goto recheck_need_open;
+       }
+
+       buf->must_flush = true;
+
+       if (parent && !closure_wait(&buf->wait, parent))
+               BUG();
+want_write:
         if (seq == journal_cur_seq(j))
-               __journal_entry_close(j);
+               journal_entry_want_write(j);
  out:
         spin_unlock(&j->lock);
         return ret;
@@ -864,15 +925,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
  static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
  {
         union journal_res_state state;
-       struct journal_buf *w;
-       bool ret;
+       bool ret = false;
+       unsigned i;
  
         spin_lock(&j->lock);
         state = READ_ONCE(j->reservations);
-       w = j->buf + !state.idx;
+       i = state.idx;
  
-       ret = state.prev_buf_unwritten &&
-               bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
+       while (i != state.unwritten_idx) {
+               i = (i - 1) & JOURNAL_BUF_MASK;
+               if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+                       ret = true;
+       }
         spin_unlock(&j->lock);
  
         return ret;
@@ -955,10 +1019,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
         spin_lock(&j->lock);
  
         set_bit(JOURNAL_STARTED, &j->flags);
+       j->last_flush_write = jiffies;
  
         journal_pin_new_entry(j, 1);
  
-       j->reservations.idx = journal_cur_seq(j);
+       j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
  
         bch2_journal_buf_init(j);
  
@@ -1013,8 +1078,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
  
  void bch2_fs_journal_exit(struct journal *j)
  {
-       kvpfree(j->buf[1].data, j->buf[1].buf_size);
-       kvpfree(j->buf[0].data, j->buf[0].buf_size);
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+               kvpfree(j->buf[i].data, j->buf[i].buf_size);
         free_fifo(&j->pin);
  }
  
@@ -1022,6 +1089,7 @@ int bch2_fs_journal_init(struct journal *j)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         static struct lock_class_key res_key;
+       unsigned i;
         int ret = 0;
  
         pr_verbose_init(c->opts, "");
@@ -1036,8 +1104,6 @@ int bch2_fs_journal_init(struct journal *j)
  
         lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
  
-       j->buf[0].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
-       j->buf[1].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
         j->write_delay_ms       = 1000;
         j->reclaim_delay_ms     = 100;
  
@@ -1049,13 +1115,20 @@ int bch2_fs_journal_init(struct journal *j)
                 ((union journal_res_state)
                  { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
  
-       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-           !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
-           !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
+       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
                 ret = -ENOMEM;
                 goto out;
         }
  
+       for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+               j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+               j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+               if (!j->buf[i].data) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
         j->pin.front = j->pin.back = 1;
  out:
         pr_verbose_init(c->opts, "ret %i", ret);
@@ -1069,7 +1142,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         union journal_res_state s;
         struct bch_dev *ca;
-       unsigned iter;
+       unsigned i;
  
         rcu_read_lock();
         spin_lock(&j->lock);
@@ -1081,6 +1154,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                "last_seq:\t\t%llu\n"
                "last_seq_ondisk:\t%llu\n"
                "prereserved:\t\t%u/%u\n"
+              "nr flush writes:\t%llu\n"
+              "nr noflush writes:\t%llu\n"
                "nr direct reclaim:\t%llu\n"
                "nr background reclaim:\t%llu\n"
                "current entry sectors:\t%u\n"
@@ -1092,6 +1167,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                j->last_seq_ondisk,
                j->prereserved.reserved,
                j->prereserved.remaining,
+              j->nr_flush_writes,
+              j->nr_noflush_writes,
                j->nr_direct_reclaim,
                j->nr_background_reclaim,
                j->cur_entry_sectors,
@@ -1112,16 +1189,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
         }
  
         pr_buf(out,
-              "current entry refs:\t%u\n"
-              "prev entry unwritten:\t",
-              journal_state_count(s, s.idx));
-
-       if (s.prev_buf_unwritten)
-               pr_buf(out, "yes, ref %u sectors %u\n",
-                      journal_state_count(s, !s.idx),
-                      journal_prev_buf(j)->sectors);
-       else
-               pr_buf(out, "no\n");
+              "current entry:\t\tidx %u refcount %u\n",
+              s.idx, journal_state_count(s, s.idx));
+
+       i = s.idx;
+       while (i != s.unwritten_idx) {
+               i = (i - 1) & JOURNAL_BUF_MASK;
+
+               pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
+                      i, journal_state_count(s, i), j->buf[i].sectors);
+       }
  
         pr_buf(out,
                "need write:\t\t%i\n"
@@ -1129,7 +1206,21 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                test_bit(JOURNAL_NEED_WRITE,     &j->flags),
                test_bit(JOURNAL_REPLAY_DONE,    &j->flags));
  
-       for_each_member_device_rcu(ca, c, iter,
+       pr_buf(out, "space:\n");
+       pr_buf(out, "\tdiscarded\t%u:%u\n",
+              j->space[journal_space_discarded].next_entry,
+              j->space[journal_space_discarded].total);
+       pr_buf(out, "\tclean ondisk\t%u:%u\n",
+              j->space[journal_space_clean_ondisk].next_entry,
+              j->space[journal_space_clean_ondisk].total);
+       pr_buf(out, "\tclean\t\t%u:%u\n",
+              j->space[journal_space_clean].next_entry,
+              j->space[journal_space_clean].total);
+       pr_buf(out, "\ttotal\t\t%u:%u\n",
+              j->space[journal_space_total].next_entry,
+              j->space[journal_space_total].total);
+
+       for_each_member_device_rcu(ca, c, i,
                                    &c->rw_devs[BCH_DATA_journal]) {
                 struct journal_device *ja = &ca->journal;
  
@@ -1139,12 +1230,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                 pr_buf(out,
                        "dev %u:\n"
                        "\tnr\t\t%u\n"
+                      "\tbucket size\t%u\n"
                        "\tavailable\t%u:%u\n"
-                      "\tdiscard_idx\t\t%u\n"
-                      "\tdirty_idx_ondisk\t%u (seq %llu)\n"
-                      "\tdirty_idx\t\t%u (seq %llu)\n"
+                      "\tdiscard_idx\t%u\n"
+                      "\tdirty_ondisk\t%u (seq %llu)\n"
+                      "\tdirty_idx\t%u (seq %llu)\n"
                        "\tcur_idx\t\t%u (seq %llu)\n",
-                      iter, ja->nr,
+                      i, ja->nr, ca->mi.bucket_size,
                        bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
                        ja->sectors_free,
                        ja->discard_idx,
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h

index 25c6876765ac4b09b05055898cd36e165c851ea0..a6ce03a724cba1e3ddb74cfe6f380bc19a9a5105 100644 (file)
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
         return j->buf + j->reservations.idx;
  }
  
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
-       return j->buf + !j->reservations.idx;
-}
-
  /* Sequence number of oldest dirty journal entry */
  
  static inline u64 journal_last_seq(struct journal *j)
@@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
  
  static inline u64 journal_cur_seq(struct journal *j)
  {
-       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+       EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
  
         return j->pin.back - 1;
  }
@@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64);
  
  static inline int journal_state_count(union journal_res_state s, int idx)
  {
-       return idx == 0 ? s.buf0_count : s.buf1_count;
+       switch (idx) {
+       case 0: return s.buf0_count;
+       case 1: return s.buf1_count;
+       case 2: return s.buf2_count;
+       case 3: return s.buf3_count;
+       }
+       BUG();
  }
  
  static inline void journal_state_inc(union journal_res_state *s)
  {
         s->buf0_count += s->idx == 0;
         s->buf1_count += s->idx == 1;
+       s->buf2_count += s->idx == 2;
+       s->buf3_count += s->idx == 3;
  }
  
  static inline void bch2_journal_set_has_inode(struct journal *j,
@@ -255,21 +258,24 @@ static inline bool journal_entry_empty(struct jset *j)
         return true;
  }
  
-void __bch2_journal_buf_put(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *);
  
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
-                                      bool need_write_just_set)
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
  {
         union journal_res_state s;
  
         s.v = atomic64_sub_return(((union journal_res_state) {
                                     .buf0_count = idx == 0,
                                     .buf1_count = idx == 1,
+                                   .buf2_count = idx == 2,
+                                   .buf3_count = idx == 3,
                                     }).v, &j->reservations.counter);
-       if (!journal_state_count(s, idx)) {
-               EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
-               __bch2_journal_buf_put(j, need_write_just_set);
-       }
+
+       EBUG_ON(((s.idx - idx) & 3) >
+               ((s.idx - s.unwritten_idx) & 3));
+
+       if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
+               __bch2_journal_buf_put(j);
  }
  
  /*
@@ -289,7 +295,7 @@ static inline void bch2_journal_res_put(struct journal *j,
                                        BCH_JSET_ENTRY_btree_keys,
                                        0, 0, NULL, 0);
  
-       bch2_journal_buf_put(j, res->idx, false);
+       bch2_journal_buf_put(j, res->idx);
  
         res->ref = 0;
  }
@@ -325,11 +331,18 @@ static inline int journal_res_get_fast(struct journal *j,
                     !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
                         return 0;
  
-               if (flags & JOURNAL_RES_GET_CHECK)
-                       return 1;
-
                 new.cur_entry_offset += res->u64s;
                 journal_state_inc(&new);
+
+               /*
+                * If the refcount would overflow, we have to wait:
+                * XXX - tracepoint this:
+                */
+               if (!journal_state_count(new, new.idx))
+                       return 0;
+
+               if (flags & JOURNAL_RES_GET_CHECK)
+                       return 1;
         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                        old.v, new.v)) != old.v);
  
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c

index d1367cf067d3036b02072771ef3e2402ed8d532d..bb9a1936c24cdd4014c8e39a134d4535213b1731 100644 (file)
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -10,10 +10,27 @@
  #include "journal.h"
  #include "journal_io.h"
  #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
  #include "replicas.h"
  
  #include <trace/events/bcachefs.h>
  
+static void __journal_replay_free(struct journal_replay *i)
+{
+       list_del(&i->list);
+       kvpfree(i, offsetof(struct journal_replay, j) +
+               vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+       i->ignore = true;
+
+       if (!c->opts.read_entire_journal)
+               __journal_replay_free(i);
+}
+
  struct journal_list {
         struct closure          cl;
         struct mutex            lock;
@@ -36,28 +53,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
         struct bch_devs_list devs = { .nr = 0 };
         struct list_head *where;
         size_t bytes = vstruct_bytes(j);
-       __le64 last_seq;
+       u64 last_seq = 0;
         int ret;
  
-       last_seq = !list_empty(jlist->head)
-               ? list_last_entry(jlist->head, struct journal_replay,
-                                 list)->j.last_seq
-               : 0;
-
-       if (!c->opts.read_entire_journal) {
-               /* Is this entry older than the range we need? */
-               if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-                       ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-                       goto out;
+       list_for_each_entry_reverse(i, jlist->head, list) {
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq = le64_to_cpu(i->j.last_seq);
+                       break;
                 }
+       }
+
+       /* Is this entry older than the range we need? */
+       if (!c->opts.read_entire_journal &&
+           le64_to_cpu(j->seq) < last_seq) {
+               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+               goto out;
+       }
  
-               /* Drop entries we don't need anymore */
+       /* Drop entries we don't need anymore */
+       if (!JSET_NO_FLUSH(j)) {
                 list_for_each_entry_safe(i, pos, jlist->head, list) {
                         if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                                 break;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       journal_replay_free(c, i);
                 }
         }
  
@@ -81,9 +99,7 @@ add:
         if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
                 if (i->bad) {
                         devs = i->devs;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       __journal_replay_free(i);
                 } else if (bad) {
                         goto found;
                 } else {
@@ -105,6 +121,7 @@ add:
         list_add(&i->list, where);
         i->devs = devs;
         i->bad  = bad;
+       i->ignore = false;
         memcpy(&i->j, j, bytes);
  found:
         if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -558,7 +575,7 @@ reread:
                         bio_put(bio);
  
                         if (bch2_dev_io_err_on(ret, ca,
-                                              "journal read from sector %llu",
+                                              "journal read error: sector %llu",
                                                offset) ||
                             bch2_meta_read_fault("journal"))
                                 return -EIO;
@@ -699,14 +716,16 @@ err:
         goto out;
  }
  
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+                     u64 *blacklist_seq, u64 *start_seq)
  {
         struct journal_list jlist;
-       struct journal_replay *i;
+       struct journal_replay *i, *t;
         struct bch_dev *ca;
         unsigned iter;
         size_t keys = 0, entries = 0;
         bool degraded = false;
+       u64 seq, last_seq = 0;
         int ret = 0;
  
         closure_init_stack(&jlist.cl);
@@ -735,12 +754,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
         if (jlist.ret)
                 return jlist.ret;
  
+       if (list_empty(list)) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
+       i = list_last_entry(list, struct journal_replay, list);
+       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+       /*
+        * Find most recent flush entry, and ignore newer non flush entries -
+        * those entries will be blacklisted:
+        */
+       list_for_each_entry_safe_reverse(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq        = le64_to_cpu(i->j.last_seq);
+                       *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
+                       break;
+               }
+
+               journal_replay_free(c, i);
+       }
+
+       if (!last_seq) {
+               fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+               return -1;
+       }
+
+       /* Drop blacklisted entries and entries older than last_seq: */
+       list_for_each_entry_safe(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               seq = le64_to_cpu(i->j.seq);
+               if (seq < last_seq) {
+                       journal_replay_free(c, i);
+                       continue;
+               }
+
+               if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+                       fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+                                   "found blacklisted journal entry %llu", seq);
+
+                       journal_replay_free(c, i);
+               }
+       }
+
+       /* Check for missing entries: */
+       seq = last_seq;
+       list_for_each_entry(i, list, list) {
+               if (i->ignore)
+                       continue;
+
+               BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+               while (seq < le64_to_cpu(i->j.seq)) {
+                       u64 missing_start, missing_end;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       if (seq == le64_to_cpu(i->j.seq))
+                               break;
+
+                       missing_start = seq;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              !bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       missing_end = seq - 1;
+                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+                                missing_start, missing_end,
+                                last_seq, *blacklist_seq - 1);
+               }
+
+               seq++;
+       }
+
         list_for_each_entry(i, list, list) {
                 struct jset_entry *entry;
                 struct bkey_i *k, *_n;
                 struct bch_replicas_padded replicas;
                 char buf[80];
  
+               if (i->ignore)
+                       continue;
+
                 ret = jset_validate_entries(c, &i->j, READ);
                 if (ret)
                         goto fsck_err;
@@ -768,12 +872,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                 entries++;
         }
  
-       if (!list_empty(list)) {
-               i = list_last_entry(list, struct journal_replay, list);
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                keys, entries, *start_seq);
  
-               bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                        keys, entries, le64_to_cpu(i->j.seq));
-       }
+       if (*start_seq != *blacklist_seq)
+               bch_info(c, "dropped unflushed entries %llu-%llu",
+                        *blacklist_seq, *start_seq - 1);
  fsck_err:
         return ret;
  }
@@ -951,16 +1055,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
         buf->buf_size   = new_size;
  }
  
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+       return j->buf + j->reservations.unwritten_idx;
+}
+
  static void journal_write_done(struct closure *cl)
  {
         struct journal *j = container_of(cl, struct journal, io);
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *w = journal_prev_buf(j);
+       struct journal_buf *w = journal_last_unwritten_buf(j);
         struct bch_devs_list devs =
                 bch2_bkey_devs(bkey_i_to_s_c(&w->key));
         struct bch_replicas_padded replicas;
+       union journal_res_state old, new;
         u64 seq = le64_to_cpu(w->data->seq);
         u64 last_seq = le64_to_cpu(w->data->last_seq);
+       u64 v;
         int err = 0;
  
         bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -984,8 +1095,12 @@ static void journal_write_done(struct closure *cl)
         j->seq_ondisk           = seq;
         if (err && (!j->err_seq || seq < j->err_seq))
                 j->err_seq      = seq;
-       j->last_seq_ondisk      = last_seq;
-       bch2_journal_space_available(j);
+
+       if (!w->noflush) {
+               j->flushed_seq_ondisk = seq;
+               j->last_seq_ondisk = last_seq;
+               bch2_journal_space_available(j);
+       }
  
         /*
          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -999,9 +1114,14 @@ static void journal_write_done(struct closure *cl)
         /* also must come before signalling write completion: */
         closure_debug_destroy(cl);
  
-       BUG_ON(!j->reservations.prev_buf_unwritten);
-       atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
-                    &j->reservations.counter);
+       v = atomic64_read(&j->reservations.counter);
+       do {
+               old.v = new.v = v;
+               BUG_ON(new.idx == new.unwritten_idx);
+
+               new.unwritten_idx++;
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
  
         closure_wake_up(&w->wait);
         journal_wake(j);
@@ -1009,6 +1129,10 @@ static void journal_write_done(struct closure *cl)
         if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
                 mod_delayed_work(system_freezable_wq, &j->write_work, 0);
         spin_unlock(&j->lock);
+
+       if (new.unwritten_idx != new.idx &&
+           !journal_state_count(new, new.unwritten_idx))
+               closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
  }
  
  static void journal_write_endio(struct bio *bio)
@@ -1016,10 +1140,10 @@ static void journal_write_endio(struct bio *bio)
         struct bch_dev *ca = bio->bi_private;
         struct journal *j = &ca->fs->journal;
  
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
                                bch2_blk_status_to_str(bio->bi_status)) ||
             bch2_meta_write_fault("journal")) {
-               struct journal_buf *w = journal_prev_buf(j);
+               struct journal_buf *w = journal_last_unwritten_buf(j);
                 unsigned long flags;
  
                 spin_lock_irqsave(&j->err_lock, flags);
@@ -1036,7 +1160,7 @@ void bch2_journal_write(struct closure *cl)
         struct journal *j = container_of(cl, struct journal, io);
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct bch_dev *ca;
-       struct journal_buf *w = journal_prev_buf(j);
+       struct journal_buf *w = journal_last_unwritten_buf(j);
         struct jset_entry *start, *end;
         struct jset *jset;
         struct bio *bio;
@@ -1047,13 +1171,27 @@ void bch2_journal_write(struct closure *cl)
  
         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
  
-       bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
-
         journal_buf_realloc(j, w);
         jset = w->data;
  
         j->write_start_time = local_clock();
  
+       spin_lock(&j->lock);
+       if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+           !w->must_flush &&
+           (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+           test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+               w->noflush = true;
+               SET_JSET_NO_FLUSH(jset, true);
+               jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+       }
+       spin_unlock(&j->lock);
+
         /*
          * New btree roots are set by journalling them; when the journal entry
          * gets written we have to propagate them to c->btree_roots
@@ -1175,8 +1313,9 @@ retry_alloc:
                 bio->bi_iter.bi_sector  = ptr->offset;
                 bio->bi_end_io          = journal_write_endio;
                 bio->bi_private         = ca;
-               bio_set_op_attrs(bio, REQ_OP_WRITE,
-                                REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+               bio->bi_opf             = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+               if (!JSET_NO_FLUSH(jset))
+                       bio->bi_opf    |= REQ_PREFLUSH|REQ_FUA;
                 bch2_bio_map(bio, jset, sectors << 9);
  
                 trace_journal_write(bio);
@@ -1185,20 +1324,21 @@ retry_alloc:
                 ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
         }
  
-       for_each_rw_member(ca, c, i)
-               if (journal_flushes_device(ca) &&
-                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
-                       percpu_ref_get(&ca->io_ref);
-
-                       bio = ca->journal.bio;
-                       bio_reset(bio);
-                       bio_set_dev(bio, ca->disk_sb.bdev);
-                       bio->bi_opf             = REQ_OP_FLUSH;
-                       bio->bi_end_io          = journal_write_endio;
-                       bio->bi_private         = ca;
-                       closure_bio_submit(bio, cl);
-               }
-
+       if (!JSET_NO_FLUSH(jset)) {
+               for_each_rw_member(ca, c, i)
+                       if (journal_flushes_device(ca) &&
+                           !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+                               percpu_ref_get(&ca->io_ref);
+
+                               bio = ca->journal.bio;
+                               bio_reset(bio);
+                               bio_set_dev(bio, ca->disk_sb.bdev);
+                               bio->bi_opf             = REQ_OP_FLUSH;
+                               bio->bi_end_io          = journal_write_endio;
+                               bio->bi_private         = ca;
+                               closure_bio_submit(bio, cl);
+                       }
+       }
  no_io:
         bch2_bucket_seq_cleanup(c);
  
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h

index 6958ee0f8cf23da1ab5a9c0588fedb3d8679678c..6b4c80968f52064370c4d3b767db29cd3cb59bed 100644 (file)
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@@ -11,6 +11,7 @@ struct journal_replay {
         struct bch_devs_list    devs;
         /* checksum error, but we may want to try using it anyways: */
         bool                    bad;
+       bool                    ignore;
         /* must be last: */
         struct jset             j;
  };
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
         for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                 vstruct_for_each_safe(entry, k, _n)
  
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
  
  void bch2_journal_write(struct closure *);
  
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c

index beaa39f7bf5ee5d75340c736e952d373010ffbc7..9d778306efc515a3caa11e561dada6fce09bed1c 100644 (file)
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -58,81 +58,107 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
                                        old.v, new.v)) != old.v);
  }
  
-static struct journal_space {
-       unsigned        next_entry;
-       unsigned        remaining;
-} __journal_space_available(struct journal *j, unsigned nr_devs_want,
-                           enum journal_space_from from)
+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
  {
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_dev *ca;
-       unsigned sectors_next_entry     = UINT_MAX;
-       unsigned sectors_total          = UINT_MAX;
-       unsigned i, nr_devs = 0;
-       unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
-               ? journal_prev_buf(j)->sectors
-               : 0;
+       unsigned sectors = 0;
  
-       rcu_read_lock();
-       for_each_member_device_rcu(ca, c, i,
-                                  &c->rw_devs[BCH_DATA_journal]) {
-               struct journal_device *ja = &ca->journal;
-               unsigned buckets_this_device, sectors_this_device;
+       while (!sectors && *idx != j->reservations.idx) {
+               sectors = j->buf[*idx].sectors;
  
-               if (!ja->nr)
-                       continue;
+               *idx = (*idx + 1) & JOURNAL_BUF_MASK;
+       }
  
-               buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
-               sectors_this_device = ja->sectors_free;
+       return sectors;
+}
  
-               /*
-                * We that we don't allocate the space for a journal entry
-                * until we write it out - thus, account for it here:
-                */
-               if (unwritten_sectors >= sectors_this_device) {
-                       if (!buckets_this_device)
-                               continue;
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+                           enum journal_space_from from)
+{
+       struct journal_device *ja = &ca->journal;
+       unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
  
-                       buckets_this_device--;
-                       sectors_this_device = ca->mi.bucket_size;
-               }
+       if (from == journal_space_total)
+               return (struct journal_space) {
+                       .next_entry     = ca->mi.bucket_size,
+                       .total          = ca->mi.bucket_size * ja->nr,
+               };
  
-               sectors_this_device -= unwritten_sectors;
+       buckets = bch2_journal_dev_buckets_available(j, ja, from);
+       sectors = ja->sectors_free;
  
-               if (sectors_this_device < ca->mi.bucket_size &&
-                   buckets_this_device) {
-                       buckets_this_device--;
-                       sectors_this_device = ca->mi.bucket_size;
+       /*
+        * We that we don't allocate the space for a journal entry
+        * until we write it out - thus, account for it here:
+        */
+       while ((unwritten = get_unwritten_sectors(j, &idx))) {
+               if (unwritten >= sectors) {
+                       if (!buckets) {
+                               sectors = 0;
+                               break;
+                       }
+
+                       buckets--;
+                       sectors = ca->mi.bucket_size;
                 }
  
-               if (!sectors_this_device)
+               sectors -= unwritten;
+       }
+
+       if (sectors < ca->mi.bucket_size && buckets) {
+               buckets--;
+               sectors = ca->mi.bucket_size;
+       }
+
+       return (struct journal_space) {
+               .next_entry     = sectors,
+               .total          = sectors + buckets * ca->mi.bucket_size,
+       };
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+                           enum journal_space_from from)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       unsigned i, pos, nr_devs = 0;
+       struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+       BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i,
+                                  &c->rw_devs[BCH_DATA_journal]) {
+               if (!ca->journal.nr)
                         continue;
  
-               sectors_next_entry = min(sectors_next_entry,
-                                        sectors_this_device);
+               space = journal_dev_space_available(j, ca, from);
+               if (!space.next_entry)
+                       continue;
  
-               sectors_total = min(sectors_total,
-                       buckets_this_device * ca->mi.bucket_size +
-                       sectors_this_device);
+               for (pos = 0; pos < nr_devs; pos++)
+                       if (space.total > dev_space[pos].total)
+                               break;
  
-               nr_devs++;
+               array_insert_item(dev_space, nr_devs, pos, space);
         }
         rcu_read_unlock();
  
         if (nr_devs < nr_devs_want)
                 return (struct journal_space) { 0, 0 };
  
-       return (struct journal_space) {
-               .next_entry     = sectors_next_entry,
-               .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
-       };
+       /*
+        * We sorted largest to smallest, and we want the smallest out of the
+        * @nr_devs_want largest devices:
+        */
+       return dev_space[nr_devs_want - 1];
  }
  
  void bch2_journal_space_available(struct journal *j)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct bch_dev *ca;
-       struct journal_space discarded, clean_ondisk, clean;
+       unsigned clean, clean_ondisk, total;
         unsigned overhead, u64s_remaining = 0;
         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                        j->buf[1].buf_size >> 9);
@@ -173,27 +199,33 @@ void bch2_journal_space_available(struct journal *j)
                 goto out;
         }
  
-       if (!fifo_free(&j->pin)) {
-               ret = cur_entry_journal_pin_full;
-               goto out;
-       }
-
         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
  
-       discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
-       clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
-       clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
+       for (i = 0; i < journal_space_nr; i++)
+               j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+       clean_ondisk    = j->space[journal_space_clean_ondisk].total;
+       clean           = j->space[journal_space_clean].total;
+       total           = j->space[journal_space_total].total;
  
-       if (!discarded.next_entry)
+       if (!j->space[journal_space_discarded].next_entry)
                 ret = cur_entry_journal_full;
+       else if (!fifo_free(&j->pin))
+               ret = cur_entry_journal_pin_full;
+
+       if ((clean - clean_ondisk <= total / 8) &&
+           (clean_ondisk * 2 > clean ))
+               set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+       else
+               clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
  
-       overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
+       overhead = DIV_ROUND_UP(clean, max_entry_size) *
                 journal_entry_overhead(j);
-       u64s_remaining = clean.remaining << 6;
+       u64s_remaining = clean << 6;
         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
         u64s_remaining /= 4;
  out:
-       j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
+       j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
         j->cur_entry_error      = ret;
         journal_set_remaining(j, u64s_remaining);
         journal_check_may_get_unreserved(j);
@@ -277,6 +309,14 @@ static void bch2_journal_reclaim_fast(struct journal *j)
                 bch2_journal_space_available(j);
  }
  
+void __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+       if (atomic_dec_and_test(&pin_list->count))
+               bch2_journal_reclaim_fast(j);
+}
+
  void bch2_journal_pin_put(struct journal *j, u64 seq)
  {
         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
@@ -485,13 +525,14 @@ static u64 journal_seq_to_flush(struct journal *j)
   * 512 journal entries or 25% of all journal buckets, then
   * journal_next_bucket() should not stall.
   */
-static void __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         bool kthread = (current->flags & PF_KTHREAD) != 0;
         u64 seq_to_flush, nr_flushed = 0;
         size_t min_nr;
         unsigned flags;
+       int ret = 0;
  
         /*
          * We can't invoke memory reclaim while holding the reclaim_lock -
@@ -506,6 +547,11 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct)
                 if (kthread && kthread_should_stop())
                         break;
  
+               if (bch2_journal_error(j)) {
+                       ret = -EIO;
+                       break;
+               }
+
                 bch2_journal_do_discards(j);
  
                 seq_to_flush = journal_seq_to_flush(j);
@@ -547,27 +593,30 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct)
         } while (min_nr);
  
         memalloc_noreclaim_restore(flags);
+
+       return ret;
  }
  
-void bch2_journal_reclaim(struct journal *j)
+int bch2_journal_reclaim(struct journal *j)
  {
-       __bch2_journal_reclaim(j, true);
+       return __bch2_journal_reclaim(j, true);
  }
  
  static int bch2_journal_reclaim_thread(void *arg)
  {
         struct journal *j = arg;
         unsigned long next;
+       int ret = 0;
  
         set_freezable();
  
         kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
  
-       while (!kthread_should_stop()) {
+       while (!ret && !kthread_should_stop()) {
                 j->reclaim_kicked = false;
  
                 mutex_lock(&j->reclaim_lock);
-               __bch2_journal_reclaim(j, false);
+               ret = __bch2_journal_reclaim(j, false);
                 mutex_unlock(&j->reclaim_lock);
  
                 next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h

index e25355042e6e4c7a6d821fbbecd026eac753276b..f02caa3d49ea74daf97d1054cc3ddbfba250d254 100644 (file)
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@@ -4,12 +4,6 @@
  
  #define JOURNAL_PIN    (32 * 1024)
  
-enum journal_space_from {
-       journal_space_discarded,
-       journal_space_clean_ondisk,
-       journal_space_clean,
-};
-
  static inline void journal_reclaim_kick(struct journal *j)
  {
         struct task_struct *p = READ_ONCE(j->reclaim_thread);
@@ -39,6 +33,7 @@ journal_seq_pin(struct journal *j, u64 seq)
         return &j->pin.data[seq & j->pin.mask];
  }
  
+void __bch2_journal_pin_put(struct journal *, u64);
  void bch2_journal_pin_put(struct journal *, u64);
  void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
  
@@ -73,7 +68,7 @@ static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
  void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
  
  void bch2_journal_do_discards(struct journal *);
-void bch2_journal_reclaim(struct journal *);
+int bch2_journal_reclaim(struct journal *);
  
  void bch2_journal_reclaim_stop(struct journal *);
  int bch2_journal_reclaim_start(struct journal *);
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c

index d0f1bbf8f6a7984ff5f96d997235b49d484d2eee..e1b63f3879f44e50cc2fdd92ca3de8db03a3c7fa 100644 (file)
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -118,7 +118,7 @@ out_write_sb:
  out:
         mutex_unlock(&c->sb_lock);
  
-       return ret;
+       return ret ?: bch2_blacklist_table_initialize(c);
  }
  
  static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
         struct journal_seq_blacklist_table *t;
         unsigned i, nr = blacklist_nr_entries(bl);
  
-       BUG_ON(c->journal_seq_blacklist_table);
-
         if (!bl)
                 return 0;
  
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
                         journal_seq_blacklist_table_cmp,
                         NULL);
  
+       kfree(c->journal_seq_blacklist_table);
         c->journal_seq_blacklist_table = t;
         return 0;
  }
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h

index 4640bb8687cc18414eb0ba9017d78a9949602f81..308b899b42145e99c0ce8c2309943e740a7efc80 100644 (file)
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -9,11 +9,13 @@
  #include "super_types.h"
  #include "fifo.h"
  
-struct journal_res;
+#define JOURNAL_BUF_BITS       2
+#define JOURNAL_BUF_NR         (1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK       (JOURNAL_BUF_NR - 1)
  
  /*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
   */
  struct journal_buf {
         struct jset             *data;
@@ -27,6 +29,8 @@ struct journal_buf {
         unsigned                disk_sectors;   /* maximum size entry could have been, if
                                                    buf_size was bigger */
         unsigned                u64s_reserved;
+       bool                    noflush;        /* write has already been kicked off, and was noflush */
+       bool                    must_flush;     /* something wants a flush */
         /* bloom filter: */
         unsigned long           has_inode[1024 / sizeof(unsigned long)];
  };
@@ -81,10 +85,12 @@ union journal_res_state {
  
         struct {
                 u64             cur_entry_offset:20,
-                               idx:1,
-                               prev_buf_unwritten:1,
-                               buf0_count:21,
-                               buf1_count:21;
+                               idx:2,
+                               unwritten_idx:2,
+                               buf0_count:10,
+                               buf1_count:10,
+                               buf2_count:10,
+                               buf3_count:10;
         };
  };
  
@@ -116,6 +122,20 @@ union journal_preres_state {
  #define JOURNAL_ENTRY_CLOSED_VAL       (JOURNAL_ENTRY_OFFSET_MAX - 1)
  #define JOURNAL_ENTRY_ERROR_VAL                (JOURNAL_ENTRY_OFFSET_MAX)
  
+struct journal_space {
+       /* Units of 512 bytes sectors: */
+       unsigned        next_entry; /* How big the next journal entry can be */
+       unsigned        total;
+};
+
+enum journal_space_from {
+       journal_space_discarded,
+       journal_space_clean_ondisk,
+       journal_space_clean,
+       journal_space_total,
+       journal_space_nr,
+};
+
  /*
   * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
   * either because something's waiting on the write to complete or because it's
@@ -128,6 +148,7 @@ enum {
         JOURNAL_RECLAIM_STARTED,
         JOURNAL_NEED_WRITE,
         JOURNAL_MAY_GET_UNRESERVED,
+       JOURNAL_MAY_SKIP_FLUSH,
  };
  
  /* Embedded in struct bch_fs */
@@ -165,7 +186,7 @@ struct journal {
          * Two journal entries -- one is currently open for new entries, the
          * other is possibly being written out.
          */
-       struct journal_buf      buf[2];
+       struct journal_buf      buf[JOURNAL_BUF_NR];
  
         spinlock_t              lock;
  
@@ -185,6 +206,7 @@ struct journal {
  
         /* seq, last_seq from the most recent journal entry successfully written */
         u64                     seq_ondisk;
+       u64                     flushed_seq_ondisk;
         u64                     last_seq_ondisk;
         u64                     err_seq;
         u64                     last_empty_seq;
@@ -210,6 +232,8 @@ struct journal {
                 struct journal_entry_pin_list *data;
         }                       pin;
  
+       struct journal_space    space[journal_space_nr];
+
         u64                     replay_journal_seq;
         u64                     replay_journal_seq_end;
  
@@ -232,11 +256,15 @@ struct journal {
  
         unsigned                write_delay_ms;
         unsigned                reclaim_delay_ms;
+       unsigned long           last_flush_write;
  
         u64                     res_get_blocked_start;
         u64                     need_write_time;
         u64                     write_start_time;
  
+       u64                     nr_flush_writes;
+       u64                     nr_noflush_writes;
+
         struct time_stats       *write_time;
         struct time_stats       *delay_time;
         struct time_stats       *blocked_time;
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c

index d24cef2bf1aa3252b611e1fee6e84a41dc2b4964..ecd51d45743a3705da4f1a6261b2469876f73add 100644 (file)
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
  
  static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
  {
-       struct journal_replay *p;
+       struct journal_replay *i;
         struct jset_entry *entry;
         struct bkey_i *k, *_n;
         struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
         if (list_empty(journal_entries))
                 return keys;
  
-       keys.journal_seq_base =
-               le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
-
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                         continue;
  
-               for_each_jset_key(k, _n, entry, &p->j)
+               if (!keys.journal_seq_base)
+                       keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                         nr_keys++;
         }
  
-
         keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
         if (!keys.d)
                 goto err;
  
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                         continue;
  
-               for_each_jset_key(k, _n, entry, &p->j)
+               BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                         keys.d[keys.nr++] = (struct journal_key) {
                                 .btree_id       = entry->btree_id,
                                 .level          = entry->level,
                                 .k              = k,
-                               .journal_seq    = le64_to_cpu(p->j.seq) -
+                               .journal_seq    = le64_to_cpu(i->j.seq) -
                                         keys.journal_seq_base,
-                               .journal_offset = k->_data - p->j._data,
+                               .journal_offset = k->_data - i->j._data,
                         };
         }
  
@@ -643,46 +643,6 @@ err:
         return ret;
  }
  
-static bool journal_empty(struct list_head *journal)
-{
-       return list_empty(journal) ||
-               journal_entry_empty(&list_last_entry(journal,
-                                       struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-                                                 struct list_head *journal)
-{
-       struct journal_replay *i =
-               list_last_entry(journal, struct journal_replay, list);
-       u64 start_seq   = le64_to_cpu(i->j.last_seq);
-       u64 end_seq     = le64_to_cpu(i->j.seq);
-       u64 seq         = start_seq;
-       int ret = 0;
-
-       list_for_each_entry(i, journal, list) {
-               if (le64_to_cpu(i->j.seq) < start_seq)
-                       continue;
-
-               fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                       seq, le64_to_cpu(i->j.seq) - 1,
-                       start_seq, end_seq);
-
-               seq = le64_to_cpu(i->j.seq);
-
-               fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-                           "found blacklisted journal entry %llu", seq);
-
-               do {
-                       seq++;
-               } while (bch2_journal_seq_is_blacklisted(c, seq, false));
-       }
-fsck_err:
-       return ret;
-}
-
  /* journal replay early: */
  
  static int journal_replay_entry_early(struct bch_fs *c,
@@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
                                 struct bch_sb_field_clean *clean,
                                 struct list_head *journal)
  {
+       struct journal_replay *i;
         struct jset_entry *entry;
         int ret;
  
@@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
                                 return ret;
                 }
         } else {
-               struct journal_replay *i =
-                       list_last_entry(journal, struct journal_replay, list);
+               list_for_each_entry(i, journal, list) {
+                       if (i->ignore)
+                               continue;
  
-               c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
  
-               list_for_each_entry(i, journal, list)
                         vstruct_for_each(&i->j, entry) {
                                 ret = journal_replay_entry_early(c, entry);
                                 if (ret)
                                         return ret;
                         }
+               }
         }
  
         bch2_fs_usage_initialize(c);
@@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
         struct bch_sb_field_clean *clean = *cleanp;
         int ret = 0;
  
-       if (!c->sb.clean || !j)
-               return 0;
-
         if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
                         "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
                         le64_to_cpu(clean->journal_seq),
@@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
  {
         const char *err = "cannot allocate memory";
         struct bch_sb_field_clean *clean = NULL;
-       u64 journal_seq;
+       struct jset *last_journal_entry = NULL;
+       u64 blacklist_seq, journal_seq;
         bool write_sb = false, need_write_alloc = false;
         int ret;
  
@@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
         }
  
+       ret = bch2_blacklist_table_initialize(c);
+       if (ret) {
+               bch_err(c, "error initializing blacklist table");
+               goto err;
+       }
+
         if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct jset *j;
+               struct journal_replay *i;
  
-               ret = bch2_journal_read(c, &c->journal_entries);
+               ret = bch2_journal_read(c, &c->journal_entries,
+                                       &blacklist_seq, &journal_seq);
                 if (ret)
                         goto err;
  
-               if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+               list_for_each_entry_reverse(i, &c->journal_entries, list)
+                       if (!i->ignore) {
+                               last_journal_entry = &i->j;
+                               break;
+                       }
+
+               if (mustfix_fsck_err_on(c->sb.clean &&
+                                       last_journal_entry &&
+                                       !journal_entry_empty(last_journal_entry), c,
                                 "filesystem marked clean but journal not empty")) {
                         c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
                         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
                         c->sb.clean = false;
                 }
  
-               if (!c->sb.clean && list_empty(&c->journal_entries)) {
-                       bch_err(c, "no journal entries found");
-                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-                       goto err;
+               if (!last_journal_entry) {
+                       fsck_err_on(!c->sb.clean, c, "no journal entries found");
+                       goto use_clean;
                 }
  
                 c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
                         goto err;
                 }
  
-               j = &list_last_entry(&c->journal_entries,
-                                    struct journal_replay, list)->j;
-
-               ret = verify_superblock_clean(c, &clean, j);
-               if (ret)
+               if (c->sb.clean && last_journal_entry) {
+                       ret = verify_superblock_clean(c, &clean,
+                                                     last_journal_entry);
+                       if (ret)
+                               goto err;
+               }
+       } else {
+use_clean:
+               if (!clean) {
+                       bch_err(c, "no superblock clean section found");
+                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
                         goto err;
  
-               journal_seq = le64_to_cpu(j->seq) + 1;
-       } else {
-               journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+               }
+               blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
         }
  
         if (!c->sb.clean &&
@@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
         if (ret)
                 goto err;
  
-       if (!c->sb.clean) {
+       /*
+        * After an unclean shutdown, skip then next few journal sequence
+        * numbers as they may have been referenced by btree writes that
+        * happened before their corresponding journal writes - those btree
+        * writes need to be ignored, by skipping and blacklisting the next few
+        * journal sequence numbers:
+        */
+       if (!c->sb.clean)
+               journal_seq += 8;
+
+       if (blacklist_seq != journal_seq) {
                 ret = bch2_journal_seq_blacklist_add(c,
-                                                    journal_seq,
-                                                    journal_seq + 4);
+                                       blacklist_seq, journal_seq);
                 if (ret) {
                         bch_err(c, "error creating new journal seq blacklist entry");
                         goto err;
                 }
-
-               journal_seq += 4;
-
-               /*
-                * The superblock needs to be written before we do any btree
-                * node writes: it will be in the read_write() path
-                */
-       }
-
-       ret = bch2_blacklist_table_initialize(c);
-
-       if (!list_empty(&c->journal_entries)) {
-               ret = verify_journal_entries_not_blacklisted_or_missing(c,
-                                                       &c->journal_entries);
-               if (ret)
-                       goto err;
         }
  
         ret = bch2_fs_journal_start(&c->journal, journal_seq,
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c

index 91518c0d67948b5a8d3832dc2168f90763614953..00a197b65e0b1412a76f28facd6df4cdf01b89a9 100644 (file)
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -275,53 +275,55 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
  static int replicas_table_update(struct bch_fs *c,
                                  struct bch_replicas_cpu *new_r)
  {
-       struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+       struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
         struct bch_fs_usage *new_scratch = NULL;
         struct bch_fs_usage __percpu *new_gc = NULL;
         struct bch_fs_usage *new_base = NULL;
-       unsigned bytes = sizeof(struct bch_fs_usage) +
+       unsigned i, bytes = sizeof(struct bch_fs_usage) +
                 sizeof(u64) * new_r->nr;
-       int ret = -ENOMEM;
+       int ret = 0;
+
+       memset(new_usage, 0, sizeof(new_usage));
+
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+                                       sizeof(u64), GFP_NOIO)))
+                       goto err;
  
         if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
-           !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
-                                               GFP_NOIO)) ||
-           !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
-                                               GFP_NOIO)) ||
             !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
             (c->usage_gc &&
-            !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
-               bch_err(c, "error updating replicas table: memory allocation failure");
+            !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
                 goto err;
-       }
  
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               if (c->usage[i])
+                       __replicas_table_update_pcpu(new_usage[i], new_r,
+                                                    c->usage[i], &c->replicas);
         if (c->usage_base)
                 __replicas_table_update(new_base,               new_r,
                                         c->usage_base,          &c->replicas);
-       if (c->usage[0])
-               __replicas_table_update_pcpu(new_usage[0],      new_r,
-                                            c->usage[0],       &c->replicas);
-       if (c->usage[1])
-               __replicas_table_update_pcpu(new_usage[1],      new_r,
-                                            c->usage[1],       &c->replicas);
         if (c->usage_gc)
                 __replicas_table_update_pcpu(new_gc,            new_r,
                                              c->usage_gc,       &c->replicas);
  
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               swap(c->usage[i],       new_usage[i]);
         swap(c->usage_base,     new_base);
-       swap(c->usage[0],       new_usage[0]);
-       swap(c->usage[1],       new_usage[1]);
         swap(c->usage_scratch,  new_scratch);
         swap(c->usage_gc,       new_gc);
         swap(c->replicas,       *new_r);
-       ret = 0;
-err:
+out:
         free_percpu(new_gc);
         kfree(new_scratch);
         free_percpu(new_usage[1]);
         free_percpu(new_usage[0]);
         kfree(new_base);
         return ret;
+err:
+       bch_err(c, "error updating replicas table: memory allocation failure");
+       ret = -ENOMEM;
+       goto out;
  }
  
  static unsigned reserve_journal_replicas(struct bch_fs *c,
@@ -496,9 +498,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
                 struct bch_replicas_cpu n;
  
                 if (!__replicas_has_entry(&c->replicas_gc, e) &&
-                   (c->usage_base->replicas[i] ||
-                    percpu_u64_get(&c->usage[0]->replicas[i]) ||
-                    percpu_u64_get(&c->usage[1]->replicas[i]))) {
+                   bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
                         n = cpu_replicas_add_entry(&c->replicas_gc, e);
                         if (!n.entries) {
                                 ret = -ENOSPC;
@@ -603,9 +603,7 @@ retry:
                         cpu_replicas_entry(&c->replicas, i);
  
                 if (e->data_type == BCH_DATA_journal ||
-                   c->usage_base->replicas[i] ||
-                   percpu_u64_get(&c->usage[0]->replicas[i]) ||
-                   percpu_u64_get(&c->usage[1]->replicas[i]))
+                   bch2_fs_usage_read_one(c, &c->usage_base->replicas[i]))
                         memcpy(cpu_replicas_entry(&new, new.nr++),
                                e, new.entry_size);
         }
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c

index cee6cc9387340c7a4b288d3427fd3fa6b415eeb7..abe46c539c2e20d0991ae023cbed7c46e5301db5 100644 (file)
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -636,7 +636,7 @@ static void write_super_endio(struct bio *bio)
  
         /* XXX: return errors directly */
  
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
                                bch2_blk_status_to_str(bio->bi_status)))
                 ca->sb_write_error = 1;
  
@@ -995,10 +995,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
         percpu_down_write(&c->mark_lock);
  
         if (!journal_seq) {
-               bch2_fs_usage_acc_to_base(c, 0);
-               bch2_fs_usage_acc_to_base(c, 1);
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       bch2_fs_usage_acc_to_base(c, i);
         } else {
-               bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+               bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
         }
  
         {
diff --git a/libbcachefs/super.c b/libbcachefs/super.c

index e3bbd0b0d6989deba07e110ad6bf746fe57301a5..651fbc5d52b1c4077f079e983d7e384941b16cc6 100644 (file)
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -475,8 +475,8 @@ static void __bch2_fs_free(struct bch_fs *c)
         bch2_journal_entries_free(&c->journal_entries);
         percpu_free_rwsem(&c->mark_lock);
         kfree(c->usage_scratch);
-       free_percpu(c->usage[1]);
-       free_percpu(c->usage[0]);
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               free_percpu(c->usage[i]);
         kfree(c->usage_base);
  
         if (c->btree_iters_bufs)
@@ -716,6 +716,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
  
         bch2_fs_btree_cache_init_early(&c->btree_cache);
  
+       mutex_init(&c->sectors_available_lock);
+
         if (percpu_init_rwsem(&c->mark_lock))
                 goto err;
author	Kent Overstreet <kent.overstreet@gmail.com>
	Fri, 4 Dec 2020 18:41:49 +0000 (13:41 -0500)
committer	Kent Overstreet <kent.overstreet@gmail.com>
	Fri, 4 Dec 2020 18:45:33 +0000 (13:45 -0500)
.bcachefs_revision		patch \| blob \| history
include/linux/list.h		patch \| blob \| history
libbcachefs/bcachefs.h		patch \| blob \| history
libbcachefs/bcachefs_format.h		patch \| blob \| history
libbcachefs/btree_gc.c		patch \| blob \| history
libbcachefs/btree_io.c		patch \| blob \| history
libbcachefs/btree_types.h		patch \| blob \| history
libbcachefs/btree_update_interior.c		patch \| blob \| history
libbcachefs/btree_update_leaf.c		patch \| blob \| history
libbcachefs/buckets.c		patch \| blob \| history
libbcachefs/ec.c		patch \| blob \| history
libbcachefs/error.h		patch \| blob \| history
libbcachefs/fs-io.c		patch \| blob \| history
libbcachefs/fs.c		patch \| blob \| history
libbcachefs/fs.h		patch \| blob \| history
libbcachefs/io.c		patch \| blob \| history
libbcachefs/journal.c		patch \| blob \| history
libbcachefs/journal.h		patch \| blob \| history
libbcachefs/journal_io.c		patch \| blob \| history
libbcachefs/journal_io.h		patch \| blob \| history
libbcachefs/journal_reclaim.c		patch \| blob \| history
libbcachefs/journal_reclaim.h		patch \| blob \| history
libbcachefs/journal_seq_blacklist.c		patch \| blob \| history
libbcachefs/journal_types.h		patch \| blob \| history
libbcachefs/recovery.c		patch \| blob \| history
libbcachefs/replicas.c		patch \| blob \| history
libbcachefs/super-io.c		patch \| blob \| history
libbcachefs/super.c		patch \| blob \| history