]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal_io.c
Update bcachefs sources to 7958ebe324 bcachefs: Fix alloc_v4_backpointers()
[bcachefs-tools-debian] / libbcachefs / journal_io.c
index 68113a08f1b9c4a42cab61b57277c9fb4bafe537..d6f259348b3dbb67eec2740a19b93b333765d038 100644 (file)
@@ -987,7 +987,6 @@ static void bch2_journal_read_device(struct closure *cl)
        struct journal_replay *r, **_r;
        struct genradix_iter iter;
        struct journal_read_buf buf = { NULL, 0 };
-       u64 min_seq = U64_MAX;
        unsigned i;
        int ret = 0;
 
@@ -1006,45 +1005,27 @@ static void bch2_journal_read_device(struct closure *cl)
                        goto err;
        }
 
-       /* Find the journal bucket with the highest sequence number: */
-       for (i = 0; i < ja->nr; i++) {
-               if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
-                       ja->cur_idx = i;
-
-               min_seq = min(ja->bucket_seq[i], min_seq);
-       }
-
-       /*
-        * If there's duplicate journal entries in multiple buckets (which
-        * definitely isn't supposed to happen, but...) - make sure to start
-        * cur_idx at the last of those buckets, so we don't deadlock trying to
-        * allocate
-        */
-       while (ja->bucket_seq[ja->cur_idx] > min_seq &&
-              ja->bucket_seq[ja->cur_idx] ==
-              ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
-               ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-
        ja->sectors_free = ca->mi.bucket_size;
 
        mutex_lock(&jlist->lock);
-       genradix_for_each(&c->journal_entries, iter, _r) {
+       genradix_for_each_reverse(&c->journal_entries, iter, _r) {
                r = *_r;
 
                if (!r)
                        continue;
 
                for (i = 0; i < r->nr_ptrs; i++) {
-                       if (r->ptrs[i].dev == ca->dev_idx &&
-                           sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+                       if (r->ptrs[i].dev == ca->dev_idx) {
                                unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
                                        vstruct_sectors(&r->j, c->block_bits);
 
-                               ja->sectors_free = min(ja->sectors_free,
-                                                      ca->mi.bucket_size - wrote);
+                               ja->cur_idx = r->ptrs[i].bucket;
+                               ja->sectors_free = ca->mi.bucket_size - wrote;
+                               goto found;
                        }
                }
        }
+found:
        mutex_unlock(&jlist->lock);
 
        if (ja->bucket_seq[ja->cur_idx] &&
@@ -1099,7 +1080,10 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c,
+                     u64 *last_seq,
+                     u64 *blacklist_seq,
+                     u64 *start_seq)
 {
        struct journal_list jlist;
        struct journal_replay *i, **_i, *prev = NULL;
@@ -1107,9 +1091,8 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
        struct bch_dev *ca;
        unsigned iter;
        struct printbuf buf = PRINTBUF;
-       size_t keys = 0, entries = 0;
-       bool degraded = false;
-       u64 seq, last_seq = 0;
+       bool degraded = false, last_write_torn = false;
+       u64 seq;
        int ret = 0;
 
        closure_init_stack(&jlist.cl);
@@ -1138,41 +1121,46 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
        if (jlist.ret)
                return jlist.ret;
 
-       *start_seq = 0;
+       *last_seq       = 0;
+       *start_seq      = 0;
+       *blacklist_seq  = 0;
 
        /*
         * Find most recent flush entry, and ignore newer non flush entries -
         * those entries will be blacklisted:
         */
        genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+               int write = READ;
+
                i = *_i;
 
                if (!i || i->ignore)
                        continue;
 
                if (!*start_seq)
-                       *start_seq = le64_to_cpu(i->j.seq) + 1;
-
-               if (!JSET_NO_FLUSH(&i->j)) {
-                       int write = READ;
-                       if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-                                                c, &i->j, NULL,
-                                                "invalid journal entry: last_seq > seq (%llu > %llu)",
-                                                le64_to_cpu(i->j.last_seq),
-                                                le64_to_cpu(i->j.seq)))
-                               i->j.last_seq = i->j.seq;
-
-                       pr_info("last flush %llu-%llu csum good %u",
-                               le64_to_cpu(i->j.last_seq),
-                               le64_to_cpu(i->j.seq),
-                               i->csum_good);
-
-                       last_seq        = le64_to_cpu(i->j.last_seq);
-                       *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
-                       break;
+                       *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+               if (JSET_NO_FLUSH(&i->j)) {
+                       i->ignore = true;
+                       continue;
+               }
+
+               if (!last_write_torn && !i->csum_good) {
+                       last_write_torn = true;
+                       i->ignore = true;
+                       continue;
                }
 
-               journal_replay_free(c, i);
+               if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+                                        c, &i->j, NULL,
+                                        "invalid journal entry: last_seq > seq (%llu > %llu)",
+                                        le64_to_cpu(i->j.last_seq),
+                                        le64_to_cpu(i->j.seq)))
+                       i->j.last_seq = i->j.seq;
+
+               *last_seq       = le64_to_cpu(i->j.last_seq);
+               *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
+               break;
        }
 
        if (!*start_seq) {
@@ -1180,12 +1168,18 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                return 0;
        }
 
-       if (!last_seq) {
+       if (!*last_seq) {
                fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
-               ret = -1;
-               goto err;
+               return 0;
        }
 
+       bch_info(c, "journal read done, replaying entries %llu-%llu",
+                *last_seq, *blacklist_seq - 1);
+
+       if (*start_seq != *blacklist_seq)
+               bch_info(c, "dropped unflushed entries %llu-%llu",
+                        *blacklist_seq, *start_seq - 1);
+
        /* Drop blacklisted entries and entries older than last_seq: */
        genradix_for_each(&c->journal_entries, radix_iter, _i) {
                i = *_i;
@@ -1194,7 +1188,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                        continue;
 
                seq = le64_to_cpu(i->j.seq);
-               if (seq < last_seq) {
+               if (seq < *last_seq) {
                        journal_replay_free(c, i);
                        continue;
                }
@@ -1202,13 +1196,12 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
                        fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
                                    "found blacklisted journal entry %llu", seq);
-
-                       journal_replay_free(c, i);
+                       i->ignore = true;
                }
        }
 
        /* Check for missing entries: */
-       seq = last_seq;
+       seq = *last_seq;
        genradix_for_each(&c->journal_entries, radix_iter, _i) {
                i = *_i;
 
@@ -1246,7 +1239,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                                 "  prev at %s\n"
                                 "  next at %s",
                                 missing_start, missing_end,
-                                last_seq, *blacklist_seq - 1,
+                                *last_seq, *blacklist_seq - 1,
                                 buf1.buf, buf2.buf);
 
                        printbuf_exit(&buf1);
@@ -1258,8 +1251,6 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
        }
 
        genradix_for_each(&c->journal_entries, radix_iter, _i) {
-               struct jset_entry *entry;
-               struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas = {
                        .e.data_type = BCH_DATA_journal,
                        .e.nr_required = 1,
@@ -1274,10 +1265,10 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                        struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
 
                        if (!i->ptrs[ptr].csum_good)
-                               printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
-                                      ca->name, i->ptrs[ptr].sector,
-                                      le64_to_cpu(i->j.seq),
-                                      i->csum_good ? " (had good copy on another device)" : "");
+                               bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+                                                  "invalid journal checksum, seq %llu%s",
+                                                  le64_to_cpu(i->j.seq),
+                                                  i->csum_good ? " (had good copy on another device)" : "");
                }
 
                ret = jset_validate(c,
@@ -1309,18 +1300,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                        if (ret)
                                goto err;
                }
-
-               for_each_jset_key(k, _n, entry, &i->j)
-                       keys++;
-               entries++;
        }
-
-       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                keys, entries, *start_seq);
-
-       if (*start_seq != *blacklist_seq)
-               bch_info(c, "dropped unflushed entries %llu-%llu",
-                        *blacklist_seq, *start_seq - 1);
 err:
 fsck_err:
        printbuf_exit(&buf);
@@ -1661,20 +1641,42 @@ void bch2_journal_write(struct closure *cl)
        j->write_start_time = local_clock();
 
        spin_lock(&j->lock);
-       if (bch2_journal_error(j) ||
-           w->noflush ||
-           (!w->must_flush &&
-            (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-            test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+
+       /*
+        * If the journal is in an error state - we did an emergency shutdown -
+        * we prefer to continue doing journal writes. We just mark them as
+        * noflush so they'll never be used, but they'll still be visible by the
+        * list_journal tool - this helps in debugging.
+        *
+        * There's a caveat: the first journal write after marking the
+        * superblock dirty must always be a flush write, because on startup
+        * from a clean shutdown we didn't necessarily read the journal and the
+        * new journal write might overwrite whatever was in the journal
+        * previously - we can't leave the journal without any flush writes in
+        * it.
+        *
+        * So if we're in an error state, and we're still starting up, we don't
+        * write anything at all.
+        */
+       if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+           (bch2_journal_error(j) ||
+            w->noflush ||
+            (!w->must_flush &&
+             (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+             test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
                w->noflush = true;
                SET_JSET_NO_FLUSH(jset, true);
                jset->last_seq  = 0;
                w->last_seq     = 0;
 
                j->nr_noflush_writes++;
-       } else {
+       } else if (!bch2_journal_error(j)) {
                j->last_flush_write = jiffies;
                j->nr_flush_writes++;
+               clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+       } else {
+               spin_unlock(&j->lock);
+               goto err;
        }
        spin_unlock(&j->lock);