]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal_io.c
Update bcachefs sources to 4837f82ee1 bcachefs: Use cached iterators for alloc btree
[bcachefs-tools-debian] / libbcachefs / journal_io.c
index 16cb6be87cbf17b07cc5e8d878fd94b69ea9e67f..b7625285b3ad6c7d58885cd617cf443cc2f60687 100644 (file)
@@ -1,15 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "alloc_background.h"
 #include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
 #include "replicas.h"
 
 #include <trace/events/bcachefs.h>
@@ -42,19 +41,21 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
                                  list)->j.last_seq
                : 0;
 
-       /* Is this entry older than the range we need? */
-       if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-               goto out;
-       }
+       if (!c->opts.read_entire_journal) {
+               /* Is this entry older than the range we need? */
+               if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+                       ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+                       goto out;
+               }
 
-       /* Drop entries we don't need anymore */
-       list_for_each_entry_safe(i, pos, jlist->head, list) {
-               if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
-                       break;
-               list_del(&i->list);
-               kvpfree(i, offsetof(struct journal_replay, j) +
-                       vstruct_bytes(&i->j));
+               /* Drop entries we don't need anymore */
+               list_for_each_entry_safe(i, pos, jlist->head, list) {
+                       if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+                               break;
+                       list_del(&i->list);
+                       kvpfree(i, offsetof(struct journal_replay, j) +
+                               vstruct_bytes(&i->j));
+               }
        }
 
        list_for_each_entry_reverse(i, jlist->head, list) {
@@ -141,7 +142,8 @@ static void journal_entry_null_range(void *start, void *end)
 
 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                                struct jset_entry *entry,
-                               struct bkey_i *k, enum btree_node_type key_type,
+                               unsigned level, enum btree_id btree_id,
+                               struct bkey_i *k,
                                const char *type, int write)
 {
        void *next = vstruct_next(entry);
@@ -174,14 +176,13 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                return 0;
        }
 
-       if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
-               bch2_bkey_swab(NULL, bkey_to_packed(k));
-
-       if (!write &&
-           version < bcachefs_metadata_version_bkey_renumber)
-               bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+       if (!write)
+               bch2_bkey_compat(level, btree_id, version,
+                           JSET_BIG_ENDIAN(jset), write,
+                           NULL, bkey_to_packed(k));
 
-       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
+       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+                                   __btree_node_type(level, btree_id));
        if (invalid) {
                char buf[160];
 
@@ -195,9 +196,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                return 0;
        }
 
-       if (write &&
-           version < bcachefs_metadata_version_bkey_renumber)
-               bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+       if (write)
+               bch2_bkey_compat(level, btree_id, version,
+                           JSET_BIG_ENDIAN(jset), write,
+                           NULL, bkey_to_packed(k));
 fsck_err:
        return ret;
 }
@@ -210,10 +212,10 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
        struct bkey_i *k;
 
        vstruct_for_each(entry, k) {
-               int ret = journal_validate_key(c, jset, entry, k,
-                               __btree_node_type(entry->level,
-                                                 entry->btree_id),
-                               "key", write);
+               int ret = journal_validate_key(c, jset, entry,
+                                              entry->level,
+                                              entry->btree_id,
+                                              k, "key", write);
                if (ret)
                        return ret;
        }
@@ -243,7 +245,7 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
                return 0;
        }
 
-       return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+       return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
                                    "btree root", write);
 fsck_err:
        return ret;
@@ -498,9 +500,8 @@ reread:
                                                    sectors_read << 9));
                        bio_set_dev(bio, ca->disk_sb.bdev);
                        bio->bi_iter.bi_sector  = offset;
-                       bio->bi_iter.bi_size    = sectors_read << 9;
                        bio_set_op_attrs(bio, REQ_OP_READ, 0);
-                       bch2_bio_map(bio, buf->data);
+                       bch2_bio_map(bio, buf->data, sectors_read << 9);
 
                        ret = submit_bio_wait(bio);
                        bio_put(bio);
@@ -625,11 +626,12 @@ static void bch2_journal_read_device(struct closure *cl)
        ja->sectors_free = 0;
 
        /*
-        * Set last_idx to indicate the entire journal is full and needs to be
+        * Set dirty_idx to indicate the entire journal is full and needs to be
         * reclaimed - journal reclaim will immediately reclaim whatever isn't
         * pinned when it first runs:
         */
-       ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+       ja->discard_idx = ja->dirty_idx_ondisk =
+               ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
        kvpfree(buf.data, buf.size);
        percpu_ref_put(&ca->io_ref);
@@ -642,57 +644,11 @@ err:
        goto out;
 }
 
-void bch2_journal_entries_free(struct list_head *list)
-{
-
-       while (!list_empty(list)) {
-               struct journal_replay *i =
-                       list_first_entry(list, struct journal_replay, list);
-               list_del(&i->list);
-               kvpfree(i, offsetof(struct journal_replay, j) +
-                       vstruct_bytes(&i->j));
-       }
-}
-
-int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
-{
-       struct journal *j = &c->journal;
-       struct journal_entry_pin_list *p;
-       u64 seq, nr = end_seq - last_seq + 1;
-
-       if (nr > j->pin.size) {
-               free_fifo(&j->pin);
-               init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-               if (!j->pin.data) {
-                       bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-                       return -ENOMEM;
-               }
-       }
-
-       atomic64_set(&j->seq, end_seq);
-       j->last_seq_ondisk = last_seq;
-
-       j->pin.front    = last_seq;
-       j->pin.back     = end_seq + 1;
-
-       fifo_for_each_entry_ptr(p, &j->pin, seq) {
-               INIT_LIST_HEAD(&p->list);
-               INIT_LIST_HEAD(&p->flushed);
-               atomic_set(&p->count, 0);
-               p->devs.nr = 0;
-       }
-
-       return 0;
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
-       struct journal *j = &c->journal;
        struct journal_list jlist;
        struct journal_replay *i;
-       struct journal_entry_pin_list *p;
        struct bch_dev *ca;
-       u64 cur_seq, end_seq;
        unsigned iter;
        size_t keys = 0, entries = 0;
        bool degraded = false;
@@ -724,17 +680,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        if (jlist.ret)
                return jlist.ret;
 
-       if (list_empty(list)){
-               bch_err(c, "no journal entries found");
-               return BCH_FSCK_REPAIR_IMPOSSIBLE;
-       }
-
        list_for_each_entry(i, list, list) {
+               struct jset_entry *entry;
+               struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas;
                char buf[80];
 
-               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
-
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
                        goto fsck_err;
@@ -744,6 +695,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                 * the devices - this is wrong:
                 */
 
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
                if (!degraded &&
                    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
                     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
@@ -754,128 +707,19 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                        if (ret)
                                return ret;
                }
-       }
-
-       i = list_last_entry(list, struct journal_replay, list);
-
-       ret = bch2_journal_set_seq(c,
-                                  le64_to_cpu(i->j.last_seq),
-                                  le64_to_cpu(i->j.seq));
-       if (ret)
-               return ret;
-
-       mutex_lock(&j->blacklist_lock);
-
-       list_for_each_entry(i, list, list) {
-               p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
-               atomic_set(&p->count, 1);
-               p->devs = i->devs;
-
-               if (bch2_journal_seq_blacklist_read(j, i)) {
-                       mutex_unlock(&j->blacklist_lock);
-                       return -ENOMEM;
-               }
-       }
-
-       mutex_unlock(&j->blacklist_lock);
-
-       cur_seq = journal_last_seq(j);
-       end_seq = le64_to_cpu(list_last_entry(list,
-                               struct journal_replay, list)->j.seq);
-
-       list_for_each_entry(i, list, list) {
-               struct jset_entry *entry;
-               struct bkey_i *k, *_n;
-               bool blacklisted;
-
-               mutex_lock(&j->blacklist_lock);
-               while (cur_seq < le64_to_cpu(i->j.seq) &&
-                      bch2_journal_seq_blacklist_find(j, cur_seq))
-                       cur_seq++;
-
-               blacklisted = bch2_journal_seq_blacklist_find(j,
-                                                        le64_to_cpu(i->j.seq));
-               mutex_unlock(&j->blacklist_lock);
-
-               fsck_err_on(blacklisted, c,
-                           "found blacklisted journal entry %llu",
-                           le64_to_cpu(i->j.seq));
-
-               fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
-                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                       cur_seq, le64_to_cpu(i->j.seq) - 1,
-                       journal_last_seq(j), end_seq);
-
-               cur_seq = le64_to_cpu(i->j.seq) + 1;
 
                for_each_jset_key(k, _n, entry, &i->j)
                        keys++;
                entries++;
        }
 
-       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                keys, entries, journal_cur_seq(j));
-fsck_err:
-       return ret;
-}
-
-/* journal replay: */
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
-       struct journal *j = &c->journal;
-       struct bkey_i *k, *_n;
-       struct jset_entry *entry;
-       struct journal_replay *i, *n;
-       int ret = 0;
-
-       list_for_each_entry_safe(i, n, list, list) {
-               j->replay_journal_seq = le64_to_cpu(i->j.seq);
-
-               for_each_jset_key(k, _n, entry, &i->j) {
-
-                       if (entry->btree_id == BTREE_ID_ALLOC) {
-                               /*
-                                * allocation code handles replay for
-                                * BTREE_ID_ALLOC keys:
-                                */
-                               ret = bch2_alloc_replay_key(c, k);
-                       } else {
-                               /*
-                                * We might cause compressed extents to be
-                                * split, so we need to pass in a
-                                * disk_reservation:
-                                */
-                               struct disk_reservation disk_res =
-                                       bch2_disk_reservation_init(c, 0);
-
-                               ret = bch2_btree_insert(c, entry->btree_id, k,
-                                               &disk_res, NULL,
-                                               BTREE_INSERT_NOFAIL|
-                                               BTREE_INSERT_JOURNAL_REPLAY|
-                                               BTREE_INSERT_NOMARK);
-                       }
-
-                       if (ret) {
-                               bch_err(c, "journal replay: error %d while replaying key",
-                                       ret);
-                               goto err;
-                       }
-
-                       cond_resched();
-               }
+       if (!list_empty(list)) {
+               i = list_last_entry(list, struct journal_replay, list);
 
-               bch2_journal_pin_put(j, j->replay_journal_seq);
+               bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                        keys, entries, le64_to_cpu(i->j.seq));
        }
-
-       j->replay_journal_seq = 0;
-
-       bch2_journal_set_replay_done(j);
-       bch2_journal_flush_all_pins(j);
-       ret = bch2_journal_error(j);
-err:
-       bch2_journal_entries_free(list);
+fsck_err:
        return ret;
 }
 
@@ -969,9 +813,16 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
                if (sectors > ja->sectors_free &&
                    sectors <= ca->mi.bucket_size &&
-                   bch2_journal_dev_buckets_available(j, ja)) {
+                   bch2_journal_dev_buckets_available(j, ja,
+                                       journal_space_discarded)) {
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
                        ja->sectors_free = ca->mi.bucket_size;
+
+                       /*
+                        * ja->bucket_seq[ja->cur_idx] must always have
+                        * something sensible:
+                        */
+                       ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
                }
        }
 
@@ -1069,12 +920,13 @@ static void journal_write_done(struct closure *cl)
                goto err;
 
        spin_lock(&j->lock);
-       j->seq_ondisk           = seq;
-       j->last_seq_ondisk      = last_seq;
-
        if (seq >= j->pin.front)
                journal_seq_pin(j, seq)->devs = devs;
 
+       j->seq_ondisk           = seq;
+       j->last_seq_ondisk      = last_seq;
+       bch2_journal_space_available(j);
+
        /*
         * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
         * more buckets:
@@ -1100,7 +952,6 @@ out:
        return;
 err:
        bch2_fatal_error(c);
-       bch2_journal_halt(j);
        spin_lock(&j->lock);
        goto out;
 }
@@ -1145,8 +996,24 @@ void bch2_journal_write(struct closure *cl)
 
        j->write_start_time = local_clock();
 
-       start   = vstruct_last(jset);
-       end     = bch2_journal_super_entries_add_common(c, start);
+       /*
+        * New btree roots are set by journalling them; when the journal entry
+        * gets written we have to propagate them to c->btree_roots
+        *
+        * But, every journal entry we write has to contain all the btree roots
+        * (at least for now); so after we copy btree roots to c->btree_roots we
+        * have to get any missing btree roots and add them to this journal
+        * entry:
+        */
+
+       bch2_journal_entries_to_btree_roots(c, jset);
+
+       start = end = vstruct_last(jset);
+
+       end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+
+       end     = bch2_journal_super_entries_add_common(c, end,
+                                               le64_to_cpu(jset->seq));
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1169,8 +1036,7 @@ void bch2_journal_write(struct closure *cl)
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
                validate_before_checksum = true;
 
-       if (le32_to_cpu(jset->version) <
-           bcachefs_metadata_version_bkey_renumber)
+       if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
@@ -1194,9 +1060,16 @@ void bch2_journal_write(struct closure *cl)
        bytes = vstruct_bytes(jset);
        memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
 
+retry_alloc:
        spin_lock(&j->lock);
        ret = journal_write_alloc(j, w, sectors);
 
+       if (ret && j->can_discard) {
+               spin_unlock(&j->lock);
+               bch2_journal_do_discards(j);
+               goto retry_alloc;
+       }
+
        /*
         * write is allocated, no longer need to account for it in
         * bch2_journal_space_available():
@@ -1211,7 +1084,6 @@ void bch2_journal_write(struct closure *cl)
        spin_unlock(&j->lock);
 
        if (ret) {
-               bch2_journal_halt(j);
                bch_err(c, "Unable to allocate journal write");
                bch2_fatal_error(c);
                continue_at(cl, journal_write_done, system_highpri_wq);
@@ -1240,12 +1112,11 @@ void bch2_journal_write(struct closure *cl)
                bio_reset(bio);
                bio_set_dev(bio, ca->disk_sb.bdev);
                bio->bi_iter.bi_sector  = ptr->offset;
-               bio->bi_iter.bi_size    = sectors << 9;
                bio->bi_end_io          = journal_write_endio;
                bio->bi_private         = ca;
                bio_set_op_attrs(bio, REQ_OP_WRITE,
                                 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-               bch2_bio_map(bio, jset);
+               bch2_bio_map(bio, jset, sectors << 9);
 
                trace_journal_write(bio);
                closure_bio_submit(bio, cl);
@@ -1255,7 +1126,7 @@ void bch2_journal_write(struct closure *cl)
 
        for_each_rw_member(ca, c, i)
                if (journal_flushes_device(ca) &&
-                   !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
                        percpu_ref_get(&ca->io_ref);
 
                        bio = ca->journal.bio;