]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal_io.c
Update bcachefs sources to bdf6d7c135 fixup! bcachefs: Kill journal buf bloom filter
[bcachefs-tools-debian] / libbcachefs / journal_io.c
index cbde21a4c5479dd9f3583fe125ed415118f886fe..e537a578c44316a1dd9cb7dc72b4604f5386cc7f 100644 (file)
 
 #include <trace/events/bcachefs.h>
 
-static void __journal_replay_free(struct journal_replay *i)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
 {
-       list_del(&i->list);
+       return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+                                 struct journal_replay *i)
+{
+       struct journal_replay **p =
+               genradix_ptr(&c->journal_entries,
+                            journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+       BUG_ON(*p != i);
+       *p = NULL;
        kvpfree(i, offsetof(struct journal_replay, j) +
                vstruct_bytes(&i->j));
-
 }
 
 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -30,13 +40,13 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
        i->ignore = true;
 
        if (!c->opts.read_entire_journal)
-               __journal_replay_free(i);
+               __journal_replay_free(c, i);
 }
 
 struct journal_list {
        struct closure          cl;
+       u64                     last_seq;
        struct mutex            lock;
-       struct list_head        *head;
        int                     ret;
 };
 
@@ -52,56 +62,54 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
                             struct journal_list *jlist, struct jset *j,
                             bool bad)
 {
-       struct journal_replay *i, *pos, *dup = NULL;
+       struct genradix_iter iter;
+       struct journal_replay **_i, *i, *dup;
        struct journal_ptr *ptr;
-       struct list_head *where;
        size_t bytes = vstruct_bytes(j);
-       u64 last_seq = 0;
+       u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
        int ret = JOURNAL_ENTRY_ADD_OK;
 
-       list_for_each_entry_reverse(i, jlist->head, list) {
-               if (!JSET_NO_FLUSH(&i->j)) {
-                       last_seq = le64_to_cpu(i->j.last_seq);
-                       break;
-               }
-       }
-
        /* Is this entry older than the range we need? */
        if (!c->opts.read_entire_journal &&
-           le64_to_cpu(j->seq) < last_seq) {
-               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-               goto out;
-       }
+           le64_to_cpu(j->seq) < jlist->last_seq)
+               return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+       /*
+        * genradixes are indexed by a ulong, not a u64, so we can't index them
+        * by sequence number directly: Assume instead that they will all fall
+        * within the range of +-2billion of the filrst one we find.
+        */
+       if (!c->journal_entries_base_seq)
+               c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
 
        /* Drop entries we don't need anymore */
-       if (!JSET_NO_FLUSH(j)) {
-               list_for_each_entry_safe(i, pos, jlist->head, list) {
-                       if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+       if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+               genradix_for_each_from(&c->journal_entries, iter, _i,
+                                      journal_entry_radix_idx(c, jlist->last_seq)) {
+                       i = *_i;
+
+                       if (!i || i->ignore)
+                               continue;
+
+                       if (le64_to_cpu(i->j.seq) >= last_seq)
                                break;
                        journal_replay_free(c, i);
                }
        }
 
-       list_for_each_entry_reverse(i, jlist->head, list) {
-               if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-                       where = &i->list;
-                       goto add;
-               }
-       }
+       jlist->last_seq = max(jlist->last_seq, last_seq);
 
-       where = jlist->head;
-add:
-       dup = where->next != jlist->head
-               ? container_of(where->next, struct journal_replay, list)
-               : NULL;
-
-       if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
-               dup = NULL;
+       _i = genradix_ptr_alloc(&c->journal_entries,
+                               journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+                               GFP_KERNEL);
+       if (!_i)
+               return -ENOMEM;
 
        /*
         * Duplicate journal entries? If so we want the one that didn't have a
         * checksum error:
         */
+       dup = *_i;
        if (dup) {
                if (dup->bad) {
                        /* we'll replace @dup: */
@@ -119,10 +127,8 @@ add:
        }
 
        i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-       if (!i) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       if (!i)
+               return -ENOMEM;
 
        i->nr_ptrs       = 0;
        i->bad          = bad;
@@ -132,10 +138,11 @@ add:
        if (dup) {
                i->nr_ptrs = dup->nr_ptrs;
                memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
-               __journal_replay_free(dup);
+               __journal_replay_free(c, dup);
        }
 
-       list_add(&i->list, where);
+
+       *_i = i;
 found:
        for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
                if (ptr->dev == ca->dev_idx) {
@@ -586,9 +593,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
                       le64_to_cpu(u->d[i].fragmented));
        }
 
-       pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
-              le64_to_cpu(u->buckets_ec),
-              le64_to_cpu(u->buckets_unavailable));
+       pr_buf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
@@ -916,7 +921,8 @@ static void bch2_journal_read_device(struct closure *cl)
        struct bch_fs *c = ca->fs;
        struct journal_list *jlist =
                container_of(cl->parent, struct journal_list, cl);
-       struct journal_replay *r;
+       struct journal_replay *r, **_r;
+       struct genradix_iter iter;
        struct journal_read_buf buf = { NULL, 0 };
        u64 min_seq = U64_MAX;
        unsigned i;
@@ -959,11 +965,16 @@ static void bch2_journal_read_device(struct closure *cl)
        ja->sectors_free = ca->mi.bucket_size;
 
        mutex_lock(&jlist->lock);
-       list_for_each_entry(r, jlist->head, list) {
+       genradix_for_each(&c->journal_entries, iter, _r) {
+               r = *_r;
+
+               if (!r)
+                       continue;
+
                for (i = 0; i < r->nr_ptrs; i++) {
                        if (r->ptrs[i].dev == ca->dev_idx &&
                            sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
-                               unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) +
+                               unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
                                        vstruct_sectors(&r->j, c->block_bits);
 
                                ja->sectors_free = min(ja->sectors_free,
@@ -978,7 +989,7 @@ static void bch2_journal_read_device(struct closure *cl)
                bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
                bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
                for (i = 0; i < 3; i++) {
-                       unsigned idx = ja->cur_idx - 1 + i;
+                       unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
                        bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
                }
                ja->sectors_free = 0;
@@ -1025,11 +1036,11 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
-                     u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
 {
        struct journal_list jlist;
-       struct journal_replay *i, *t;
+       struct journal_replay *i, **_i, *prev = NULL;
+       struct genradix_iter radix_iter;
        struct bch_dev *ca;
        unsigned iter;
        struct printbuf buf = PRINTBUF;
@@ -1040,11 +1051,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
        closure_init_stack(&jlist.cl);
        mutex_init(&jlist.lock);
-       jlist.head = list;
+       jlist.last_seq = 0;
        jlist.ret = 0;
 
        for_each_member_device(ca, c, iter) {
-               if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+               if (!c->opts.fsck &&
                    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
                        continue;
 
@@ -1064,22 +1075,21 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        if (jlist.ret)
                return jlist.ret;
 
-       if (list_empty(list)) {
-               bch_info(c, "journal read done, but no entries found");
-               return 0;
-       }
-
-       i = list_last_entry(list, struct journal_replay, list);
-       *start_seq = le64_to_cpu(i->j.seq) + 1;
+       *start_seq = 0;
 
        /*
         * Find most recent flush entry, and ignore newer non flush entries -
         * those entries will be blacklisted:
         */
-       list_for_each_entry_safe_reverse(i, t, list, list) {
-               if (i->ignore)
+       genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
+               if (!*start_seq)
+                       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
                if (!JSET_NO_FLUSH(&i->j)) {
                        last_seq        = le64_to_cpu(i->j.last_seq);
                        *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
@@ -1089,6 +1099,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                journal_replay_free(c, i);
        }
 
+       if (!*start_seq) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
        if (!last_seq) {
                fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
                ret = -1;
@@ -1096,8 +1111,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        }
 
        /* Drop blacklisted entries and entries older than last_seq: */
-       list_for_each_entry_safe(i, t, list, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                seq = le64_to_cpu(i->j.seq);
@@ -1116,8 +1133,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
        /* Check for missing entries: */
        seq = last_seq;
-       list_for_each_entry(i, list, list) {
-               if (i->ignore)
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
                        continue;
 
                BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1139,11 +1158,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                               !bch2_journal_seq_is_blacklisted(c, seq, false))
                                seq++;
 
-                       if (i->list.prev != list) {
-                               struct journal_replay *p = list_prev_entry(i, list);
-
-                               bch2_journal_ptrs_to_text(&buf1, c, p);
-                               pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
+                       if (prev) {
+                               bch2_journal_ptrs_to_text(&buf1, c, prev);
+                               pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
                        } else
                                pr_buf(&buf1, "(none)");
                        bch2_journal_ptrs_to_text(&buf2, c, i);
@@ -1160,10 +1177,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                        printbuf_exit(&buf2);
                }
 
+               prev = i;
                seq++;
        }
 
-       list_for_each_entry(i, list, list) {
+       genradix_for_each(&c->journal_entries, radix_iter, _i) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
                struct bch_replicas_padded replicas = {
@@ -1172,7 +1190,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                };
                unsigned ptr;
 
-               if (i->ignore)
+               i = *_i;
+               if (!i || i->ignore)
                        continue;
 
                ret = jset_validate_entries(c, &i->j, READ);
@@ -1193,10 +1212,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                bch2_replicas_entry_to_text(&buf, &replicas.e);
 
                if (!degraded &&
-                   (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-                                "superblock not marked as containing replicas %s",
-                                buf.buf))) {
+                   fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+                               "superblock not marked as containing replicas %s",
+                               buf.buf)) {
                        ret = bch2_mark_replicas(c, &replicas.e);
                        if (ret)
                                goto err;
@@ -1423,7 +1441,8 @@ static void journal_write_done(struct closure *cl)
         * Must come before signaling write completion, for
         * bch2_fs_journal_stop():
         */
-       journal_reclaim_kick(&c->journal);
+       if (j->watermark)
+               journal_reclaim_kick(&c->journal);
 
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);