]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal_io.c
Update bcachefs sources to c7defb5793 bcachefs: Split btree_iter_traverse and bch2_bt...
[bcachefs-tools-debian] / libbcachefs / journal_io.c
index eacc9b2c362fc476126871f19983a59f2e239037..54f2e2053bc0bb43acfd81c7af4e62dfa3f5b618 100644 (file)
@@ -5,6 +5,7 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
@@ -201,22 +202,19 @@ static void journal_entry_null_range(void *start, void *end)
 
 #define FSCK_DELETED_KEY       5
 
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+static int journal_validate_key(struct bch_fs *c, const char *where,
                                struct jset_entry *entry,
                                unsigned level, enum btree_id btree_id,
-                               struct bkey_i *k,
-                               const char *type, int write)
+                               struct bkey_i *k, const char *type,
+                               unsigned version, int big_endian, int write)
 {
        void *next = vstruct_next(entry);
        const char *invalid;
-       unsigned version = le32_to_cpu(jset->version);
        int ret = 0;
 
        if (journal_entry_err_on(!k->k.u64s, c,
-                       "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
-                       type, le64_to_cpu(jset->seq),
-                       (u64 *) entry - jset->_data,
-                       le32_to_cpu(jset->u64s),
+                       "invalid %s in %s entry offset %zi/%u: k->u64s 0",
+                       type, where,
                        (u64 *) k - entry->_data,
                        le16_to_cpu(entry->u64s))) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
@@ -226,10 +224,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 
        if (journal_entry_err_on((void *) bkey_next(k) >
                                (void *) vstruct_next(entry), c,
-                       "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
-                       type, le64_to_cpu(jset->seq),
-                       (u64 *) entry - jset->_data,
-                       le32_to_cpu(jset->u64s),
+                       "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
+                       type, where,
                        (u64 *) k - entry->_data,
                        le16_to_cpu(entry->u64s))) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
@@ -238,10 +234,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
        }
 
        if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-                       "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
-                       type, le64_to_cpu(jset->seq),
-                       (u64 *) entry - jset->_data,
-                       le32_to_cpu(jset->u64s),
+                       "invalid %s in %s entry offset %zi/%u: bad format %u",
+                       type, where,
                        (u64 *) k - entry->_data,
                        le16_to_cpu(entry->u64s),
                        k->k.format)) {
@@ -252,9 +246,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
        }
 
        if (!write)
-               bch2_bkey_compat(level, btree_id, version,
-                           JSET_BIG_ENDIAN(jset), write,
-                           NULL, bkey_to_packed(k));
+               bch2_bkey_compat(level, btree_id, version, big_endian,
+                                write, NULL, bkey_to_packed(k));
 
        invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
                                    __btree_node_type(level, btree_id));
@@ -262,10 +255,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                char buf[160];
 
                bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-               mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
-                                type, le64_to_cpu(jset->seq),
-                                (u64 *) entry - jset->_data,
-                                le32_to_cpu(jset->u64s),
+               mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
+                                type, where,
                                 (u64 *) k - entry->_data,
                                 le16_to_cpu(entry->u64s),
                                 invalid, buf);
@@ -277,25 +268,24 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
        }
 
        if (write)
-               bch2_bkey_compat(level, btree_id, version,
-                           JSET_BIG_ENDIAN(jset), write,
-                           NULL, bkey_to_packed(k));
+               bch2_bkey_compat(level, btree_id, version, big_endian,
+                                write, NULL, bkey_to_packed(k));
 fsck_err:
        return ret;
 }
 
 static int journal_entry_validate_btree_keys(struct bch_fs *c,
-                                            struct jset *jset,
+                                            const char *where,
                                             struct jset_entry *entry,
-                                            int write)
+                                            unsigned version, int big_endian, int write)
 {
        struct bkey_i *k = entry->start;
 
        while (k != vstruct_last(entry)) {
-               int ret = journal_validate_key(c, jset, entry,
+               int ret = journal_validate_key(c, where, entry,
                                               entry->level,
                                               entry->btree_id,
-                                              k, "key", write);
+                                              k, "key", version, big_endian, write);
                if (ret == FSCK_DELETED_KEY)
                        continue;
 
@@ -306,9 +296,9 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
 }
 
 static int journal_entry_validate_btree_root(struct bch_fs *c,
-                                            struct jset *jset,
+                                            const char *where,
                                             struct jset_entry *entry,
-                                            int write)
+                                            unsigned version, int big_endian, int write)
 {
        struct bkey_i *k = entry->start;
        int ret = 0;
@@ -327,25 +317,25 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
                return 0;
        }
 
-       return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-                                   "btree root", write);
+       return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
+                                   "btree root", version, big_endian, write);
 fsck_err:
        return ret;
 }
 
 static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
-                                           struct jset *jset,
+                                           const char *where,
                                            struct jset_entry *entry,
-                                           int write)
+                                           unsigned version, int big_endian, int write)
 {
        /* obsolete, don't care: */
        return 0;
 }
 
 static int journal_entry_validate_blacklist(struct bch_fs *c,
-                                           struct jset *jset,
+                                           const char *where,
                                            struct jset_entry *entry,
-                                           int write)
+                                           unsigned version, int big_endian, int write)
 {
        int ret = 0;
 
@@ -358,9 +348,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
-                                              struct jset *jset,
+                                              const char *where,
                                               struct jset_entry *entry,
-                                              int write)
+                                              unsigned version, int big_endian, int write)
 {
        struct jset_entry_blacklist_v2 *bl_entry;
        int ret = 0;
@@ -384,9 +374,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_usage(struct bch_fs *c,
-                                       struct jset *jset,
+                                       const char *where,
                                        struct jset_entry *entry,
-                                       int write)
+                                       unsigned version, int big_endian, int write)
 {
        struct jset_entry_usage *u =
                container_of(entry, struct jset_entry_usage, entry);
@@ -405,9 +395,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_data_usage(struct bch_fs *c,
-                                       struct jset *jset,
+                                       const char *where,
                                        struct jset_entry *entry,
-                                       int write)
+                                       unsigned version, int big_endian, int write)
 {
        struct jset_entry_data_usage *u =
                container_of(entry, struct jset_entry_data_usage, entry);
@@ -426,9 +416,72 @@ fsck_err:
        return ret;
 }
 
+static int journal_entry_validate_clock(struct bch_fs *c,
+                                       const char *where,
+                                       struct jset_entry *entry,
+                                       unsigned version, int big_endian, int write)
+{
+       struct jset_entry_clock *clock =
+               container_of(entry, struct jset_entry_clock, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes != sizeof(*clock),
+                                c, "invalid journal entry clock: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(clock->rw > 1,
+                                c, "invalid journal entry clock: bad rw")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+                                           const char *where,
+                                           struct jset_entry *entry,
+                                           unsigned version, int big_endian, int write)
+{
+       struct jset_entry_dev_usage *u =
+               container_of(entry, struct jset_entry_dev_usage, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+       unsigned dev;
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes < expected,
+                                c, "invalid journal entry dev usage: bad size (%u < %u)",
+                                bytes, expected)) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       dev = le32_to_cpu(u->dev);
+
+       if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+                                c, "invalid journal entry dev usage: bad dev")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(u->pad,
+                                c, "invalid journal entry dev usage: bad pad")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
 struct jset_entry_ops {
-       int (*validate)(struct bch_fs *, struct jset *,
-                       struct jset_entry *, int);
+       int (*validate)(struct bch_fs *, const char *,
+                       struct jset_entry *, unsigned, int, int);
 };
 
 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
@@ -440,22 +493,29 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #undef x
 };
 
-static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
-                                 struct jset_entry *entry, int write)
+int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian, int write)
 {
        return entry->type < BCH_JSET_ENTRY_NR
-               ? bch2_jset_entry_ops[entry->type].validate(c, jset,
-                                                           entry, write)
+               ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+                               version, big_endian, write)
                : 0;
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
                                 int write)
 {
+       char buf[100];
        struct jset_entry *entry;
        int ret = 0;
 
        vstruct_for_each(jset, entry) {
+               scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
+                         le64_to_cpu(jset->seq),
+                         (u64 *) entry - jset->_data,
+                         le32_to_cpu(jset->u64s));
+
                if (journal_entry_err_on(vstruct_next(entry) >
                                         vstruct_last(jset), c,
                                "journal entry extends past end of jset")) {
@@ -463,7 +523,9 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
                        break;
                }
 
-               ret = journal_entry_validate(c, jset, entry, write);
+               ret = bch2_journal_entry_validate(c, buf, entry,
+                                       le32_to_cpu(jset->version),
+                                       JSET_BIG_ENDIAN(jset), write);
                if (ret)
                        break;
        }
@@ -773,13 +835,15 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 
        for (i = 0; i < j->nr_ptrs; i++) {
                struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+               u64 offset;
+
+               div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
 
                if (i)
                        pr_buf(out, " ");
                pr_buf(out, "%u:%llu (offset %llu)",
                       j->ptrs[i].dev,
-                      (u64) j->ptrs[i].offset,
-                      (u64) j->ptrs[i].offset % ca->mi.bucket_size);
+                      (u64) j->ptrs[i].offset, offset);
        }
 }
 
@@ -805,8 +869,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
                        continue;
 
-               if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
-                    ca->mi.state == BCH_MEMBER_STATE_RO) &&
+               if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+                    ca->mi.state == BCH_MEMBER_STATE_ro) &&
                    percpu_ref_tryget(&ca->io_ref))
                        closure_call(&ca->journal.read,
                                     bch2_journal_read_device,
@@ -937,6 +1001,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                for (ptr = 0; ptr < i->nr_ptrs; ptr++)
                        replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
 
+               bch2_replicas_entry_sort(&replicas.e);
+
                /*
                 * If we're mounting in degraded mode - if we didn't read all
                 * the devices - this is wrong:
@@ -997,7 +1063,7 @@ static void __journal_write_alloc(struct journal *j,
                 * it:
                 */
                if (!ca->mi.durability ||
-                   ca->mi.state != BCH_MEMBER_STATE_RW ||
+                   ca->mi.state != BCH_MEMBER_STATE_rw ||
                    !ja->nr ||
                    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
                                         ca->dev_idx) ||
@@ -1032,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
                               unsigned sectors)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_devs_mask devs;
        struct journal_device *ja;
        struct bch_dev *ca;
        struct dev_alloc_list devs_sorted;
+       unsigned target = c->opts.metadata_target ?:
+               c->opts.foreground_target;
        unsigned i, replicas = 0, replicas_want =
                READ_ONCE(c->opts.metadata_replicas);
 
        rcu_read_lock();
+retry:
+       devs = target_rw_devs(c, BCH_DATA_journal, target);
 
-       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-                                         &c->rw_devs[BCH_DATA_journal]);
+       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
@@ -1073,6 +1143,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
+
+       if (replicas < replicas_want && target) {
+               /* Retry from all devices: */
+               target = 0;
+               goto retry;
+       }
 done:
        rcu_read_unlock();
 
@@ -1278,6 +1354,9 @@ static void do_journal_write(struct closure *cl)
                bio->bi_private         = ca;
                bio->bi_opf             = REQ_OP_WRITE|REQ_SYNC|REQ_META;
 
+               BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+               ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
                if (!JSET_NO_FLUSH(w->data))
                        bio->bi_opf    |= REQ_FUA;
                if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
@@ -1305,6 +1384,7 @@ void bch2_journal_write(struct closure *cl)
        struct jset_entry *start, *end;
        struct jset *jset;
        struct bio *bio;
+       char *journal_debug_buf = NULL;
        bool validate_before_checksum = false;
        unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
        int ret;
@@ -1348,8 +1428,8 @@ void bch2_journal_write(struct closure *cl)
 
        end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
 
-       end     = bch2_journal_super_entries_add_common(c, end,
-                                               le64_to_cpu(jset->seq));
+       bch2_journal_super_entries_add_common(c, &end,
+                               le64_to_cpu(jset->seq));
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1358,10 +1438,7 @@ void bch2_journal_write(struct closure *cl)
 
        journal_write_compact(jset);
 
-       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
-       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
        jset->magic             = cpu_to_le64(jset_magic(c));
-
        jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
                ? cpu_to_le32(BCH_JSET_VERSION_OLD)
                : cpu_to_le32(c->sb.version);
@@ -1409,6 +1486,12 @@ retry_alloc:
                goto retry_alloc;
        }
 
+       if (ret) {
+               journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+               if (journal_debug_buf)
+                       __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+       }
+
        /*
         * write is allocated, no longer need to account for it in
         * bch2_journal_space_available():
@@ -1423,7 +1506,9 @@ retry_alloc:
        spin_unlock(&j->lock);
 
        if (ret) {
-               bch_err(c, "Unable to allocate journal write");
+               bch_err(c, "Unable to allocate journal write:\n%s",
+                       journal_debug_buf);
+               kfree(journal_debug_buf);
                bch2_fatal_error(c);
                continue_at(cl, journal_write_done, system_highpri_wq);
                return;