]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 6a361fb68c bcachefs: Rework btree read error handling
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 11 Nov 2017 07:18:52 +0000 (22:18 -0900)
committerKent Overstreet <kent.overstreet@gmail.com>
Sat, 11 Nov 2017 07:18:52 +0000 (22:18 -0900)
.bcachefs_revision
libbcachefs/bkey.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/debug.c
libbcachefs/extents.c
libbcachefs/extents.h

index 3f7faac0f11410795818d920364afd4dfeeba394..aa0579cca4e02130de3241154a3c0ff109edfc24 100644 (file)
@@ -1 +1 @@
-58b77cfec62e8cdf6c1f7863a5066356ab77e7ad
+6a361fb68c8b0b7cd3bc0085b8d21b808fdc13eb
index 19bf1b8fec890af3badc65cdef3568e2212070fb..d33bc4e1331aafffa830152b7b75f29e662d463a 100644 (file)
@@ -626,25 +626,25 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
        unsigned i, bits = KEY_PACKED_BITS_START;
 
        if (f->nr_fields != BKEY_NR_FIELDS)
-               return "invalid format: incorrect number of fields";
+               return "incorrect number of fields";
 
        for (i = 0; i < f->nr_fields; i++) {
                u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
                if (f->bits_per_field[i] > 64)
-                       return "invalid format: field too large";
+                       return "field too large";
 
                if (field_offset &&
                    (f->bits_per_field[i] == 64 ||
                    (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
                     field_offset)))
-                       return "invalid format: offset + bits overflow";
+                       return "offset + bits overflow";
 
                bits += f->bits_per_field[i];
        }
 
        if (f->key_u64s != DIV_ROUND_UP(bits, 64))
-               return "invalid format: incorrect key_u64s";
+               return "incorrect key_u64s";
 
        return NULL;
 }
index 302546f2c2abd63a3271c1e2eafe16659544007b..e5cc00cc040b961238689b35f1bf69389aad48a9 100644 (file)
@@ -986,8 +986,7 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
                bch2_btree_iter_cond_resched(&iter);
        }
 err:
-       bch2_btree_iter_unlock(&iter);
-       return ret;
+       return bch2_btree_iter_unlock(&iter) ?: ret;
 }
 
 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
index e0735afa36c9054b61103fa968f73c091666990a..b600842ba629b06a339d0af42e9215c0b5c824e5 100644 (file)
@@ -855,9 +855,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
                bch2_btree_iter_reinit_node(iter, b);
 }
 
-static struct nonce btree_nonce(struct btree *b,
-                               struct bset *i,
-                               unsigned offset)
+static struct nonce btree_nonce(struct bset *i, unsigned offset)
 {
        return (struct nonce) {{
                [0] = cpu_to_le32(offset),
@@ -867,63 +865,165 @@ static struct nonce btree_nonce(struct btree *b,
        }};
 }
 
-static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
+static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
 {
+       struct nonce nonce = btree_nonce(i, offset);
+
+       if (!offset) {
+               struct btree_node *bn = container_of(i, struct btree_node, keys);
+               unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+               bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
+                            bytes);
+
+               nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+       }
+
        bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-                   vstruct_end(i) - (void *) i->_data);
+                    vstruct_end(i) - (void *) i->_data);
+}
+
+static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
+                        unsigned offset, int write, char *buf, size_t len)
+{
+       char *out = buf, *end = buf + len;
+
+       out += scnprintf(out, end - out,
+                        "error validating btree node %s "
+                        "at btree %u level %u/%u\n"
+                        "pos %llu:%llu node offset %u",
+                        write ? "before write " : "",
+                        b->btree_id, b->level,
+                        c->btree_roots[b->btree_id].level,
+                        b->key.k.p.inode, b->key.k.p.offset,
+                        b->written);
+       if (i)
+               out += scnprintf(out, end - out,
+                                " bset u64s %u",
+                                le16_to_cpu(i->u64s));
+
+       return out - buf;
 }
 
-#define btree_node_error(c, b, msg, ...)                               \
-do {                                                                   \
-       if (write == READ &&                                            \
+enum btree_err_type {
+       BTREE_ERR_FIXABLE,
+       BTREE_ERR_WANT_RETRY,
+       BTREE_ERR_MUST_RETRY,
+       BTREE_ERR_FATAL,
+};
+
+enum btree_validate_ret {
+       BTREE_RETRY_READ = 64,
+};
+
+#define btree_err(type, c, b, i, msg, ...)                             \
+({                                                                     \
+       char buf[200], *out = buf, *end = out + sizeof(buf);            \
+                                                                       \
+       out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
+       out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);      \
+                                                                       \
+       if (type == BTREE_ERR_FIXABLE &&                                \
+           write == READ &&                                            \
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
-               mustfix_fsck_err(c,                                     \
-                       "btree node read error at btree %u level %u/%u\n"\
-                       "pos %llu:%llu node offset %u bset u64s %u: " msg,\
-                       (b)->btree_id, (b)->level,                      \
-                       (c)->btree_roots[(b)->btree_id].level,          \
-                       (b)->key.k.p.inode, (b)->key.k.p.offset,        \
-                       (b)->written, le16_to_cpu((i)->u64s),           \
-                       ##__VA_ARGS__);                                 \
+               mustfix_fsck_err(c, "%s", buf);                         \
        } else {                                                        \
-               bch_err(c, "%s at btree %u level %u/%u\n"               \
-                       "pos %llu:%llu node offset %u bset u64s %u: " msg,\
-                       write == WRITE                                  \
-                       ? "corrupt metadata in btree node write"        \
-                       : "btree node error",                           \
-                       (b)->btree_id, (b)->level,                      \
-                       (c)->btree_roots[(b)->btree_id].level,          \
-                       (b)->key.k.p.inode, (b)->key.k.p.offset,        \
-                       (b)->written, le16_to_cpu((i)->u64s),           \
-                       ##__VA_ARGS__);                                 \
-               ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
-               goto fsck_err;                                          \
+               bch_err(c, "%s", buf);                                  \
+                                                                       \
+               switch (type) {                                         \
+               case BTREE_ERR_FIXABLE:                                 \
+                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       goto fsck_err;                                  \
+               case BTREE_ERR_WANT_RETRY:                              \
+                       if (have_retry) {                               \
+                               ret = BTREE_RETRY_READ;                 \
+                               goto fsck_err;                          \
+                       }                                               \
+                       break;                                          \
+               case BTREE_ERR_MUST_RETRY:                              \
+                       ret = BTREE_RETRY_READ;                         \
+                       goto fsck_err;                                  \
+               case BTREE_ERR_FATAL:                                   \
+                       ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
+                       goto fsck_err;                                  \
+               }                                                       \
        }                                                               \
-} while (0)
+       true;                                                           \
+})
+
+#define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
 
 static int validate_bset(struct bch_fs *c, struct btree *b,
                         struct bset *i, unsigned sectors,
-                        unsigned *whiteout_u64s, int write)
+                        unsigned *whiteout_u64s, int write,
+                        bool have_retry)
 {
        struct bkey_packed *k, *prev = NULL;
        struct bpos prev_pos = POS_MIN;
        bool seen_non_whiteout = false;
+       const char *err;
        int ret = 0;
 
-       if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
-               btree_node_error(c, b, "unsupported bset version");
+       if (i == &b->data->keys) {
+               /* These indicate that we read the wrong btree node: */
+               btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
+                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            "incorrect btree id");
+
+               btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
+                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            "incorrect level");
+
+               if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+                       u64 *p = (u64 *) &b->data->ptr;
+
+                       *p = swab64(*p);
+                       bch2_bpos_swab(&b->data->min_key);
+                       bch2_bpos_swab(&b->data->max_key);
+               }
+
+               btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
+                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            "incorrect max key");
+
+               /* XXX: ideally we would be validating min_key too */
+#if 0
+               /*
+                * not correct anymore, due to btree node write error
+                * handling
+                *
+                * need to add b->data->seq to btree keys and verify
+                * against that
+                */
+               btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
+                                                 b->data->ptr),
+                            BTREE_ERR_FATAL, c, b, i,
+                            "incorrect backpointer");
+#endif
+               err = bch2_bkey_format_validate(&b->data->format);
+               btree_err_on(err,
+                            BTREE_ERR_FATAL, c, b, i,
+                            "invalid bkey format: %s", err);
+       }
+
+       if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
+                        BTREE_ERR_FIXABLE, c, b, i,
+                        "unsupported bset version")) {
+               i->version = cpu_to_le16(BCACHE_BSET_VERSION);
                i->u64s = 0;
                return 0;
        }
 
-       if (b->written + sectors > c->opts.btree_node_size) {
-               btree_node_error(c, b, "bset past end of btree node");
+       if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+                        BTREE_ERR_FIXABLE, c, b, i,
+                        "bset past end of btree node")) {
                i->u64s = 0;
                return 0;
        }
 
-       if (b->written && !i->u64s)
-               btree_node_error(c, b, "empty set");
+       btree_err_on(b->written && !i->u64s,
+                    BTREE_ERR_FIXABLE, c, b, i,
+                    "empty bset");
 
        if (!BSET_SEPARATE_WHITEOUTS(i)) {
                seen_non_whiteout = true;
@@ -936,27 +1036,24 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                struct bkey tmp;
                const char *invalid;
 
-               if (!k->u64s) {
-                       btree_node_error(c, b,
-                               "KEY_U64s 0: %zu bytes of metadata lost",
-                               vstruct_end(i) - (void *) k);
-
+               if (btree_err_on(!k->u64s,
+                                BTREE_ERR_FIXABLE, c, b, i,
+                                "KEY_U64s 0: %zu bytes of metadata lost",
+                                vstruct_end(i) - (void *) k)) {
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
                        break;
                }
 
-               if (bkey_next(k) > vstruct_last(i)) {
-                       btree_node_error(c, b,
-                                        "key extends past end of bset");
-
+               if (btree_err_on(bkey_next(k) > vstruct_last(i),
+                                BTREE_ERR_FIXABLE, c, b, i,
+                                "key extends past end of bset")) {
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
                        break;
                }
 
-               if (k->format > KEY_FORMAT_CURRENT) {
-                       btree_node_error(c, b,
-                                        "invalid bkey format %u", k->format);
-
+               if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+                                BTREE_ERR_FIXABLE, c, b, i,
+                                "invalid bkey format %u", k->format)) {
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
                                          (u64 *) vstruct_end(i) - (u64 *) k);
@@ -974,8 +1071,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
                        bch2_bkey_val_to_text(c, btree_node_type(b),
                                              buf, sizeof(buf), u);
-                       btree_node_error(c, b,
-                                        "invalid bkey %s: %s", buf, invalid);
+                       btree_err(BTREE_ERR_FIXABLE, c, b, i,
+                                 "invalid bkey %s: %s", buf, invalid);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
@@ -995,12 +1092,12 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                        *whiteout_u64s = k->_data - i->_data;
                        seen_non_whiteout = true;
                } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
-                       btree_node_error(c, b,
-                                        "keys out of order: %llu:%llu > %llu:%llu",
-                                        prev_pos.inode,
-                                        prev_pos.offset,
-                                        u.k->p.inode,
-                                        bkey_start_offset(u.k));
+                       btree_err(BTREE_ERR_FATAL, c, b, i,
+                                 "keys out of order: %llu:%llu > %llu:%llu",
+                                 prev_pos.inode,
+                                 prev_pos.offset,
+                                 u.k->p.inode,
+                                 bkey_start_offset(u.k));
                        /* XXX: repair this */
                }
 
@@ -1014,101 +1111,55 @@ fsck_err:
        return ret;
 }
 
-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
 {
        struct btree_node_entry *bne;
-       struct bset *i = &b->data->keys;
        struct btree_node_iter *iter;
        struct btree_node *sorted;
        bool used_mempool;
        unsigned u64s;
-       const char *err;
-       struct bch_csum csum;
-       struct nonce nonce;
-       int ret, should_retry = 0, write = READ;
+       int ret, retry_read = 0, write = READ;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
        __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
 
-       err = "dynamic fault";
        if (bch2_meta_read_fault("btree"))
-               goto err;
+               btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+                         "dynamic fault");
+
+       btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                    "bad magic");
+
+       btree_err_on(!b->data->keys.seq,
+                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                    "bad btree header");
 
        while (b->written < c->opts.btree_node_size) {
                unsigned sectors, whiteout_u64s = 0;
+               struct nonce nonce;
+               struct bch_csum csum;
+               struct bset *i;
 
                if (!b->written) {
                        i = &b->data->keys;
 
-                       err = "bad magic";
-                       if (le64_to_cpu(b->data->magic) != bset_magic(c))
-                               goto retry_err;
-
-                       err = "bad btree header";
-                       if (!b->data->keys.seq)
-                               goto retry_err;
-
-                       err = "unknown checksum type";
-                       if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
-                               goto retry_err;
+                       btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "unknown checksum type");
 
-                       nonce = btree_nonce(b, i, b->written << 9);
+                       nonce = btree_nonce(i, b->written << 9);
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 
-                       err = "bad checksum";
-                       if (bch2_crc_cmp(csum, b->data->csum))
-                               goto retry_err;
+                       btree_err_on(bch2_crc_cmp(csum, b->data->csum),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "invalid checksum");
 
-                       bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
-                                   &b->data->flags,
-                                   (void *) &b->data->keys -
-                                   (void *) &b->data->flags);
-                       nonce = nonce_add(nonce,
-                                         round_up((void *) &b->data->keys -
-                                                  (void *) &b->data->flags,
-                                                  CHACHA20_BLOCK_SIZE));
-                       bset_encrypt(c, i, nonce);
+                       bset_encrypt(c, i, b->written << 9);
 
                        sectors = vstruct_sectors(b->data, c->block_bits);
 
-                       if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-                               u64 *p = (u64 *) &b->data->ptr;
-
-                               *p = swab64(*p);
-                               bch2_bpos_swab(&b->data->min_key);
-                               bch2_bpos_swab(&b->data->max_key);
-                       }
-
-                       err = "incorrect btree id";
-                       if (BTREE_NODE_ID(b->data) != b->btree_id)
-                               goto err;
-
-                       err = "incorrect level";
-                       if (BTREE_NODE_LEVEL(b->data) != b->level)
-                               goto err;
-
-                       err = "incorrect max key";
-                       if (bkey_cmp(b->data->max_key, b->key.k.p))
-                               goto err;
-#if 0
-                       /*
-                        * not correct anymore, due to btree node write error
-                        * handling
-                        *
-                        * need to add b->data->seq to btree keys and verify
-                        * against that
-                        */
-                       err = "incorrect backpointer";
-                       if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
-                                                b->data->ptr))
-                               goto err;
-#endif
-                       err = bch2_bkey_format_validate(&b->data->format);
-                       if (err)
-                               goto err;
-
                        set_btree_bset(b, b->set, &b->data->keys);
-
                        btree_node_set_format(b, b->data->format);
                } else {
                        bne = write_block(b);
@@ -1117,32 +1168,35 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
                        if (i->seq != b->data->keys.seq)
                                break;
 
-                       err = "unknown checksum type";
-                       if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
-                               goto retry_err;
+                       btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "unknown checksum type");
 
-                       nonce = btree_nonce(b, i, b->written << 9);
+                       nonce = btree_nonce(i, b->written << 9);
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
-                       err = "bad checksum";
-                       if (bch2_crc_cmp(csum, bne->csum))
-                               goto retry_err;
+                       btree_err_on(bch2_crc_cmp(csum, bne->csum),
+                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    "invalid checksum");
 
-                       bset_encrypt(c, i, nonce);
+                       bset_encrypt(c, i, b->written << 9);
 
                        sectors = vstruct_sectors(bne, c->block_bits);
                }
 
-               ret = validate_bset(c, b, i, sectors, &whiteout_u64s, READ);
+               ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
+                                   READ, have_retry);
                if (ret)
                        goto fsck_err;
 
                b->written += sectors;
 
-               err = "insufficient memory";
                ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
-               if (ret < 0)
+               if (ret < 0) {
+                       btree_err(BTREE_ERR_FATAL, c, b, i,
+                                 "insufficient memory");
                        goto err;
+               }
 
                if (ret)
                        continue;
@@ -1156,12 +1210,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
                                           vstruct_last(i));
        }
 
-       err = "corrupted btree";
        for (bne = write_block(b);
             bset_byte_offset(b, bne) < btree_bytes(c);
             bne = (void *) bne + block_bytes(c))
-               if (bne->keys.seq == b->data->keys.seq)
-                       goto err;
+               btree_err_on(bne->keys.seq == b->data->keys.seq,
+                            BTREE_ERR_WANT_RETRY, c, b, NULL,
+                            "found bset signature after last bset");
 
        sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
        sorted->keys.u64s = 0;
@@ -1188,15 +1242,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
        btree_node_reset_sib_u64s(b);
 out:
        mempool_free(iter, &c->fill_iter);
-       return should_retry;
+       return retry_read;
 err:
-       btree_node_error(c, b, "%s", err);
 fsck_err:
-       bch2_inconsistent_error(c);
-       set_btree_node_read_error(b);
-       goto out;
-retry_err:
-       should_retry = -1;
+       if (ret == BTREE_RETRY_READ) {
+               retry_read = 1;
+       } else {
+               bch2_inconsistent_error(c);
+               set_btree_node_read_error(b);
+       }
        goto out;
 }
 
@@ -1205,55 +1259,41 @@ static void btree_node_read_work(struct work_struct *work)
        struct btree_read_bio *rb =
                container_of(work, struct btree_read_bio, work);
        struct bch_fs *c        = rb->c;
-       struct bch_dev *ca      = rb->pick.ca;
        struct btree *b         = rb->bio.bi_private;
        struct bio *bio         = &rb->bio;
-       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-       const struct bch_extent_ptr *ptr;
        struct bch_devs_mask avoid;
 
-       bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
-       percpu_ref_put(&rb->pick.ca->io_ref);
-
-       if (!bio->bi_error &&
-           !bch2_btree_node_read_done(c, b))
-               goto out;
-
-       goto err;
-out:
-       bch2_time_stats_update(&c->btree_read_time, rb->start_time);
-       bio_put(&rb->bio);
-       clear_btree_node_read_in_flight(b);
-       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-       return;
-err:
        memset(&avoid, 0, sizeof(avoid));
-       __set_bit(ca->dev_idx, avoid.d);
-
-       extent_for_each_ptr(e, ptr) {
-               memset(&rb->pick, 0, sizeof(rb->pick));
-               bch2_get_read_device(c, e.k, ptr, NULL, &avoid, &rb->pick);
-
-               if (!rb->pick.ca)
-                       continue;
 
+       goto start;
+       do {
                bio_reset(bio);
                bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
                bio->bi_bdev            = rb->pick.ca->disk_sb.bdev;
                bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
                bio->bi_iter.bi_size    = btree_bytes(c);
                submit_bio_wait(bio);
-
+start:
                bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
                percpu_ref_put(&rb->pick.ca->io_ref);
 
+               __set_bit(rb->pick.ca->dev_idx, avoid.d);
+               rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+
                if (!bio->bi_error &&
-                   !bch2_btree_node_read_done(c, b))
+                   !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
                        goto out;
-       }
+       } while (!IS_ERR_OR_NULL(rb->pick.ca));
 
        set_btree_node_read_error(b);
-       goto out;
+out:
+       if (!IS_ERR_OR_NULL(rb->pick.ca))
+               percpu_ref_put(&rb->pick.ca->io_ref);
+
+       bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+       bio_put(&rb->bio);
+       clear_btree_node_read_in_flight(b);
+       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
 
 static void btree_node_read_endio(struct bio *bio)
@@ -1274,7 +1314,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 
        trace_btree_read(c, b);
 
-       pick = bch2_btree_pick_ptr(c, b);
+       pick = bch2_btree_pick_ptr(c, b, NULL);
        if (bch2_fs_fatal_err_on(!pick.ca, c,
                        "btree node read error: no device to read from")) {
                set_btree_node_read_error(b);
@@ -1469,7 +1509,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
                break;
 
-       ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE);
+       ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
        if (ret)
                bch2_inconsistent_error(c);
 
@@ -1619,31 +1659,19 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        i->version = cpu_to_le16(BCACHE_BSET_VERSION);
        SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
 
-       nonce = btree_nonce(b, i, b->written << 9);
-
        /* if we're going to be encrypting, check metadata validity first: */
        if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
            validate_bset_for_write(c, b, i, sectors_to_write))
                goto err;
 
-       if (bn) {
-               bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
-                           &bn->flags,
-                           (void *) &b->data->keys -
-                           (void *) &b->data->flags);
-               nonce = nonce_add(nonce,
-                                 round_up((void *) &b->data->keys -
-                                          (void *) &b->data->flags,
-                                          CHACHA20_BLOCK_SIZE));
-               bset_encrypt(c, i, nonce);
-
-               nonce = btree_nonce(b, i, b->written << 9);
-               bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
-       } else {
-               bset_encrypt(c, i, nonce);
+       bset_encrypt(c, i, b->written << 9);
+
+       nonce = btree_nonce(i, b->written << 9);
 
+       if (bn)
+               bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+       else
                bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-       }
 
        /* if we're not encrypting, check metadata after checksumming: */
        if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
index 537b8e1d0a7f9d3a784d5d00da05195b69d2b564..f3290f989d6b5a3a7b062f5e2954d15af8c2d75d 100644 (file)
@@ -72,7 +72,7 @@ void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
                         struct btree_iter *);
 
-int bch2_btree_node_read_done(struct bch_fs *, struct btree *);
+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
 void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
                         const struct bkey_i *, unsigned);
index f4f73bfc85470f036d5c847fc78c580828b3007c..b1b6233993dc0a48c25424f776fda367ef6c0ded 100644 (file)
@@ -928,7 +928,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 
        ret = bch2_btree_iter_traverse(iter);
        if (ret)
-               return NULL;
+               return ERR_PTR(ret);
 
        b = iter->nodes[iter->level];
 
index db03a3415e52fb7f3b1891c247a2d792fc4dd205..ccfb0386166ed6a7b1373f06552bd99b52cfa3c2 100644 (file)
@@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        v->btree_id     = b->btree_id;
        bch2_btree_keys_init(v, &c->expensive_debug_checks);
 
-       pick = bch2_btree_pick_ptr(c, b);
+       pick = bch2_btree_pick_ptr(c, b, NULL);
        if (IS_ERR_OR_NULL(pick.ca))
                return;
 
@@ -68,14 +68,14 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        submit_bio_wait(bio);
 
        bio_put(bio);
+       percpu_ref_put(&pick.ca->io_ref);
 
        memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
-       bch2_btree_node_read_done(c, v);
-       n_sorted = c->verify_data->data;
-
-       percpu_ref_put(&pick.ca->io_ref);
+       if (bch2_btree_node_read_done(c, v, false))
+               goto out;
 
+       n_sorted = c->verify_data->data;
        sorted = &n_sorted->keys;
        inmemory = &n_inmemory->keys;
 
@@ -127,7 +127,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
                console_unlock();
                panic("verify failed at %u\n", j);
        }
-
+out:
        mutex_unlock(&c->verify_lock);
        btree_node_io_unlock(b);
 }
index 742a9a005b28beb81987bad076b7f20730638bb1..1937f4cb9f05891ccbbe3a533b6f918926d23305 100644 (file)
@@ -499,52 +499,43 @@ out:
        return out - buf;
 }
 
-void bch2_get_read_device(struct bch_fs *c,
-                         const struct bkey *k,
-                         const struct bch_extent_ptr *ptr,
-                         const union bch_extent_crc *crc,
-                         struct bch_devs_mask *avoid,
-                         struct extent_pick_ptr *pick)
+static void extent_pick_read_device(struct bch_fs *c,
+                                   struct bkey_s_c_extent e,
+                                   struct bch_devs_mask *avoid,
+                                   struct extent_pick_ptr *pick)
 {
-       struct bch_dev *ca = c->devs[ptr->dev];
-
-       if (ptr->cached && ptr_stale(ca, ptr))
-               return;
+       const union bch_extent_crc *crc;
+       const struct bch_extent_ptr *ptr;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-               return;
+       extent_for_each_ptr_crc(e, ptr, crc) {
+               struct bch_dev *ca = c->devs[ptr->dev];
 
-       if (avoid && test_bit(ca->dev_idx, avoid->d))
-               return;
+               if (ptr->cached && ptr_stale(ca, ptr))
+                       return;
 
-       if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
-               return;
+               if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+                       return;
 
-       if (!percpu_ref_tryget(&ca->io_ref))
-               return;
+               if (avoid && test_bit(ca->dev_idx, avoid->d))
+                       return;
 
-       if (pick->ca)
-               percpu_ref_put(&pick->ca->io_ref);
+               if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
+                       return;
 
-       *pick = (struct extent_pick_ptr) {
-               .ptr    = *ptr,
-               .ca     = ca,
-       };
+               if (!percpu_ref_tryget(&ca->io_ref))
+                       return;
 
-       if (k->size)
-               pick->crc = crc_to_128(k, crc);
-}
+               if (pick->ca)
+                       percpu_ref_put(&pick->ca->io_ref);
 
-static void extent_pick_read_device(struct bch_fs *c,
-                                   struct bkey_s_c_extent e,
-                                   struct bch_devs_mask *avoid,
-                                   struct extent_pick_ptr *pick)
-{
-       const union bch_extent_crc *crc;
-       const struct bch_extent_ptr *ptr;
+               *pick = (struct extent_pick_ptr) {
+                       .ptr    = *ptr,
+                       .ca     = ca,
+               };
 
-       extent_for_each_ptr_crc(e, ptr, crc)
-               bch2_get_read_device(c, e.k, ptr, crc, avoid, pick);
+               if (e.k->size)
+                       pick->crc = crc_to_128(e.k, crc);
+       }
 }
 
 /* Btree ptrs */
@@ -667,12 +658,13 @@ static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 }
 
 struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
+bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+                   struct bch_devs_mask *avoid)
 {
        struct extent_pick_ptr pick = { .ca = NULL };
 
        extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-                               NULL, &pick);
+                               avoid, &pick);
 
        return pick;
 }
index dc2fcbc11412da298bc167ff9e03989a1515ea59..634159f279585787efac9438c7ca622592273f1e 100644 (file)
@@ -25,14 +25,9 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 extern const struct bkey_ops bch2_bkey_btree_ops;
 extern const struct bkey_ops bch2_bkey_extent_ops;
 
-void bch2_get_read_device(struct bch_fs *,
-                         const struct bkey *,
-                         const struct bch_extent_ptr *,
-                         const union bch_extent_crc *,
-                         struct bch_devs_mask *,
-                         struct extent_pick_ptr *);
 struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *, const struct btree *);
+bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+                   struct bch_devs_mask *avoid);
 
 void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
                          struct bch_devs_mask *,