+
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
+
+ queue_work(c->io_complete_wq, &rb->work);
+}
+
+struct btree_node_read_all {
+ struct closure cl;
+ struct bch_fs *c;
+ struct btree *b;
+ unsigned nr;
+ void *buf[BCH_REPLICAS_MAX];
+ struct bio *bio[BCH_REPLICAS_MAX];
+ blk_status_t err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+ struct btree_node *bn = data;
+ struct btree_node_entry *bne;
+ unsigned offset = 0;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c))
+ return 0;
+
+ while (offset < btree_sectors(c)) {
+ if (!offset) {
+ offset += vstruct_sectors(bn, c->block_bits);
+ } else {
+ bne = data + (offset << 9);
+ if (bne->keys.seq != bn->keys.seq)
+ break;
+ offset += vstruct_sectors(bne, c->block_bits);
+ }
+ }
+
+ return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+ struct btree_node *bn = data;
+ struct btree_node_entry *bne;
+
+ if (!offset)
+ return false;
+
+ while (offset < btree_sectors(c)) {
+ bne = data + (offset << 9);
+ if (bne->keys.seq == bn->keys.seq)
+ return true;
+ offset++;
+ }
+
+ return false;
+ return offset;
+}
+
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
+{
+ closure_type(ra, struct btree_node_read_all, cl);
+ struct bch_fs *c = ra->c;
+ struct btree *b = ra->b;
+ struct printbuf buf = PRINTBUF;
+ bool dump_bset_maps = false;
+ bool have_retry = false;
+ int ret = 0, best = -1, write = READ;
+ unsigned i, written = 0, written2 = 0;
+ __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+ bool _saw_error = false, *saw_error = &_saw_error;
+
+ for (i = 0; i < ra->nr; i++) {
+ struct btree_node *bn = ra->buf[i];
+
+ if (ra->err[i])
+ continue;
+
+ if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+ (seq && seq != bn->keys.seq))
+ continue;
+
+ if (best < 0) {
+ best = i;
+ written = btree_node_sectors_written(c, bn);
+ continue;
+ }
+
+ written2 = btree_node_sectors_written(c, ra->buf[i]);
+ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_sectors_written_mismatch,
+ "btree node sectors written mismatch: %u != %u",
+ written, written2) ||
+ btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_bset_after_end,
+ "found bset signature after last bset") ||
+ btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_data_mismatch,
+ "btree node replicas content mismatch"))
+ dump_bset_maps = true;
+
+ if (written2 > written) {
+ written = written2;
+ best = i;
+ }
+ }
+fsck_err:
+ if (dump_bset_maps) {
+ for (i = 0; i < ra->nr; i++) {
+ struct btree_node *bn = ra->buf[i];
+ struct btree_node_entry *bne = NULL;
+ unsigned offset = 0, sectors;
+ bool gap = false;
+
+ if (ra->err[i])
+ continue;
+
+ printbuf_reset(&buf);
+
+ while (offset < btree_sectors(c)) {
+ if (!offset) {
+ sectors = vstruct_sectors(bn, c->block_bits);
+ } else {
+ bne = ra->buf[i] + (offset << 9);
+ if (bne->keys.seq != bn->keys.seq)
+ break;
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
+ if (bne && bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq), false))
+ prt_printf(&buf, "*");
+ offset += sectors;
+ }
+
+ while (offset < btree_sectors(c)) {
+ bne = ra->buf[i] + (offset << 9);
+ if (bne->keys.seq == bn->keys.seq) {
+ if (!gap)
+ prt_printf(&buf, " GAP");
+ gap = true;
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
+ if (bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq), false))
+ prt_printf(&buf, "*");
+ }
+ offset++;
+ }
+
+ bch_err(c, "replica %u:%s", i, buf.buf);
+ }
+ }
+
+ if (best >= 0) {
+ memcpy(b->data, ra->buf[best], btree_bytes(c));
+ ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+ } else {
+ ret = -1;
+ }
+
+ if (ret)
+ set_btree_node_read_error(b);
+ else if (*saw_error)
+ bch2_btree_node_rewrite_async(c, b);
+
+ for (i = 0; i < ra->nr; i++) {
+ mempool_free(ra->buf[i], &c->btree_bounce_pool);
+ bio_put(ra->bio[i]);
+ }
+
+ closure_debug_destroy(&ra->cl);
+ kfree(ra);
+ printbuf_exit(&buf);
+
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+ struct btree_read_bio *rb =
+ container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
+ struct btree_node_read_all *ra = rb->ra;
+
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+