+ btree_err_on(btree_node_is_extents(b) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+ BTREE_ERR_FATAL, c, NULL, b, NULL,
+ "btree node does not have NEW_EXTENT_OVERWRITE set");
+
+ sectors = vstruct_sectors(b->data, c->block_bits);
+ } else {
+ bne = write_block(b);
+ i = &bne->keys;
+
+ if (i->seq != b->data->keys.seq)
+ break;
+
+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
+ "unknown checksum type %llu",
+ BSET_CSUM_TYPE(i));
+
+ nonce = btree_nonce(i, b->written << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+ btree_err_on(bch2_crc_cmp(csum, bne->csum),
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
+ "invalid checksum");
+
+ bset_encrypt(c, i, b->written << 9);
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ b->version_ondisk = min(b->version_ondisk,
+ le16_to_cpu(i->version));
+
+ ret = validate_bset(c, ca, b, i, b->written, sectors,
+ READ, have_retry);
+ if (ret)
+ goto fsck_err;
+
+ if (!b->written)
+ btree_node_set_format(b, b->data->format);
+
+ ret = validate_bset_keys(c, b, i, &whiteout_u64s,
+ READ, have_retry);
+ if (ret)
+ goto fsck_err;
+
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+ b->written += sectors;
+
+ blacklisted = bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(i->journal_seq),
+ true);
+
+ btree_err_on(blacklisted && first,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
+ "first btree node bset has blacklisted journal seq");
+
+ btree_err_on(blacklisted && ptr_written,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
+ "found blacklisted bset in btree node with sectors_written");
+ if (blacklisted && !first)
+ continue;
+
+ sort_iter_add(iter, i->start,
+ vstruct_idx(i, whiteout_u64s));
+
+ sort_iter_add(iter,
+ vstruct_idx(i, whiteout_u64s),
+ vstruct_last(i));
+
+ nonblacklisted_written = b->written;
+ }
+
+ if (ptr_written) {
+ btree_err_on(b->written < ptr_written,
+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+ "btree node data missing: expected %u sectors, found %u",
+ ptr_written, b->written);
+ } else {
+ for (bne = write_block(b);
+ bset_byte_offset(b, bne) < btree_bytes(c);
+ bne = (void *) bne + block_bytes(c))
+ btree_err_on(bne->keys.seq == b->data->keys.seq &&
+ !bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq),
+ true),
+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+ "found bset signature after last bset");
+
+ /*
+ * Blacklisted bsets are those that were written after the most recent
+ * (flush) journal write. Since there wasn't a flush, they may not have
+ * made it to all devices - which means we shouldn't write new bsets
+ * after them, as that could leave a gap and then reads from that device
+ * wouldn't find all the bsets in that btree node - which means it's
+ * important that we start writing new bsets after the most recent _non_
+ * blacklisted bset:
+ */
+ blacklisted_written = b->written;
+ b->written = nonblacklisted_written;
+ }
+
+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted->keys.u64s = 0;
+
+ set_btree_bset(b, b->set, &b->data->keys);
+
+ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+
+ u64s = le16_to_cpu(sorted->keys.u64s);
+ *sorted = *b->data;
+ sorted->keys.u64s = cpu_to_le16(u64s);
+ swap(sorted, b->data);
+ set_btree_bset(b, b->set, &b->data->keys);
+ b->nsets = 1;
+
+ BUG_ON(b->nr.live_u64s != u64s);
+
+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+
+ if (updated_range)
+ bch2_btree_node_drop_keys_outside_node(b);
+
+ i = &b->data->keys;
+ for (k = i->start; k != vstruct_last(i);) {
+ struct bkey tmp;
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
+
+ if (invalid ||
+ (bch2_inject_invalid_keys &&
+ !bversion_cmp(u.k->version, MAX_VERSION))) {
+ char buf[160];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+ "invalid bkey %s: %s", buf, invalid);
+
+ btree_keys_account_key_drop(&b->nr, 0, k);
+
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_bset_end(b, b->set);
+ continue;
+ }
+
+ if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+ bp.v->mem_ptr = 0;
+ }
+
+ k = bkey_next(k);
+ }
+
+ bch2_bset_build_aux_tree(b, b->set, false);
+
+ set_needs_whiteout(btree_bset_first(b), true);
+
+ btree_node_reset_sib_u64s(b);
+
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
+ set_btree_node_need_rewrite(b);
+ }
+
+ if (!ptr_written)
+ set_btree_node_need_rewrite(b);
+out:
+ mempool_free(iter, &c->fill_iter);
+ return retry_read;
+fsck_err:
+ if (ret == BTREE_RETRY_READ) {
+ retry_read = 1;
+ } else {
+ bch2_inconsistent_error(c);
+ set_btree_node_read_error(b);
+ }
+ goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+ struct btree_read_bio *rb =
+ container_of(work, struct btree_read_bio, work);
+ struct bch_fs *c = rb->c;
+ struct btree *b = rb->b;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ struct bio *bio = &rb->bio;
+ struct bch_io_failures failed = { .nr = 0 };
+ char buf[200];
+ struct printbuf out;
+ bool saw_error = false;
+ bool can_retry;
+
+ goto start;
+ while (1) {
+ bch_info(c, "retrying read");
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
+ bio_reset(bio);
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
+ bio->bi_iter.bi_sector = rb->pick.ptr.offset;
+ bio->bi_iter.bi_size = btree_bytes(c);
+
+ if (rb->have_ioref) {
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ submit_bio_wait(bio);
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+ }
+start:
+ out = PBUF(buf);
+ btree_pos_to_text(&out, c, b);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+ bch2_blk_status_to_str(bio->bi_status), buf);
+ if (rb->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+ rb->have_ioref = false;
+
+ bch2_mark_io_failure(&failed, &rb->pick);
+
+ can_retry = bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ &failed, &rb->pick) > 0;
+
+ if (!bio->bi_status &&
+ !bch2_btree_node_read_done(c, ca, b, can_retry))
+ break;