bool used_mempool, void *p)
{
if (used_mempool)
- mempool_free(virt_to_page(p), &c->btree_bounce_pool);
+ mempool_free(p, &c->btree_bounce_pool);
else
- free_pages((unsigned long) p, order);
+ vpfree(p, PAGE_SIZE << order);
}
static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
{
void *p;
- BUG_ON(1 << order > btree_pages(c));
+ BUG_ON(order > btree_page_order(c));
*used_mempool = false;
p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
return p;
*used_mempool = true;
- return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
+ return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
}
typedef int (*sort_cmp_fn)(struct btree *,
vstruct_end(i) - (void *) i->_data);
}
-#define btree_node_error(b, c, ptr, fmt, ...) \
- bch2_fs_inconsistent(c, \
- "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
- (b)->btree_id, (b)->level, btree_node_root(c, b) \
- ? btree_node_root(c, b)->level : -1, \
- PTR_BUCKET_NR(ca, ptr), (b)->written, \
- le16_to_cpu((i)->u64s), ##__VA_ARGS__)
-
-static const char *validate_bset(struct bch_fs *c, struct btree *b,
- struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- struct bset *i, unsigned sectors,
- unsigned *whiteout_u64s)
+#define btree_node_error(c, b, ptr, msg, ...) \
+do { \
+ if (write == READ && \
+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
+ mustfix_fsck_err(c, \
+ "btree node read error at btree %u level %u/%u\n"\
+ "sector %llu node offset %u bset u64s %u: " msg,\
+ (b)->btree_id, (b)->level, \
+ (c)->btree_roots[(b)->btree_id].level, \
+ (u64) ptr->offset, (b)->written, \
+ le16_to_cpu((i)->u64s), ##__VA_ARGS__); \
+ } else { \
+ bch_err(c, "%s at btree %u level %u/%u\n" \
+ "sector %llu node offset %u bset u64s %u: " msg,\
+ write == WRITE \
+ ? "corrupt metadata in btree node write" \
+ : "btree node error", \
+ (b)->btree_id, (b)->level, \
+ (c)->btree_roots[(b)->btree_id].level, \
+ (u64) ptr->offset, (b)->written, \
+ le16_to_cpu((i)->u64s), ##__VA_ARGS__); \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+} while (0)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+ const struct bch_extent_ptr *ptr,
+ struct bset *i, unsigned sectors,
+ unsigned *whiteout_u64s,
+ int write)
{
struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN;
bool seen_non_whiteout = false;
+ int ret = 0;
- if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
- return "unsupported bset version";
+ if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
+ btree_node_error(c, b, ptr, "unsupported bset version");
+ i->u64s = 0;
+ return 0;
+ }
- if (b->written + sectors > c->sb.btree_node_size)
- return "bset past end of btree node";
+ if (b->written + sectors > c->sb.btree_node_size) {
+ btree_node_error(c, b, ptr, "bset past end of btree node");
+ i->u64s = 0;
+ return 0;
+ }
- if (i != &b->data->keys && !i->u64s)
- btree_node_error(b, c, ptr, "empty set");
+ if (b->written && !i->u64s)
+ btree_node_error(c, b, ptr, "empty set");
if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true;
const char *invalid;
if (!k->u64s) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"KEY_U64s 0: %zu bytes of metadata lost",
vstruct_end(i) - (void *) k);
}
if (bkey_next(k) > vstruct_last(i)) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"key extends past end of bset");
i->u64s = cpu_to_le16((u64 *) k - i->_data);
}
if (k->format > KEY_FORMAT_CURRENT) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"invalid bkey format %u", k->format);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
char buf[160];
bch2_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), u);
- btree_node_error(b, c, ptr,
+ buf, sizeof(buf), u);
+ btree_node_error(c, b, ptr,
"invalid bkey %s: %s", buf, invalid);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"keys out of order: %llu:%llu > %llu:%llu",
prev_pos.inode,
prev_pos.offset,
}
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
- return NULL;
+fsck_err:
+ return ret;
}
static bool extent_contains_ptr(struct bkey_s_c_extent e,
return false;
}
+static void bch2_btree_node_read_complete(struct btree_read_bio *rb,
+ struct btree *b)
+{
+ struct bch_dev *ca = rb->pick.ca;
+
+ bio_put(&rb->bio);
+ percpu_ref_put(&ca->io_ref);
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
const char *err;
struct bch_csum csum;
struct nonce nonce;
- int ret;
+ int ret, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
__bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
sectors = vstruct_sectors(bne, c->block_bits);
}
- err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
- if (err)
- goto err;
+ ret = validate_bset(c, b, ptr, i, sectors,
+ &whiteout_u64s, READ);
+ if (ret)
+ goto fsck_err;
b->written += sectors;
if (bne->keys.seq == b->data->keys.seq)
goto err;
- sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
sorted->keys.u64s = 0;
b->nr = btree_node_is_extents(b)
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
+ btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
bch2_bset_build_aux_tree(b, b->set, false);
mempool_free(iter, &c->fill_iter);
return;
err:
+ btree_node_error(c, b, ptr, "%s", err);
+fsck_err:
+ bch2_inconsistent_error(c);
set_btree_node_read_error(b);
- btree_node_error(b, c, ptr, "%s", err);
goto out;
}
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b)
+static void btree_node_read_work(struct work_struct *work)
+{
+ struct btree_read_bio *rb =
+ container_of(work, struct btree_read_bio, work);
+
+ bch2_btree_node_read_done(rb->c, rb->bio.bi_private,
+ rb->pick.ca, &rb->pick.ptr);
+ bch2_btree_node_read_complete(rb, rb->bio.bi_private);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+ struct btree *b = bio->bi_private;
+ struct btree_read_bio *rb =
+ container_of(bio, struct btree_read_bio, bio);
+
+ if (bch2_dev_fatal_io_err_on(bio->bi_error,
+ rb->pick.ca, "IO error reading bucket %zu",
+ PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) ||
+ bch2_meta_read_fault("btree")) {
+ set_btree_node_read_error(b);
+ bch2_btree_node_read_complete(rb, rb->bio.bi_private);
+ return;
+ }
+
+ INIT_WORK(&rb->work, btree_node_read_work);
+ schedule_work(&rb->work);
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+ bool sync)
{
uint64_t start_time = local_clock();
- struct bio *bio;
struct extent_pick_ptr pick;
+ struct btree_read_bio *rb;
+ struct bio *bio;
trace_btree_read(c, b);
}
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+ rb = container_of(bio, struct btree_read_bio, bio);
+ rb->c = c;
+ rb->pick = pick;
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_bdev = pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
bch2_bio_map(bio, b->data);
- submit_bio_wait(bio);
+ set_btree_node_read_in_flight(b);
- if (bch2_dev_fatal_io_err_on(bio->bi_error,
- pick.ca, "IO error reading bucket %zu",
- PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
- bch2_meta_read_fault("btree")) {
- set_btree_node_read_error(b);
- goto out;
- }
+ if (sync) {
+ submit_bio_wait(bio);
+
+ if (bch2_dev_fatal_io_err_on(bio->bi_error,
+ pick.ca, "IO error reading bucket %zu",
+ PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
+ bch2_meta_read_fault("btree")) {
+ set_btree_node_read_error(b);
+ goto out;
+ }
- bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
- bch2_time_stats_update(&c->btree_read_time, start_time);
+ bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
+ bch2_time_stats_update(&c->btree_read_time, start_time);
out:
- bio_put(bio);
- percpu_ref_put(&pick.ca->io_ref);
+ bch2_btree_node_read_complete(rb, b);
+ } else {
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = b;
+ submit_bio(bio);
+ }
}
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
- bch2_btree_node_read(c, b);
+ bch2_btree_node_read(c, b, true);
six_unlock_write(&b->lock);
if (btree_node_read_error(b)) {
{
struct btree_write *w = btree_prev_write(b);
- /*
- * Before calling bch2_btree_complete_write() - if the write errored, we
- * have to halt new journal writes before they see this btree node
- * write as completed:
- */
- if (btree_node_write_error(b))
- bch2_journal_halt(&c->journal);
-
bch2_btree_complete_write(c, b, w);
btree_node_io_unlock(b);
}
-static void btree_node_write_endio(struct bio *bio)
+static void bch2_btree_node_write_error(struct bch_fs *c,
+ struct bch_write_bio *wbio)
{
- struct btree *b = bio->bi_private;
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_fs *c = wbio->c;
- struct bio *orig = wbio->split ? wbio->orig : NULL;
- struct closure *cl = !wbio->split ? wbio->cl : NULL;
- struct bch_dev *ca = wbio->ca;
+ struct btree *b = wbio->bio.bi_private;
+ struct closure *cl = wbio->cl;
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_i_extent *new_key;
+
+ bkey_copy(&tmp.k, &b->key);
+ new_key = bkey_i_to_extent(&tmp.k);
- if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
+ while (wbio->replicas_failed) {
+ unsigned idx = __fls(wbio->replicas_failed);
+
+ bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
+ wbio->replicas_failed ^= 1 << idx;
+ }
+
+ if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
+ bch2_btree_node_update_key(c, b, new_key)) {
+ set_btree_node_noevict(b);
+ bch2_fatal_error(c);
+ }
+
+ bio_put(&wbio->bio);
+ btree_node_write_done(c, b);
+ if (cl)
+ closure_put(cl);
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ btree_write_error_work);
+ struct bio *bio;
+
+ while (1) {
+ spin_lock_irq(&c->read_retry_lock);
+ bio = bio_list_pop(&c->read_retry_list);
+ spin_unlock_irq(&c->read_retry_lock);
+
+ if (!bio)
+ break;
+
+ bch2_btree_node_write_error(c, to_wbio(bio));
+ }
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+ struct btree *b = bio->bi_private;
+ struct bch_write_bio *wbio = to_wbio(bio);
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
+ struct bch_write_bio *orig = parent ?: wbio;
+ struct closure *cl = !wbio->split ? wbio->cl : NULL;
+ struct bch_fs *c = wbio->c;
+ struct bch_dev *ca = wbio->ca;
+
+ if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") ||
bch2_meta_write_fault("btree"))
- set_btree_node_write_error(b);
+ set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
- if (wbio->bounce)
- btree_bounce_free(c,
- wbio->order,
- wbio->used_mempool,
- page_address(bio->bi_io_vec[0].bv_page));
-
- if (wbio->put_bio)
+ if (parent) {
bio_put(bio);
+ bio_endio(&parent->bio);
+ return;
+ }
- if (orig) {
- bio_endio(orig);
- } else {
- btree_node_write_done(c, b);
- if (cl)
- closure_put(cl);
+ btree_bounce_free(c,
+ wbio->order,
+ wbio->used_mempool,
+ wbio->data);
+
+ if (wbio->replicas_failed) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bio_list_add(&c->read_retry_list, &wbio->bio);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ queue_work(c->wq, &c->btree_write_error_work);
+ return;
}
+
+ bio_put(bio);
+ btree_node_write_done(c, b);
+ if (cl)
+ closure_put(cl);
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+ struct bset *i, unsigned sectors)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned whiteout_u64s = 0;
+ int ret;
+
+ extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
+ break;
+
+ ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
+ if (ret)
+ bch2_inconsistent_error(c);
+
+ return ret;
}
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent,
enum six_lock_type lock_type_held)
{
- struct bio *bio;
struct bch_write_bio *wbio;
struct bset_tree *t;
struct bset *i;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
+ if (b->written &&
+ !btree_node_may_write(b))
+ return;
+
if (old & (1 << BTREE_NODE_write_in_flight)) {
btree_node_wait_on_io(b);
continue;
}
new &= ~(1 << BTREE_NODE_dirty);
+ new &= ~(1 << BTREE_NODE_need_write);
new |= (1 << BTREE_NODE_write_in_flight);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON((b->will_make_reachable != NULL) != !b->written);
BUG_ON(b->written >= c->sb.btree_node_size);
BUG_ON(bset_written(b, btree_bset_last(b)));
clear_needs_whiteout(i);
- if (b->written && !i->u64s) {
- /* Nothing to write: */
- btree_bounce_free(c, order, used_mempool, data);
- btree_node_write_done(c, b);
- return;
- }
+ /* do we have data to write? */
+ if (b->written && !i->u64s)
+ goto nowrite;
+ bytes_to_write = vstruct_end(i) - data;
+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+ memset(data + bytes_to_write, 0,
+ (sectors_to_write << 9) - bytes_to_write);
+
+ BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
nonce = btree_nonce(b, i, b->written << 9);
+ /* if we're going to be encrypting, check metadata validity first: */
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+ validate_bset_for_write(c, b, i, sectors_to_write))
+ goto err;
+
if (bn) {
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
&bn->flags,
bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
}
- bytes_to_write = vstruct_end(i) - data;
- sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
- memset(data + bytes_to_write, 0,
- (sectors_to_write << 9) - bytes_to_write);
-
- BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
-
- trace_btree_write(b, bytes_to_write, sectors_to_write);
+ /* if we're not encrypting, check metadata after checksumming: */
+ if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+ validate_bset_for_write(c, b, i, sectors_to_write))
+ goto err;
/*
* We handle btree write errors by immediately halting the journal -
* break:
*/
if (bch2_journal_error(&c->journal) ||
- c->opts.nochanges) {
- set_btree_node_noevict(b);
- b->written += sectors_to_write;
-
- btree_bounce_free(c, order, used_mempool, data);
- btree_node_write_done(c, b);
- return;
- }
+ c->opts.nochanges)
+ goto err;
- bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
+ trace_btree_write(b, bytes_to_write, sectors_to_write);
- wbio = to_wbio(bio);
+ wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent;
- wbio->bounce = true;
- wbio->put_bio = true;
wbio->order = order;
wbio->used_mempool = used_mempool;
- bio->bi_iter.bi_size = sectors_to_write << 9;
- bio->bi_end_io = btree_node_write_endio;
- bio->bi_private = b;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
+ wbio->data = data;
+ wbio->bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
+ wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
+ wbio->bio.bi_end_io = btree_node_write_endio;
+ wbio->bio.bi_private = b;
if (parent)
closure_get(parent);
- bch2_bio_map(bio, data);
+ bch2_bio_map(&wbio->bio, data);
/*
* If we're appending to a leaf node, we don't technically need FUA -
b->written += sectors_to_write;
bch2_submit_wbio_replicas(wbio, c, &k.key);
+ return;
+err:
+ set_btree_node_noevict(b);
+ b->written += sectors_to_write;
+nowrite:
+ btree_bounce_free(c, order, used_mempool, data);
+ btree_node_write_done(c, b);
}
/*
}
}
-/*
- * Write all dirty btree nodes to disk, including roots
- */
-void bch2_btree_flush(struct bch_fs *c)
+void bch2_btree_verify_flushed(struct bch_fs *c)
{
- struct closure cl;
- struct btree *b;
struct bucket_table *tbl;
struct rhash_head *pos;
- bool saw_dirty;
+ struct btree *b;
unsigned i;
- closure_init_stack(&cl);
-
rcu_read_lock();
+ tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
+ &c->btree_cache_table);
- do {
- saw_dirty = false;
- i = 0;
-restart:
- tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
- &c->btree_cache_table);
-
- for (; i < tbl->size; i++)
- rht_for_each_entry_rcu(b, pos, tbl, i, hash) {
- saw_dirty |= btree_node_dirty(b);
-
- if (btree_node_dirty(b) &&
- btree_node_may_write(b)) {
- rcu_read_unlock();
- six_lock_read(&b->lock);
- bch2_btree_node_write_dirty(c, b, &cl, 1);
- six_unlock_read(&b->lock);
- rcu_read_lock();
- goto restart;
- }
- }
- } while (saw_dirty);
-
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(b, pos, tbl, i, hash)
+ BUG_ON(btree_node_dirty(b));
rcu_read_unlock();
-
- closure_sync(&cl);
-}
-
-/**
- * bch_btree_node_flush_journal - flush any journal entries that contain keys
- * from this node
- *
- * The bset's journal sequence number is used for preserving ordering of index
- * updates across unclean shutdowns - it's used to ignore bsets newer than the
- * most recent journal entry.
- *
- * But when rewriting btree nodes we compact all the bsets in a btree node - and
- * if we compacted a bset that should be ignored with bsets we do need, that
- * would be bad. So to avoid that, prior to making the new node visible ensure
- * that the journal has been flushed so that all the bsets we compacted should
- * be visible.
- */
-void bch2_btree_node_flush_journal_entries(struct bch_fs *c,
- struct btree *b,
- struct closure *cl)
-{
- int i = b->nsets;
-
- /*
- * Journal sequence numbers in the different bsets will always be in
- * ascending order, we only need to flush the highest - except that the
- * most recent bset might not have a journal sequence number yet, so we
- * need to loop:
- */
- while (i--) {
- u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
-
- if (seq) {
- bch2_journal_flush_seq_async(&c->journal, seq, cl);
- break;
- }
- }
}