X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;ds=inline;f=libbcachefs%2Falloc_background.c;h=8d8481fc1ecce4563a2c708e37cd76f65f35760c;hb=ffa950ce1d59344ff621659b845416b8a526127f;hp=97d3ffb4d392cbe3d6bdca1c2b86f4ec766cefb6;hpb=b6fca67693002848f21c2f27131c212debf8a396;p=bcachefs-tools-debian diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 97d3ffb..8d8481f 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -9,6 +9,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_gc.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" #include "clock.h" @@ -17,6 +18,7 @@ #include "error.h" #include "lru.h" #include "recovery.h" +#include "trace.h" #include "varint.h" #include @@ -26,7 +28,6 @@ #include #include #include -#include /* Persistent alloc info: */ @@ -222,7 +223,8 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -237,7 +239,8 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -250,7 +253,8 @@ int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_alloc_unpacked u; @@ -263,13 +267,14 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + int rw = flags & WRITE; - if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { - prt_printf(err, "bad val size (%lu != %u)", - bkey_val_u64s(k.k), alloc_v4_u64s(a.v)); + if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%u > %lu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); return -BCH_ERR_invalid_bkey; } @@ -279,11 +284,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - /* - * XXX this is wrong, we'll be checking updates that happened from - * before BCH_FS_CHECK_BACKPOINTERS_DONE - */ - if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + if (rw == WRITE && + !(flags & BKEY_INVALID_JOURNAL) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { unsigned i, bp_len = 0; for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) @@ -333,7 +336,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, } if (!a.v->io_time[READ] && - test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) { + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { prt_printf(err, "cached bucket with read_time == 0"); return -BCH_ERR_invalid_bkey; } @@ -415,14 +418,15 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_newline(out); prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); prt_newline(out); + prt_printf(out, "fragmentation %llu", a->fragmentation_lru); + prt_newline(out); + prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_newline(out); - if (k.k->type == KEY_TYPE_alloc_v4) { + if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a_raw.v)); - prt_newline(out); - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); printbuf_indent_add(out, 2); @@ -450,6 +454,8 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) if (src < dst) memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); } else { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); @@ -475,38 +481,26 @@ static noinline struct bkey_i_alloc_v4 * __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i_alloc_v4 *ret; + + ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); + if (IS_ERR(ret)) + return ret; + if (k.k->type == KEY_TYPE_alloc_v4) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - unsigned bytes = sizeof(struct bkey_i_alloc_v4) + - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) * - sizeof(struct bch_backpointer); void *src, *dst; - /* - * Reserve space for one more backpointer here: - * Not sketchy at doing it this way, nope... - */ - ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); - if (IS_ERR(ret)) - return ret; - bkey_reassemble(&ret->k_i, k); src = alloc_v4_backpointers(&ret->v); SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(&ret->v); - memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * - sizeof(struct bch_backpointer)); if (src < dst) memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); set_alloc_v4_u64s(ret); } else { - ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) + - sizeof(struct bch_backpointer)); - if (IS_ERR(ret)) - return ret; - bkey_alloc_v4_init(&ret->k_i); ret->k.p = k.k->p; bch2_alloc_to_v4(k, &ret->v); @@ -516,18 +510,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) { + struct bkey_s_c_alloc_v4 a; + if (likely(k.k->type == KEY_TYPE_alloc_v4) && - BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) { - /* - * Reserve space for one more backpointer here: - * Not sketchy at doing it this way, nope... - */ - struct bkey_i_alloc_v4 *ret = - bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer)); - if (!IS_ERR(ret)) - bkey_reassemble(&ret->k_i, k); - return ret; - } + ((a = bkey_s_c_to_alloc_v4(k), true) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) + return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); return __bch2_alloc_to_v4_mut(trans, k); } @@ -545,14 +533,13 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter struct bkey_i_alloc_v4 *a; int ret; - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, BTREE_ITER_WITH_UPDATES| BTREE_ITER_CACHED| BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (unlikely(ret)) - goto err; + return ERR_PTR(ret); a = bch2_alloc_to_v4_mut_inlined(trans, k); ret = PTR_ERR_OR_ZERO(a); @@ -564,40 +551,6 @@ err: return ERR_PTR(ret); } -int bch2_alloc_read(struct bch_fs *c) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a; - struct bch_dev *ca; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!bch2_dev_bucket_exists(c, k.k->p)) - continue; - - ca = bch_dev_bkey_exists(c, k.k->p.inode); - - *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - - if (ret) - bch_err(c, "error reading alloc info: %s", bch2_err_str(ret)); - - return ret; -} - static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) { *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; @@ -621,7 +574,8 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { prt_printf(err, "bad val size (%lu != %zu)", @@ -675,7 +629,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i)); + __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); if (ret) break; have_bucket_gens_key = false; @@ -695,58 +649,79 @@ int bch2_bucket_gens_init(struct bch_fs *c) ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i)); + __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_trans_exit(&trans); if (ret) - bch_err(c, "%s: error %s", __func__, bch2_err_str(ret)); - + bch_err_fn(c, ret); return ret; } -int bch2_bucket_gens_read(struct bch_fs *c) +int bch2_alloc_read(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - const struct bch_bucket_gens *g; struct bch_dev *ca; - u64 b; int ret; + down_read(&c->gc_lock); bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; - u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { + const struct bch_bucket_gens *g; + u64 b; - if (k.k->type != KEY_TYPE_bucket_gens) - continue; + for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; - g = bkey_s_c_to_bucket_gens(k).v; + if (k.k->type != KEY_TYPE_bucket_gens) + continue; - /* - * Not a fsck error because this is checked/repaired by - * bch2_check_alloc_key() which runs later: - */ - if (!bch2_dev_exists2(c, k.k->p.inode)) - continue; + g = bkey_s_c_to_bucket_gens(k).v; + + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_exists2(c, k.k->p.inode)) + continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + ca = bch_dev_bkey_exists(c, k.k->p.inode); - for (b = max_t(u64, ca->mi.first_bucket, start); - b < min_t(u64, ca->mi.nbuckets, end); - b++) - *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; + for (b = max_t(u64, ca->mi.first_bucket, start); + b < min_t(u64, ca->mi.nbuckets, end); + b++) + *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; + } + bch2_trans_iter_exit(&trans, &iter); + } else { + struct bch_alloc_v4 a; + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; + } + bch2_trans_iter_exit(&trans, &iter); } - bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); + up_read(&c->gc_lock); if (ret) - bch_err(c, "error reading alloc info: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -794,21 +769,22 @@ static int bch2_bucket_do_index(struct btree_trans *trans, return 0; } - bch2_trans_iter_init(trans, &iter, btree, + old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - old = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(old); if (ret) - goto err; + return ret; if (ca->mi.freespace_initialized && - test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && bch2_trans_inconsistent_on(old.k->type != old_type, trans, - "incorrect key when %s %s btree (got %s should be %s)\n" + "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" " for %s", set ? "setting" : "clearing", bch2_btree_ids[btree], + iter.pos.inode, + iter.pos.offset, bch2_bkey_types[old.k->type], bch2_bkey_types[old_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -838,13 +814,12 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, if (ret) return ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_bucket_gens, pos, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); ret = bkey_err(k); if (ret) - goto err; + return ret; if (k.k->type != KEY_TYPE_bucket_gens) { bkey_bucket_gens_init(&g->k_i); @@ -856,7 +831,6 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, g->v.gens[offset] = gen; ret = bch2_trans_update(trans, &iter, &g->k_i, 0); -err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -911,17 +885,27 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - old_lru = alloc_lru_idx(*old_a); - new_lru = alloc_lru_idx(*new_a); + old_lru = alloc_lru_idx_read(*old_a); + new_lru = alloc_lru_idx_read(*new_a); if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, - old_lru, &new_lru, old); + ret = bch2_lru_change(trans, new->k.p.inode, + bucket_to_u64(new->k.p), + old_lru, new_lru); if (ret) return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new->k.p.inode)); - if (new_a->data_type == BCH_DATA_cached) - new_a->io_time[READ] = new_lru; + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new->k.p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; } if (old_a->gen != new_a->gen) { @@ -937,7 +921,7 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for * extents style btrees, but works on non-extents btrees: */ -struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); @@ -951,10 +935,17 @@ struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, s struct bpos next; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_upto(&iter2, - bkey_min(bkey_min(end, - iter->path->l[0].b->key.k.p), - POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1))); + + if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + + end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); + + /* + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ + k = bch2_btree_iter_peek_upto(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); @@ -995,13 +986,13 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket) iter = bucket->inode; ca = __bch2_next_dev(c, &iter, NULL); if (ca) - bucket->offset = ca->mi.first_bucket; + *bucket = POS(ca->dev_idx, ca->mi.first_bucket); rcu_read_unlock(); return ca != NULL; } -struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) { struct bch_fs *c = iter->trans->c; struct bkey_s_c k; @@ -1031,12 +1022,13 @@ again: return k; } -static int bch2_check_alloc_key(struct btree_trans *trans, - struct bkey_s_c alloc_k, - struct btree_iter *alloc_iter, - struct btree_iter *discard_iter, - struct btree_iter *freespace_iter, - struct btree_iter *bucket_gens_iter) +static noinline_for_stack +int bch2_check_alloc_key(struct btree_trans *trans, + struct bkey_s_c alloc_k, + struct btree_iter *alloc_iter, + struct btree_iter *discard_iter, + struct btree_iter *freespace_iter, + struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; struct bch_dev *ca; @@ -1160,10 +1152,11 @@ fsck_err: return ret; } -static int bch2_check_alloc_hole_freespace(struct btree_trans *trans, - struct bpos start, - struct bpos *end, - struct btree_iter *freespace_iter) +static noinline_for_stack +int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *freespace_iter) { struct bch_fs *c = trans->c; struct bch_dev *ca; @@ -1215,10 +1208,11 @@ fsck_err: return ret; } -static int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, - struct bpos start, - struct bpos *end, - struct btree_iter *bucket_gens_iter) +static noinline_for_stack +int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; struct bkey_s_c k; @@ -1226,8 +1220,7 @@ static int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - if (c->sb.version < bcachefs_metadata_version_bucket_gens && - !c->opts.version_upgrade) + if (c->sb.version < bcachefs_metadata_version_bucket_gens) return 0; bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); @@ -1280,8 +1273,8 @@ fsck_err: return ret; } -static int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter) +static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter; @@ -1300,49 +1293,70 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans, pos.offset &= ~(~0ULL << 56); genbits = iter->pos.offset & (~0ULL << 56); - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + ret = bkey_err(alloc_k); + if (ret) + return ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, "entry in %s btree for nonexistant dev:bucket %llu:%llu", bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) goto delete; - alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); - ret = bkey_err(alloc_k); - if (ret) - goto err; - a = bch2_alloc_to_v4(alloc_k, &a_convert); if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a)), c, - "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)", + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, a->data_type == state, genbits >> 56, alloc_freespace_genbits(*a) >> 56)) goto delete; out: -err: fsck_err: + set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_delete_extent_at(trans, iter, - iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0); + ret = bch2_btree_delete_extent_at(trans, iter, + iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); goto out; } +static int bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end) +{ + if (!btree_node_type_is_extents(iter->btree_id)) { + return __bch2_check_discard_freespace_key(trans, iter); + } else { + int ret; + + while (!bkey_eq(iter->pos, end) && + !(ret = btree_trans_too_many_iters(trans) ?: + __bch2_check_discard_freespace_key(trans, iter))) + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + + return ret; + } +} + /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent * buckets. */ -static int bch2_check_bucket_gens_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +static noinline_for_stack +int bch2_check_bucket_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_i_bucket_gens g; @@ -1350,17 +1364,21 @@ static int bch2_check_bucket_gens_key(struct btree_trans *trans, u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; u64 b; - bool need_update = false; + bool need_update = false, dev_exists; struct printbuf buf = PRINTBUF; int ret = 0; BUG_ON(k.k->type != KEY_TYPE_bucket_gens); bkey_reassemble(&g.k_i, k); - if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, - "bucket_gens key for invalid device:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, 0); + /* if no bch_dev, skip out whether we repair or not */ + dev_exists = bch2_dev_exists2(c, k.k->p.inode); + if (!dev_exists) { + if (fsck_err_on(!dev_exists, c, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + } goto out; } @@ -1482,16 +1500,14 @@ bkey_err: if (ret < 0) goto err; - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key2(&trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_discard_freespace_key(&trans, &iter)) ?: - for_each_btree_key_commit(&trans, iter, + bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: + for_each_btree_key2(&trans, iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_discard_freespace_key(&trans, &iter)) ?: + bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: for_each_btree_key_commit(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, @@ -1499,7 +1515,9 @@ bkey_err: bch2_check_bucket_gens_key(&trans, &iter, k)); err: bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; + if (ret) + bch_err_fn(c, ret); + return ret; } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, @@ -1509,9 +1527,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k, k; + struct bkey_s_c alloc_k, lru_k; struct printbuf buf = PRINTBUF; - struct printbuf buf2 = PRINTBUF; int ret; alloc_k = bch2_btree_iter_peek(alloc_iter); @@ -1527,34 +1544,31 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (a->data_type != BCH_DATA_cached) return 0; - bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, - POS(alloc_k.k->p.inode, a->io_time[READ]), 0); - - k = bch2_btree_iter_peek_slot(&lru_iter); - ret = bkey_err(k); + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ]), 0); + ret = bkey_err(lru_k); if (ret) - goto err; + return ret; if (fsck_err_on(!a->io_time[READ], c, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || - fsck_err_on(k.k->type != KEY_TYPE_lru || - le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, - "incorrect/missing lru entry\n" - " %s\n" + fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + "missing lru entry\n" " %s", (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { u64 read_time = a->io_time[READ] ?: atomic64_read(&c->io_clock[READ].now); ret = bch2_lru_set(trans, alloc_k.k->p.inode, - alloc_k.k->p.offset, - &read_time); + bucket_to_u64(alloc_k.k->p), + read_time); if (ret) goto err; @@ -1575,27 +1589,24 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &lru_iter); - printbuf_exit(&buf2); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_alloc_to_lru_ref(&trans, &iter)); - - bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_alloc_to_lru_ref(&trans, &iter))); + if (ret) + bch_err_fn(c, ret); + return ret; } static int bch2_discard_one_bucket(struct btree_trans *trans, @@ -1613,7 +1624,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; - bool did_discard = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); @@ -1634,10 +1644,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - need_discard_iter->pos, - BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + need_discard_iter->pos, + BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) goto out; @@ -1654,7 +1663,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { bch2_trans_inconsistent(trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" "%s", @@ -1667,7 +1676,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } if (a->v.data_type != BCH_DATA_need_discard) { - if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { bch2_trans_inconsistent(trans, "bucket incorrectly set in need_discard btree\n" "%s", @@ -1689,29 +1698,27 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, GFP_KERNEL); + *discard_pos_done = iter.pos; - ret = bch2_trans_relock(trans); + ret = bch2_trans_relock_notrace(trans); if (ret) goto out; } - *discard_pos_done = iter.pos; - did_discard = true; - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); if (ret) goto out; - if (did_discard) { - this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); - (*discarded)++; - } + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + (*discarded)++; out: + (*seen)++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1748,7 +1755,7 @@ static void bch2_do_discards_work(struct work_struct *work) if (need_journal_commit * 2 > seen) bch2_journal_flush_async(&c->journal, NULL); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); trace_discard_buckets(c, seen, open, need_journal_commit, discarded, bch2_err_str(ret)); @@ -1756,62 +1763,45 @@ static void bch2_do_discards_work(struct work_struct *work) void bch2_do_discards(struct bch_fs *c) { - if (percpu_ref_tryget_live(&c->writes) && - !queue_work(system_long_wq, &c->discard_work)) - percpu_ref_put(&c->writes); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && + !queue_work(c->write_ref_wq, &c->discard_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } static int invalidate_one_bucket(struct btree_trans *trans, - struct btree_iter *lru_iter, struct bkey_s_c k, - unsigned dev_idx, s64 *nr_to_invalidate) + struct btree_iter *lru_iter, + struct bkey_s_c lru_k, + s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; - struct bkey_i_alloc_v4 *a; - struct bpos bucket; + struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; + struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); unsigned cached_sectors; int ret = 0; - if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx) + if (*nr_to_invalidate <= 0) return 1; - if (k.k->type != KEY_TYPE_lru) { - prt_printf(&buf, "non lru key in lru btree:\n "); - bch2_bkey_val_to_text(&buf, c, k); - - if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { - bch_err(c, "%s", buf.buf); - } else { - bch2_trans_inconsistent(trans, "%s", buf.buf); - ret = -EINVAL; - } - - goto out; + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "lru entry points to invalid bucket"); + goto err; } - bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx)); + if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) + return 0; a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; - if (k.k->p.offset != alloc_lru_idx(a->v)) { - prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - - if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { - bch_err(c, "%s", buf.buf); - } else { - bch2_trans_inconsistent(trans, "%s", buf.buf); - ret = -EINVAL; - } - + /* We expect harmless races here due to the btree write buffer: */ + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) goto out; - } + + BUG_ON(a->v.data_type != BCH_DATA_cached); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); @@ -1829,7 +1819,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); if (ret) goto out; @@ -1839,6 +1830,26 @@ out: bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; +err: + prt_str(&buf, "\n lru key: "); + bch2_bkey_val_to_text(&buf, c, lru_k); + + prt_str(&buf, "\n lru entry: "); + bch2_lru_pos_to_text(&buf, lru_iter->pos); + + prt_str(&buf, "\n alloc key: "); + if (!a) + bch2_bpos_to_text(&buf, bucket); + else + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + + bch_err(c, "%s", buf.buf); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { + bch2_inconsistent_error(c); + ret = -EINVAL; + } + + goto out; } static void bch2_do_invalidates_work(struct work_struct *work) @@ -1853,32 +1864,39 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_init(&trans, c, 0, 0); + ret = bch2_btree_write_buffer_flush(&trans); + if (ret) + goto err; + for_each_member_device(ca, c, i) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru, - POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k, - invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate)); + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, 0), + lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), + BTREE_ITER_INTENT, k, + invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { percpu_ref_put(&ca->ref); break; } } - +err: bch2_trans_exit(&trans); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) { - if (percpu_ref_tryget_live(&c->writes) && - !queue_work(system_long_wq, &c->invalidate_work)) - percpu_ref_put(&c->writes); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && + !queue_work(c->write_ref_wq, &c->invalidate_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } -static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + unsigned long *last_updated) { struct btree_trans trans; struct btree_iter iter; @@ -1898,6 +1916,12 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { + if (*last_updated + HZ * 10 < jiffies) { + bch_info(ca, "%s: currently at %llu/%llu", + __func__, iter.pos.offset, ca->mi.nbuckets); + *last_updated = jiffies; + } + bch2_trans_begin(&trans); if (bkey_ge(iter.pos, end)) { @@ -1939,7 +1963,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) freespace->k.p = k.k->p; freespace->k.size = k.k->size; - ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace) ?: + ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL); @@ -1977,6 +2001,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) unsigned i; int ret = 0; bool doing_init = false; + unsigned long last_updated = jiffies; /* * We can crash during the device add path, so we need to check this on @@ -1992,9 +2017,10 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca); + ret = bch2_dev_freespace_init(c, ca, &last_updated); if (ret) { percpu_ref_put(&ca->ref); + bch_err_fn(c, ret); return ret; } } @@ -2003,11 +2029,10 @@ int bch2_fs_freespace_init(struct bch_fs *c) mutex_lock(&c->sb_lock); bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch_verbose(c, "done initializing freespace"); } - return ret; + return 0; } /* Bucket IO clocks: */ @@ -2146,40 +2171,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) */ bch2_recalc_capacity(c); - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_writepoint_stop(c, ca, &c->write_points[i]); - - bch2_writepoint_stop(c, ca, &c->copygc_write_point); - bch2_writepoint_stop(c, ca, &c->rebalance_write_point); - bch2_writepoint_stop(c, ca, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - while (1) { - struct open_bucket *ob; - - spin_lock(&c->freelist_lock); - if (!ca->open_buckets_partial_nr) { - spin_unlock(&c->freelist_lock); - break; - } - ob = c->open_buckets + - ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - ob->on_partial_list = false; - spin_unlock(&c->freelist_lock); - - bch2_open_bucket_put(c, ob); - } - - bch2_ec_stop_dev(c, ca); + bch2_open_buckets_stop(c, ca, false); /* * Wake up threads that were blocked on allocation, so they can notice