X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fmove.c;h=4ef7595fa6a2d5f43c098b40d10a5ca73ae029cf;hb=1ee7dc7a55273d34358a0ee525a9e823c999ffe6;hp=1d11cf0d4f38700181b577dbc349ac177020893f;hpb=c1e4d447f6dd0ee60495b651436d2055db7777ed;p=bcachefs-tools-debian diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 1d11cf0..4ef7595 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -60,7 +62,7 @@ static void move_free(struct moving_io *io) bch2_data_update_exit(&io->write); wake_up(&ctxt->wait); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_move); kfree(io); } @@ -73,6 +75,7 @@ static void move_write_done(struct bch_write_op *op) ctxt->write_error = true; atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_dec(&io->write.ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } @@ -86,11 +89,12 @@ static void move_write(struct moving_io *io) closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); bch2_data_update_read_done(&io->write, io->rbio.pick.crc); } -static inline struct moving_io *next_pending_write(struct moving_context *ctxt) +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) { struct moving_io *io = list_first_entry_or_null(&ctxt->reads, struct moving_io, list); @@ -104,35 +108,27 @@ static void move_read_endio(struct bio *bio) struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); + atomic_dec(&ctxt->read_ios); io->read_completed = true; wake_up(&ctxt->wait); closure_put(&ctxt->cl); } -static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, + struct btree_trans *trans) { struct moving_io *io; if (trans) bch2_trans_unlock(trans); - while ((io = next_pending_write(ctxt))) { + while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { list_del(&io->list); move_write(io); } } -#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ -do { \ - do_pending_writes(_ctxt, _trans); \ - \ - if (_cond) \ - break; \ - __wait_event((_ctxt)->wait, \ - next_pending_write(_ctxt) || (_cond)); \ -} while (1) - static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, struct btree_trans *trans) { @@ -147,7 +143,11 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) { move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); + EBUG_ON(atomic_read(&ctxt->write_sectors)); + EBUG_ON(atomic_read(&ctxt->write_ios)); + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); if (ctxt->stats) { progress_list_del(ctxt->c, ctxt->stats); @@ -198,13 +198,11 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, struct bkey_i *n; int ret; - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + n = bch2_bkey_make_mut(trans, k); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; - bkey_reassemble(n, k); - while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); struct bch_extent_ptr *ptr; @@ -229,7 +227,8 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, if (bkey_deleted(&n->k)) n->k.size = 0; - return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); } @@ -258,8 +257,14 @@ static int bch2_move_extent(struct btree_trans *trans, return 0; } - if (!percpu_ref_tryget_live(&c->writes)) - return -EROFS; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move)) + return -BCH_ERR_erofs_no_writes; + + /* + * Before memory allocations & taking nocow locks in + * bch2_data_update_init(): + */ + bch2_trans_unlock(trans); /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) @@ -294,9 +299,9 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts, - data_opts, btree_id, k); - if (ret) + ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, btree_id, k); + if (ret && ret != -BCH_ERR_unwritten_extent_update) goto err_free_pages; io->write.ctxt = ctxt; @@ -304,11 +309,21 @@ static int bch2_move_extent(struct btree_trans *trans, atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); + + if (ret == -BCH_ERR_unwritten_extent_update) { + bch2_update_unwritten_extent(trans, &io->write); + move_free(io); + return 0; + } + + BUG_ON(ret); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); trace_move_extent_read(k.k); atomic_add(io->read_sectors, &ctxt->read_sectors); + atomic_inc(&ctxt->read_ios); list_add_tail(&io->list, &ctxt->reads); /* @@ -327,7 +342,7 @@ err_free_pages: err_free: kfree(io); err: - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_move); trace_and_count(c, move_extent_alloc_mem_fail, k.k); return ret; } @@ -346,7 +361,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, if (ret) goto err; - if (!k.k || bkey_cmp(k.k->p, pos)) { + if (!k.k || !bkey_eq(k.k->p, pos)) { ret = -ENOENT; goto err; } @@ -398,13 +413,15 @@ static int move_ratelimit(struct btree_trans *trans, } } while (delay); + /* + * XXX: these limits really ought to be per device, SSDs and hard drives + * will want different limits + */ move_ctxt_wait_event(ctxt, trans, - atomic_read(&ctxt->write_sectors) < - c->opts.move_bytes_in_flight >> 9); - - move_ctxt_wait_event(ctxt, trans, - atomic_read(&ctxt->read_sectors) < - c->opts.move_bytes_in_flight >> 9); + atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && + atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); return 0; } @@ -419,8 +436,6 @@ static int move_get_io_opts(struct btree_trans *trans, if (*cur_inum == k.k->p.inode) return 0; - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - ret = lookup_inode(trans, SPOS(0, k.k->p.inode, k.k->p.snapshot), &inode); @@ -428,8 +443,9 @@ static int move_get_io_opts(struct btree_trans *trans, return ret; if (!ret) - bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode)); - + bch2_inode_opts_get(io_opts, trans->c, &inode); + else + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); *cur_inum = k.k->p.inode; return 0; } @@ -477,7 +493,7 @@ static int __bch2_move_data(struct moving_context *ctxt, if (ret) break; - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + if (bkey_ge(bkey_start_pos(k.k), end)) break; ctxt->stats->pos = iter.pos; @@ -499,6 +515,7 @@ static int __bch2_move_data(struct moving_context *ctxt, */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts, btree_id, k, data_opts); @@ -568,65 +585,122 @@ int bch2_move_data(struct bch_fs *c, return ret; } -static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) +static noinline void verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + struct bch_backpointer bp; + u64 bp_offset = 0; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); again: - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); if (!ret && k.k->type == KEY_TYPE_alloc_v4) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); if (a.v->gen == gen && a.v->dirty_sectors) { - struct printbuf buf = PRINTBUF; - if (a.v->data_type == BCH_DATA_btree) { bch2_trans_unlock(trans); if (bch2_btree_interior_updates_flush(c)) goto again; + goto failed_to_evacuate; } - - prt_str(&buf, "failed to evacuate bucket "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); } } bch2_trans_iter_exit(trans, &iter); - return ret; + return; +failed_to_evacuate: + bch2_trans_iter_exit(trans, &iter); + + prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket ")); + bch2_bkey_val_to_text(&buf, c, k); + + while (1) { + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket, gen, + &bp_offset, &bp, + BTREE_ITER_CACHED); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + if (bp_offset == U64_MAX) + break; + + k = bch2_backpointer_get_key(trans, &iter, + bucket, bp_offset, bp); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + if (!k.k) + continue; + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_iter_exit(trans, &iter); + } + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); } -int __bch2_evacuate_bucket(struct moving_context *ctxt, +int __bch2_evacuate_bucket(struct btree_trans *trans, + struct moving_context *ctxt, struct bpos bucket, int gen, struct data_update_opts _data_opts) { struct bch_fs *c = ctxt->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_trans trans; struct btree_iter iter; struct bkey_buf sk; struct bch_backpointer bp; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c k; struct data_update_opts data_opts; + unsigned dirty_sectors, bucket_size; + u64 fragmentation; u64 bp_offset = 0, cur_inum = U64_MAX; int ret = 0; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - while (!(ret = move_ratelimit(&trans, ctxt))) { - bch2_trans_begin(&trans); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + bch2_trans_iter_exit(trans, &iter); + + if (ret) { + bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret)); + goto err; + } - ret = bch2_get_next_backpointer(&trans, bucket, gen, + a = bch2_alloc_to_v4(k, &a_convert); + dirty_sectors = a->dirty_sectors; + bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + fragmentation = a->fragmentation_lru; + + ret = bch2_btree_write_buffer_flush(trans); + if (ret) { + bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret)); + goto err; + } + + while (!(ret = move_ratelimit(trans, ctxt))) { + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket, gen, &bp_offset, &bp, BTREE_ITER_CACHED); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -641,7 +715,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c k; unsigned i = 0; - k = bch2_backpointer_get_key(&trans, &iter, + k = bch2_backpointer_get_key(trans, &iter, bucket, bp_offset, bp); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -649,14 +723,14 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; if (!k.k) - continue; + goto next; bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); if (ret) { - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); continue; } @@ -670,15 +744,15 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, i++; } - ret = bch2_move_extent(&trans, &iter, ctxt, io_opts, + ret = bch2_move_extent(trans, &iter, ctxt, io_opts, bp.btree_id, k, data_opts); - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, &trans); + bch2_move_ctxt_wait_for_io(ctxt, trans); continue; } if (ret) @@ -690,7 +764,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, } else { struct btree *b; - b = bch2_backpointer_get_node(&trans, &iter, + b = bch2_backpointer_get_node(trans, &iter, bucket, bp_offset, bp); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) @@ -700,10 +774,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; if (!b) - continue; + goto next; - ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); - bch2_trans_iter_exit(&trans, &iter); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -716,19 +790,20 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); } - +next: bp_offset++; } + trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { - bch2_trans_unlock(&trans); + bch2_trans_unlock(trans); move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); if (!ctxt->write_error) - lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); + verify_bucket_evacuated(trans, bucket, gen); } err: - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } @@ -741,12 +816,15 @@ int bch2_evacuate_bucket(struct bch_fs *c, struct write_point_specifier wp, bool wait_on_copygc) { + struct btree_trans trans; struct moving_context ctxt; int ret; + bch2_trans_init(&trans, c, 0, 0); bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts); + ret = __bch2_evacuate_bucket(&trans, &ctxt, bucket, gen, data_opts); bch2_moving_ctxt_exit(&ctxt); + bch2_trans_exit(&trans); return ret; }