1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_foreground.h"
5 #include "btree_iter.h"
6 #include "btree_update.h"
7 #include "btree_write_buffer.h"
11 #include "disk_groups.h"
16 #include "rebalance.h"
17 #include "subvolume.h"
21 #include <linux/freezer.h>
22 #include <linux/kthread.h>
23 #include <linux/sched/cputime.h>
25 #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
27 static const char * const bch2_rebalance_state_strs[] = {
29 BCH_REBALANCE_STATES()
34 static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
36 struct btree_iter iter;
38 struct bkey_i_cookie *cookie;
42 bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
43 SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
45 k = bch2_btree_iter_peek_slot(&iter);
50 v = k.k->type == KEY_TYPE_cookie
51 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
54 cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
55 ret = PTR_ERR_OR_ZERO(cookie);
59 bkey_cookie_init(&cookie->k_i);
60 cookie->k.p = iter.pos;
61 cookie->v.cookie = cpu_to_le64(v + 1);
63 ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
65 bch2_trans_iter_exit(trans, &iter);
69 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
71 int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
72 __bch2_set_rebalance_needs_scan(trans, inum));
77 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
79 return bch2_set_rebalance_needs_scan(c, 0);
82 static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
84 struct btree_iter iter;
89 bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
90 SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
92 k = bch2_btree_iter_peek_slot(&iter);
97 v = k.k->type == KEY_TYPE_cookie
98 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
102 ret = bch2_btree_delete_at(trans, &iter, 0);
104 bch2_trans_iter_exit(trans, &iter);
108 static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
109 struct btree_iter *work_iter)
111 return !kthread_should_stop()
112 ? bch2_btree_iter_peek(work_iter)
116 static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
117 struct btree_iter *iter,
120 struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
121 int ret = PTR_ERR_OR_ZERO(n);
125 extent_entry_drop(bkey_i_to_s(n),
126 (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
127 return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
130 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
131 struct bpos work_pos,
132 struct btree_iter *extent_iter,
133 struct data_update_opts *data_opts)
135 struct bch_fs *c = trans->c;
138 bch2_trans_iter_exit(trans, extent_iter);
139 bch2_trans_iter_init(trans, extent_iter,
140 work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
142 BTREE_ITER_ALL_SNAPSHOTS);
143 k = bch2_btree_iter_peek_slot(extent_iter);
147 const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
149 /* raced due to btree write buffer, nothing to do */
150 return bkey_s_c_null;
153 memset(data_opts, 0, sizeof(*data_opts));
155 data_opts->rewrite_ptrs =
156 bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
157 data_opts->target = r->target;
159 if (!data_opts->rewrite_ptrs) {
161 * device we would want to write to offline? devices in target
164 * We'll now need a full scan before this extent is picked up
167 int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
169 return bkey_s_c_err(ret);
170 return bkey_s_c_null;
177 static int do_rebalance_extent(struct moving_context *ctxt,
178 struct bpos work_pos,
179 struct btree_iter *extent_iter)
181 struct btree_trans *trans = ctxt->trans;
182 struct bch_fs *c = trans->c;
183 struct bch_fs_rebalance *r = &trans->c->rebalance;
184 struct data_update_opts data_opts;
185 struct bch_io_opts io_opts;
190 ctxt->stats = &r->work_stats;
191 r->state = BCH_REBALANCE_working;
193 bch2_bkey_buf_init(&sk);
195 ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
196 extent_iter, &data_opts));
200 ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
204 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
207 * The iterator gets unlocked by __bch2_read_extent - need to
208 * save a copy of @k elsewhere:
210 bch2_bkey_buf_reassemble(&sk, c, k);
211 k = bkey_i_to_s_c(sk.k);
213 ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
215 if (bch2_err_matches(ret, ENOMEM)) {
216 /* memory allocation failure, wait for some IO to finish */
217 bch2_move_ctxt_wait_for_io(ctxt);
218 ret = -BCH_ERR_transaction_restart_nested;
221 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
224 /* skip it and continue, XXX signal failure */
228 bch2_bkey_buf_exit(&sk, c);
232 static bool rebalance_pred(struct bch_fs *c, void *arg,
234 struct bch_io_opts *io_opts,
235 struct data_update_opts *data_opts)
237 unsigned target, compression;
240 target = io_opts->background_target;
241 compression = io_opts->background_compression ?: io_opts->compression;
243 const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
245 target = r ? r->target : io_opts->background_target;
246 compression = r ? r->compression :
247 (io_opts->background_compression ?: io_opts->compression);
250 data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
251 data_opts->target = target;
252 return data_opts->rewrite_ptrs != 0;
255 static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
257 struct btree_trans *trans = ctxt->trans;
258 struct bch_fs_rebalance *r = &trans->c->rebalance;
261 bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
262 ctxt->stats = &r->scan_stats;
265 r->scan_start = BBPOS_MIN;
266 r->scan_end = BBPOS_MAX;
268 r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
269 r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
272 r->state = BCH_REBALANCE_scanning;
274 ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
275 commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
276 bch2_clear_rebalance_needs_scan(trans, inum, cookie));
278 bch2_move_stats_exit(&r->scan_stats, trans->c);
282 static void rebalance_wait(struct bch_fs *c)
284 struct bch_fs_rebalance *r = &c->rebalance;
286 struct io_clock *clock = &c->io_clock[WRITE];
287 u64 now = atomic64_read(&clock->now);
288 u64 min_member_capacity = 128 * 2048;
291 for_each_rw_member(ca, c, i)
292 min_member_capacity = min(min_member_capacity,
293 ca->mi.nbuckets * ca->mi.bucket_size);
295 r->wait_iotime_end = now + (min_member_capacity >> 6);
297 if (r->state != BCH_REBALANCE_waiting) {
298 r->wait_iotime_start = now;
299 r->wait_wallclock_start = ktime_get_real_ns();
300 r->state = BCH_REBALANCE_waiting;
303 bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
306 static int do_rebalance(struct moving_context *ctxt)
308 struct btree_trans *trans = ctxt->trans;
309 struct bch_fs *c = trans->c;
310 struct bch_fs_rebalance *r = &c->rebalance;
311 struct btree_iter rebalance_work_iter, extent_iter = { NULL };
315 bch2_move_stats_init(&r->work_stats, "rebalance_work");
316 bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
318 bch2_trans_iter_init(trans, &rebalance_work_iter,
319 BTREE_ID_rebalance_work, POS_MIN,
320 BTREE_ITER_ALL_SNAPSHOTS);
322 while (!bch2_move_ratelimit(ctxt) &&
323 !kthread_wait_freezable(r->enabled)) {
324 bch2_trans_begin(trans);
326 ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
327 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
332 ret = k.k->type == KEY_TYPE_cookie
333 ? do_rebalance_scan(ctxt, k.k->p.inode,
334 le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
335 : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
337 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
342 bch2_btree_iter_advance(&rebalance_work_iter);
345 bch2_trans_iter_exit(trans, &extent_iter);
346 bch2_trans_iter_exit(trans, &rebalance_work_iter);
347 bch2_move_stats_exit(&r->scan_stats, c);
350 !kthread_should_stop() &&
351 !atomic64_read(&r->work_stats.sectors_seen) &&
352 !atomic64_read(&r->scan_stats.sectors_seen)) {
353 bch2_trans_unlock(trans);
361 static int bch2_rebalance_thread(void *arg)
363 struct bch_fs *c = arg;
364 struct bch_fs_rebalance *r = &c->rebalance;
365 struct moving_context ctxt;
370 bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
371 writepoint_ptr(&c->rebalance_write_point),
374 while (!kthread_should_stop() &&
375 !(ret = do_rebalance(&ctxt)))
378 bch2_moving_ctxt_exit(&ctxt);
383 void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
385 struct bch_fs_rebalance *r = &c->rebalance;
387 prt_str(out, bch2_rebalance_state_strs[r->state]);
389 printbuf_indent_add(out, 2);
392 case BCH_REBALANCE_waiting: {
393 u64 now = atomic64_read(&c->io_clock[WRITE].now);
395 prt_str(out, "io wait duration: ");
396 bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
399 prt_str(out, "io wait remaining: ");
400 bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
403 prt_str(out, "duration waited: ");
404 bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
408 case BCH_REBALANCE_working:
409 bch2_move_stats_to_text(out, &r->work_stats);
411 case BCH_REBALANCE_scanning:
412 bch2_move_stats_to_text(out, &r->scan_stats);
416 printbuf_indent_sub(out, 2);
419 void bch2_rebalance_stop(struct bch_fs *c)
421 struct task_struct *p;
423 c->rebalance.pd.rate.rate = UINT_MAX;
424 bch2_ratelimit_reset(&c->rebalance.pd.rate);
426 p = rcu_dereference_protected(c->rebalance.thread, 1);
427 c->rebalance.thread = NULL;
430 /* for sychronizing with rebalance_wakeup() */
438 int bch2_rebalance_start(struct bch_fs *c)
440 struct task_struct *p;
443 if (c->rebalance.thread)
446 if (c->opts.nochanges)
449 p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
450 ret = PTR_ERR_OR_ZERO(p);
452 bch_err_msg(c, ret, "creating rebalance thread");
457 rcu_assign_pointer(c->rebalance.thread, p);
462 void bch2_fs_rebalance_init(struct bch_fs *c)
464 bch2_pd_controller_init(&c->rebalance.pd);