]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to feaca6edbd24 mean and variance: Promote to lib/math
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io_read.h"
18 #include "io_write.h"
19 #include "journal_reclaim.h"
20 #include "keylist.h"
21 #include "move.h"
22 #include "replicas.h"
23 #include "snapshot.h"
24 #include "super-io.h"
25 #include "trace.h"
26
27 #include <linux/ioprio.h>
28 #include <linux/kthread.h>
29
30 const char * const bch2_data_ops_strs[] = {
31 #define x(t, n, ...) [n] = #t,
32         BCH_DATA_OPS()
33 #undef x
34         NULL
35 };
36
37 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
38 {
39         if (trace_move_extent_enabled()) {
40                 struct printbuf buf = PRINTBUF;
41
42                 bch2_bkey_val_to_text(&buf, c, k);
43                 trace_move_extent(c, buf.buf);
44                 printbuf_exit(&buf);
45         }
46 }
47
48 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
49 {
50         if (trace_move_extent_read_enabled()) {
51                 struct printbuf buf = PRINTBUF;
52
53                 bch2_bkey_val_to_text(&buf, c, k);
54                 trace_move_extent_read(c, buf.buf);
55                 printbuf_exit(&buf);
56         }
57 }
58
59 struct moving_io {
60         struct list_head                read_list;
61         struct list_head                io_list;
62         struct move_bucket_in_flight    *b;
63         struct closure                  cl;
64         bool                            read_completed;
65
66         unsigned                        read_sectors;
67         unsigned                        write_sectors;
68
69         struct bch_read_bio             rbio;
70
71         struct data_update              write;
72         /* Must be last since it is variable size */
73         struct bio_vec                  bi_inline_vecs[0];
74 };
75
76 static void move_free(struct moving_io *io)
77 {
78         struct moving_context *ctxt = io->write.ctxt;
79
80         if (io->b)
81                 atomic_dec(&io->b->count);
82
83         bch2_data_update_exit(&io->write);
84
85         mutex_lock(&ctxt->lock);
86         list_del(&io->io_list);
87         wake_up(&ctxt->wait);
88         mutex_unlock(&ctxt->lock);
89
90         kfree(io);
91 }
92
93 static void move_write_done(struct bch_write_op *op)
94 {
95         struct moving_io *io = container_of(op, struct moving_io, write.op);
96         struct moving_context *ctxt = io->write.ctxt;
97
98         if (io->write.op.error)
99                 ctxt->write_error = true;
100
101         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
102         atomic_dec(&io->write.ctxt->write_ios);
103         move_free(io);
104         closure_put(&ctxt->cl);
105 }
106
107 static void move_write(struct moving_io *io)
108 {
109         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
110                 move_free(io);
111                 return;
112         }
113
114         closure_get(&io->write.ctxt->cl);
115         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
116         atomic_inc(&io->write.ctxt->write_ios);
117
118         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
119 }
120
121 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
122 {
123         struct moving_io *io =
124                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
125
126         return io && io->read_completed ? io : NULL;
127 }
128
129 static void move_read_endio(struct bio *bio)
130 {
131         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
132         struct moving_context *ctxt = io->write.ctxt;
133
134         atomic_sub(io->read_sectors, &ctxt->read_sectors);
135         atomic_dec(&ctxt->read_ios);
136         io->read_completed = true;
137
138         wake_up(&ctxt->wait);
139         closure_put(&ctxt->cl);
140 }
141
142 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
143 {
144         struct moving_io *io;
145
146         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
147                 bch2_trans_unlock_long(ctxt->trans);
148                 list_del(&io->read_list);
149                 move_write(io);
150         }
151 }
152
153 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
154 {
155         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
156
157         move_ctxt_wait_event(ctxt,
158                 !atomic_read(&ctxt->write_sectors) ||
159                 atomic_read(&ctxt->write_sectors) != sectors_pending);
160 }
161
162 static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
163 {
164         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
165         bch2_trans_unlock_long(ctxt->trans);
166         closure_sync(&ctxt->cl);
167 }
168
169 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
170 {
171         struct bch_fs *c = ctxt->trans->c;
172
173         bch2_moving_ctxt_flush_all(ctxt);
174
175         EBUG_ON(atomic_read(&ctxt->write_sectors));
176         EBUG_ON(atomic_read(&ctxt->write_ios));
177         EBUG_ON(atomic_read(&ctxt->read_sectors));
178         EBUG_ON(atomic_read(&ctxt->read_ios));
179
180         mutex_lock(&c->moving_context_lock);
181         list_del(&ctxt->list);
182         mutex_unlock(&c->moving_context_lock);
183
184         bch2_trans_put(ctxt->trans);
185         memset(ctxt, 0, sizeof(*ctxt));
186 }
187
188 void bch2_moving_ctxt_init(struct moving_context *ctxt,
189                            struct bch_fs *c,
190                            struct bch_ratelimit *rate,
191                            struct bch_move_stats *stats,
192                            struct write_point_specifier wp,
193                            bool wait_on_copygc)
194 {
195         memset(ctxt, 0, sizeof(*ctxt));
196
197         ctxt->trans     = bch2_trans_get(c);
198         ctxt->fn        = (void *) _RET_IP_;
199         ctxt->rate      = rate;
200         ctxt->stats     = stats;
201         ctxt->wp        = wp;
202         ctxt->wait_on_copygc = wait_on_copygc;
203
204         closure_init_stack(&ctxt->cl);
205
206         mutex_init(&ctxt->lock);
207         INIT_LIST_HEAD(&ctxt->reads);
208         INIT_LIST_HEAD(&ctxt->ios);
209         init_waitqueue_head(&ctxt->wait);
210
211         mutex_lock(&c->moving_context_lock);
212         list_add(&ctxt->list, &c->moving_context_list);
213         mutex_unlock(&c->moving_context_lock);
214 }
215
216 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
217 {
218         trace_move_data(c, stats);
219 }
220
221 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
222 {
223         memset(stats, 0, sizeof(*stats));
224         stats->data_type = BCH_DATA_user;
225         scnprintf(stats->name, sizeof(stats->name), "%s", name);
226 }
227
228 int bch2_move_extent(struct moving_context *ctxt,
229                      struct move_bucket_in_flight *bucket_in_flight,
230                      struct btree_iter *iter,
231                      struct bkey_s_c k,
232                      struct bch_io_opts io_opts,
233                      struct data_update_opts data_opts)
234 {
235         struct btree_trans *trans = ctxt->trans;
236         struct bch_fs *c = trans->c;
237         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
238         struct moving_io *io;
239         const union bch_extent_entry *entry;
240         struct extent_ptr_decoded p;
241         unsigned sectors = k.k->size, pages;
242         int ret = -ENOMEM;
243
244         if (ctxt->stats)
245                 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
246         trace_move_extent2(c, k);
247
248         bch2_data_update_opts_normalize(k, &data_opts);
249
250         if (!data_opts.rewrite_ptrs &&
251             !data_opts.extra_replicas) {
252                 if (data_opts.kill_ptrs)
253                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
254                 return 0;
255         }
256
257         /*
258          * Before memory allocations & taking nocow locks in
259          * bch2_data_update_init():
260          */
261         bch2_trans_unlock(trans);
262
263         /* write path might have to decompress data: */
264         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
265                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
266
267         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
268         io = kzalloc(sizeof(struct moving_io) +
269                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
270         if (!io)
271                 goto err;
272
273         INIT_LIST_HEAD(&io->io_list);
274         io->write.ctxt          = ctxt;
275         io->read_sectors        = k.k->size;
276         io->write_sectors       = k.k->size;
277
278         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
279         bio_set_prio(&io->write.op.wbio.bio,
280                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
281
282         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
283                                  GFP_KERNEL))
284                 goto err_free;
285
286         io->rbio.c              = c;
287         io->rbio.opts           = io_opts;
288         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
289         io->rbio.bio.bi_vcnt = pages;
290         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
291         io->rbio.bio.bi_iter.bi_size = sectors << 9;
292
293         io->rbio.bio.bi_opf             = REQ_OP_READ;
294         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
295         io->rbio.bio.bi_end_io          = move_read_endio;
296
297         ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
298                                     io_opts, data_opts, iter->btree_id, k);
299         if (ret)
300                 goto err_free_pages;
301
302         io->write.op.end_io = move_write_done;
303
304         if (ctxt->rate)
305                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
306
307         if (ctxt->stats) {
308                 atomic64_inc(&ctxt->stats->keys_moved);
309                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
310         }
311
312         if (bucket_in_flight) {
313                 io->b = bucket_in_flight;
314                 atomic_inc(&io->b->count);
315         }
316
317         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
318         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
319         trace_move_extent_read2(c, k);
320
321         mutex_lock(&ctxt->lock);
322         atomic_add(io->read_sectors, &ctxt->read_sectors);
323         atomic_inc(&ctxt->read_ios);
324
325         list_add_tail(&io->read_list, &ctxt->reads);
326         list_add_tail(&io->io_list, &ctxt->ios);
327         mutex_unlock(&ctxt->lock);
328
329         /*
330          * dropped by move_read_endio() - guards against use after free of
331          * ctxt when doing wakeup
332          */
333         closure_get(&ctxt->cl);
334         bch2_read_extent(trans, &io->rbio,
335                          bkey_start_pos(k.k),
336                          iter->btree_id, k, 0,
337                          BCH_READ_NODECODE|
338                          BCH_READ_LAST_FRAGMENT);
339         return 0;
340 err_free_pages:
341         bio_free_pages(&io->write.op.wbio.bio);
342 err_free:
343         kfree(io);
344 err:
345         if (ret == -BCH_ERR_data_update_done)
346                 return 0;
347
348         this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]);
349         if (trace_move_extent_start_fail_enabled()) {
350                 struct printbuf buf = PRINTBUF;
351
352                 bch2_bkey_val_to_text(&buf, c, k);
353                 prt_str(&buf, ": ");
354                 prt_str(&buf, bch2_err_str(ret));
355                 trace_move_extent_start_fail(c, buf.buf);
356                 printbuf_exit(&buf);
357         }
358         return ret;
359 }
360
361 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
362                           struct per_snapshot_io_opts *io_opts,
363                           struct bkey_s_c extent_k)
364 {
365         struct bch_fs *c = trans->c;
366         u32 restart_count = trans->restart_count;
367         int ret = 0;
368
369         if (io_opts->cur_inum != extent_k.k->p.inode) {
370                 struct btree_iter iter;
371                 struct bkey_s_c k;
372
373                 io_opts->d.nr = 0;
374
375                 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
376                                    BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
377                         if (k.k->p.offset != extent_k.k->p.inode)
378                                 break;
379
380                         if (!bkey_is_inode(k.k))
381                                 continue;
382
383                         struct bch_inode_unpacked inode;
384                         BUG_ON(bch2_inode_unpack(k, &inode));
385
386                         struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
387                         bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
388
389                         ret = darray_push(&io_opts->d, e);
390                         if (ret)
391                                 break;
392                 }
393                 bch2_trans_iter_exit(trans, &iter);
394                 io_opts->cur_inum = extent_k.k->p.inode;
395         }
396
397         ret = ret ?: trans_was_restarted(trans, restart_count);
398         if (ret)
399                 return ERR_PTR(ret);
400
401         if (extent_k.k->p.snapshot) {
402                 struct snapshot_io_opts_entry *i;
403                 darray_for_each(io_opts->d, i)
404                         if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
405                                 return &i->io_opts;
406         }
407
408         return &io_opts->fs_io_opts;
409 }
410
411 int bch2_move_get_io_opts_one(struct btree_trans *trans,
412                               struct bch_io_opts *io_opts,
413                               struct bkey_s_c extent_k)
414 {
415         struct btree_iter iter;
416         struct bkey_s_c k;
417         int ret;
418
419         /* reflink btree? */
420         if (!extent_k.k->p.inode) {
421                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
422                 return 0;
423         }
424
425         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
426                                SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
427                                BTREE_ITER_CACHED);
428         ret = bkey_err(k);
429         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
430                 return ret;
431
432         if (!ret && bkey_is_inode(k.k)) {
433                 struct bch_inode_unpacked inode;
434                 bch2_inode_unpack(k, &inode);
435                 bch2_inode_opts_get(io_opts, trans->c, &inode);
436         } else {
437                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
438         }
439
440         bch2_trans_iter_exit(trans, &iter);
441         return 0;
442 }
443
444 int bch2_move_ratelimit(struct moving_context *ctxt)
445 {
446         struct bch_fs *c = ctxt->trans->c;
447         u64 delay;
448
449         if (ctxt->wait_on_copygc && c->copygc_running) {
450                 bch2_moving_ctxt_flush_all(ctxt);
451                 wait_event_killable(c->copygc_running_wq,
452                                     !c->copygc_running ||
453                                     kthread_should_stop());
454         }
455
456         do {
457                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
458
459                 if (kthread_should_stop())
460                         return 1;
461
462                 if (delay)
463                         move_ctxt_wait_event_timeout(ctxt,
464                                         freezing(current) || kthread_should_stop(),
465                                         delay);
466
467                 if (unlikely(freezing(current))) {
468                         bch2_moving_ctxt_flush_all(ctxt);
469                         try_to_freeze();
470                 }
471         } while (delay);
472
473         /*
474          * XXX: these limits really ought to be per device, SSDs and hard drives
475          * will want different limits
476          */
477         move_ctxt_wait_event(ctxt,
478                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
479                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
480                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
481                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
482
483         return 0;
484 }
485
486 static int bch2_move_data_btree(struct moving_context *ctxt,
487                                 struct bpos start,
488                                 struct bpos end,
489                                 move_pred_fn pred, void *arg,
490                                 enum btree_id btree_id)
491 {
492         struct btree_trans *trans = ctxt->trans;
493         struct bch_fs *c = trans->c;
494         struct per_snapshot_io_opts snapshot_io_opts;
495         struct bch_io_opts *io_opts;
496         struct bkey_buf sk;
497         struct btree_iter iter;
498         struct bkey_s_c k;
499         struct data_update_opts data_opts;
500         int ret = 0, ret2;
501
502         per_snapshot_io_opts_init(&snapshot_io_opts, c);
503         bch2_bkey_buf_init(&sk);
504
505         if (ctxt->stats) {
506                 ctxt->stats->data_type  = BCH_DATA_user;
507                 ctxt->stats->pos        = BBPOS(btree_id, start);
508         }
509
510         bch2_trans_iter_init(trans, &iter, btree_id, start,
511                              BTREE_ITER_PREFETCH|
512                              BTREE_ITER_ALL_SNAPSHOTS);
513
514         if (ctxt->rate)
515                 bch2_ratelimit_reset(ctxt->rate);
516
517         while (!bch2_move_ratelimit(ctxt)) {
518                 bch2_trans_begin(trans);
519
520                 k = bch2_btree_iter_peek(&iter);
521                 if (!k.k)
522                         break;
523
524                 ret = bkey_err(k);
525                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
526                         continue;
527                 if (ret)
528                         break;
529
530                 if (bkey_ge(bkey_start_pos(k.k), end))
531                         break;
532
533                 if (ctxt->stats)
534                         ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
535
536                 if (!bkey_extent_is_direct_data(k.k))
537                         goto next_nondata;
538
539                 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
540                 ret = PTR_ERR_OR_ZERO(io_opts);
541                 if (ret)
542                         continue;
543
544                 memset(&data_opts, 0, sizeof(data_opts));
545                 if (!pred(c, arg, k, io_opts, &data_opts))
546                         goto next;
547
548                 /*
549                  * The iterator gets unlocked by __bch2_read_extent - need to
550                  * save a copy of @k elsewhere:
551                  */
552                 bch2_bkey_buf_reassemble(&sk, c, k);
553                 k = bkey_i_to_s_c(sk.k);
554
555                 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
556                 if (ret2) {
557                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
558                                 continue;
559
560                         if (ret2 == -ENOMEM) {
561                                 /* memory allocation failure, wait for some IO to finish */
562                                 bch2_move_ctxt_wait_for_io(ctxt);
563                                 continue;
564                         }
565
566                         /* XXX signal failure */
567                         goto next;
568                 }
569 next:
570                 if (ctxt->stats)
571                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
572 next_nondata:
573                 bch2_btree_iter_advance(&iter);
574         }
575
576         bch2_trans_iter_exit(trans, &iter);
577         bch2_bkey_buf_exit(&sk, c);
578         per_snapshot_io_opts_exit(&snapshot_io_opts);
579
580         return ret;
581 }
582
583 int __bch2_move_data(struct moving_context *ctxt,
584                      struct bbpos start,
585                      struct bbpos end,
586                      move_pred_fn pred, void *arg)
587 {
588         struct bch_fs *c = ctxt->trans->c;
589         enum btree_id id;
590         int ret = 0;
591
592         for (id = start.btree;
593              id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
594              id++) {
595                 ctxt->stats->pos = BBPOS(id, POS_MIN);
596
597                 if (!btree_type_has_ptrs(id) ||
598                     !bch2_btree_id_root(c, id)->b)
599                         continue;
600
601                 ret = bch2_move_data_btree(ctxt,
602                                        id == start.btree ? start.pos : POS_MIN,
603                                        id == end.btree   ? end.pos   : POS_MAX,
604                                        pred, arg, id);
605                 if (ret)
606                         break;
607         }
608
609         return ret;
610 }
611
612 int bch2_move_data(struct bch_fs *c,
613                    struct bbpos start,
614                    struct bbpos end,
615                    struct bch_ratelimit *rate,
616                    struct bch_move_stats *stats,
617                    struct write_point_specifier wp,
618                    bool wait_on_copygc,
619                    move_pred_fn pred, void *arg)
620 {
621
622         struct moving_context ctxt;
623         int ret;
624
625         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
626         ret = __bch2_move_data(&ctxt, start, end, pred, arg);
627         bch2_moving_ctxt_exit(&ctxt);
628
629         return ret;
630 }
631
632 int __bch2_evacuate_bucket(struct moving_context *ctxt,
633                            struct move_bucket_in_flight *bucket_in_flight,
634                            struct bpos bucket, int gen,
635                            struct data_update_opts _data_opts)
636 {
637         struct btree_trans *trans = ctxt->trans;
638         struct bch_fs *c = trans->c;
639         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
640         struct btree_iter iter;
641         struct bkey_buf sk;
642         struct bch_backpointer bp;
643         struct bch_alloc_v4 a_convert;
644         const struct bch_alloc_v4 *a;
645         struct bkey_s_c k;
646         struct data_update_opts data_opts;
647         unsigned dirty_sectors, bucket_size;
648         u64 fragmentation;
649         struct bpos bp_pos = POS_MIN;
650         int ret = 0;
651
652         trace_bucket_evacuate(c, &bucket);
653
654         bch2_bkey_buf_init(&sk);
655
656         /*
657          * We're not run in a context that handles transaction restarts:
658          */
659         bch2_trans_begin(trans);
660
661         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
662                              bucket, BTREE_ITER_CACHED);
663         ret = lockrestart_do(trans,
664                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
665         bch2_trans_iter_exit(trans, &iter);
666
667         if (ret) {
668                 bch_err_msg(c, ret, "looking up alloc key");
669                 goto err;
670         }
671
672         a = bch2_alloc_to_v4(k, &a_convert);
673         dirty_sectors = bch2_bucket_sectors_dirty(*a);
674         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
675         fragmentation = a->fragmentation_lru;
676
677         ret = bch2_btree_write_buffer_tryflush(trans);
678         bch_err_msg(c, ret, "flushing btree write buffer");
679         if (ret)
680                 goto err;
681
682         while (!(ret = bch2_move_ratelimit(ctxt))) {
683                 bch2_trans_begin(trans);
684
685                 ret = bch2_get_next_backpointer(trans, bucket, gen,
686                                                 &bp_pos, &bp,
687                                                 BTREE_ITER_CACHED);
688                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
689                         continue;
690                 if (ret)
691                         goto err;
692                 if (bkey_eq(bp_pos, POS_MAX))
693                         break;
694
695                 if (!bp.level) {
696                         const struct bch_extent_ptr *ptr;
697                         unsigned i = 0;
698
699                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
700                         ret = bkey_err(k);
701                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
702                                 continue;
703                         if (ret)
704                                 goto err;
705                         if (!k.k)
706                                 goto next;
707
708                         bch2_bkey_buf_reassemble(&sk, c, k);
709                         k = bkey_i_to_s_c(sk.k);
710
711                         ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
712                         if (ret) {
713                                 bch2_trans_iter_exit(trans, &iter);
714                                 continue;
715                         }
716
717                         data_opts = _data_opts;
718                         data_opts.target        = io_opts.background_target;
719                         data_opts.rewrite_ptrs = 0;
720
721                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
722                                 if (ptr->dev == bucket.inode) {
723                                         data_opts.rewrite_ptrs |= 1U << i;
724                                         if (ptr->cached) {
725                                                 bch2_trans_iter_exit(trans, &iter);
726                                                 goto next;
727                                         }
728                                 }
729                                 i++;
730                         }
731
732                         ret = bch2_move_extent(ctxt, bucket_in_flight,
733                                                &iter, k, io_opts, data_opts);
734                         bch2_trans_iter_exit(trans, &iter);
735
736                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737                                 continue;
738                         if (ret == -ENOMEM) {
739                                 /* memory allocation failure, wait for some IO to finish */
740                                 bch2_move_ctxt_wait_for_io(ctxt);
741                                 continue;
742                         }
743                         if (ret)
744                                 goto err;
745
746                         if (ctxt->stats)
747                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
748                 } else {
749                         struct btree *b;
750
751                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
752                         ret = PTR_ERR_OR_ZERO(b);
753                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
754                                 continue;
755                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
756                                 continue;
757                         if (ret)
758                                 goto err;
759                         if (!b)
760                                 goto next;
761
762                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
763                         bch2_trans_iter_exit(trans, &iter);
764
765                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
766                                 continue;
767                         if (ret)
768                                 goto err;
769
770                         if (ctxt->rate)
771                                 bch2_ratelimit_increment(ctxt->rate,
772                                                          c->opts.btree_node_size >> 9);
773                         if (ctxt->stats) {
774                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
775                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
776                         }
777                 }
778 next:
779                 bp_pos = bpos_nosnap_successor(bp_pos);
780         }
781
782         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
783 err:
784         bch2_bkey_buf_exit(&sk, c);
785         return ret;
786 }
787
788 int bch2_evacuate_bucket(struct bch_fs *c,
789                          struct bpos bucket, int gen,
790                          struct data_update_opts data_opts,
791                          struct bch_ratelimit *rate,
792                          struct bch_move_stats *stats,
793                          struct write_point_specifier wp,
794                          bool wait_on_copygc)
795 {
796         struct moving_context ctxt;
797         int ret;
798
799         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
800         ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
801         bch2_moving_ctxt_exit(&ctxt);
802
803         return ret;
804 }
805
806 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
807                                 struct btree *, struct bch_io_opts *,
808                                 struct data_update_opts *);
809
810 static int bch2_move_btree(struct bch_fs *c,
811                            struct bbpos start,
812                            struct bbpos end,
813                            move_btree_pred pred, void *arg,
814                            struct bch_move_stats *stats)
815 {
816         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
817         struct moving_context ctxt;
818         struct btree_trans *trans;
819         struct btree_iter iter;
820         struct btree *b;
821         enum btree_id btree;
822         struct data_update_opts data_opts;
823         int ret = 0;
824
825         bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
826                               writepoint_ptr(&c->btree_write_point),
827                               true);
828         trans = ctxt.trans;
829
830         stats->data_type = BCH_DATA_btree;
831
832         for (btree = start.btree;
833              btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
834              btree ++) {
835                 stats->pos = BBPOS(btree, POS_MIN);
836
837                 if (!bch2_btree_id_root(c, btree)->b)
838                         continue;
839
840                 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
841                                           BTREE_ITER_PREFETCH);
842 retry:
843                 ret = 0;
844                 while (bch2_trans_begin(trans),
845                        (b = bch2_btree_iter_peek_node(&iter)) &&
846                        !(ret = PTR_ERR_OR_ZERO(b))) {
847                         if (kthread_should_stop())
848                                 break;
849
850                         if ((cmp_int(btree, end.btree) ?:
851                              bpos_cmp(b->key.k.p, end.pos)) > 0)
852                                 break;
853
854                         stats->pos = BBPOS(iter.btree_id, iter.pos);
855
856                         if (!pred(c, arg, b, &io_opts, &data_opts))
857                                 goto next;
858
859                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
860                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
861                                 continue;
862                         if (ret)
863                                 break;
864 next:
865                         bch2_btree_iter_next_node(&iter);
866                 }
867                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
868                         goto retry;
869
870                 bch2_trans_iter_exit(trans, &iter);
871
872                 if (kthread_should_stop())
873                         break;
874         }
875
876         bch_err_fn(c, ret);
877         bch2_moving_ctxt_exit(&ctxt);
878         bch2_btree_interior_updates_flush(c);
879
880         return ret;
881 }
882
883 static bool rereplicate_pred(struct bch_fs *c, void *arg,
884                              struct bkey_s_c k,
885                              struct bch_io_opts *io_opts,
886                              struct data_update_opts *data_opts)
887 {
888         unsigned nr_good = bch2_bkey_durability(c, k);
889         unsigned replicas = bkey_is_btree_ptr(k.k)
890                 ? c->opts.metadata_replicas
891                 : io_opts->data_replicas;
892
893         if (!nr_good || nr_good >= replicas)
894                 return false;
895
896         data_opts->target               = 0;
897         data_opts->extra_replicas       = replicas - nr_good;
898         data_opts->btree_insert_flags   = 0;
899         return true;
900 }
901
902 static bool migrate_pred(struct bch_fs *c, void *arg,
903                          struct bkey_s_c k,
904                          struct bch_io_opts *io_opts,
905                          struct data_update_opts *data_opts)
906 {
907         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
908         const struct bch_extent_ptr *ptr;
909         struct bch_ioctl_data *op = arg;
910         unsigned i = 0;
911
912         data_opts->rewrite_ptrs         = 0;
913         data_opts->target               = 0;
914         data_opts->extra_replicas       = 0;
915         data_opts->btree_insert_flags   = 0;
916
917         bkey_for_each_ptr(ptrs, ptr) {
918                 if (ptr->dev == op->migrate.dev)
919                         data_opts->rewrite_ptrs |= 1U << i;
920                 i++;
921         }
922
923         return data_opts->rewrite_ptrs != 0;
924 }
925
926 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
927                                    struct btree *b,
928                                    struct bch_io_opts *io_opts,
929                                    struct data_update_opts *data_opts)
930 {
931         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
932 }
933
934 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
935                                struct btree *b,
936                                struct bch_io_opts *io_opts,
937                                struct data_update_opts *data_opts)
938 {
939         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
940 }
941
942 static bool bformat_needs_redo(struct bkey_format *f)
943 {
944         unsigned i;
945
946         for (i = 0; i < f->nr_fields; i++) {
947                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
948                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
949                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
950
951                 if (f->bits_per_field[i] > unpacked_bits)
952                         return true;
953
954                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
955                         return true;
956
957                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
958                      unpacked_mask) <
959                     field_offset)
960                         return true;
961         }
962
963         return false;
964 }
965
966 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
967                                    struct btree *b,
968                                    struct bch_io_opts *io_opts,
969                                    struct data_update_opts *data_opts)
970 {
971         if (b->version_ondisk != c->sb.version ||
972             btree_node_need_rewrite(b) ||
973             bformat_needs_redo(&b->format)) {
974                 data_opts->target               = 0;
975                 data_opts->extra_replicas       = 0;
976                 data_opts->btree_insert_flags   = 0;
977                 return true;
978         }
979
980         return false;
981 }
982
983 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
984 {
985         int ret;
986
987         ret = bch2_move_btree(c,
988                               BBPOS_MIN,
989                               BBPOS_MAX,
990                               rewrite_old_nodes_pred, c, stats);
991         if (!ret) {
992                 mutex_lock(&c->sb_lock);
993                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
994                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
995                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
996                 bch2_write_super(c);
997                 mutex_unlock(&c->sb_lock);
998         }
999
1000         bch_err_fn(c, ret);
1001         return ret;
1002 }
1003
1004 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1005                              struct bkey_s_c k,
1006                              struct bch_io_opts *io_opts,
1007                              struct data_update_opts *data_opts)
1008 {
1009         unsigned durability = bch2_bkey_durability(c, k);
1010         unsigned replicas = bkey_is_btree_ptr(k.k)
1011                 ? c->opts.metadata_replicas
1012                 : io_opts->data_replicas;
1013         const union bch_extent_entry *entry;
1014         struct extent_ptr_decoded p;
1015         unsigned i = 0;
1016
1017         bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1018                 unsigned d = bch2_extent_ptr_durability(c, &p);
1019
1020                 if (d && durability - d >= replicas) {
1021                         data_opts->kill_ptrs |= BIT(i);
1022                         durability -= d;
1023                 }
1024
1025                 i++;
1026         }
1027
1028         return data_opts->kill_ptrs != 0;
1029 }
1030
1031 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1032                                    struct btree *b,
1033                                    struct bch_io_opts *io_opts,
1034                                    struct data_update_opts *data_opts)
1035 {
1036         return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
1037 }
1038
1039 int bch2_data_job(struct bch_fs *c,
1040                   struct bch_move_stats *stats,
1041                   struct bch_ioctl_data op)
1042 {
1043         struct bbpos start      = BBPOS(op.start_btree, op.start_pos);
1044         struct bbpos end        = BBPOS(op.end_btree, op.end_pos);
1045         int ret = 0;
1046
1047         if (op.op >= BCH_DATA_OP_NR)
1048                 return -EINVAL;
1049
1050         bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
1051
1052         switch (op.op) {
1053         case BCH_DATA_OP_rereplicate:
1054                 stats->data_type = BCH_DATA_journal;
1055                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1056                 ret = bch2_move_btree(c, start, end,
1057                                       rereplicate_btree_pred, c, stats) ?: ret;
1058                 ret = bch2_move_data(c, start, end,
1059                                      NULL,
1060                                      stats,
1061                                      writepoint_hashed((unsigned long) current),
1062                                      true,
1063                                      rereplicate_pred, c) ?: ret;
1064                 ret = bch2_replicas_gc2(c) ?: ret;
1065                 break;
1066         case BCH_DATA_OP_migrate:
1067                 if (op.migrate.dev >= c->sb.nr_devices)
1068                         return -EINVAL;
1069
1070                 stats->data_type = BCH_DATA_journal;
1071                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1072                 ret = bch2_move_btree(c, start, end,
1073                                       migrate_btree_pred, &op, stats) ?: ret;
1074                 ret = bch2_move_data(c, start, end,
1075                                      NULL,
1076                                      stats,
1077                                      writepoint_hashed((unsigned long) current),
1078                                      true,
1079                                      migrate_pred, &op) ?: ret;
1080                 ret = bch2_replicas_gc2(c) ?: ret;
1081                 break;
1082         case BCH_DATA_OP_rewrite_old_nodes:
1083                 ret = bch2_scan_old_btree_nodes(c, stats);
1084                 break;
1085         case BCH_DATA_OP_drop_extra_replicas:
1086                 ret = bch2_move_btree(c, start, end,
1087                                 drop_extra_replicas_btree_pred, c, stats) ?: ret;
1088                 ret = bch2_move_data(c, start, end, NULL, stats,
1089                                 writepoint_hashed((unsigned long) current),
1090                                 true,
1091                                 drop_extra_replicas_pred, c) ?: ret;
1092                 ret = bch2_replicas_gc2(c) ?: ret;
1093                 break;
1094         default:
1095                 ret = -EINVAL;
1096         }
1097
1098         bch2_move_stats_exit(stats, c);
1099         return ret;
1100 }
1101
1102 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1103 {
1104         prt_printf(out, "%s: data type=%s pos=",
1105                    stats->name,
1106                    bch2_data_types[stats->data_type]);
1107         bch2_bbpos_to_text(out, stats->pos);
1108         prt_newline(out);
1109         printbuf_indent_add(out, 2);
1110
1111         prt_str(out, "keys moved:  ");
1112         prt_u64(out, atomic64_read(&stats->keys_moved));
1113         prt_newline(out);
1114
1115         prt_str(out, "keys raced:  ");
1116         prt_u64(out, atomic64_read(&stats->keys_raced));
1117         prt_newline(out);
1118
1119         prt_str(out, "bytes seen:  ");
1120         prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1121         prt_newline(out);
1122
1123         prt_str(out, "bytes moved: ");
1124         prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1125         prt_newline(out);
1126
1127         prt_str(out, "bytes raced: ");
1128         prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1129         prt_newline(out);
1130
1131         printbuf_indent_sub(out, 2);
1132 }
1133
1134 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1135 {
1136         struct moving_io *io;
1137
1138         bch2_move_stats_to_text(out, ctxt->stats);
1139         printbuf_indent_add(out, 2);
1140
1141         prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1142                    atomic_read(&ctxt->read_ios),
1143                    c->opts.move_ios_in_flight,
1144                    atomic_read(&ctxt->read_sectors),
1145                    c->opts.move_bytes_in_flight >> 9);
1146         prt_newline(out);
1147
1148         prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1149                    atomic_read(&ctxt->write_ios),
1150                    c->opts.move_ios_in_flight,
1151                    atomic_read(&ctxt->write_sectors),
1152                    c->opts.move_bytes_in_flight >> 9);
1153         prt_newline(out);
1154
1155         printbuf_indent_add(out, 2);
1156
1157         mutex_lock(&ctxt->lock);
1158         list_for_each_entry(io, &ctxt->ios, io_list)
1159                 bch2_write_op_to_text(out, &io->write.op);
1160         mutex_unlock(&ctxt->lock);
1161
1162         printbuf_indent_sub(out, 4);
1163 }
1164
1165 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1166 {
1167         struct moving_context *ctxt;
1168
1169         mutex_lock(&c->moving_context_lock);
1170         list_for_each_entry(ctxt, &c->moving_context_list, list)
1171                 bch2_moving_ctxt_to_text(out, c, ctxt);
1172         mutex_unlock(&c->moving_context_lock);
1173 }
1174
1175 void bch2_fs_move_init(struct bch_fs *c)
1176 {
1177         INIT_LIST_HEAD(&c->moving_context_list);
1178         mutex_init(&c->moving_context_lock);
1179 }