]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to 3856459b1b bcachefs: bch2_btree_iter_peek_node_and_restart()
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io.h"
18 #include "journal_reclaim.h"
19 #include "move.h"
20 #include "replicas.h"
21 #include "super-io.h"
22 #include "keylist.h"
23
24 #include <linux/ioprio.h>
25 #include <linux/kthread.h>
26
27 #include <trace/events/bcachefs.h>
28
29 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
30 {
31         mutex_lock(&c->data_progress_lock);
32         list_add(&stats->list, &c->data_progress_list);
33         mutex_unlock(&c->data_progress_lock);
34 }
35
36 static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
37 {
38         mutex_lock(&c->data_progress_lock);
39         list_del(&stats->list);
40         mutex_unlock(&c->data_progress_lock);
41 }
42
43 struct moving_io {
44         struct list_head                list;
45         struct move_bucket_in_flight    *b;
46         struct closure                  cl;
47         bool                            read_completed;
48
49         unsigned                        read_sectors;
50         unsigned                        write_sectors;
51
52         struct bch_read_bio             rbio;
53
54         struct data_update              write;
55         /* Must be last since it is variable size */
56         struct bio_vec                  bi_inline_vecs[0];
57 };
58
59 static void move_free(struct moving_io *io)
60 {
61         struct moving_context *ctxt = io->write.ctxt;
62         struct bch_fs *c = ctxt->c;
63
64         if (io->b)
65                 atomic_dec(&io->b->count);
66
67         bch2_data_update_exit(&io->write);
68         wake_up(&ctxt->wait);
69         bch2_write_ref_put(c, BCH_WRITE_REF_move);
70         kfree(io);
71 }
72
73 static void move_write_done(struct bch_write_op *op)
74 {
75         struct moving_io *io = container_of(op, struct moving_io, write.op);
76         struct moving_context *ctxt = io->write.ctxt;
77
78         if (io->write.op.error)
79                 ctxt->write_error = true;
80
81         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
82         atomic_dec(&io->write.ctxt->write_ios);
83         move_free(io);
84         closure_put(&ctxt->cl);
85 }
86
87 static void move_write(struct moving_io *io)
88 {
89         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
90                 move_free(io);
91                 return;
92         }
93
94         closure_get(&io->write.ctxt->cl);
95         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
96         atomic_inc(&io->write.ctxt->write_ios);
97
98         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
99 }
100
101 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
102 {
103         struct moving_io *io =
104                 list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
105
106         return io && io->read_completed ? io : NULL;
107 }
108
109 static void move_read_endio(struct bio *bio)
110 {
111         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
112         struct moving_context *ctxt = io->write.ctxt;
113
114         atomic_sub(io->read_sectors, &ctxt->read_sectors);
115         atomic_dec(&ctxt->read_ios);
116         io->read_completed = true;
117
118         wake_up(&ctxt->wait);
119         closure_put(&ctxt->cl);
120 }
121
122 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
123                                         struct btree_trans *trans)
124 {
125         struct moving_io *io;
126
127         if (trans)
128                 bch2_trans_unlock(trans);
129
130         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
131                 list_del(&io->list);
132                 move_write(io);
133         }
134 }
135
136 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
137                                        struct btree_trans *trans)
138 {
139         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
140
141         move_ctxt_wait_event(ctxt, trans,
142                 !atomic_read(&ctxt->write_sectors) ||
143                 atomic_read(&ctxt->write_sectors) != sectors_pending);
144 }
145
146 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
147 {
148         move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
149         closure_sync(&ctxt->cl);
150
151         EBUG_ON(atomic_read(&ctxt->write_sectors));
152         EBUG_ON(atomic_read(&ctxt->write_ios));
153         EBUG_ON(atomic_read(&ctxt->read_sectors));
154         EBUG_ON(atomic_read(&ctxt->read_ios));
155
156         if (ctxt->stats) {
157                 progress_list_del(ctxt->c, ctxt->stats);
158
159                 trace_move_data(ctxt->c,
160                                 atomic64_read(&ctxt->stats->sectors_moved),
161                                 atomic64_read(&ctxt->stats->keys_moved));
162         }
163 }
164
165 void bch2_moving_ctxt_init(struct moving_context *ctxt,
166                            struct bch_fs *c,
167                            struct bch_ratelimit *rate,
168                            struct bch_move_stats *stats,
169                            struct write_point_specifier wp,
170                            bool wait_on_copygc)
171 {
172         memset(ctxt, 0, sizeof(*ctxt));
173
174         ctxt->c         = c;
175         ctxt->rate      = rate;
176         ctxt->stats     = stats;
177         ctxt->wp        = wp;
178         ctxt->wait_on_copygc = wait_on_copygc;
179
180         closure_init_stack(&ctxt->cl);
181         INIT_LIST_HEAD(&ctxt->reads);
182         init_waitqueue_head(&ctxt->wait);
183
184         if (stats) {
185                 progress_list_add(c, stats);
186                 stats->data_type = BCH_DATA_user;
187         }
188 }
189
190 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
191 {
192         memset(stats, 0, sizeof(*stats));
193         scnprintf(stats->name, sizeof(stats->name), "%s", name);
194 }
195
196 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
197                                  struct btree_iter *iter,
198                                  struct bkey_s_c k,
199                                  struct data_update_opts data_opts)
200 {
201         struct bch_fs *c = trans->c;
202         struct bkey_i *n;
203         int ret;
204
205         n = bch2_bkey_make_mut(trans, k);
206         ret = PTR_ERR_OR_ZERO(n);
207         if (ret)
208                 return ret;
209
210         while (data_opts.kill_ptrs) {
211                 unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
212                 struct bch_extent_ptr *ptr;
213
214                 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
215                 data_opts.kill_ptrs ^= 1U << drop;
216         }
217
218         /*
219          * If the new extent no longer has any pointers, bch2_extent_normalize()
220          * will do the appropriate thing with it (turning it into a
221          * KEY_TYPE_error key, or just a discard if it was a cached extent)
222          */
223         bch2_extent_normalize(c, bkey_i_to_s(n));
224
225         /*
226          * Since we're not inserting through an extent iterator
227          * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
228          * we aren't using the extent overwrite path to delete, we're
229          * just using the normal key deletion path:
230          */
231         if (bkey_deleted(&n->k))
232                 n->k.size = 0;
233
234         return bch2_trans_relock(trans) ?:
235                 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
236                 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
237 }
238
239 static int bch2_move_extent(struct btree_trans *trans,
240                             struct btree_iter *iter,
241                             struct moving_context *ctxt,
242                             struct move_bucket_in_flight *bucket_in_flight,
243                             struct bch_io_opts io_opts,
244                             enum btree_id btree_id,
245                             struct bkey_s_c k,
246                             struct data_update_opts data_opts)
247 {
248         struct bch_fs *c = trans->c;
249         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
250         struct moving_io *io;
251         const union bch_extent_entry *entry;
252         struct extent_ptr_decoded p;
253         unsigned sectors = k.k->size, pages;
254         int ret = -ENOMEM;
255
256         bch2_data_update_opts_normalize(k, &data_opts);
257
258         if (!data_opts.rewrite_ptrs &&
259             !data_opts.extra_replicas) {
260                 if (data_opts.kill_ptrs)
261                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
262                 return 0;
263         }
264
265         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
266                 return -BCH_ERR_erofs_no_writes;
267
268         /*
269          * Before memory allocations & taking nocow locks in
270          * bch2_data_update_init():
271          */
272         bch2_trans_unlock(trans);
273
274         /* write path might have to decompress data: */
275         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
276                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
277
278         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
279         io = kzalloc(sizeof(struct moving_io) +
280                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
281         if (!io)
282                 goto err;
283
284         io->write.ctxt          = ctxt;
285         io->read_sectors        = k.k->size;
286         io->write_sectors       = k.k->size;
287
288         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
289         bio_set_prio(&io->write.op.wbio.bio,
290                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
291
292         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
293                                  GFP_KERNEL))
294                 goto err_free;
295
296         io->rbio.c              = c;
297         io->rbio.opts           = io_opts;
298         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
299         io->rbio.bio.bi_vcnt = pages;
300         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
301         io->rbio.bio.bi_iter.bi_size = sectors << 9;
302
303         io->rbio.bio.bi_opf             = REQ_OP_READ;
304         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
305         io->rbio.bio.bi_end_io          = move_read_endio;
306
307         ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
308                                     io_opts, data_opts, btree_id, k);
309         if (ret && ret != -BCH_ERR_unwritten_extent_update)
310                 goto err_free_pages;
311
312         if (ret == -BCH_ERR_unwritten_extent_update) {
313                 bch2_update_unwritten_extent(trans, &io->write);
314                 move_free(io);
315                 return 0;
316         }
317
318         BUG_ON(ret);
319
320         io->write.ctxt = ctxt;
321         io->write.op.end_io = move_write_done;
322
323         if (ctxt->stats) {
324                 atomic64_inc(&ctxt->stats->keys_moved);
325                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
326         }
327
328         if (bucket_in_flight) {
329                 io->b = bucket_in_flight;
330                 atomic_inc(&io->b->count);
331         }
332
333         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
334         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
335         trace_move_extent_read(k.k);
336
337         atomic_add(io->read_sectors, &ctxt->read_sectors);
338         atomic_inc(&ctxt->read_ios);
339         list_add_tail(&io->list, &ctxt->reads);
340
341         /*
342          * dropped by move_read_endio() - guards against use after free of
343          * ctxt when doing wakeup
344          */
345         closure_get(&ctxt->cl);
346         bch2_read_extent(trans, &io->rbio,
347                          bkey_start_pos(k.k),
348                          btree_id, k, 0,
349                          BCH_READ_NODECODE|
350                          BCH_READ_LAST_FRAGMENT);
351         return 0;
352 err_free_pages:
353         bio_free_pages(&io->write.op.wbio.bio);
354 err_free:
355         kfree(io);
356 err:
357         bch2_write_ref_put(c, BCH_WRITE_REF_move);
358         trace_and_count(c, move_extent_alloc_mem_fail, k.k);
359         return ret;
360 }
361
362 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
363                         struct bch_inode_unpacked *inode)
364 {
365         struct btree_iter iter;
366         struct bkey_s_c k;
367         int ret;
368
369         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
370                              BTREE_ITER_ALL_SNAPSHOTS);
371         k = bch2_btree_iter_peek(&iter);
372         ret = bkey_err(k);
373         if (ret)
374                 goto err;
375
376         if (!k.k || !bkey_eq(k.k->p, pos)) {
377                 ret = -ENOENT;
378                 goto err;
379         }
380
381         ret = bkey_is_inode(k.k) ? 0 : -EIO;
382         if (ret)
383                 goto err;
384
385         ret = bch2_inode_unpack(k, inode);
386         if (ret)
387                 goto err;
388 err:
389         bch2_trans_iter_exit(trans, &iter);
390         return ret;
391 }
392
393 static int move_ratelimit(struct btree_trans *trans,
394                           struct moving_context *ctxt)
395 {
396         struct bch_fs *c = trans->c;
397         u64 delay;
398
399         if (ctxt->wait_on_copygc) {
400                 bch2_trans_unlock(trans);
401                 wait_event_killable(c->copygc_running_wq,
402                                     !c->copygc_running ||
403                                     kthread_should_stop());
404         }
405
406         do {
407                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
408
409                 if (delay) {
410                         bch2_trans_unlock(trans);
411                         set_current_state(TASK_INTERRUPTIBLE);
412                 }
413
414                 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
415                         __set_current_state(TASK_RUNNING);
416                         return 1;
417                 }
418
419                 if (delay)
420                         schedule_timeout(delay);
421
422                 if (unlikely(freezing(current))) {
423                         move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
424                         try_to_freeze();
425                 }
426         } while (delay);
427
428         /*
429          * XXX: these limits really ought to be per device, SSDs and hard drives
430          * will want different limits
431          */
432         move_ctxt_wait_event(ctxt, trans,
433                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
434                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
435                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
436                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
437
438         return 0;
439 }
440
441 static int move_get_io_opts(struct btree_trans *trans,
442                             struct bch_io_opts *io_opts,
443                             struct bkey_s_c k, u64 *cur_inum)
444 {
445         struct bch_inode_unpacked inode;
446         int ret;
447
448         if (*cur_inum == k.k->p.inode)
449                 return 0;
450
451         ret = lookup_inode(trans,
452                            SPOS(0, k.k->p.inode, k.k->p.snapshot),
453                            &inode);
454         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
455                 return ret;
456
457         if (!ret)
458                 bch2_inode_opts_get(io_opts, trans->c, &inode);
459         else
460                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
461         *cur_inum = k.k->p.inode;
462         return 0;
463 }
464
465 static int __bch2_move_data(struct moving_context *ctxt,
466                             struct bpos start,
467                             struct bpos end,
468                             move_pred_fn pred, void *arg,
469                             enum btree_id btree_id)
470 {
471         struct bch_fs *c = ctxt->c;
472         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
473         struct bkey_buf sk;
474         struct btree_trans trans;
475         struct btree_iter iter;
476         struct bkey_s_c k;
477         struct data_update_opts data_opts;
478         u64 cur_inum = U64_MAX;
479         int ret = 0, ret2;
480
481         bch2_bkey_buf_init(&sk);
482         bch2_trans_init(&trans, c, 0, 0);
483
484         if (ctxt->stats) {
485                 ctxt->stats->data_type  = BCH_DATA_user;
486                 ctxt->stats->btree_id   = btree_id;
487                 ctxt->stats->pos        = start;
488         }
489
490         bch2_trans_iter_init(&trans, &iter, btree_id, start,
491                              BTREE_ITER_PREFETCH|
492                              BTREE_ITER_ALL_SNAPSHOTS);
493
494         if (ctxt->rate)
495                 bch2_ratelimit_reset(ctxt->rate);
496
497         while (!move_ratelimit(&trans, ctxt)) {
498                 bch2_trans_begin(&trans);
499
500                 k = bch2_btree_iter_peek(&iter);
501                 if (!k.k)
502                         break;
503
504                 ret = bkey_err(k);
505                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
506                         continue;
507                 if (ret)
508                         break;
509
510                 if (bkey_ge(bkey_start_pos(k.k), end))
511                         break;
512
513                 if (ctxt->stats)
514                         ctxt->stats->pos = iter.pos;
515
516                 if (!bkey_extent_is_direct_data(k.k))
517                         goto next_nondata;
518
519                 ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
520                 if (ret)
521                         continue;
522
523                 memset(&data_opts, 0, sizeof(data_opts));
524                 if (!pred(c, arg, k, &io_opts, &data_opts))
525                         goto next;
526
527                 /*
528                  * The iterator gets unlocked by __bch2_read_extent - need to
529                  * save a copy of @k elsewhere:
530                  */
531                 bch2_bkey_buf_reassemble(&sk, c, k);
532                 k = bkey_i_to_s_c(sk.k);
533                 bch2_trans_unlock(&trans);
534
535                 ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
536                                         io_opts, btree_id, k, data_opts);
537                 if (ret2) {
538                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
539                                 continue;
540
541                         if (ret2 == -ENOMEM) {
542                                 /* memory allocation failure, wait for some IO to finish */
543                                 bch2_move_ctxt_wait_for_io(ctxt, &trans);
544                                 continue;
545                         }
546
547                         /* XXX signal failure */
548                         goto next;
549                 }
550
551                 if (ctxt->rate)
552                         bch2_ratelimit_increment(ctxt->rate, k.k->size);
553 next:
554                 if (ctxt->stats)
555                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
556 next_nondata:
557                 bch2_btree_iter_advance(&iter);
558         }
559
560         bch2_trans_iter_exit(&trans, &iter);
561         bch2_trans_exit(&trans);
562         bch2_bkey_buf_exit(&sk, c);
563
564         return ret;
565 }
566
567 int bch2_move_data(struct bch_fs *c,
568                    enum btree_id start_btree_id, struct bpos start_pos,
569                    enum btree_id end_btree_id,   struct bpos end_pos,
570                    struct bch_ratelimit *rate,
571                    struct bch_move_stats *stats,
572                    struct write_point_specifier wp,
573                    bool wait_on_copygc,
574                    move_pred_fn pred, void *arg)
575 {
576         struct moving_context ctxt;
577         enum btree_id id;
578         int ret;
579
580         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
581
582         for (id = start_btree_id;
583              id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
584              id++) {
585                 stats->btree_id = id;
586
587                 if (id != BTREE_ID_extents &&
588                     id != BTREE_ID_reflink)
589                         continue;
590
591                 ret = __bch2_move_data(&ctxt,
592                                        id == start_btree_id ? start_pos : POS_MIN,
593                                        id == end_btree_id   ? end_pos   : POS_MAX,
594                                        pred, arg, id);
595                 if (ret)
596                         break;
597         }
598
599         bch2_moving_ctxt_exit(&ctxt);
600
601         return ret;
602 }
603
604 void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
605 {
606         struct bch_fs *c = trans->c;
607         struct btree_iter iter;
608         struct bkey_s_c k;
609         struct printbuf buf = PRINTBUF;
610         struct bch_backpointer bp;
611         u64 bp_offset = 0;
612         int ret;
613
614         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
615                              bucket, BTREE_ITER_CACHED);
616 again:
617         ret = lockrestart_do(trans,
618                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
619
620         if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
621                 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
622
623                 if (a.v->gen == gen &&
624                     a.v->dirty_sectors) {
625                         if (a.v->data_type == BCH_DATA_btree) {
626                                 bch2_trans_unlock(trans);
627                                 if (bch2_btree_interior_updates_flush(c))
628                                         goto again;
629                                 goto failed_to_evacuate;
630                         }
631                 }
632         }
633
634         bch2_trans_iter_exit(trans, &iter);
635         return;
636 failed_to_evacuate:
637         bch2_trans_iter_exit(trans, &iter);
638
639         if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
640                 return;
641
642         prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket "));
643         bch2_bkey_val_to_text(&buf, c, k);
644
645         while (1) {
646                 bch2_trans_begin(trans);
647
648                 ret = bch2_get_next_backpointer(trans, bucket, gen,
649                                                 &bp_offset, &bp,
650                                                 BTREE_ITER_CACHED);
651                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
652                         continue;
653                 if (ret)
654                         break;
655                 if (bp_offset == U64_MAX)
656                         break;
657
658                 k = bch2_backpointer_get_key(trans, &iter,
659                                              bucket, bp_offset, bp);
660                 ret = bkey_err(k);
661                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
662                         continue;
663                 if (ret)
664                         break;
665                 if (!k.k)
666                         continue;
667                 prt_newline(&buf);
668                 bch2_bkey_val_to_text(&buf, c, k);
669                 bch2_trans_iter_exit(trans, &iter);
670         }
671
672         bch2_print_string_as_lines(KERN_ERR, buf.buf);
673         printbuf_exit(&buf);
674 }
675
676 int __bch2_evacuate_bucket(struct btree_trans *trans,
677                            struct moving_context *ctxt,
678                            struct move_bucket_in_flight *bucket_in_flight,
679                            struct bpos bucket, int gen,
680                            struct data_update_opts _data_opts)
681 {
682         struct bch_fs *c = ctxt->c;
683         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
684         struct btree_iter iter;
685         struct bkey_buf sk;
686         struct bch_backpointer bp;
687         struct bch_alloc_v4 a_convert;
688         const struct bch_alloc_v4 *a;
689         struct bkey_s_c k;
690         struct data_update_opts data_opts;
691         unsigned dirty_sectors, bucket_size;
692         u64 fragmentation;
693         u64 bp_offset = 0, cur_inum = U64_MAX;
694         int ret = 0;
695
696         bch2_bkey_buf_init(&sk);
697
698         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
699                              bucket, BTREE_ITER_CACHED);
700         ret = lockrestart_do(trans,
701                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
702         bch2_trans_iter_exit(trans, &iter);
703
704         if (ret) {
705                 bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
706                 goto err;
707         }
708
709         a = bch2_alloc_to_v4(k, &a_convert);
710         dirty_sectors = a->dirty_sectors;
711         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
712         fragmentation = a->fragmentation_lru;
713
714         ret = bch2_btree_write_buffer_flush(trans);
715         if (ret) {
716                 bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
717                 goto err;
718         }
719
720         while (!(ret = move_ratelimit(trans, ctxt))) {
721                 bch2_trans_begin(trans);
722
723                 ret = bch2_get_next_backpointer(trans, bucket, gen,
724                                                 &bp_offset, &bp,
725                                                 BTREE_ITER_CACHED);
726                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
727                         continue;
728                 if (ret)
729                         goto err;
730                 if (bp_offset == U64_MAX)
731                         break;
732
733                 if (!bp.level) {
734                         const struct bch_extent_ptr *ptr;
735                         struct bkey_s_c k;
736                         unsigned i = 0;
737
738                         k = bch2_backpointer_get_key(trans, &iter,
739                                                 bucket, bp_offset, bp);
740                         ret = bkey_err(k);
741                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
742                                 continue;
743                         if (ret)
744                                 goto err;
745                         if (!k.k)
746                                 goto next;
747
748                         bch2_bkey_buf_reassemble(&sk, c, k);
749                         k = bkey_i_to_s_c(sk.k);
750
751                         ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
752                         if (ret) {
753                                 bch2_trans_iter_exit(trans, &iter);
754                                 continue;
755                         }
756
757                         data_opts = _data_opts;
758                         data_opts.target        = io_opts.background_target;
759                         data_opts.rewrite_ptrs = 0;
760
761                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
762                                 if (ptr->dev == bucket.inode)
763                                         data_opts.rewrite_ptrs |= 1U << i;
764                                 i++;
765                         }
766
767                         ret = bch2_move_extent(trans, &iter, ctxt,
768                                         bucket_in_flight,
769                                         io_opts, bp.btree_id, k, data_opts);
770                         bch2_trans_iter_exit(trans, &iter);
771
772                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
773                                 continue;
774                         if (ret == -ENOMEM) {
775                                 /* memory allocation failure, wait for some IO to finish */
776                                 bch2_move_ctxt_wait_for_io(ctxt, trans);
777                                 continue;
778                         }
779                         if (ret)
780                                 goto err;
781
782                         if (ctxt->rate)
783                                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
784                         if (ctxt->stats)
785                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
786                 } else {
787                         struct btree *b;
788
789                         b = bch2_backpointer_get_node(trans, &iter,
790                                                 bucket, bp_offset, bp);
791                         ret = PTR_ERR_OR_ZERO(b);
792                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
793                                 continue;
794                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
795                                 continue;
796                         if (ret)
797                                 goto err;
798                         if (!b)
799                                 goto next;
800
801                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
802                         bch2_trans_iter_exit(trans, &iter);
803
804                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
805                                 continue;
806                         if (ret)
807                                 goto err;
808
809                         if (ctxt->rate)
810                                 bch2_ratelimit_increment(ctxt->rate,
811                                                          c->opts.btree_node_size >> 9);
812                         if (ctxt->stats) {
813                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
814                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
815                         }
816                 }
817 next:
818                 bp_offset++;
819         }
820
821         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
822
823         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
824                 bch2_trans_unlock(trans);
825                 move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
826                 closure_sync(&ctxt->cl);
827                 if (!ctxt->write_error)
828                         bch2_verify_bucket_evacuated(trans, bucket, gen);
829         }
830 err:
831         bch2_bkey_buf_exit(&sk, c);
832         return ret;
833 }
834
835 int bch2_evacuate_bucket(struct bch_fs *c,
836                          struct bpos bucket, int gen,
837                          struct data_update_opts data_opts,
838                          struct bch_ratelimit *rate,
839                          struct bch_move_stats *stats,
840                          struct write_point_specifier wp,
841                          bool wait_on_copygc)
842 {
843         struct btree_trans trans;
844         struct moving_context ctxt;
845         int ret;
846
847         bch2_trans_init(&trans, c, 0, 0);
848         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
849         ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
850         bch2_moving_ctxt_exit(&ctxt);
851         bch2_trans_exit(&trans);
852
853         return ret;
854 }
855
856 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
857                                 struct btree *, struct bch_io_opts *,
858                                 struct data_update_opts *);
859
860 static int bch2_move_btree(struct bch_fs *c,
861                            enum btree_id start_btree_id, struct bpos start_pos,
862                            enum btree_id end_btree_id,   struct bpos end_pos,
863                            move_btree_pred pred, void *arg,
864                            struct bch_move_stats *stats)
865 {
866         bool kthread = (current->flags & PF_KTHREAD) != 0;
867         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
868         struct btree_trans trans;
869         struct btree_iter iter;
870         struct btree *b;
871         enum btree_id id;
872         struct data_update_opts data_opts;
873         int ret = 0;
874
875         bch2_trans_init(&trans, c, 0, 0);
876         progress_list_add(c, stats);
877
878         stats->data_type = BCH_DATA_btree;
879
880         for (id = start_btree_id;
881              id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
882              id++) {
883                 stats->btree_id = id;
884
885                 bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
886                                           BTREE_ITER_PREFETCH);
887 retry:
888                 ret = 0;
889                 while (bch2_trans_begin(&trans),
890                        (b = bch2_btree_iter_peek_node(&iter)) &&
891                        !(ret = PTR_ERR_OR_ZERO(b))) {
892                         if (kthread && kthread_should_stop())
893                                 break;
894
895                         if ((cmp_int(id, end_btree_id) ?:
896                              bpos_cmp(b->key.k.p, end_pos)) > 0)
897                                 break;
898
899                         stats->pos = iter.pos;
900
901                         if (!pred(c, arg, b, &io_opts, &data_opts))
902                                 goto next;
903
904                         ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
905                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
906                                 continue;
907                         if (ret)
908                                 break;
909 next:
910                         bch2_btree_iter_next_node(&iter);
911                 }
912                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
913                         goto retry;
914
915                 bch2_trans_iter_exit(&trans, &iter);
916
917                 if (kthread && kthread_should_stop())
918                         break;
919         }
920
921         bch2_trans_exit(&trans);
922
923         if (ret)
924                 bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
925
926         bch2_btree_interior_updates_flush(c);
927
928         progress_list_del(c, stats);
929         return ret;
930 }
931
932 static bool rereplicate_pred(struct bch_fs *c, void *arg,
933                              struct bkey_s_c k,
934                              struct bch_io_opts *io_opts,
935                              struct data_update_opts *data_opts)
936 {
937         unsigned nr_good = bch2_bkey_durability(c, k);
938         unsigned replicas = bkey_is_btree_ptr(k.k)
939                 ? c->opts.metadata_replicas
940                 : io_opts->data_replicas;
941
942         if (!nr_good || nr_good >= replicas)
943                 return false;
944
945         data_opts->target               = 0;
946         data_opts->extra_replicas       = replicas - nr_good;
947         data_opts->btree_insert_flags   = 0;
948         return true;
949 }
950
951 static bool migrate_pred(struct bch_fs *c, void *arg,
952                          struct bkey_s_c k,
953                          struct bch_io_opts *io_opts,
954                          struct data_update_opts *data_opts)
955 {
956         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
957         const struct bch_extent_ptr *ptr;
958         struct bch_ioctl_data *op = arg;
959         unsigned i = 0;
960
961         data_opts->rewrite_ptrs         = 0;
962         data_opts->target               = 0;
963         data_opts->extra_replicas       = 0;
964         data_opts->btree_insert_flags   = 0;
965
966         bkey_for_each_ptr(ptrs, ptr) {
967                 if (ptr->dev == op->migrate.dev)
968                         data_opts->rewrite_ptrs |= 1U << i;
969                 i++;
970         }
971
972         return data_opts->rewrite_ptrs != 0;
973 }
974
975 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
976                                    struct btree *b,
977                                    struct bch_io_opts *io_opts,
978                                    struct data_update_opts *data_opts)
979 {
980         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
981 }
982
983 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
984                                struct btree *b,
985                                struct bch_io_opts *io_opts,
986                                struct data_update_opts *data_opts)
987 {
988         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
989 }
990
991 static bool bformat_needs_redo(struct bkey_format *f)
992 {
993         unsigned i;
994
995         for (i = 0; i < f->nr_fields; i++) {
996                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
997                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
998                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
999
1000                 if (f->bits_per_field[i] > unpacked_bits)
1001                         return true;
1002
1003                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
1004                         return true;
1005
1006                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
1007                      unpacked_mask) <
1008                     field_offset)
1009                         return true;
1010         }
1011
1012         return false;
1013 }
1014
1015 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1016                                    struct btree *b,
1017                                    struct bch_io_opts *io_opts,
1018                                    struct data_update_opts *data_opts)
1019 {
1020         if (b->version_ondisk != c->sb.version ||
1021             btree_node_need_rewrite(b) ||
1022             bformat_needs_redo(&b->format)) {
1023                 data_opts->target               = 0;
1024                 data_opts->extra_replicas       = 0;
1025                 data_opts->btree_insert_flags   = 0;
1026                 return true;
1027         }
1028
1029         return false;
1030 }
1031
1032 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1033 {
1034         int ret;
1035
1036         ret = bch2_move_btree(c,
1037                               0,                POS_MIN,
1038                               BTREE_ID_NR,      SPOS_MAX,
1039                               rewrite_old_nodes_pred, c, stats);
1040         if (!ret) {
1041                 mutex_lock(&c->sb_lock);
1042                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1043                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1044                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1045                 bch2_write_super(c);
1046                 mutex_unlock(&c->sb_lock);
1047         }
1048
1049         return ret;
1050 }
1051
1052 int bch2_data_job(struct bch_fs *c,
1053                   struct bch_move_stats *stats,
1054                   struct bch_ioctl_data op)
1055 {
1056         int ret = 0;
1057
1058         switch (op.op) {
1059         case BCH_DATA_OP_REREPLICATE:
1060                 bch2_move_stats_init(stats, "rereplicate");
1061                 stats->data_type = BCH_DATA_journal;
1062                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1063
1064                 ret = bch2_move_btree(c,
1065                                       op.start_btree,   op.start_pos,
1066                                       op.end_btree,     op.end_pos,
1067                                       rereplicate_btree_pred, c, stats) ?: ret;
1068                 ret = bch2_replicas_gc2(c) ?: ret;
1069
1070                 ret = bch2_move_data(c,
1071                                      op.start_btree,    op.start_pos,
1072                                      op.end_btree,      op.end_pos,
1073                                      NULL,
1074                                      stats,
1075                                      writepoint_hashed((unsigned long) current),
1076                                      true,
1077                                      rereplicate_pred, c) ?: ret;
1078                 ret = bch2_replicas_gc2(c) ?: ret;
1079                 break;
1080         case BCH_DATA_OP_MIGRATE:
1081                 if (op.migrate.dev >= c->sb.nr_devices)
1082                         return -EINVAL;
1083
1084                 bch2_move_stats_init(stats, "migrate");
1085                 stats->data_type = BCH_DATA_journal;
1086                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1087
1088                 ret = bch2_move_btree(c,
1089                                       op.start_btree,   op.start_pos,
1090                                       op.end_btree,     op.end_pos,
1091                                       migrate_btree_pred, &op, stats) ?: ret;
1092                 ret = bch2_replicas_gc2(c) ?: ret;
1093
1094                 ret = bch2_move_data(c,
1095                                      op.start_btree,    op.start_pos,
1096                                      op.end_btree,      op.end_pos,
1097                                      NULL,
1098                                      stats,
1099                                      writepoint_hashed((unsigned long) current),
1100                                      true,
1101                                      migrate_pred, &op) ?: ret;
1102                 ret = bch2_replicas_gc2(c) ?: ret;
1103                 break;
1104         case BCH_DATA_OP_REWRITE_OLD_NODES:
1105                 bch2_move_stats_init(stats, "rewrite_old_nodes");
1106                 ret = bch2_scan_old_btree_nodes(c, stats);
1107                 break;
1108         default:
1109                 ret = -EINVAL;
1110         }
1111
1112         return ret;
1113 }