]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
47b77b3c3e686309e87336b2604b8dafeb290858
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "bkey_buf.h"
7 #include "btree_gc.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "disk_groups.h"
11 #include "ec.h"
12 #include "errcode.h"
13 #include "error.h"
14 #include "inode.h"
15 #include "io.h"
16 #include "journal_reclaim.h"
17 #include "move.h"
18 #include "replicas.h"
19 #include "super-io.h"
20 #include "keylist.h"
21
22 #include <linux/ioprio.h>
23 #include <linux/kthread.h>
24
25 #include <trace/events/bcachefs.h>
26
27 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
28 {
29         mutex_lock(&c->data_progress_lock);
30         list_add(&stats->list, &c->data_progress_list);
31         mutex_unlock(&c->data_progress_lock);
32 }
33
34 static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
35 {
36         mutex_lock(&c->data_progress_lock);
37         list_del(&stats->list);
38         mutex_unlock(&c->data_progress_lock);
39 }
40
41 struct moving_io {
42         struct list_head        list;
43         struct closure          cl;
44         bool                    read_completed;
45
46         unsigned                read_sectors;
47         unsigned                write_sectors;
48
49         struct bch_read_bio     rbio;
50
51         struct data_update      write;
52         /* Must be last since it is variable size */
53         struct bio_vec          bi_inline_vecs[0];
54 };
55
56 static void move_free(struct moving_io *io)
57 {
58         struct moving_context *ctxt = io->write.ctxt;
59         struct bch_fs *c = ctxt->c;
60
61         bch2_data_update_exit(&io->write);
62         wake_up(&ctxt->wait);
63         percpu_ref_put(&c->writes);
64         kfree(io);
65 }
66
67 static void move_write_done(struct bch_write_op *op)
68 {
69         struct moving_io *io = container_of(op, struct moving_io, write.op);
70         struct moving_context *ctxt = io->write.ctxt;
71
72         if (io->write.op.error)
73                 ctxt->write_error = true;
74
75         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
76         move_free(io);
77         closure_put(&ctxt->cl);
78 }
79
80 static void move_write(struct moving_io *io)
81 {
82         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
83                 move_free(io);
84                 return;
85         }
86
87         closure_get(&io->write.ctxt->cl);
88         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
89
90         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
91 }
92
93 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
94 {
95         struct moving_io *io =
96                 list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
97
98         return io && io->read_completed ? io : NULL;
99 }
100
101 static void move_read_endio(struct bio *bio)
102 {
103         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
104         struct moving_context *ctxt = io->write.ctxt;
105
106         atomic_sub(io->read_sectors, &ctxt->read_sectors);
107         io->read_completed = true;
108
109         wake_up(&ctxt->wait);
110         closure_put(&ctxt->cl);
111 }
112
113 static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
114 {
115         struct moving_io *io;
116
117         if (trans)
118                 bch2_trans_unlock(trans);
119
120         while ((io = next_pending_write(ctxt))) {
121                 list_del(&io->list);
122                 move_write(io);
123         }
124 }
125
126 #define move_ctxt_wait_event(_ctxt, _trans, _cond)              \
127 do {                                                            \
128         do_pending_writes(_ctxt, _trans);                       \
129                                                                 \
130         if (_cond)                                              \
131                 break;                                          \
132         __wait_event((_ctxt)->wait,                             \
133                      next_pending_write(_ctxt) || (_cond));     \
134 } while (1)
135
136 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
137                                        struct btree_trans *trans)
138 {
139         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
140
141         move_ctxt_wait_event(ctxt, trans,
142                 !atomic_read(&ctxt->write_sectors) ||
143                 atomic_read(&ctxt->write_sectors) != sectors_pending);
144 }
145
146 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
147 {
148         move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
149         closure_sync(&ctxt->cl);
150         EBUG_ON(atomic_read(&ctxt->write_sectors));
151
152         if (ctxt->stats) {
153                 progress_list_del(ctxt->c, ctxt->stats);
154
155                 trace_move_data(ctxt->c,
156                                 atomic64_read(&ctxt->stats->sectors_moved),
157                                 atomic64_read(&ctxt->stats->keys_moved));
158         }
159 }
160
161 void bch2_moving_ctxt_init(struct moving_context *ctxt,
162                            struct bch_fs *c,
163                            struct bch_ratelimit *rate,
164                            struct bch_move_stats *stats,
165                            struct write_point_specifier wp,
166                            bool wait_on_copygc)
167 {
168         memset(ctxt, 0, sizeof(*ctxt));
169
170         ctxt->c         = c;
171         ctxt->rate      = rate;
172         ctxt->stats     = stats;
173         ctxt->wp        = wp;
174         ctxt->wait_on_copygc = wait_on_copygc;
175
176         closure_init_stack(&ctxt->cl);
177         INIT_LIST_HEAD(&ctxt->reads);
178         init_waitqueue_head(&ctxt->wait);
179
180         if (stats) {
181                 progress_list_add(c, stats);
182                 stats->data_type = BCH_DATA_user;
183         }
184 }
185
186 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
187 {
188         memset(stats, 0, sizeof(*stats));
189         scnprintf(stats->name, sizeof(stats->name), "%s", name);
190 }
191
192 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
193                                  struct btree_iter *iter,
194                                  struct bkey_s_c k,
195                                  struct data_update_opts data_opts)
196 {
197         struct bch_fs *c = trans->c;
198         struct bkey_i *n;
199         int ret;
200
201         n = bch2_bkey_make_mut(trans, k);
202         ret = PTR_ERR_OR_ZERO(n);
203         if (ret)
204                 return ret;
205
206         while (data_opts.kill_ptrs) {
207                 unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
208                 struct bch_extent_ptr *ptr;
209
210                 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
211                 data_opts.kill_ptrs ^= 1U << drop;
212         }
213
214         /*
215          * If the new extent no longer has any pointers, bch2_extent_normalize()
216          * will do the appropriate thing with it (turning it into a
217          * KEY_TYPE_error key, or just a discard if it was a cached extent)
218          */
219         bch2_extent_normalize(c, bkey_i_to_s(n));
220
221         /*
222          * Since we're not inserting through an extent iterator
223          * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
224          * we aren't using the extent overwrite path to delete, we're
225          * just using the normal key deletion path:
226          */
227         if (bkey_deleted(&n->k))
228                 n->k.size = 0;
229
230         return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
231                 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
232 }
233
234 static int bch2_move_extent(struct btree_trans *trans,
235                             struct btree_iter *iter,
236                             struct moving_context *ctxt,
237                             struct bch_io_opts io_opts,
238                             enum btree_id btree_id,
239                             struct bkey_s_c k,
240                             struct data_update_opts data_opts)
241 {
242         struct bch_fs *c = trans->c;
243         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
244         struct moving_io *io;
245         const union bch_extent_entry *entry;
246         struct extent_ptr_decoded p;
247         unsigned sectors = k.k->size, pages;
248         int ret = -ENOMEM;
249
250         bch2_data_update_opts_normalize(k, &data_opts);
251
252         if (!data_opts.rewrite_ptrs &&
253             !data_opts.extra_replicas) {
254                 if (data_opts.kill_ptrs)
255                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
256                 return 0;
257         }
258
259         if (!percpu_ref_tryget_live(&c->writes))
260                 return -EROFS;
261
262         /*
263          * Before memory allocations & taking nocow locks in
264          * bch2_data_update_init():
265          */
266         bch2_trans_unlock(trans);
267
268         /* write path might have to decompress data: */
269         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
270                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
271
272         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
273         io = kzalloc(sizeof(struct moving_io) +
274                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
275         if (!io)
276                 goto err;
277
278         io->write.ctxt          = ctxt;
279         io->read_sectors        = k.k->size;
280         io->write_sectors       = k.k->size;
281
282         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
283         bio_set_prio(&io->write.op.wbio.bio,
284                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
285
286         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
287                                  GFP_KERNEL))
288                 goto err_free;
289
290         io->rbio.c              = c;
291         io->rbio.opts           = io_opts;
292         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
293         io->rbio.bio.bi_vcnt = pages;
294         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
295         io->rbio.bio.bi_iter.bi_size = sectors << 9;
296
297         bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
298         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
299         io->rbio.bio.bi_end_io          = move_read_endio;
300
301         ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts,
302                                     data_opts, btree_id, k);
303         if (ret && ret != -BCH_ERR_unwritten_extent_update)
304                 goto err_free_pages;
305
306         io->write.ctxt = ctxt;
307         io->write.op.end_io = move_write_done;
308
309         atomic64_inc(&ctxt->stats->keys_moved);
310         atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
311
312         if (ret == -BCH_ERR_unwritten_extent_update) {
313                 bch2_update_unwritten_extent(trans, &io->write);
314                 move_free(io);
315                 return 0;
316         }
317
318         BUG_ON(ret);
319
320         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
321         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
322         trace_move_extent_read(k.k);
323
324         atomic_add(io->read_sectors, &ctxt->read_sectors);
325         list_add_tail(&io->list, &ctxt->reads);
326
327         /*
328          * dropped by move_read_endio() - guards against use after free of
329          * ctxt when doing wakeup
330          */
331         closure_get(&ctxt->cl);
332         bch2_read_extent(trans, &io->rbio,
333                          bkey_start_pos(k.k),
334                          btree_id, k, 0,
335                          BCH_READ_NODECODE|
336                          BCH_READ_LAST_FRAGMENT);
337         return 0;
338 err_free_pages:
339         bio_free_pages(&io->write.op.wbio.bio);
340 err_free:
341         kfree(io);
342 err:
343         percpu_ref_put(&c->writes);
344         trace_and_count(c, move_extent_alloc_mem_fail, k.k);
345         return ret;
346 }
347
348 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
349                         struct bch_inode_unpacked *inode)
350 {
351         struct btree_iter iter;
352         struct bkey_s_c k;
353         int ret;
354
355         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
356                              BTREE_ITER_ALL_SNAPSHOTS);
357         k = bch2_btree_iter_peek(&iter);
358         ret = bkey_err(k);
359         if (ret)
360                 goto err;
361
362         if (!k.k || !bkey_eq(k.k->p, pos)) {
363                 ret = -ENOENT;
364                 goto err;
365         }
366
367         ret = bkey_is_inode(k.k) ? 0 : -EIO;
368         if (ret)
369                 goto err;
370
371         ret = bch2_inode_unpack(k, inode);
372         if (ret)
373                 goto err;
374 err:
375         bch2_trans_iter_exit(trans, &iter);
376         return ret;
377 }
378
379 static int move_ratelimit(struct btree_trans *trans,
380                           struct moving_context *ctxt)
381 {
382         struct bch_fs *c = trans->c;
383         u64 delay;
384
385         if (ctxt->wait_on_copygc) {
386                 bch2_trans_unlock(trans);
387                 wait_event_killable(c->copygc_running_wq,
388                                     !c->copygc_running ||
389                                     kthread_should_stop());
390         }
391
392         do {
393                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
394
395                 if (delay) {
396                         bch2_trans_unlock(trans);
397                         set_current_state(TASK_INTERRUPTIBLE);
398                 }
399
400                 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
401                         __set_current_state(TASK_RUNNING);
402                         return 1;
403                 }
404
405                 if (delay)
406                         schedule_timeout(delay);
407
408                 if (unlikely(freezing(current))) {
409                         move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
410                         try_to_freeze();
411                 }
412         } while (delay);
413
414         move_ctxt_wait_event(ctxt, trans,
415                 atomic_read(&ctxt->write_sectors) <
416                 c->opts.move_bytes_in_flight >> 9);
417
418         move_ctxt_wait_event(ctxt, trans,
419                 atomic_read(&ctxt->read_sectors) <
420                 c->opts.move_bytes_in_flight >> 9);
421
422         return 0;
423 }
424
425 static int move_get_io_opts(struct btree_trans *trans,
426                             struct bch_io_opts *io_opts,
427                             struct bkey_s_c k, u64 *cur_inum)
428 {
429         struct bch_inode_unpacked inode;
430         int ret;
431
432         if (*cur_inum == k.k->p.inode)
433                 return 0;
434
435         ret = lookup_inode(trans,
436                            SPOS(0, k.k->p.inode, k.k->p.snapshot),
437                            &inode);
438         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
439                 return ret;
440
441         if (!ret)
442                 bch2_inode_opts_get(io_opts, trans->c, &inode);
443         else
444                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
445         *cur_inum = k.k->p.inode;
446         return 0;
447 }
448
449 static int __bch2_move_data(struct moving_context *ctxt,
450                             struct bpos start,
451                             struct bpos end,
452                             move_pred_fn pred, void *arg,
453                             enum btree_id btree_id)
454 {
455         struct bch_fs *c = ctxt->c;
456         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
457         struct bkey_buf sk;
458         struct btree_trans trans;
459         struct btree_iter iter;
460         struct bkey_s_c k;
461         struct data_update_opts data_opts;
462         u64 cur_inum = U64_MAX;
463         int ret = 0, ret2;
464
465         bch2_bkey_buf_init(&sk);
466         bch2_trans_init(&trans, c, 0, 0);
467
468         ctxt->stats->data_type  = BCH_DATA_user;
469         ctxt->stats->btree_id   = btree_id;
470         ctxt->stats->pos        = start;
471
472         bch2_trans_iter_init(&trans, &iter, btree_id, start,
473                              BTREE_ITER_PREFETCH|
474                              BTREE_ITER_ALL_SNAPSHOTS);
475
476         if (ctxt->rate)
477                 bch2_ratelimit_reset(ctxt->rate);
478
479         while (!move_ratelimit(&trans, ctxt)) {
480                 bch2_trans_begin(&trans);
481
482                 k = bch2_btree_iter_peek(&iter);
483                 if (!k.k)
484                         break;
485
486                 ret = bkey_err(k);
487                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
488                         continue;
489                 if (ret)
490                         break;
491
492                 if (bkey_ge(bkey_start_pos(k.k), end))
493                         break;
494
495                 ctxt->stats->pos = iter.pos;
496
497                 if (!bkey_extent_is_direct_data(k.k))
498                         goto next_nondata;
499
500                 ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
501                 if (ret)
502                         continue;
503
504                 memset(&data_opts, 0, sizeof(data_opts));
505                 if (!pred(c, arg, k, &io_opts, &data_opts))
506                         goto next;
507
508                 /*
509                  * The iterator gets unlocked by __bch2_read_extent - need to
510                  * save a copy of @k elsewhere:
511                  */
512                 bch2_bkey_buf_reassemble(&sk, c, k);
513                 k = bkey_i_to_s_c(sk.k);
514                 bch2_trans_unlock(&trans);
515
516                 ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
517                                         btree_id, k, data_opts);
518                 if (ret2) {
519                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
520                                 continue;
521
522                         if (ret2 == -ENOMEM) {
523                                 /* memory allocation failure, wait for some IO to finish */
524                                 bch2_move_ctxt_wait_for_io(ctxt, &trans);
525                                 continue;
526                         }
527
528                         /* XXX signal failure */
529                         goto next;
530                 }
531
532                 if (ctxt->rate)
533                         bch2_ratelimit_increment(ctxt->rate, k.k->size);
534 next:
535                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
536 next_nondata:
537                 bch2_btree_iter_advance(&iter);
538         }
539
540         bch2_trans_iter_exit(&trans, &iter);
541         bch2_trans_exit(&trans);
542         bch2_bkey_buf_exit(&sk, c);
543
544         return ret;
545 }
546
547 int bch2_move_data(struct bch_fs *c,
548                    enum btree_id start_btree_id, struct bpos start_pos,
549                    enum btree_id end_btree_id,   struct bpos end_pos,
550                    struct bch_ratelimit *rate,
551                    struct bch_move_stats *stats,
552                    struct write_point_specifier wp,
553                    bool wait_on_copygc,
554                    move_pred_fn pred, void *arg)
555 {
556         struct moving_context ctxt;
557         enum btree_id id;
558         int ret;
559
560         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
561
562         for (id = start_btree_id;
563              id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
564              id++) {
565                 stats->btree_id = id;
566
567                 if (id != BTREE_ID_extents &&
568                     id != BTREE_ID_reflink)
569                         continue;
570
571                 ret = __bch2_move_data(&ctxt,
572                                        id == start_btree_id ? start_pos : POS_MIN,
573                                        id == end_btree_id   ? end_pos   : POS_MAX,
574                                        pred, arg, id);
575                 if (ret)
576                         break;
577         }
578
579         bch2_moving_ctxt_exit(&ctxt);
580
581         return ret;
582 }
583
584 static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
585 {
586         struct bch_fs *c = trans->c;
587         struct btree_iter iter;
588         struct bkey_s_c k;
589         struct printbuf buf = PRINTBUF;
590         struct bch_backpointer bp;
591         u64 bp_offset = 0;
592         int ret;
593
594         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
595                              bucket, BTREE_ITER_CACHED);
596 again:
597         k = bch2_btree_iter_peek_slot(&iter);
598         ret = bkey_err(k);
599
600         if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
601                 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
602
603                 if (a.v->gen == gen &&
604                     a.v->dirty_sectors) {
605                         if (a.v->data_type == BCH_DATA_btree) {
606                                 bch2_trans_unlock(trans);
607                                 if (bch2_btree_interior_updates_flush(c))
608                                         goto again;
609                                 goto failed_to_evacuate;
610                         }
611                 }
612         }
613
614         bch2_trans_iter_exit(trans, &iter);
615         return ret;
616 failed_to_evacuate:
617         bch2_trans_iter_exit(trans, &iter);
618
619         prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket "));
620         bch2_bkey_val_to_text(&buf, c, k);
621
622         while (1) {
623                 bch2_trans_begin(trans);
624
625                 ret = bch2_get_next_backpointer(trans, bucket, gen,
626                                                 &bp_offset, &bp,
627                                                 BTREE_ITER_CACHED);
628                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
629                         continue;
630                 if (ret)
631                         break;
632                 if (bp_offset == U64_MAX)
633                         break;
634
635                 k = bch2_backpointer_get_key(trans, &iter,
636                                              bucket, bp_offset, bp);
637                 ret = bkey_err(k);
638                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
639                         continue;
640                 if (ret)
641                         break;
642                 if (!k.k)
643                         continue;
644                 prt_newline(&buf);
645                 bch2_bkey_val_to_text(&buf, c, k);
646                 bch2_trans_iter_exit(trans, &iter);
647         }
648
649         bch2_print_string_as_lines(KERN_ERR, buf.buf);
650         printbuf_exit(&buf);
651         return 0;
652 }
653
654 int __bch2_evacuate_bucket(struct moving_context *ctxt,
655                            struct bpos bucket, int gen,
656                            struct data_update_opts _data_opts)
657 {
658         struct bch_fs *c = ctxt->c;
659         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
660         struct btree_trans trans;
661         struct btree_iter iter;
662         struct bkey_buf sk;
663         struct bch_backpointer bp;
664         struct data_update_opts data_opts;
665         u64 bp_offset = 0, cur_inum = U64_MAX;
666         int ret = 0;
667
668         bch2_bkey_buf_init(&sk);
669         bch2_trans_init(&trans, c, 0, 0);
670
671         while (!(ret = move_ratelimit(&trans, ctxt))) {
672                 bch2_trans_begin(&trans);
673
674                 ret = bch2_get_next_backpointer(&trans, bucket, gen,
675                                                 &bp_offset, &bp,
676                                                 BTREE_ITER_CACHED);
677                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
678                         continue;
679                 if (ret)
680                         goto err;
681                 if (bp_offset == U64_MAX)
682                         break;
683
684                 if (!bp.level) {
685                         const struct bch_extent_ptr *ptr;
686                         struct bkey_s_c k;
687                         unsigned i = 0;
688
689                         k = bch2_backpointer_get_key(&trans, &iter,
690                                                 bucket, bp_offset, bp);
691                         ret = bkey_err(k);
692                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
693                                 continue;
694                         if (ret)
695                                 goto err;
696                         if (!k.k)
697                                 continue;
698
699                         bch2_bkey_buf_reassemble(&sk, c, k);
700                         k = bkey_i_to_s_c(sk.k);
701
702                         ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
703                         if (ret) {
704                                 bch2_trans_iter_exit(&trans, &iter);
705                                 continue;
706                         }
707
708                         data_opts = _data_opts;
709                         data_opts.target        = io_opts.background_target;
710                         data_opts.rewrite_ptrs = 0;
711
712                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
713                                 if (ptr->dev == bucket.inode)
714                                         data_opts.rewrite_ptrs |= 1U << i;
715                                 i++;
716                         }
717
718                         ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
719                                                bp.btree_id, k, data_opts);
720                         bch2_trans_iter_exit(&trans, &iter);
721
722                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
723                                 continue;
724                         if (ret == -ENOMEM) {
725                                 /* memory allocation failure, wait for some IO to finish */
726                                 bch2_move_ctxt_wait_for_io(ctxt, &trans);
727                                 continue;
728                         }
729                         if (ret)
730                                 goto err;
731
732                         if (ctxt->rate)
733                                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
734                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
735                 } else {
736                         struct btree *b;
737
738                         b = bch2_backpointer_get_node(&trans, &iter,
739                                                 bucket, bp_offset, bp);
740                         ret = PTR_ERR_OR_ZERO(b);
741                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
742                                 continue;
743                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
744                                 continue;
745                         if (ret)
746                                 goto err;
747                         if (!b)
748                                 continue;
749
750                         ret = bch2_btree_node_rewrite(&trans, &iter, b, 0);
751                         bch2_trans_iter_exit(&trans, &iter);
752
753                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
754                                 continue;
755                         if (ret)
756                                 goto err;
757
758                         if (ctxt->rate)
759                                 bch2_ratelimit_increment(ctxt->rate,
760                                                          c->opts.btree_node_size >> 9);
761                         atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
762                         atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
763                 }
764
765                 bp_offset++;
766         }
767
768         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
769                 bch2_trans_unlock(&trans);
770                 move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
771                 closure_sync(&ctxt->cl);
772                 if (!ctxt->write_error)
773                         lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen));
774         }
775 err:
776         bch2_trans_exit(&trans);
777         bch2_bkey_buf_exit(&sk, c);
778         return ret;
779 }
780
781 int bch2_evacuate_bucket(struct bch_fs *c,
782                          struct bpos bucket, int gen,
783                          struct data_update_opts data_opts,
784                          struct bch_ratelimit *rate,
785                          struct bch_move_stats *stats,
786                          struct write_point_specifier wp,
787                          bool wait_on_copygc)
788 {
789         struct moving_context ctxt;
790         int ret;
791
792         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
793         ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts);
794         bch2_moving_ctxt_exit(&ctxt);
795
796         return ret;
797 }
798
799 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
800                                 struct btree *, struct bch_io_opts *,
801                                 struct data_update_opts *);
802
803 static int bch2_move_btree(struct bch_fs *c,
804                            enum btree_id start_btree_id, struct bpos start_pos,
805                            enum btree_id end_btree_id,   struct bpos end_pos,
806                            move_btree_pred pred, void *arg,
807                            struct bch_move_stats *stats)
808 {
809         bool kthread = (current->flags & PF_KTHREAD) != 0;
810         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
811         struct btree_trans trans;
812         struct btree_iter iter;
813         struct btree *b;
814         enum btree_id id;
815         struct data_update_opts data_opts;
816         int ret = 0;
817
818         bch2_trans_init(&trans, c, 0, 0);
819         progress_list_add(c, stats);
820
821         stats->data_type = BCH_DATA_btree;
822
823         for (id = start_btree_id;
824              id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
825              id++) {
826                 stats->btree_id = id;
827
828                 bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
829                                           BTREE_ITER_PREFETCH);
830 retry:
831                 ret = 0;
832                 while (bch2_trans_begin(&trans),
833                        (b = bch2_btree_iter_peek_node(&iter)) &&
834                        !(ret = PTR_ERR_OR_ZERO(b))) {
835                         if (kthread && kthread_should_stop())
836                                 break;
837
838                         if ((cmp_int(id, end_btree_id) ?:
839                              bpos_cmp(b->key.k.p, end_pos)) > 0)
840                                 break;
841
842                         stats->pos = iter.pos;
843
844                         if (!pred(c, arg, b, &io_opts, &data_opts))
845                                 goto next;
846
847                         ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
848                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
849                                 continue;
850                         if (ret)
851                                 break;
852 next:
853                         bch2_btree_iter_next_node(&iter);
854                 }
855                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
856                         goto retry;
857
858                 bch2_trans_iter_exit(&trans, &iter);
859
860                 if (kthread && kthread_should_stop())
861                         break;
862         }
863
864         bch2_trans_exit(&trans);
865
866         if (ret)
867                 bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
868
869         bch2_btree_interior_updates_flush(c);
870
871         progress_list_del(c, stats);
872         return ret;
873 }
874
875 static bool rereplicate_pred(struct bch_fs *c, void *arg,
876                              struct bkey_s_c k,
877                              struct bch_io_opts *io_opts,
878                              struct data_update_opts *data_opts)
879 {
880         unsigned nr_good = bch2_bkey_durability(c, k);
881         unsigned replicas = bkey_is_btree_ptr(k.k)
882                 ? c->opts.metadata_replicas
883                 : io_opts->data_replicas;
884
885         if (!nr_good || nr_good >= replicas)
886                 return false;
887
888         data_opts->target               = 0;
889         data_opts->extra_replicas       = replicas - nr_good;
890         data_opts->btree_insert_flags   = 0;
891         return true;
892 }
893
894 static bool migrate_pred(struct bch_fs *c, void *arg,
895                          struct bkey_s_c k,
896                          struct bch_io_opts *io_opts,
897                          struct data_update_opts *data_opts)
898 {
899         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
900         const struct bch_extent_ptr *ptr;
901         struct bch_ioctl_data *op = arg;
902         unsigned i = 0;
903
904         data_opts->rewrite_ptrs         = 0;
905         data_opts->target               = 0;
906         data_opts->extra_replicas       = 0;
907         data_opts->btree_insert_flags   = 0;
908
909         bkey_for_each_ptr(ptrs, ptr) {
910                 if (ptr->dev == op->migrate.dev)
911                         data_opts->rewrite_ptrs |= 1U << i;
912                 i++;
913         }
914
915         return data_opts->rewrite_ptrs != 0;
916 }
917
918 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
919                                    struct btree *b,
920                                    struct bch_io_opts *io_opts,
921                                    struct data_update_opts *data_opts)
922 {
923         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
924 }
925
926 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
927                                struct btree *b,
928                                struct bch_io_opts *io_opts,
929                                struct data_update_opts *data_opts)
930 {
931         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
932 }
933
934 static bool bformat_needs_redo(struct bkey_format *f)
935 {
936         unsigned i;
937
938         for (i = 0; i < f->nr_fields; i++) {
939                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
940                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
941                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
942
943                 if (f->bits_per_field[i] > unpacked_bits)
944                         return true;
945
946                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
947                         return true;
948
949                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
950                      unpacked_mask) <
951                     field_offset)
952                         return true;
953         }
954
955         return false;
956 }
957
958 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
959                                    struct btree *b,
960                                    struct bch_io_opts *io_opts,
961                                    struct data_update_opts *data_opts)
962 {
963         if (b->version_ondisk != c->sb.version ||
964             btree_node_need_rewrite(b) ||
965             bformat_needs_redo(&b->format)) {
966                 data_opts->target               = 0;
967                 data_opts->extra_replicas       = 0;
968                 data_opts->btree_insert_flags   = 0;
969                 return true;
970         }
971
972         return false;
973 }
974
975 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
976 {
977         int ret;
978
979         ret = bch2_move_btree(c,
980                               0,                POS_MIN,
981                               BTREE_ID_NR,      SPOS_MAX,
982                               rewrite_old_nodes_pred, c, stats);
983         if (!ret) {
984                 mutex_lock(&c->sb_lock);
985                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
986                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
987                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
988                 bch2_write_super(c);
989                 mutex_unlock(&c->sb_lock);
990         }
991
992         return ret;
993 }
994
995 int bch2_data_job(struct bch_fs *c,
996                   struct bch_move_stats *stats,
997                   struct bch_ioctl_data op)
998 {
999         int ret = 0;
1000
1001         switch (op.op) {
1002         case BCH_DATA_OP_REREPLICATE:
1003                 bch2_move_stats_init(stats, "rereplicate");
1004                 stats->data_type = BCH_DATA_journal;
1005                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1006
1007                 ret = bch2_move_btree(c,
1008                                       op.start_btree,   op.start_pos,
1009                                       op.end_btree,     op.end_pos,
1010                                       rereplicate_btree_pred, c, stats) ?: ret;
1011                 ret = bch2_replicas_gc2(c) ?: ret;
1012
1013                 ret = bch2_move_data(c,
1014                                      op.start_btree,    op.start_pos,
1015                                      op.end_btree,      op.end_pos,
1016                                      NULL,
1017                                      stats,
1018                                      writepoint_hashed((unsigned long) current),
1019                                      true,
1020                                      rereplicate_pred, c) ?: ret;
1021                 ret = bch2_replicas_gc2(c) ?: ret;
1022                 break;
1023         case BCH_DATA_OP_MIGRATE:
1024                 if (op.migrate.dev >= c->sb.nr_devices)
1025                         return -EINVAL;
1026
1027                 bch2_move_stats_init(stats, "migrate");
1028                 stats->data_type = BCH_DATA_journal;
1029                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1030
1031                 ret = bch2_move_btree(c,
1032                                       op.start_btree,   op.start_pos,
1033                                       op.end_btree,     op.end_pos,
1034                                       migrate_btree_pred, &op, stats) ?: ret;
1035                 ret = bch2_replicas_gc2(c) ?: ret;
1036
1037                 ret = bch2_move_data(c,
1038                                      op.start_btree,    op.start_pos,
1039                                      op.end_btree,      op.end_pos,
1040                                      NULL,
1041                                      stats,
1042                                      writepoint_hashed((unsigned long) current),
1043                                      true,
1044                                      migrate_pred, &op) ?: ret;
1045                 ret = bch2_replicas_gc2(c) ?: ret;
1046                 break;
1047         case BCH_DATA_OP_REWRITE_OLD_NODES:
1048                 bch2_move_stats_init(stats, "rewrite_old_nodes");
1049                 ret = bch2_scan_old_btree_nodes(c, stats);
1050                 break;
1051         default:
1052                 ret = -EINVAL;
1053         }
1054
1055         return ret;
1056 }