]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
c5518a86627642addb83299fad23a5604e6184e6
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io_read.h"
18 #include "io_write.h"
19 #include "journal_reclaim.h"
20 #include "keylist.h"
21 #include "move.h"
22 #include "replicas.h"
23 #include "snapshot.h"
24 #include "super-io.h"
25 #include "trace.h"
26
27 #include <linux/ioprio.h>
28 #include <linux/kthread.h>
29
30 const char * const bch2_data_ops_strs[] = {
31 #define x(t, n, ...) [n] = #t,
32         BCH_DATA_OPS()
33 #undef x
34         NULL
35 };
36
37 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
38 {
39         if (trace_move_extent_enabled()) {
40                 struct printbuf buf = PRINTBUF;
41
42                 bch2_bkey_val_to_text(&buf, c, k);
43                 trace_move_extent(c, buf.buf);
44                 printbuf_exit(&buf);
45         }
46 }
47
48 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
49 {
50         if (trace_move_extent_read_enabled()) {
51                 struct printbuf buf = PRINTBUF;
52
53                 bch2_bkey_val_to_text(&buf, c, k);
54                 trace_move_extent_read(c, buf.buf);
55                 printbuf_exit(&buf);
56         }
57 }
58
59 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
60 {
61         if (trace_move_extent_alloc_mem_fail_enabled()) {
62                 struct printbuf buf = PRINTBUF;
63
64                 bch2_bkey_val_to_text(&buf, c, k);
65                 trace_move_extent_alloc_mem_fail(c, buf.buf);
66                 printbuf_exit(&buf);
67         }
68 }
69
70 struct moving_io {
71         struct list_head                read_list;
72         struct list_head                io_list;
73         struct move_bucket_in_flight    *b;
74         struct closure                  cl;
75         bool                            read_completed;
76
77         unsigned                        read_sectors;
78         unsigned                        write_sectors;
79
80         struct bch_read_bio             rbio;
81
82         struct data_update              write;
83         /* Must be last since it is variable size */
84         struct bio_vec                  bi_inline_vecs[0];
85 };
86
87 static void move_free(struct moving_io *io)
88 {
89         struct moving_context *ctxt = io->write.ctxt;
90
91         if (io->b)
92                 atomic_dec(&io->b->count);
93
94         bch2_data_update_exit(&io->write);
95
96         mutex_lock(&ctxt->lock);
97         list_del(&io->io_list);
98         wake_up(&ctxt->wait);
99         mutex_unlock(&ctxt->lock);
100
101         kfree(io);
102 }
103
104 static void move_write_done(struct bch_write_op *op)
105 {
106         struct moving_io *io = container_of(op, struct moving_io, write.op);
107         struct moving_context *ctxt = io->write.ctxt;
108
109         if (io->write.op.error)
110                 ctxt->write_error = true;
111
112         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
113         atomic_dec(&io->write.ctxt->write_ios);
114         move_free(io);
115         closure_put(&ctxt->cl);
116 }
117
118 static void move_write(struct moving_io *io)
119 {
120         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
121                 move_free(io);
122                 return;
123         }
124
125         closure_get(&io->write.ctxt->cl);
126         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
127         atomic_inc(&io->write.ctxt->write_ios);
128
129         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
130 }
131
132 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
133 {
134         struct moving_io *io =
135                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
136
137         return io && io->read_completed ? io : NULL;
138 }
139
140 static void move_read_endio(struct bio *bio)
141 {
142         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
143         struct moving_context *ctxt = io->write.ctxt;
144
145         atomic_sub(io->read_sectors, &ctxt->read_sectors);
146         atomic_dec(&ctxt->read_ios);
147         io->read_completed = true;
148
149         wake_up(&ctxt->wait);
150         closure_put(&ctxt->cl);
151 }
152
153 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
154 {
155         struct moving_io *io;
156
157         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
158                 bch2_trans_unlock_long(ctxt->trans);
159                 list_del(&io->read_list);
160                 move_write(io);
161         }
162 }
163
164 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
165 {
166         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
167
168         move_ctxt_wait_event(ctxt,
169                 !atomic_read(&ctxt->write_sectors) ||
170                 atomic_read(&ctxt->write_sectors) != sectors_pending);
171 }
172
173 static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
174 {
175         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
176         bch2_trans_unlock_long(ctxt->trans);
177         closure_sync(&ctxt->cl);
178 }
179
180 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
181 {
182         struct bch_fs *c = ctxt->trans->c;
183
184         bch2_moving_ctxt_flush_all(ctxt);
185
186         EBUG_ON(atomic_read(&ctxt->write_sectors));
187         EBUG_ON(atomic_read(&ctxt->write_ios));
188         EBUG_ON(atomic_read(&ctxt->read_sectors));
189         EBUG_ON(atomic_read(&ctxt->read_ios));
190
191         mutex_lock(&c->moving_context_lock);
192         list_del(&ctxt->list);
193         mutex_unlock(&c->moving_context_lock);
194
195         bch2_trans_put(ctxt->trans);
196         memset(ctxt, 0, sizeof(*ctxt));
197 }
198
199 void bch2_moving_ctxt_init(struct moving_context *ctxt,
200                            struct bch_fs *c,
201                            struct bch_ratelimit *rate,
202                            struct bch_move_stats *stats,
203                            struct write_point_specifier wp,
204                            bool wait_on_copygc)
205 {
206         memset(ctxt, 0, sizeof(*ctxt));
207
208         ctxt->trans     = bch2_trans_get(c);
209         ctxt->fn        = (void *) _RET_IP_;
210         ctxt->rate      = rate;
211         ctxt->stats     = stats;
212         ctxt->wp        = wp;
213         ctxt->wait_on_copygc = wait_on_copygc;
214
215         closure_init_stack(&ctxt->cl);
216
217         mutex_init(&ctxt->lock);
218         INIT_LIST_HEAD(&ctxt->reads);
219         INIT_LIST_HEAD(&ctxt->ios);
220         init_waitqueue_head(&ctxt->wait);
221
222         mutex_lock(&c->moving_context_lock);
223         list_add(&ctxt->list, &c->moving_context_list);
224         mutex_unlock(&c->moving_context_lock);
225 }
226
227 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
228 {
229         trace_move_data(c, stats);
230 }
231
232 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
233 {
234         memset(stats, 0, sizeof(*stats));
235         stats->data_type = BCH_DATA_user;
236         scnprintf(stats->name, sizeof(stats->name), "%s", name);
237 }
238
239 int bch2_move_extent(struct moving_context *ctxt,
240                      struct move_bucket_in_flight *bucket_in_flight,
241                      struct btree_iter *iter,
242                      struct bkey_s_c k,
243                      struct bch_io_opts io_opts,
244                      struct data_update_opts data_opts)
245 {
246         struct btree_trans *trans = ctxt->trans;
247         struct bch_fs *c = trans->c;
248         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
249         struct moving_io *io;
250         const union bch_extent_entry *entry;
251         struct extent_ptr_decoded p;
252         unsigned sectors = k.k->size, pages;
253         int ret = -ENOMEM;
254
255         if (ctxt->stats)
256                 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
257         trace_move_extent2(c, k);
258
259         bch2_data_update_opts_normalize(k, &data_opts);
260
261         if (!data_opts.rewrite_ptrs &&
262             !data_opts.extra_replicas) {
263                 if (data_opts.kill_ptrs)
264                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
265                 return 0;
266         }
267
268         /*
269          * Before memory allocations & taking nocow locks in
270          * bch2_data_update_init():
271          */
272         bch2_trans_unlock(trans);
273
274         /* write path might have to decompress data: */
275         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
276                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
277
278         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
279         io = kzalloc(sizeof(struct moving_io) +
280                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
281         if (!io)
282                 goto err;
283
284         INIT_LIST_HEAD(&io->io_list);
285         io->write.ctxt          = ctxt;
286         io->read_sectors        = k.k->size;
287         io->write_sectors       = k.k->size;
288
289         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
290         bio_set_prio(&io->write.op.wbio.bio,
291                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
292
293         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
294                                  GFP_KERNEL))
295                 goto err_free;
296
297         io->rbio.c              = c;
298         io->rbio.opts           = io_opts;
299         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
300         io->rbio.bio.bi_vcnt = pages;
301         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
302         io->rbio.bio.bi_iter.bi_size = sectors << 9;
303
304         io->rbio.bio.bi_opf             = REQ_OP_READ;
305         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
306         io->rbio.bio.bi_end_io          = move_read_endio;
307
308         ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
309                                     io_opts, data_opts, iter->btree_id, k);
310         if (ret)
311                 goto err_free_pages;
312
313         io->write.op.end_io = move_write_done;
314
315         if (ctxt->rate)
316                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
317
318         if (ctxt->stats) {
319                 atomic64_inc(&ctxt->stats->keys_moved);
320                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
321         }
322
323         if (bucket_in_flight) {
324                 io->b = bucket_in_flight;
325                 atomic_inc(&io->b->count);
326         }
327
328         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
329         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
330         trace_move_extent_read2(c, k);
331
332         mutex_lock(&ctxt->lock);
333         atomic_add(io->read_sectors, &ctxt->read_sectors);
334         atomic_inc(&ctxt->read_ios);
335
336         list_add_tail(&io->read_list, &ctxt->reads);
337         list_add_tail(&io->io_list, &ctxt->ios);
338         mutex_unlock(&ctxt->lock);
339
340         /*
341          * dropped by move_read_endio() - guards against use after free of
342          * ctxt when doing wakeup
343          */
344         closure_get(&ctxt->cl);
345         bch2_read_extent(trans, &io->rbio,
346                          bkey_start_pos(k.k),
347                          iter->btree_id, k, 0,
348                          BCH_READ_NODECODE|
349                          BCH_READ_LAST_FRAGMENT);
350         return 0;
351 err_free_pages:
352         bio_free_pages(&io->write.op.wbio.bio);
353 err_free:
354         kfree(io);
355 err:
356         if (ret == -BCH_ERR_data_update_done)
357                 return 0;
358
359         this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
360         trace_move_extent_alloc_mem_fail2(c, k);
361         return ret;
362 }
363
364 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
365                           struct per_snapshot_io_opts *io_opts,
366                           struct bkey_s_c extent_k)
367 {
368         struct bch_fs *c = trans->c;
369         u32 restart_count = trans->restart_count;
370         int ret = 0;
371
372         if (io_opts->cur_inum != extent_k.k->p.inode) {
373                 struct btree_iter iter;
374                 struct bkey_s_c k;
375
376                 io_opts->d.nr = 0;
377
378                 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
379                                    BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
380                         if (k.k->p.offset != extent_k.k->p.inode)
381                                 break;
382
383                         if (!bkey_is_inode(k.k))
384                                 continue;
385
386                         struct bch_inode_unpacked inode;
387                         BUG_ON(bch2_inode_unpack(k, &inode));
388
389                         struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
390                         bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
391
392                         ret = darray_push(&io_opts->d, e);
393                         if (ret)
394                                 break;
395                 }
396                 bch2_trans_iter_exit(trans, &iter);
397                 io_opts->cur_inum = extent_k.k->p.inode;
398         }
399
400         ret = ret ?: trans_was_restarted(trans, restart_count);
401         if (ret)
402                 return ERR_PTR(ret);
403
404         if (extent_k.k->p.snapshot) {
405                 struct snapshot_io_opts_entry *i;
406                 darray_for_each(io_opts->d, i)
407                         if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
408                                 return &i->io_opts;
409         }
410
411         return &io_opts->fs_io_opts;
412 }
413
414 int bch2_move_get_io_opts_one(struct btree_trans *trans,
415                               struct bch_io_opts *io_opts,
416                               struct bkey_s_c extent_k)
417 {
418         struct btree_iter iter;
419         struct bkey_s_c k;
420         int ret;
421
422         /* reflink btree? */
423         if (!extent_k.k->p.inode) {
424                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
425                 return 0;
426         }
427
428         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
429                                SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
430                                BTREE_ITER_CACHED);
431         ret = bkey_err(k);
432         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
433                 return ret;
434
435         if (!ret && bkey_is_inode(k.k)) {
436                 struct bch_inode_unpacked inode;
437                 bch2_inode_unpack(k, &inode);
438                 bch2_inode_opts_get(io_opts, trans->c, &inode);
439         } else {
440                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
441         }
442
443         bch2_trans_iter_exit(trans, &iter);
444         return 0;
445 }
446
447 int bch2_move_ratelimit(struct moving_context *ctxt)
448 {
449         struct bch_fs *c = ctxt->trans->c;
450         u64 delay;
451
452         if (ctxt->wait_on_copygc && c->copygc_running) {
453                 bch2_moving_ctxt_flush_all(ctxt);
454                 wait_event_killable(c->copygc_running_wq,
455                                     !c->copygc_running ||
456                                     kthread_should_stop());
457         }
458
459         do {
460                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
461
462                 if (kthread_should_stop())
463                         return 1;
464
465                 if (delay)
466                         move_ctxt_wait_event_timeout(ctxt,
467                                         freezing(current) || kthread_should_stop(),
468                                         delay);
469
470                 if (unlikely(freezing(current))) {
471                         bch2_moving_ctxt_flush_all(ctxt);
472                         try_to_freeze();
473                 }
474         } while (delay);
475
476         /*
477          * XXX: these limits really ought to be per device, SSDs and hard drives
478          * will want different limits
479          */
480         move_ctxt_wait_event(ctxt,
481                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
482                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
483                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
484                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
485
486         return 0;
487 }
488
489 static int bch2_move_data_btree(struct moving_context *ctxt,
490                                 struct bpos start,
491                                 struct bpos end,
492                                 move_pred_fn pred, void *arg,
493                                 enum btree_id btree_id)
494 {
495         struct btree_trans *trans = ctxt->trans;
496         struct bch_fs *c = trans->c;
497         struct per_snapshot_io_opts snapshot_io_opts;
498         struct bch_io_opts *io_opts;
499         struct bkey_buf sk;
500         struct btree_iter iter;
501         struct bkey_s_c k;
502         struct data_update_opts data_opts;
503         int ret = 0, ret2;
504
505         per_snapshot_io_opts_init(&snapshot_io_opts, c);
506         bch2_bkey_buf_init(&sk);
507
508         if (ctxt->stats) {
509                 ctxt->stats->data_type  = BCH_DATA_user;
510                 ctxt->stats->pos        = BBPOS(btree_id, start);
511         }
512
513         bch2_trans_iter_init(trans, &iter, btree_id, start,
514                              BTREE_ITER_PREFETCH|
515                              BTREE_ITER_ALL_SNAPSHOTS);
516
517         if (ctxt->rate)
518                 bch2_ratelimit_reset(ctxt->rate);
519
520         while (!bch2_move_ratelimit(ctxt)) {
521                 bch2_trans_begin(trans);
522
523                 k = bch2_btree_iter_peek(&iter);
524                 if (!k.k)
525                         break;
526
527                 ret = bkey_err(k);
528                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
529                         continue;
530                 if (ret)
531                         break;
532
533                 if (bkey_ge(bkey_start_pos(k.k), end))
534                         break;
535
536                 if (ctxt->stats)
537                         ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
538
539                 if (!bkey_extent_is_direct_data(k.k))
540                         goto next_nondata;
541
542                 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
543                 ret = PTR_ERR_OR_ZERO(io_opts);
544                 if (ret)
545                         continue;
546
547                 memset(&data_opts, 0, sizeof(data_opts));
548                 if (!pred(c, arg, k, io_opts, &data_opts))
549                         goto next;
550
551                 /*
552                  * The iterator gets unlocked by __bch2_read_extent - need to
553                  * save a copy of @k elsewhere:
554                  */
555                 bch2_bkey_buf_reassemble(&sk, c, k);
556                 k = bkey_i_to_s_c(sk.k);
557
558                 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
559                 if (ret2) {
560                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
561                                 continue;
562
563                         if (ret2 == -ENOMEM) {
564                                 /* memory allocation failure, wait for some IO to finish */
565                                 bch2_move_ctxt_wait_for_io(ctxt);
566                                 continue;
567                         }
568
569                         /* XXX signal failure */
570                         goto next;
571                 }
572 next:
573                 if (ctxt->stats)
574                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
575 next_nondata:
576                 bch2_btree_iter_advance(&iter);
577         }
578
579         bch2_trans_iter_exit(trans, &iter);
580         bch2_bkey_buf_exit(&sk, c);
581         per_snapshot_io_opts_exit(&snapshot_io_opts);
582
583         return ret;
584 }
585
586 int __bch2_move_data(struct moving_context *ctxt,
587                      struct bbpos start,
588                      struct bbpos end,
589                      move_pred_fn pred, void *arg)
590 {
591         struct bch_fs *c = ctxt->trans->c;
592         enum btree_id id;
593         int ret = 0;
594
595         for (id = start.btree;
596              id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
597              id++) {
598                 ctxt->stats->pos = BBPOS(id, POS_MIN);
599
600                 if (!btree_type_has_ptrs(id) ||
601                     !bch2_btree_id_root(c, id)->b)
602                         continue;
603
604                 ret = bch2_move_data_btree(ctxt,
605                                        id == start.btree ? start.pos : POS_MIN,
606                                        id == end.btree   ? end.pos   : POS_MAX,
607                                        pred, arg, id);
608                 if (ret)
609                         break;
610         }
611
612         return ret;
613 }
614
615 int bch2_move_data(struct bch_fs *c,
616                    struct bbpos start,
617                    struct bbpos end,
618                    struct bch_ratelimit *rate,
619                    struct bch_move_stats *stats,
620                    struct write_point_specifier wp,
621                    bool wait_on_copygc,
622                    move_pred_fn pred, void *arg)
623 {
624
625         struct moving_context ctxt;
626         int ret;
627
628         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
629         ret = __bch2_move_data(&ctxt, start, end, pred, arg);
630         bch2_moving_ctxt_exit(&ctxt);
631
632         return ret;
633 }
634
635 int __bch2_evacuate_bucket(struct moving_context *ctxt,
636                            struct move_bucket_in_flight *bucket_in_flight,
637                            struct bpos bucket, int gen,
638                            struct data_update_opts _data_opts)
639 {
640         struct btree_trans *trans = ctxt->trans;
641         struct bch_fs *c = trans->c;
642         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
643         struct btree_iter iter;
644         struct bkey_buf sk;
645         struct bch_backpointer bp;
646         struct bch_alloc_v4 a_convert;
647         const struct bch_alloc_v4 *a;
648         struct bkey_s_c k;
649         struct data_update_opts data_opts;
650         unsigned dirty_sectors, bucket_size;
651         u64 fragmentation;
652         struct bpos bp_pos = POS_MIN;
653         int ret = 0;
654
655         trace_bucket_evacuate(c, &bucket);
656
657         bch2_bkey_buf_init(&sk);
658
659         /*
660          * We're not run in a context that handles transaction restarts:
661          */
662         bch2_trans_begin(trans);
663
664         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
665                              bucket, BTREE_ITER_CACHED);
666         ret = lockrestart_do(trans,
667                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
668         bch2_trans_iter_exit(trans, &iter);
669
670         if (ret) {
671                 bch_err_msg(c, ret, "looking up alloc key");
672                 goto err;
673         }
674
675         a = bch2_alloc_to_v4(k, &a_convert);
676         dirty_sectors = bch2_bucket_sectors_dirty(*a);
677         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
678         fragmentation = a->fragmentation_lru;
679
680         ret = bch2_btree_write_buffer_tryflush(trans);
681         bch_err_msg(c, ret, "flushing btree write buffer");
682         if (ret)
683                 goto err;
684
685         while (!(ret = bch2_move_ratelimit(ctxt))) {
686                 bch2_trans_begin(trans);
687
688                 ret = bch2_get_next_backpointer(trans, bucket, gen,
689                                                 &bp_pos, &bp,
690                                                 BTREE_ITER_CACHED);
691                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
692                         continue;
693                 if (ret)
694                         goto err;
695                 if (bkey_eq(bp_pos, POS_MAX))
696                         break;
697
698                 if (!bp.level) {
699                         const struct bch_extent_ptr *ptr;
700                         unsigned i = 0;
701
702                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
703                         ret = bkey_err(k);
704                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
705                                 continue;
706                         if (ret)
707                                 goto err;
708                         if (!k.k)
709                                 goto next;
710
711                         bch2_bkey_buf_reassemble(&sk, c, k);
712                         k = bkey_i_to_s_c(sk.k);
713
714                         ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
715                         if (ret) {
716                                 bch2_trans_iter_exit(trans, &iter);
717                                 continue;
718                         }
719
720                         data_opts = _data_opts;
721                         data_opts.target        = io_opts.background_target;
722                         data_opts.rewrite_ptrs = 0;
723
724                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
725                                 if (ptr->dev == bucket.inode) {
726                                         data_opts.rewrite_ptrs |= 1U << i;
727                                         if (ptr->cached) {
728                                                 bch2_trans_iter_exit(trans, &iter);
729                                                 goto next;
730                                         }
731                                 }
732                                 i++;
733                         }
734
735                         ret = bch2_move_extent(ctxt, bucket_in_flight,
736                                                &iter, k, io_opts, data_opts);
737                         bch2_trans_iter_exit(trans, &iter);
738
739                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
740                                 continue;
741                         if (ret == -ENOMEM) {
742                                 /* memory allocation failure, wait for some IO to finish */
743                                 bch2_move_ctxt_wait_for_io(ctxt);
744                                 continue;
745                         }
746                         if (ret)
747                                 goto err;
748
749                         if (ctxt->stats)
750                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
751                 } else {
752                         struct btree *b;
753
754                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
755                         ret = PTR_ERR_OR_ZERO(b);
756                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
757                                 continue;
758                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
759                                 continue;
760                         if (ret)
761                                 goto err;
762                         if (!b)
763                                 goto next;
764
765                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
766                         bch2_trans_iter_exit(trans, &iter);
767
768                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
769                                 continue;
770                         if (ret)
771                                 goto err;
772
773                         if (ctxt->rate)
774                                 bch2_ratelimit_increment(ctxt->rate,
775                                                          c->opts.btree_node_size >> 9);
776                         if (ctxt->stats) {
777                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
778                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
779                         }
780                 }
781 next:
782                 bp_pos = bpos_nosnap_successor(bp_pos);
783         }
784
785         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
786 err:
787         bch2_bkey_buf_exit(&sk, c);
788         return ret;
789 }
790
791 int bch2_evacuate_bucket(struct bch_fs *c,
792                          struct bpos bucket, int gen,
793                          struct data_update_opts data_opts,
794                          struct bch_ratelimit *rate,
795                          struct bch_move_stats *stats,
796                          struct write_point_specifier wp,
797                          bool wait_on_copygc)
798 {
799         struct moving_context ctxt;
800         int ret;
801
802         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
803         ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
804         bch2_moving_ctxt_exit(&ctxt);
805
806         return ret;
807 }
808
809 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
810                                 struct btree *, struct bch_io_opts *,
811                                 struct data_update_opts *);
812
813 static int bch2_move_btree(struct bch_fs *c,
814                            struct bbpos start,
815                            struct bbpos end,
816                            move_btree_pred pred, void *arg,
817                            struct bch_move_stats *stats)
818 {
819         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
820         struct moving_context ctxt;
821         struct btree_trans *trans;
822         struct btree_iter iter;
823         struct btree *b;
824         enum btree_id btree;
825         struct data_update_opts data_opts;
826         int ret = 0;
827
828         bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
829                               writepoint_ptr(&c->btree_write_point),
830                               true);
831         trans = ctxt.trans;
832
833         stats->data_type = BCH_DATA_btree;
834
835         for (btree = start.btree;
836              btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
837              btree ++) {
838                 stats->pos = BBPOS(btree, POS_MIN);
839
840                 if (!bch2_btree_id_root(c, btree)->b)
841                         continue;
842
843                 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
844                                           BTREE_ITER_PREFETCH);
845 retry:
846                 ret = 0;
847                 while (bch2_trans_begin(trans),
848                        (b = bch2_btree_iter_peek_node(&iter)) &&
849                        !(ret = PTR_ERR_OR_ZERO(b))) {
850                         if (kthread_should_stop())
851                                 break;
852
853                         if ((cmp_int(btree, end.btree) ?:
854                              bpos_cmp(b->key.k.p, end.pos)) > 0)
855                                 break;
856
857                         stats->pos = BBPOS(iter.btree_id, iter.pos);
858
859                         if (!pred(c, arg, b, &io_opts, &data_opts))
860                                 goto next;
861
862                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
863                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
864                                 continue;
865                         if (ret)
866                                 break;
867 next:
868                         bch2_btree_iter_next_node(&iter);
869                 }
870                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
871                         goto retry;
872
873                 bch2_trans_iter_exit(trans, &iter);
874
875                 if (kthread_should_stop())
876                         break;
877         }
878
879         bch_err_fn(c, ret);
880         bch2_moving_ctxt_exit(&ctxt);
881         bch2_btree_interior_updates_flush(c);
882
883         return ret;
884 }
885
886 static bool rereplicate_pred(struct bch_fs *c, void *arg,
887                              struct bkey_s_c k,
888                              struct bch_io_opts *io_opts,
889                              struct data_update_opts *data_opts)
890 {
891         unsigned nr_good = bch2_bkey_durability(c, k);
892         unsigned replicas = bkey_is_btree_ptr(k.k)
893                 ? c->opts.metadata_replicas
894                 : io_opts->data_replicas;
895
896         if (!nr_good || nr_good >= replicas)
897                 return false;
898
899         data_opts->target               = 0;
900         data_opts->extra_replicas       = replicas - nr_good;
901         data_opts->btree_insert_flags   = 0;
902         return true;
903 }
904
905 static bool migrate_pred(struct bch_fs *c, void *arg,
906                          struct bkey_s_c k,
907                          struct bch_io_opts *io_opts,
908                          struct data_update_opts *data_opts)
909 {
910         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
911         const struct bch_extent_ptr *ptr;
912         struct bch_ioctl_data *op = arg;
913         unsigned i = 0;
914
915         data_opts->rewrite_ptrs         = 0;
916         data_opts->target               = 0;
917         data_opts->extra_replicas       = 0;
918         data_opts->btree_insert_flags   = 0;
919
920         bkey_for_each_ptr(ptrs, ptr) {
921                 if (ptr->dev == op->migrate.dev)
922                         data_opts->rewrite_ptrs |= 1U << i;
923                 i++;
924         }
925
926         return data_opts->rewrite_ptrs != 0;
927 }
928
929 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
930                                    struct btree *b,
931                                    struct bch_io_opts *io_opts,
932                                    struct data_update_opts *data_opts)
933 {
934         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
935 }
936
937 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
938                                struct btree *b,
939                                struct bch_io_opts *io_opts,
940                                struct data_update_opts *data_opts)
941 {
942         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
943 }
944
945 static bool bformat_needs_redo(struct bkey_format *f)
946 {
947         unsigned i;
948
949         for (i = 0; i < f->nr_fields; i++) {
950                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
951                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
952                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
953
954                 if (f->bits_per_field[i] > unpacked_bits)
955                         return true;
956
957                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
958                         return true;
959
960                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
961                      unpacked_mask) <
962                     field_offset)
963                         return true;
964         }
965
966         return false;
967 }
968
969 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
970                                    struct btree *b,
971                                    struct bch_io_opts *io_opts,
972                                    struct data_update_opts *data_opts)
973 {
974         if (b->version_ondisk != c->sb.version ||
975             btree_node_need_rewrite(b) ||
976             bformat_needs_redo(&b->format)) {
977                 data_opts->target               = 0;
978                 data_opts->extra_replicas       = 0;
979                 data_opts->btree_insert_flags   = 0;
980                 return true;
981         }
982
983         return false;
984 }
985
986 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
987 {
988         int ret;
989
990         ret = bch2_move_btree(c,
991                               BBPOS_MIN,
992                               BBPOS_MAX,
993                               rewrite_old_nodes_pred, c, stats);
994         if (!ret) {
995                 mutex_lock(&c->sb_lock);
996                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
997                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
998                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
999                 bch2_write_super(c);
1000                 mutex_unlock(&c->sb_lock);
1001         }
1002
1003         bch_err_fn(c, ret);
1004         return ret;
1005 }
1006
1007 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1008                              struct bkey_s_c k,
1009                              struct bch_io_opts *io_opts,
1010                              struct data_update_opts *data_opts)
1011 {
1012         unsigned durability = bch2_bkey_durability(c, k);
1013         unsigned replicas = bkey_is_btree_ptr(k.k)
1014                 ? c->opts.metadata_replicas
1015                 : io_opts->data_replicas;
1016         const union bch_extent_entry *entry;
1017         struct extent_ptr_decoded p;
1018         unsigned i = 0;
1019
1020         bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1021                 unsigned d = bch2_extent_ptr_durability(c, &p);
1022
1023                 if (d && durability - d >= replicas) {
1024                         data_opts->kill_ptrs |= BIT(i);
1025                         durability -= d;
1026                 }
1027
1028                 i++;
1029         }
1030
1031         return data_opts->kill_ptrs != 0;
1032 }
1033
1034 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1035                                    struct btree *b,
1036                                    struct bch_io_opts *io_opts,
1037                                    struct data_update_opts *data_opts)
1038 {
1039         return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
1040 }
1041
1042 int bch2_data_job(struct bch_fs *c,
1043                   struct bch_move_stats *stats,
1044                   struct bch_ioctl_data op)
1045 {
1046         struct bbpos start      = BBPOS(op.start_btree, op.start_pos);
1047         struct bbpos end        = BBPOS(op.end_btree, op.end_pos);
1048         int ret = 0;
1049
1050         if (op.op >= BCH_DATA_OP_NR)
1051                 return -EINVAL;
1052
1053         bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
1054
1055         switch (op.op) {
1056         case BCH_DATA_OP_rereplicate:
1057                 stats->data_type = BCH_DATA_journal;
1058                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1059                 ret = bch2_move_btree(c, start, end,
1060                                       rereplicate_btree_pred, c, stats) ?: ret;
1061                 ret = bch2_move_data(c, start, end,
1062                                      NULL,
1063                                      stats,
1064                                      writepoint_hashed((unsigned long) current),
1065                                      true,
1066                                      rereplicate_pred, c) ?: ret;
1067                 ret = bch2_replicas_gc2(c) ?: ret;
1068                 break;
1069         case BCH_DATA_OP_migrate:
1070                 if (op.migrate.dev >= c->sb.nr_devices)
1071                         return -EINVAL;
1072
1073                 stats->data_type = BCH_DATA_journal;
1074                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1075                 ret = bch2_move_btree(c, start, end,
1076                                       migrate_btree_pred, &op, stats) ?: ret;
1077                 ret = bch2_move_data(c, start, end,
1078                                      NULL,
1079                                      stats,
1080                                      writepoint_hashed((unsigned long) current),
1081                                      true,
1082                                      migrate_pred, &op) ?: ret;
1083                 ret = bch2_replicas_gc2(c) ?: ret;
1084                 break;
1085         case BCH_DATA_OP_rewrite_old_nodes:
1086                 ret = bch2_scan_old_btree_nodes(c, stats);
1087                 break;
1088         case BCH_DATA_OP_drop_extra_replicas:
1089                 ret = bch2_move_btree(c, start, end,
1090                                 drop_extra_replicas_btree_pred, c, stats) ?: ret;
1091                 ret = bch2_move_data(c, start, end, NULL, stats,
1092                                 writepoint_hashed((unsigned long) current),
1093                                 true,
1094                                 drop_extra_replicas_pred, c) ?: ret;
1095                 ret = bch2_replicas_gc2(c) ?: ret;
1096                 break;
1097         default:
1098                 ret = -EINVAL;
1099         }
1100
1101         bch2_move_stats_exit(stats, c);
1102         return ret;
1103 }
1104
1105 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1106 {
1107         prt_printf(out, "%s: data type=%s pos=",
1108                    stats->name,
1109                    bch2_data_types[stats->data_type]);
1110         bch2_bbpos_to_text(out, stats->pos);
1111         prt_newline(out);
1112         printbuf_indent_add(out, 2);
1113
1114         prt_str(out, "keys moved:  ");
1115         prt_u64(out, atomic64_read(&stats->keys_moved));
1116         prt_newline(out);
1117
1118         prt_str(out, "keys raced:  ");
1119         prt_u64(out, atomic64_read(&stats->keys_raced));
1120         prt_newline(out);
1121
1122         prt_str(out, "bytes seen:  ");
1123         prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1124         prt_newline(out);
1125
1126         prt_str(out, "bytes moved: ");
1127         prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1128         prt_newline(out);
1129
1130         prt_str(out, "bytes raced: ");
1131         prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1132         prt_newline(out);
1133
1134         printbuf_indent_sub(out, 2);
1135 }
1136
1137 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1138 {
1139         struct moving_io *io;
1140
1141         bch2_move_stats_to_text(out, ctxt->stats);
1142         printbuf_indent_add(out, 2);
1143
1144         prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1145                    atomic_read(&ctxt->read_ios),
1146                    c->opts.move_ios_in_flight,
1147                    atomic_read(&ctxt->read_sectors),
1148                    c->opts.move_bytes_in_flight >> 9);
1149         prt_newline(out);
1150
1151         prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1152                    atomic_read(&ctxt->write_ios),
1153                    c->opts.move_ios_in_flight,
1154                    atomic_read(&ctxt->write_sectors),
1155                    c->opts.move_bytes_in_flight >> 9);
1156         prt_newline(out);
1157
1158         printbuf_indent_add(out, 2);
1159
1160         mutex_lock(&ctxt->lock);
1161         list_for_each_entry(io, &ctxt->ios, io_list)
1162                 bch2_write_op_to_text(out, &io->write.op);
1163         mutex_unlock(&ctxt->lock);
1164
1165         printbuf_indent_sub(out, 4);
1166 }
1167
1168 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1169 {
1170         struct moving_context *ctxt;
1171
1172         mutex_lock(&c->moving_context_lock);
1173         list_for_each_entry(ctxt, &c->moving_context_list, list)
1174                 bch2_moving_ctxt_to_text(out, c, ctxt);
1175         mutex_unlock(&c->moving_context_lock);
1176 }
1177
1178 void bch2_fs_move_init(struct bch_fs *c)
1179 {
1180         INIT_LIST_HEAD(&c->moving_context_list);
1181         mutex_init(&c->moving_context_lock);
1182 }