]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to 3ca08ab51ec9 bcachefs: six locks: Simplify optimistic...
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io_read.h"
18 #include "io_write.h"
19 #include "journal_reclaim.h"
20 #include "keylist.h"
21 #include "move.h"
22 #include "replicas.h"
23 #include "snapshot.h"
24 #include "super-io.h"
25 #include "trace.h"
26
27 #include <linux/ioprio.h>
28 #include <linux/kthread.h>
29
30 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
31 {
32         if (trace_move_extent_enabled()) {
33                 struct printbuf buf = PRINTBUF;
34
35                 bch2_bkey_val_to_text(&buf, c, k);
36                 trace_move_extent(c, buf.buf);
37                 printbuf_exit(&buf);
38         }
39 }
40
41 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
42 {
43         if (trace_move_extent_read_enabled()) {
44                 struct printbuf buf = PRINTBUF;
45
46                 bch2_bkey_val_to_text(&buf, c, k);
47                 trace_move_extent_read(c, buf.buf);
48                 printbuf_exit(&buf);
49         }
50 }
51
52 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
53 {
54         if (trace_move_extent_alloc_mem_fail_enabled()) {
55                 struct printbuf buf = PRINTBUF;
56
57                 bch2_bkey_val_to_text(&buf, c, k);
58                 trace_move_extent_alloc_mem_fail(c, buf.buf);
59                 printbuf_exit(&buf);
60         }
61 }
62
63 struct moving_io {
64         struct list_head                read_list;
65         struct list_head                io_list;
66         struct move_bucket_in_flight    *b;
67         struct closure                  cl;
68         bool                            read_completed;
69
70         unsigned                        read_sectors;
71         unsigned                        write_sectors;
72
73         struct bch_read_bio             rbio;
74
75         struct data_update              write;
76         /* Must be last since it is variable size */
77         struct bio_vec                  bi_inline_vecs[0];
78 };
79
80 static void move_free(struct moving_io *io)
81 {
82         struct moving_context *ctxt = io->write.ctxt;
83
84         if (io->b)
85                 atomic_dec(&io->b->count);
86
87         bch2_data_update_exit(&io->write);
88
89         mutex_lock(&ctxt->lock);
90         list_del(&io->io_list);
91         wake_up(&ctxt->wait);
92         mutex_unlock(&ctxt->lock);
93
94         kfree(io);
95 }
96
97 static void move_write_done(struct bch_write_op *op)
98 {
99         struct moving_io *io = container_of(op, struct moving_io, write.op);
100         struct moving_context *ctxt = io->write.ctxt;
101
102         if (io->write.op.error)
103                 ctxt->write_error = true;
104
105         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
106         atomic_dec(&io->write.ctxt->write_ios);
107         move_free(io);
108         closure_put(&ctxt->cl);
109 }
110
111 static void move_write(struct moving_io *io)
112 {
113         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
114                 move_free(io);
115                 return;
116         }
117
118         closure_get(&io->write.ctxt->cl);
119         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
120         atomic_inc(&io->write.ctxt->write_ios);
121
122         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
123 }
124
125 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
126 {
127         struct moving_io *io =
128                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
129
130         return io && io->read_completed ? io : NULL;
131 }
132
133 static void move_read_endio(struct bio *bio)
134 {
135         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
136         struct moving_context *ctxt = io->write.ctxt;
137
138         atomic_sub(io->read_sectors, &ctxt->read_sectors);
139         atomic_dec(&ctxt->read_ios);
140         io->read_completed = true;
141
142         wake_up(&ctxt->wait);
143         closure_put(&ctxt->cl);
144 }
145
146 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
147 {
148         struct moving_io *io;
149
150         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
151                 bch2_trans_unlock_long(ctxt->trans);
152                 list_del(&io->read_list);
153                 move_write(io);
154         }
155 }
156
157 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
158 {
159         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
160
161         move_ctxt_wait_event(ctxt,
162                 !atomic_read(&ctxt->write_sectors) ||
163                 atomic_read(&ctxt->write_sectors) != sectors_pending);
164 }
165
166 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
167 {
168         struct bch_fs *c = ctxt->trans->c;
169
170         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
171         closure_sync(&ctxt->cl);
172
173         EBUG_ON(atomic_read(&ctxt->write_sectors));
174         EBUG_ON(atomic_read(&ctxt->write_ios));
175         EBUG_ON(atomic_read(&ctxt->read_sectors));
176         EBUG_ON(atomic_read(&ctxt->read_ios));
177
178         mutex_lock(&c->moving_context_lock);
179         list_del(&ctxt->list);
180         mutex_unlock(&c->moving_context_lock);
181
182         bch2_trans_put(ctxt->trans);
183         memset(ctxt, 0, sizeof(*ctxt));
184 }
185
186 void bch2_moving_ctxt_init(struct moving_context *ctxt,
187                            struct bch_fs *c,
188                            struct bch_ratelimit *rate,
189                            struct bch_move_stats *stats,
190                            struct write_point_specifier wp,
191                            bool wait_on_copygc)
192 {
193         memset(ctxt, 0, sizeof(*ctxt));
194
195         ctxt->trans     = bch2_trans_get(c);
196         ctxt->fn        = (void *) _RET_IP_;
197         ctxt->rate      = rate;
198         ctxt->stats     = stats;
199         ctxt->wp        = wp;
200         ctxt->wait_on_copygc = wait_on_copygc;
201
202         closure_init_stack(&ctxt->cl);
203
204         mutex_init(&ctxt->lock);
205         INIT_LIST_HEAD(&ctxt->reads);
206         INIT_LIST_HEAD(&ctxt->ios);
207         init_waitqueue_head(&ctxt->wait);
208
209         mutex_lock(&c->moving_context_lock);
210         list_add(&ctxt->list, &c->moving_context_list);
211         mutex_unlock(&c->moving_context_lock);
212 }
213
214 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
215 {
216         trace_move_data(c, stats);
217 }
218
219 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
220 {
221         memset(stats, 0, sizeof(*stats));
222         stats->data_type = BCH_DATA_user;
223         scnprintf(stats->name, sizeof(stats->name), "%s", name);
224 }
225
226 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
227                                  struct btree_iter *iter,
228                                  struct bkey_s_c k,
229                                  struct data_update_opts data_opts)
230 {
231         struct bch_fs *c = trans->c;
232         struct bkey_i *n;
233         int ret;
234
235         n = bch2_bkey_make_mut_noupdate(trans, k);
236         ret = PTR_ERR_OR_ZERO(n);
237         if (ret)
238                 return ret;
239
240         while (data_opts.kill_ptrs) {
241                 unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
242                 struct bch_extent_ptr *ptr;
243
244                 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
245                 data_opts.kill_ptrs ^= 1U << drop;
246         }
247
248         /*
249          * If the new extent no longer has any pointers, bch2_extent_normalize()
250          * will do the appropriate thing with it (turning it into a
251          * KEY_TYPE_error key, or just a discard if it was a cached extent)
252          */
253         bch2_extent_normalize(c, bkey_i_to_s(n));
254
255         /*
256          * Since we're not inserting through an extent iterator
257          * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
258          * we aren't using the extent overwrite path to delete, we're
259          * just using the normal key deletion path:
260          */
261         if (bkey_deleted(&n->k))
262                 n->k.size = 0;
263
264         return bch2_trans_relock(trans) ?:
265                 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
266                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
267 }
268
269 int bch2_move_extent(struct moving_context *ctxt,
270                      struct move_bucket_in_flight *bucket_in_flight,
271                      struct btree_iter *iter,
272                      struct bkey_s_c k,
273                      struct bch_io_opts io_opts,
274                      struct data_update_opts data_opts)
275 {
276         struct btree_trans *trans = ctxt->trans;
277         struct bch_fs *c = trans->c;
278         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
279         struct moving_io *io;
280         const union bch_extent_entry *entry;
281         struct extent_ptr_decoded p;
282         unsigned sectors = k.k->size, pages;
283         int ret = -ENOMEM;
284
285         if (ctxt->stats)
286                 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
287         trace_move_extent2(c, k);
288
289         bch2_data_update_opts_normalize(k, &data_opts);
290
291         if (!data_opts.rewrite_ptrs &&
292             !data_opts.extra_replicas) {
293                 if (data_opts.kill_ptrs)
294                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
295                 return 0;
296         }
297
298         /*
299          * Before memory allocations & taking nocow locks in
300          * bch2_data_update_init():
301          */
302         bch2_trans_unlock(trans);
303
304         /* write path might have to decompress data: */
305         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
306                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
307
308         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
309         io = kzalloc(sizeof(struct moving_io) +
310                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
311         if (!io)
312                 goto err;
313
314         INIT_LIST_HEAD(&io->io_list);
315         io->write.ctxt          = ctxt;
316         io->read_sectors        = k.k->size;
317         io->write_sectors       = k.k->size;
318
319         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
320         bio_set_prio(&io->write.op.wbio.bio,
321                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
322
323         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
324                                  GFP_KERNEL))
325                 goto err_free;
326
327         io->rbio.c              = c;
328         io->rbio.opts           = io_opts;
329         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
330         io->rbio.bio.bi_vcnt = pages;
331         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
332         io->rbio.bio.bi_iter.bi_size = sectors << 9;
333
334         io->rbio.bio.bi_opf             = REQ_OP_READ;
335         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
336         io->rbio.bio.bi_end_io          = move_read_endio;
337
338         ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
339                                     io_opts, data_opts, iter->btree_id, k);
340         if (ret && ret != -BCH_ERR_unwritten_extent_update)
341                 goto err_free_pages;
342
343         if (ret == -BCH_ERR_unwritten_extent_update) {
344                 bch2_update_unwritten_extent(trans, &io->write);
345                 move_free(io);
346                 return 0;
347         }
348
349         BUG_ON(ret);
350
351         io->write.op.end_io = move_write_done;
352
353         if (ctxt->rate)
354                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
355
356         if (ctxt->stats) {
357                 atomic64_inc(&ctxt->stats->keys_moved);
358                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
359         }
360
361         if (bucket_in_flight) {
362                 io->b = bucket_in_flight;
363                 atomic_inc(&io->b->count);
364         }
365
366         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
367         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
368         trace_move_extent_read2(c, k);
369
370         mutex_lock(&ctxt->lock);
371         atomic_add(io->read_sectors, &ctxt->read_sectors);
372         atomic_inc(&ctxt->read_ios);
373
374         list_add_tail(&io->read_list, &ctxt->reads);
375         list_add_tail(&io->io_list, &ctxt->ios);
376         mutex_unlock(&ctxt->lock);
377
378         /*
379          * dropped by move_read_endio() - guards against use after free of
380          * ctxt when doing wakeup
381          */
382         closure_get(&ctxt->cl);
383         bch2_read_extent(trans, &io->rbio,
384                          bkey_start_pos(k.k),
385                          iter->btree_id, k, 0,
386                          BCH_READ_NODECODE|
387                          BCH_READ_LAST_FRAGMENT);
388         return 0;
389 err_free_pages:
390         bio_free_pages(&io->write.op.wbio.bio);
391 err_free:
392         kfree(io);
393 err:
394         this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
395         trace_move_extent_alloc_mem_fail2(c, k);
396         return ret;
397 }
398
399 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
400                           struct per_snapshot_io_opts *io_opts,
401                           struct bkey_s_c extent_k)
402 {
403         struct bch_fs *c = trans->c;
404         u32 restart_count = trans->restart_count;
405         int ret = 0;
406
407         if (io_opts->cur_inum != extent_k.k->p.inode) {
408                 struct btree_iter iter;
409                 struct bkey_s_c k;
410
411                 io_opts->d.nr = 0;
412
413                 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
414                                    BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
415                         if (k.k->p.offset != extent_k.k->p.inode)
416                                 break;
417
418                         if (!bkey_is_inode(k.k))
419                                 continue;
420
421                         struct bch_inode_unpacked inode;
422                         BUG_ON(bch2_inode_unpack(k, &inode));
423
424                         struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
425                         bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
426
427                         ret = darray_push(&io_opts->d, e);
428                         if (ret)
429                                 break;
430                 }
431                 bch2_trans_iter_exit(trans, &iter);
432                 io_opts->cur_inum = extent_k.k->p.inode;
433         }
434
435         ret = ret ?: trans_was_restarted(trans, restart_count);
436         if (ret)
437                 return ERR_PTR(ret);
438
439         if (extent_k.k->p.snapshot) {
440                 struct snapshot_io_opts_entry *i;
441                 darray_for_each(io_opts->d, i)
442                         if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
443                                 return &i->io_opts;
444         }
445
446         return &io_opts->fs_io_opts;
447 }
448
449 int bch2_move_get_io_opts_one(struct btree_trans *trans,
450                               struct bch_io_opts *io_opts,
451                               struct bkey_s_c extent_k)
452 {
453         struct btree_iter iter;
454         struct bkey_s_c k;
455         int ret;
456
457         /* reflink btree? */
458         if (!extent_k.k->p.inode) {
459                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
460                 return 0;
461         }
462
463         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
464                                SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
465                                BTREE_ITER_CACHED);
466         ret = bkey_err(k);
467         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
468                 return ret;
469
470         if (!ret && bkey_is_inode(k.k)) {
471                 struct bch_inode_unpacked inode;
472                 bch2_inode_unpack(k, &inode);
473                 bch2_inode_opts_get(io_opts, trans->c, &inode);
474         } else {
475                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
476         }
477
478         bch2_trans_iter_exit(trans, &iter);
479         return 0;
480 }
481
482 int bch2_move_ratelimit(struct moving_context *ctxt)
483 {
484         struct bch_fs *c = ctxt->trans->c;
485         u64 delay;
486
487         if (ctxt->wait_on_copygc && !c->copygc_running) {
488                 bch2_trans_unlock_long(ctxt->trans);
489                 wait_event_killable(c->copygc_running_wq,
490                                     !c->copygc_running ||
491                                     kthread_should_stop());
492         }
493
494         do {
495                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
496
497
498                 if (delay) {
499                         if (delay > HZ / 10)
500                                 bch2_trans_unlock_long(ctxt->trans);
501                         else
502                                 bch2_trans_unlock(ctxt->trans);
503                         set_current_state(TASK_INTERRUPTIBLE);
504                 }
505
506                 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
507                         __set_current_state(TASK_RUNNING);
508                         return 1;
509                 }
510
511                 if (delay)
512                         schedule_timeout(delay);
513
514                 if (unlikely(freezing(current))) {
515                         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
516                         try_to_freeze();
517                 }
518         } while (delay);
519
520         /*
521          * XXX: these limits really ought to be per device, SSDs and hard drives
522          * will want different limits
523          */
524         move_ctxt_wait_event(ctxt,
525                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
526                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
527                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
528                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
529
530         return 0;
531 }
532
533 static int bch2_move_data_btree(struct moving_context *ctxt,
534                                 struct bpos start,
535                                 struct bpos end,
536                                 move_pred_fn pred, void *arg,
537                                 enum btree_id btree_id)
538 {
539         struct btree_trans *trans = ctxt->trans;
540         struct bch_fs *c = trans->c;
541         struct per_snapshot_io_opts snapshot_io_opts;
542         struct bch_io_opts *io_opts;
543         struct bkey_buf sk;
544         struct btree_iter iter;
545         struct bkey_s_c k;
546         struct data_update_opts data_opts;
547         int ret = 0, ret2;
548
549         per_snapshot_io_opts_init(&snapshot_io_opts, c);
550         bch2_bkey_buf_init(&sk);
551
552         if (ctxt->stats) {
553                 ctxt->stats->data_type  = BCH_DATA_user;
554                 ctxt->stats->pos        = BBPOS(btree_id, start);
555         }
556
557         bch2_trans_iter_init(trans, &iter, btree_id, start,
558                              BTREE_ITER_PREFETCH|
559                              BTREE_ITER_ALL_SNAPSHOTS);
560
561         if (ctxt->rate)
562                 bch2_ratelimit_reset(ctxt->rate);
563
564         while (!bch2_move_ratelimit(ctxt)) {
565                 bch2_trans_begin(trans);
566
567                 k = bch2_btree_iter_peek(&iter);
568                 if (!k.k)
569                         break;
570
571                 ret = bkey_err(k);
572                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
573                         continue;
574                 if (ret)
575                         break;
576
577                 if (bkey_ge(bkey_start_pos(k.k), end))
578                         break;
579
580                 if (ctxt->stats)
581                         ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
582
583                 if (!bkey_extent_is_direct_data(k.k))
584                         goto next_nondata;
585
586                 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
587                 ret = PTR_ERR_OR_ZERO(io_opts);
588                 if (ret)
589                         continue;
590
591                 memset(&data_opts, 0, sizeof(data_opts));
592                 if (!pred(c, arg, k, io_opts, &data_opts))
593                         goto next;
594
595                 /*
596                  * The iterator gets unlocked by __bch2_read_extent - need to
597                  * save a copy of @k elsewhere:
598                  */
599                 bch2_bkey_buf_reassemble(&sk, c, k);
600                 k = bkey_i_to_s_c(sk.k);
601
602                 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
603                 if (ret2) {
604                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
605                                 continue;
606
607                         if (ret2 == -ENOMEM) {
608                                 /* memory allocation failure, wait for some IO to finish */
609                                 bch2_move_ctxt_wait_for_io(ctxt);
610                                 continue;
611                         }
612
613                         /* XXX signal failure */
614                         goto next;
615                 }
616 next:
617                 if (ctxt->stats)
618                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
619 next_nondata:
620                 bch2_btree_iter_advance(&iter);
621         }
622
623         bch2_trans_iter_exit(trans, &iter);
624         bch2_bkey_buf_exit(&sk, c);
625         per_snapshot_io_opts_exit(&snapshot_io_opts);
626
627         return ret;
628 }
629
630 int __bch2_move_data(struct moving_context *ctxt,
631                      struct bbpos start,
632                      struct bbpos end,
633                      move_pred_fn pred, void *arg)
634 {
635         struct bch_fs *c = ctxt->trans->c;
636         enum btree_id id;
637         int ret = 0;
638
639         for (id = start.btree;
640              id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
641              id++) {
642                 ctxt->stats->pos = BBPOS(id, POS_MIN);
643
644                 if (!btree_type_has_ptrs(id) ||
645                     !bch2_btree_id_root(c, id)->b)
646                         continue;
647
648                 ret = bch2_move_data_btree(ctxt,
649                                        id == start.btree ? start.pos : POS_MIN,
650                                        id == end.btree   ? end.pos   : POS_MAX,
651                                        pred, arg, id);
652                 if (ret)
653                         break;
654         }
655
656         return ret;
657 }
658
659 int bch2_move_data(struct bch_fs *c,
660                    struct bbpos start,
661                    struct bbpos end,
662                    struct bch_ratelimit *rate,
663                    struct bch_move_stats *stats,
664                    struct write_point_specifier wp,
665                    bool wait_on_copygc,
666                    move_pred_fn pred, void *arg)
667 {
668
669         struct moving_context ctxt;
670         int ret;
671
672         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
673         ret = __bch2_move_data(&ctxt, start, end, pred, arg);
674         bch2_moving_ctxt_exit(&ctxt);
675
676         return ret;
677 }
678
679 int __bch2_evacuate_bucket(struct moving_context *ctxt,
680                            struct move_bucket_in_flight *bucket_in_flight,
681                            struct bpos bucket, int gen,
682                            struct data_update_opts _data_opts)
683 {
684         struct btree_trans *trans = ctxt->trans;
685         struct bch_fs *c = trans->c;
686         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
687         struct btree_iter iter;
688         struct bkey_buf sk;
689         struct bch_backpointer bp;
690         struct bch_alloc_v4 a_convert;
691         const struct bch_alloc_v4 *a;
692         struct bkey_s_c k;
693         struct data_update_opts data_opts;
694         unsigned dirty_sectors, bucket_size;
695         u64 fragmentation;
696         struct bpos bp_pos = POS_MIN;
697         int ret = 0;
698
699         trace_bucket_evacuate(c, &bucket);
700
701         bch2_bkey_buf_init(&sk);
702
703         /*
704          * We're not run in a context that handles transaction restarts:
705          */
706         bch2_trans_begin(trans);
707
708         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
709                              bucket, BTREE_ITER_CACHED);
710         ret = lockrestart_do(trans,
711                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
712         bch2_trans_iter_exit(trans, &iter);
713
714         if (ret) {
715                 bch_err_msg(c, ret, "looking up alloc key");
716                 goto err;
717         }
718
719         a = bch2_alloc_to_v4(k, &a_convert);
720         dirty_sectors = a->dirty_sectors;
721         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
722         fragmentation = a->fragmentation_lru;
723
724         ret = bch2_btree_write_buffer_flush(trans);
725         if (ret) {
726                 bch_err_msg(c, ret, "flushing btree write buffer");
727                 goto err;
728         }
729
730         while (!(ret = bch2_move_ratelimit(ctxt))) {
731                 bch2_trans_begin(trans);
732
733                 ret = bch2_get_next_backpointer(trans, bucket, gen,
734                                                 &bp_pos, &bp,
735                                                 BTREE_ITER_CACHED);
736                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737                         continue;
738                 if (ret)
739                         goto err;
740                 if (bkey_eq(bp_pos, POS_MAX))
741                         break;
742
743                 if (!bp.level) {
744                         const struct bch_extent_ptr *ptr;
745                         unsigned i = 0;
746
747                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
748                         ret = bkey_err(k);
749                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
750                                 continue;
751                         if (ret)
752                                 goto err;
753                         if (!k.k)
754                                 goto next;
755
756                         bch2_bkey_buf_reassemble(&sk, c, k);
757                         k = bkey_i_to_s_c(sk.k);
758
759                         ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
760                         if (ret) {
761                                 bch2_trans_iter_exit(trans, &iter);
762                                 continue;
763                         }
764
765                         data_opts = _data_opts;
766                         data_opts.target        = io_opts.background_target;
767                         data_opts.rewrite_ptrs = 0;
768
769                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
770                                 if (ptr->dev == bucket.inode) {
771                                         data_opts.rewrite_ptrs |= 1U << i;
772                                         if (ptr->cached) {
773                                                 bch2_trans_iter_exit(trans, &iter);
774                                                 goto next;
775                                         }
776                                 }
777                                 i++;
778                         }
779
780                         ret = bch2_move_extent(ctxt, bucket_in_flight,
781                                                &iter, k, io_opts, data_opts);
782                         bch2_trans_iter_exit(trans, &iter);
783
784                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
785                                 continue;
786                         if (ret == -ENOMEM) {
787                                 /* memory allocation failure, wait for some IO to finish */
788                                 bch2_move_ctxt_wait_for_io(ctxt);
789                                 continue;
790                         }
791                         if (ret)
792                                 goto err;
793
794                         if (ctxt->stats)
795                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
796                 } else {
797                         struct btree *b;
798
799                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
800                         ret = PTR_ERR_OR_ZERO(b);
801                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
802                                 continue;
803                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
804                                 continue;
805                         if (ret)
806                                 goto err;
807                         if (!b)
808                                 goto next;
809
810                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
811                         bch2_trans_iter_exit(trans, &iter);
812
813                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
814                                 continue;
815                         if (ret)
816                                 goto err;
817
818                         if (ctxt->rate)
819                                 bch2_ratelimit_increment(ctxt->rate,
820                                                          c->opts.btree_node_size >> 9);
821                         if (ctxt->stats) {
822                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
823                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
824                         }
825                 }
826 next:
827                 bp_pos = bpos_nosnap_successor(bp_pos);
828         }
829
830         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
831 err:
832         bch2_bkey_buf_exit(&sk, c);
833         return ret;
834 }
835
836 int bch2_evacuate_bucket(struct bch_fs *c,
837                          struct bpos bucket, int gen,
838                          struct data_update_opts data_opts,
839                          struct bch_ratelimit *rate,
840                          struct bch_move_stats *stats,
841                          struct write_point_specifier wp,
842                          bool wait_on_copygc)
843 {
844         struct moving_context ctxt;
845         int ret;
846
847         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
848         ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
849         bch2_moving_ctxt_exit(&ctxt);
850
851         return ret;
852 }
853
854 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
855                                 struct btree *, struct bch_io_opts *,
856                                 struct data_update_opts *);
857
858 static int bch2_move_btree(struct bch_fs *c,
859                            enum btree_id start_btree_id, struct bpos start_pos,
860                            enum btree_id end_btree_id,   struct bpos end_pos,
861                            move_btree_pred pred, void *arg,
862                            struct bch_move_stats *stats)
863 {
864         bool kthread = (current->flags & PF_KTHREAD) != 0;
865         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
866         struct moving_context ctxt;
867         struct btree_trans *trans;
868         struct btree_iter iter;
869         struct btree *b;
870         enum btree_id id;
871         struct data_update_opts data_opts;
872         int ret = 0;
873
874         bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
875                               writepoint_ptr(&c->btree_write_point),
876                               true);
877         trans = ctxt.trans;
878
879         stats->data_type = BCH_DATA_btree;
880
881         for (id = start_btree_id;
882              id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
883              id++) {
884                 stats->pos = BBPOS(id, POS_MIN);
885
886                 if (!bch2_btree_id_root(c, id)->b)
887                         continue;
888
889                 bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
890                                           BTREE_ITER_PREFETCH);
891 retry:
892                 ret = 0;
893                 while (bch2_trans_begin(trans),
894                        (b = bch2_btree_iter_peek_node(&iter)) &&
895                        !(ret = PTR_ERR_OR_ZERO(b))) {
896                         if (kthread && kthread_should_stop())
897                                 break;
898
899                         if ((cmp_int(id, end_btree_id) ?:
900                              bpos_cmp(b->key.k.p, end_pos)) > 0)
901                                 break;
902
903                         stats->pos = BBPOS(iter.btree_id, iter.pos);
904
905                         if (!pred(c, arg, b, &io_opts, &data_opts))
906                                 goto next;
907
908                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
909                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
910                                 continue;
911                         if (ret)
912                                 break;
913 next:
914                         bch2_btree_iter_next_node(&iter);
915                 }
916                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
917                         goto retry;
918
919                 bch2_trans_iter_exit(trans, &iter);
920
921                 if (kthread && kthread_should_stop())
922                         break;
923         }
924
925         bch_err_fn(c, ret);
926         bch2_moving_ctxt_exit(&ctxt);
927         bch2_btree_interior_updates_flush(c);
928
929         return ret;
930 }
931
932 static bool rereplicate_pred(struct bch_fs *c, void *arg,
933                              struct bkey_s_c k,
934                              struct bch_io_opts *io_opts,
935                              struct data_update_opts *data_opts)
936 {
937         unsigned nr_good = bch2_bkey_durability(c, k);
938         unsigned replicas = bkey_is_btree_ptr(k.k)
939                 ? c->opts.metadata_replicas
940                 : io_opts->data_replicas;
941
942         if (!nr_good || nr_good >= replicas)
943                 return false;
944
945         data_opts->target               = 0;
946         data_opts->extra_replicas       = replicas - nr_good;
947         data_opts->btree_insert_flags   = 0;
948         return true;
949 }
950
951 static bool migrate_pred(struct bch_fs *c, void *arg,
952                          struct bkey_s_c k,
953                          struct bch_io_opts *io_opts,
954                          struct data_update_opts *data_opts)
955 {
956         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
957         const struct bch_extent_ptr *ptr;
958         struct bch_ioctl_data *op = arg;
959         unsigned i = 0;
960
961         data_opts->rewrite_ptrs         = 0;
962         data_opts->target               = 0;
963         data_opts->extra_replicas       = 0;
964         data_opts->btree_insert_flags   = 0;
965
966         bkey_for_each_ptr(ptrs, ptr) {
967                 if (ptr->dev == op->migrate.dev)
968                         data_opts->rewrite_ptrs |= 1U << i;
969                 i++;
970         }
971
972         return data_opts->rewrite_ptrs != 0;
973 }
974
975 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
976                                    struct btree *b,
977                                    struct bch_io_opts *io_opts,
978                                    struct data_update_opts *data_opts)
979 {
980         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
981 }
982
983 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
984                                struct btree *b,
985                                struct bch_io_opts *io_opts,
986                                struct data_update_opts *data_opts)
987 {
988         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
989 }
990
991 static bool bformat_needs_redo(struct bkey_format *f)
992 {
993         unsigned i;
994
995         for (i = 0; i < f->nr_fields; i++) {
996                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
997                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
998                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
999
1000                 if (f->bits_per_field[i] > unpacked_bits)
1001                         return true;
1002
1003                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
1004                         return true;
1005
1006                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
1007                      unpacked_mask) <
1008                     field_offset)
1009                         return true;
1010         }
1011
1012         return false;
1013 }
1014
1015 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1016                                    struct btree *b,
1017                                    struct bch_io_opts *io_opts,
1018                                    struct data_update_opts *data_opts)
1019 {
1020         if (b->version_ondisk != c->sb.version ||
1021             btree_node_need_rewrite(b) ||
1022             bformat_needs_redo(&b->format)) {
1023                 data_opts->target               = 0;
1024                 data_opts->extra_replicas       = 0;
1025                 data_opts->btree_insert_flags   = 0;
1026                 return true;
1027         }
1028
1029         return false;
1030 }
1031
1032 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1033 {
1034         int ret;
1035
1036         ret = bch2_move_btree(c,
1037                               0,                POS_MIN,
1038                               BTREE_ID_NR,      SPOS_MAX,
1039                               rewrite_old_nodes_pred, c, stats);
1040         if (!ret) {
1041                 mutex_lock(&c->sb_lock);
1042                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1043                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1044                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1045                 bch2_write_super(c);
1046                 mutex_unlock(&c->sb_lock);
1047         }
1048
1049         bch_err_fn(c, ret);
1050         return ret;
1051 }
1052
1053 int bch2_data_job(struct bch_fs *c,
1054                   struct bch_move_stats *stats,
1055                   struct bch_ioctl_data op)
1056 {
1057         int ret = 0;
1058
1059         switch (op.op) {
1060         case BCH_DATA_OP_REREPLICATE:
1061                 bch2_move_stats_init(stats, "rereplicate");
1062                 stats->data_type = BCH_DATA_journal;
1063                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1064
1065                 ret = bch2_move_btree(c,
1066                                       op.start_btree,   op.start_pos,
1067                                       op.end_btree,     op.end_pos,
1068                                       rereplicate_btree_pred, c, stats) ?: ret;
1069                 ret = bch2_replicas_gc2(c) ?: ret;
1070
1071                 ret = bch2_move_data(c,
1072                                      (struct bbpos) { op.start_btree,   op.start_pos },
1073                                      (struct bbpos) { op.end_btree,     op.end_pos },
1074                                      NULL,
1075                                      stats,
1076                                      writepoint_hashed((unsigned long) current),
1077                                      true,
1078                                      rereplicate_pred, c) ?: ret;
1079                 ret = bch2_replicas_gc2(c) ?: ret;
1080
1081                 bch2_move_stats_exit(stats, c);
1082                 break;
1083         case BCH_DATA_OP_MIGRATE:
1084                 if (op.migrate.dev >= c->sb.nr_devices)
1085                         return -EINVAL;
1086
1087                 bch2_move_stats_init(stats, "migrate");
1088                 stats->data_type = BCH_DATA_journal;
1089                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1090
1091                 ret = bch2_move_btree(c,
1092                                       op.start_btree,   op.start_pos,
1093                                       op.end_btree,     op.end_pos,
1094                                       migrate_btree_pred, &op, stats) ?: ret;
1095                 ret = bch2_replicas_gc2(c) ?: ret;
1096
1097                 ret = bch2_move_data(c,
1098                                      (struct bbpos) { op.start_btree,   op.start_pos },
1099                                      (struct bbpos) { op.end_btree,     op.end_pos },
1100                                      NULL,
1101                                      stats,
1102                                      writepoint_hashed((unsigned long) current),
1103                                      true,
1104                                      migrate_pred, &op) ?: ret;
1105                 ret = bch2_replicas_gc2(c) ?: ret;
1106
1107                 bch2_move_stats_exit(stats, c);
1108                 break;
1109         case BCH_DATA_OP_REWRITE_OLD_NODES:
1110                 bch2_move_stats_init(stats, "rewrite_old_nodes");
1111                 ret = bch2_scan_old_btree_nodes(c, stats);
1112                 bch2_move_stats_exit(stats, c);
1113                 break;
1114         default:
1115                 ret = -EINVAL;
1116         }
1117
1118         return ret;
1119 }
1120
1121 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1122 {
1123         prt_printf(out, "%s: data type=%s pos=",
1124                    stats->name,
1125                    bch2_data_types[stats->data_type]);
1126         bch2_bbpos_to_text(out, stats->pos);
1127         prt_newline(out);
1128         printbuf_indent_add(out, 2);
1129
1130         prt_str(out, "keys moved:  ");
1131         prt_u64(out, atomic64_read(&stats->keys_moved));
1132         prt_newline(out);
1133
1134         prt_str(out, "keys raced:  ");
1135         prt_u64(out, atomic64_read(&stats->keys_raced));
1136         prt_newline(out);
1137
1138         prt_str(out, "bytes seen:  ");
1139         prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1140         prt_newline(out);
1141
1142         prt_str(out, "bytes moved: ");
1143         prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1144         prt_newline(out);
1145
1146         prt_str(out, "bytes raced: ");
1147         prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1148         prt_newline(out);
1149
1150         printbuf_indent_sub(out, 2);
1151 }
1152
1153 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1154 {
1155         struct moving_io *io;
1156
1157         bch2_move_stats_to_text(out, ctxt->stats);
1158         printbuf_indent_add(out, 2);
1159
1160         prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1161                    atomic_read(&ctxt->read_ios),
1162                    c->opts.move_ios_in_flight,
1163                    atomic_read(&ctxt->read_sectors),
1164                    c->opts.move_bytes_in_flight >> 9);
1165         prt_newline(out);
1166
1167         prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1168                    atomic_read(&ctxt->write_ios),
1169                    c->opts.move_ios_in_flight,
1170                    atomic_read(&ctxt->write_sectors),
1171                    c->opts.move_bytes_in_flight >> 9);
1172         prt_newline(out);
1173
1174         printbuf_indent_add(out, 2);
1175
1176         mutex_lock(&ctxt->lock);
1177         list_for_each_entry(io, &ctxt->ios, io_list)
1178                 bch2_write_op_to_text(out, &io->write.op);
1179         mutex_unlock(&ctxt->lock);
1180
1181         printbuf_indent_sub(out, 4);
1182 }
1183
1184 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1185 {
1186         struct moving_context *ctxt;
1187
1188         mutex_lock(&c->moving_context_lock);
1189         list_for_each_entry(ctxt, &c->moving_context_list, list)
1190                 bch2_moving_ctxt_to_text(out, c, ctxt);
1191         mutex_unlock(&c->moving_context_lock);
1192 }
1193
1194 void bch2_fs_move_init(struct bch_fs *c)
1195 {
1196         INIT_LIST_HEAD(&c->moving_context_list);
1197         mutex_init(&c->moving_context_lock);
1198 }