]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to 0d63ed13ea3d closures: Fix race in closure_sync()
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io_read.h"
18 #include "io_write.h"
19 #include "journal_reclaim.h"
20 #include "keylist.h"
21 #include "move.h"
22 #include "replicas.h"
23 #include "snapshot.h"
24 #include "super-io.h"
25 #include "trace.h"
26
27 #include <linux/ioprio.h>
28 #include <linux/kthread.h>
29
30 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
31 {
32         if (trace_move_extent_enabled()) {
33                 struct printbuf buf = PRINTBUF;
34
35                 bch2_bkey_val_to_text(&buf, c, k);
36                 trace_move_extent(c, buf.buf);
37                 printbuf_exit(&buf);
38         }
39 }
40
41 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
42 {
43         if (trace_move_extent_read_enabled()) {
44                 struct printbuf buf = PRINTBUF;
45
46                 bch2_bkey_val_to_text(&buf, c, k);
47                 trace_move_extent_read(c, buf.buf);
48                 printbuf_exit(&buf);
49         }
50 }
51
52 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
53 {
54         if (trace_move_extent_alloc_mem_fail_enabled()) {
55                 struct printbuf buf = PRINTBUF;
56
57                 bch2_bkey_val_to_text(&buf, c, k);
58                 trace_move_extent_alloc_mem_fail(c, buf.buf);
59                 printbuf_exit(&buf);
60         }
61 }
62
63 struct moving_io {
64         struct list_head                read_list;
65         struct list_head                io_list;
66         struct move_bucket_in_flight    *b;
67         struct closure                  cl;
68         bool                            read_completed;
69
70         unsigned                        read_sectors;
71         unsigned                        write_sectors;
72
73         struct bch_read_bio             rbio;
74
75         struct data_update              write;
76         /* Must be last since it is variable size */
77         struct bio_vec                  bi_inline_vecs[0];
78 };
79
80 static void move_free(struct moving_io *io)
81 {
82         struct moving_context *ctxt = io->write.ctxt;
83
84         if (io->b)
85                 atomic_dec(&io->b->count);
86
87         bch2_data_update_exit(&io->write);
88
89         mutex_lock(&ctxt->lock);
90         list_del(&io->io_list);
91         wake_up(&ctxt->wait);
92         mutex_unlock(&ctxt->lock);
93
94         kfree(io);
95 }
96
97 static void move_write_done(struct bch_write_op *op)
98 {
99         struct moving_io *io = container_of(op, struct moving_io, write.op);
100         struct moving_context *ctxt = io->write.ctxt;
101
102         if (io->write.op.error)
103                 ctxt->write_error = true;
104
105         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
106         atomic_dec(&io->write.ctxt->write_ios);
107         move_free(io);
108         closure_put(&ctxt->cl);
109 }
110
111 static void move_write(struct moving_io *io)
112 {
113         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
114                 move_free(io);
115                 return;
116         }
117
118         closure_get(&io->write.ctxt->cl);
119         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
120         atomic_inc(&io->write.ctxt->write_ios);
121
122         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
123 }
124
125 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
126 {
127         struct moving_io *io =
128                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
129
130         return io && io->read_completed ? io : NULL;
131 }
132
133 static void move_read_endio(struct bio *bio)
134 {
135         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
136         struct moving_context *ctxt = io->write.ctxt;
137
138         atomic_sub(io->read_sectors, &ctxt->read_sectors);
139         atomic_dec(&ctxt->read_ios);
140         io->read_completed = true;
141
142         wake_up(&ctxt->wait);
143         closure_put(&ctxt->cl);
144 }
145
146 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
147 {
148         struct moving_io *io;
149
150         bch2_trans_unlock(ctxt->trans);
151
152         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
153                 list_del(&io->read_list);
154                 move_write(io);
155         }
156 }
157
158 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
159 {
160         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
161
162         move_ctxt_wait_event(ctxt,
163                 !atomic_read(&ctxt->write_sectors) ||
164                 atomic_read(&ctxt->write_sectors) != sectors_pending);
165 }
166
167 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
168 {
169         struct bch_fs *c = ctxt->trans->c;
170
171         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
172         closure_sync(&ctxt->cl);
173
174         EBUG_ON(atomic_read(&ctxt->write_sectors));
175         EBUG_ON(atomic_read(&ctxt->write_ios));
176         EBUG_ON(atomic_read(&ctxt->read_sectors));
177         EBUG_ON(atomic_read(&ctxt->read_ios));
178
179         mutex_lock(&c->moving_context_lock);
180         list_del(&ctxt->list);
181         mutex_unlock(&c->moving_context_lock);
182
183         bch2_trans_put(ctxt->trans);
184         memset(ctxt, 0, sizeof(*ctxt));
185 }
186
187 void bch2_moving_ctxt_init(struct moving_context *ctxt,
188                            struct bch_fs *c,
189                            struct bch_ratelimit *rate,
190                            struct bch_move_stats *stats,
191                            struct write_point_specifier wp,
192                            bool wait_on_copygc)
193 {
194         memset(ctxt, 0, sizeof(*ctxt));
195
196         ctxt->trans     = bch2_trans_get(c);
197         ctxt->fn        = (void *) _RET_IP_;
198         ctxt->rate      = rate;
199         ctxt->stats     = stats;
200         ctxt->wp        = wp;
201         ctxt->wait_on_copygc = wait_on_copygc;
202
203         closure_init_stack(&ctxt->cl);
204
205         mutex_init(&ctxt->lock);
206         INIT_LIST_HEAD(&ctxt->reads);
207         INIT_LIST_HEAD(&ctxt->ios);
208         init_waitqueue_head(&ctxt->wait);
209
210         mutex_lock(&c->moving_context_lock);
211         list_add(&ctxt->list, &c->moving_context_list);
212         mutex_unlock(&c->moving_context_lock);
213 }
214
215 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
216 {
217         trace_move_data(c, stats);
218 }
219
220 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
221 {
222         memset(stats, 0, sizeof(*stats));
223         stats->data_type = BCH_DATA_user;
224         scnprintf(stats->name, sizeof(stats->name), "%s", name);
225 }
226
227 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
228                                  struct btree_iter *iter,
229                                  struct bkey_s_c k,
230                                  struct data_update_opts data_opts)
231 {
232         struct bch_fs *c = trans->c;
233         struct bkey_i *n;
234         int ret;
235
236         n = bch2_bkey_make_mut_noupdate(trans, k);
237         ret = PTR_ERR_OR_ZERO(n);
238         if (ret)
239                 return ret;
240
241         while (data_opts.kill_ptrs) {
242                 unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
243                 struct bch_extent_ptr *ptr;
244
245                 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
246                 data_opts.kill_ptrs ^= 1U << drop;
247         }
248
249         /*
250          * If the new extent no longer has any pointers, bch2_extent_normalize()
251          * will do the appropriate thing with it (turning it into a
252          * KEY_TYPE_error key, or just a discard if it was a cached extent)
253          */
254         bch2_extent_normalize(c, bkey_i_to_s(n));
255
256         /*
257          * Since we're not inserting through an extent iterator
258          * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
259          * we aren't using the extent overwrite path to delete, we're
260          * just using the normal key deletion path:
261          */
262         if (bkey_deleted(&n->k))
263                 n->k.size = 0;
264
265         return bch2_trans_relock(trans) ?:
266                 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
267                 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
268 }
269
270 int bch2_move_extent(struct moving_context *ctxt,
271                      struct move_bucket_in_flight *bucket_in_flight,
272                      struct btree_iter *iter,
273                      struct bkey_s_c k,
274                      struct bch_io_opts io_opts,
275                      struct data_update_opts data_opts)
276 {
277         struct btree_trans *trans = ctxt->trans;
278         struct bch_fs *c = trans->c;
279         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
280         struct moving_io *io;
281         const union bch_extent_entry *entry;
282         struct extent_ptr_decoded p;
283         unsigned sectors = k.k->size, pages;
284         int ret = -ENOMEM;
285
286         if (ctxt->stats)
287                 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
288         trace_move_extent2(c, k);
289
290         bch2_data_update_opts_normalize(k, &data_opts);
291
292         if (!data_opts.rewrite_ptrs &&
293             !data_opts.extra_replicas) {
294                 if (data_opts.kill_ptrs)
295                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
296                 return 0;
297         }
298
299         /*
300          * Before memory allocations & taking nocow locks in
301          * bch2_data_update_init():
302          */
303         bch2_trans_unlock(trans);
304
305         /* write path might have to decompress data: */
306         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
307                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
308
309         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
310         io = kzalloc(sizeof(struct moving_io) +
311                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
312         if (!io)
313                 goto err;
314
315         INIT_LIST_HEAD(&io->io_list);
316         io->write.ctxt          = ctxt;
317         io->read_sectors        = k.k->size;
318         io->write_sectors       = k.k->size;
319
320         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
321         bio_set_prio(&io->write.op.wbio.bio,
322                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
323
324         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
325                                  GFP_KERNEL))
326                 goto err_free;
327
328         io->rbio.c              = c;
329         io->rbio.opts           = io_opts;
330         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
331         io->rbio.bio.bi_vcnt = pages;
332         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
333         io->rbio.bio.bi_iter.bi_size = sectors << 9;
334
335         io->rbio.bio.bi_opf             = REQ_OP_READ;
336         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
337         io->rbio.bio.bi_end_io          = move_read_endio;
338
339         ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
340                                     io_opts, data_opts, iter->btree_id, k);
341         if (ret && ret != -BCH_ERR_unwritten_extent_update)
342                 goto err_free_pages;
343
344         if (ret == -BCH_ERR_unwritten_extent_update) {
345                 bch2_update_unwritten_extent(trans, &io->write);
346                 move_free(io);
347                 return 0;
348         }
349
350         BUG_ON(ret);
351
352         io->write.op.end_io = move_write_done;
353
354         if (ctxt->rate)
355                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
356
357         if (ctxt->stats) {
358                 atomic64_inc(&ctxt->stats->keys_moved);
359                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
360         }
361
362         if (bucket_in_flight) {
363                 io->b = bucket_in_flight;
364                 atomic_inc(&io->b->count);
365         }
366
367         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
368         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
369         trace_move_extent_read2(c, k);
370
371         mutex_lock(&ctxt->lock);
372         atomic_add(io->read_sectors, &ctxt->read_sectors);
373         atomic_inc(&ctxt->read_ios);
374
375         list_add_tail(&io->read_list, &ctxt->reads);
376         list_add_tail(&io->io_list, &ctxt->ios);
377         mutex_unlock(&ctxt->lock);
378
379         /*
380          * dropped by move_read_endio() - guards against use after free of
381          * ctxt when doing wakeup
382          */
383         closure_get(&ctxt->cl);
384         bch2_read_extent(trans, &io->rbio,
385                          bkey_start_pos(k.k),
386                          iter->btree_id, k, 0,
387                          BCH_READ_NODECODE|
388                          BCH_READ_LAST_FRAGMENT);
389         return 0;
390 err_free_pages:
391         bio_free_pages(&io->write.op.wbio.bio);
392 err_free:
393         kfree(io);
394 err:
395         this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
396         trace_move_extent_alloc_mem_fail2(c, k);
397         return ret;
398 }
399
400 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
401                           struct per_snapshot_io_opts *io_opts,
402                           struct bkey_s_c extent_k)
403 {
404         struct bch_fs *c = trans->c;
405         u32 restart_count = trans->restart_count;
406         int ret = 0;
407
408         if (io_opts->cur_inum != extent_k.k->p.inode) {
409                 struct btree_iter iter;
410                 struct bkey_s_c k;
411
412                 io_opts->d.nr = 0;
413
414                 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
415                                    BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
416                         if (k.k->p.offset != extent_k.k->p.inode)
417                                 break;
418
419                         if (!bkey_is_inode(k.k))
420                                 continue;
421
422                         struct bch_inode_unpacked inode;
423                         BUG_ON(bch2_inode_unpack(k, &inode));
424
425                         struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
426                         bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
427
428                         ret = darray_push(&io_opts->d, e);
429                         if (ret)
430                                 break;
431                 }
432                 bch2_trans_iter_exit(trans, &iter);
433                 io_opts->cur_inum = extent_k.k->p.inode;
434         }
435
436         ret = ret ?: trans_was_restarted(trans, restart_count);
437         if (ret)
438                 return ERR_PTR(ret);
439
440         if (extent_k.k->p.snapshot) {
441                 struct snapshot_io_opts_entry *i;
442                 darray_for_each(io_opts->d, i)
443                         if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
444                                 return &i->io_opts;
445         }
446
447         return &io_opts->fs_io_opts;
448 }
449
450 int bch2_move_get_io_opts_one(struct btree_trans *trans,
451                               struct bch_io_opts *io_opts,
452                               struct bkey_s_c extent_k)
453 {
454         struct btree_iter iter;
455         struct bkey_s_c k;
456         int ret;
457
458         /* reflink btree? */
459         if (!extent_k.k->p.inode) {
460                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
461                 return 0;
462         }
463
464         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
465                                SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
466                                BTREE_ITER_CACHED);
467         ret = bkey_err(k);
468         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
469                 return ret;
470
471         if (!ret && bkey_is_inode(k.k)) {
472                 struct bch_inode_unpacked inode;
473                 bch2_inode_unpack(k, &inode);
474                 bch2_inode_opts_get(io_opts, trans->c, &inode);
475         } else {
476                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
477         }
478
479         bch2_trans_iter_exit(trans, &iter);
480         return 0;
481 }
482
483 int bch2_move_ratelimit(struct moving_context *ctxt)
484 {
485         struct bch_fs *c = ctxt->trans->c;
486         u64 delay;
487
488         if (ctxt->wait_on_copygc) {
489                 bch2_trans_unlock(ctxt->trans);
490                 wait_event_killable(c->copygc_running_wq,
491                                     !c->copygc_running ||
492                                     kthread_should_stop());
493         }
494
495         do {
496                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
497
498                 if (delay) {
499                         bch2_trans_unlock(ctxt->trans);
500                         set_current_state(TASK_INTERRUPTIBLE);
501                 }
502
503                 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
504                         __set_current_state(TASK_RUNNING);
505                         return 1;
506                 }
507
508                 if (delay)
509                         schedule_timeout(delay);
510
511                 if (unlikely(freezing(current))) {
512                         move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
513                         try_to_freeze();
514                 }
515         } while (delay);
516
517         /*
518          * XXX: these limits really ought to be per device, SSDs and hard drives
519          * will want different limits
520          */
521         move_ctxt_wait_event(ctxt,
522                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
523                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
524                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
525                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
526
527         return 0;
528 }
529
530 static int bch2_move_data_btree(struct moving_context *ctxt,
531                                 struct bpos start,
532                                 struct bpos end,
533                                 move_pred_fn pred, void *arg,
534                                 enum btree_id btree_id)
535 {
536         struct btree_trans *trans = ctxt->trans;
537         struct bch_fs *c = trans->c;
538         struct per_snapshot_io_opts snapshot_io_opts;
539         struct bch_io_opts *io_opts;
540         struct bkey_buf sk;
541         struct btree_iter iter;
542         struct bkey_s_c k;
543         struct data_update_opts data_opts;
544         int ret = 0, ret2;
545
546         per_snapshot_io_opts_init(&snapshot_io_opts, c);
547         bch2_bkey_buf_init(&sk);
548
549         if (ctxt->stats) {
550                 ctxt->stats->data_type  = BCH_DATA_user;
551                 ctxt->stats->pos        = BBPOS(btree_id, start);
552         }
553
554         bch2_trans_iter_init(trans, &iter, btree_id, start,
555                              BTREE_ITER_PREFETCH|
556                              BTREE_ITER_ALL_SNAPSHOTS);
557
558         if (ctxt->rate)
559                 bch2_ratelimit_reset(ctxt->rate);
560
561         while (!bch2_move_ratelimit(ctxt)) {
562                 bch2_trans_begin(trans);
563
564                 k = bch2_btree_iter_peek(&iter);
565                 if (!k.k)
566                         break;
567
568                 ret = bkey_err(k);
569                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
570                         continue;
571                 if (ret)
572                         break;
573
574                 if (bkey_ge(bkey_start_pos(k.k), end))
575                         break;
576
577                 if (ctxt->stats)
578                         ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
579
580                 if (!bkey_extent_is_direct_data(k.k))
581                         goto next_nondata;
582
583                 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
584                 ret = PTR_ERR_OR_ZERO(io_opts);
585                 if (ret)
586                         continue;
587
588                 memset(&data_opts, 0, sizeof(data_opts));
589                 if (!pred(c, arg, k, io_opts, &data_opts))
590                         goto next;
591
592                 /*
593                  * The iterator gets unlocked by __bch2_read_extent - need to
594                  * save a copy of @k elsewhere:
595                  */
596                 bch2_bkey_buf_reassemble(&sk, c, k);
597                 k = bkey_i_to_s_c(sk.k);
598
599                 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
600                 if (ret2) {
601                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
602                                 continue;
603
604                         if (ret2 == -ENOMEM) {
605                                 /* memory allocation failure, wait for some IO to finish */
606                                 bch2_move_ctxt_wait_for_io(ctxt);
607                                 continue;
608                         }
609
610                         /* XXX signal failure */
611                         goto next;
612                 }
613 next:
614                 if (ctxt->stats)
615                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
616 next_nondata:
617                 bch2_btree_iter_advance(&iter);
618         }
619
620         bch2_trans_iter_exit(trans, &iter);
621         bch2_bkey_buf_exit(&sk, c);
622         per_snapshot_io_opts_exit(&snapshot_io_opts);
623
624         return ret;
625 }
626
627 int __bch2_move_data(struct moving_context *ctxt,
628                      struct bbpos start,
629                      struct bbpos end,
630                      move_pred_fn pred, void *arg)
631 {
632         struct bch_fs *c = ctxt->trans->c;
633         enum btree_id id;
634         int ret = 0;
635
636         for (id = start.btree;
637              id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
638              id++) {
639                 ctxt->stats->pos = BBPOS(id, POS_MIN);
640
641                 if (!btree_type_has_ptrs(id) ||
642                     !bch2_btree_id_root(c, id)->b)
643                         continue;
644
645                 ret = bch2_move_data_btree(ctxt,
646                                        id == start.btree ? start.pos : POS_MIN,
647                                        id == end.btree   ? end.pos   : POS_MAX,
648                                        pred, arg, id);
649                 if (ret)
650                         break;
651         }
652
653         return ret;
654 }
655
656 int bch2_move_data(struct bch_fs *c,
657                    struct bbpos start,
658                    struct bbpos end,
659                    struct bch_ratelimit *rate,
660                    struct bch_move_stats *stats,
661                    struct write_point_specifier wp,
662                    bool wait_on_copygc,
663                    move_pred_fn pred, void *arg)
664 {
665
666         struct moving_context ctxt;
667         int ret;
668
669         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
670         ret = __bch2_move_data(&ctxt, start, end, pred, arg);
671         bch2_moving_ctxt_exit(&ctxt);
672
673         return ret;
674 }
675
676 int __bch2_evacuate_bucket(struct moving_context *ctxt,
677                            struct move_bucket_in_flight *bucket_in_flight,
678                            struct bpos bucket, int gen,
679                            struct data_update_opts _data_opts)
680 {
681         struct btree_trans *trans = ctxt->trans;
682         struct bch_fs *c = trans->c;
683         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
684         struct btree_iter iter;
685         struct bkey_buf sk;
686         struct bch_backpointer bp;
687         struct bch_alloc_v4 a_convert;
688         const struct bch_alloc_v4 *a;
689         struct bkey_s_c k;
690         struct data_update_opts data_opts;
691         unsigned dirty_sectors, bucket_size;
692         u64 fragmentation;
693         struct bpos bp_pos = POS_MIN;
694         int ret = 0;
695
696         trace_bucket_evacuate(c, &bucket);
697
698         bch2_bkey_buf_init(&sk);
699
700         /*
701          * We're not run in a context that handles transaction restarts:
702          */
703         bch2_trans_begin(trans);
704
705         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
706                              bucket, BTREE_ITER_CACHED);
707         ret = lockrestart_do(trans,
708                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
709         bch2_trans_iter_exit(trans, &iter);
710
711         if (ret) {
712                 bch_err_msg(c, ret, "looking up alloc key");
713                 goto err;
714         }
715
716         a = bch2_alloc_to_v4(k, &a_convert);
717         dirty_sectors = a->dirty_sectors;
718         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
719         fragmentation = a->fragmentation_lru;
720
721         ret = bch2_btree_write_buffer_flush(trans);
722         if (ret) {
723                 bch_err_msg(c, ret, "flushing btree write buffer");
724                 goto err;
725         }
726
727         while (!(ret = bch2_move_ratelimit(ctxt))) {
728                 bch2_trans_begin(trans);
729
730                 ret = bch2_get_next_backpointer(trans, bucket, gen,
731                                                 &bp_pos, &bp,
732                                                 BTREE_ITER_CACHED);
733                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
734                         continue;
735                 if (ret)
736                         goto err;
737                 if (bkey_eq(bp_pos, POS_MAX))
738                         break;
739
740                 if (!bp.level) {
741                         const struct bch_extent_ptr *ptr;
742                         unsigned i = 0;
743
744                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
745                         ret = bkey_err(k);
746                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
747                                 continue;
748                         if (ret)
749                                 goto err;
750                         if (!k.k)
751                                 goto next;
752
753                         bch2_bkey_buf_reassemble(&sk, c, k);
754                         k = bkey_i_to_s_c(sk.k);
755
756                         ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
757                         if (ret) {
758                                 bch2_trans_iter_exit(trans, &iter);
759                                 continue;
760                         }
761
762                         data_opts = _data_opts;
763                         data_opts.target        = io_opts.background_target;
764                         data_opts.rewrite_ptrs = 0;
765
766                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
767                                 if (ptr->dev == bucket.inode) {
768                                         data_opts.rewrite_ptrs |= 1U << i;
769                                         if (ptr->cached) {
770                                                 bch2_trans_iter_exit(trans, &iter);
771                                                 goto next;
772                                         }
773                                 }
774                                 i++;
775                         }
776
777                         ret = bch2_move_extent(ctxt, bucket_in_flight,
778                                                &iter, k, io_opts, data_opts);
779                         bch2_trans_iter_exit(trans, &iter);
780
781                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
782                                 continue;
783                         if (ret == -ENOMEM) {
784                                 /* memory allocation failure, wait for some IO to finish */
785                                 bch2_move_ctxt_wait_for_io(ctxt);
786                                 continue;
787                         }
788                         if (ret)
789                                 goto err;
790
791                         if (ctxt->stats)
792                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
793                 } else {
794                         struct btree *b;
795
796                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
797                         ret = PTR_ERR_OR_ZERO(b);
798                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
799                                 continue;
800                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
801                                 continue;
802                         if (ret)
803                                 goto err;
804                         if (!b)
805                                 goto next;
806
807                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
808                         bch2_trans_iter_exit(trans, &iter);
809
810                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
811                                 continue;
812                         if (ret)
813                                 goto err;
814
815                         if (ctxt->rate)
816                                 bch2_ratelimit_increment(ctxt->rate,
817                                                          c->opts.btree_node_size >> 9);
818                         if (ctxt->stats) {
819                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
820                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
821                         }
822                 }
823 next:
824                 bp_pos = bpos_nosnap_successor(bp_pos);
825         }
826
827         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
828 err:
829         bch2_bkey_buf_exit(&sk, c);
830         return ret;
831 }
832
833 int bch2_evacuate_bucket(struct bch_fs *c,
834                          struct bpos bucket, int gen,
835                          struct data_update_opts data_opts,
836                          struct bch_ratelimit *rate,
837                          struct bch_move_stats *stats,
838                          struct write_point_specifier wp,
839                          bool wait_on_copygc)
840 {
841         struct moving_context ctxt;
842         int ret;
843
844         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
845         ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
846         bch2_moving_ctxt_exit(&ctxt);
847
848         return ret;
849 }
850
851 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
852                                 struct btree *, struct bch_io_opts *,
853                                 struct data_update_opts *);
854
855 static int bch2_move_btree(struct bch_fs *c,
856                            enum btree_id start_btree_id, struct bpos start_pos,
857                            enum btree_id end_btree_id,   struct bpos end_pos,
858                            move_btree_pred pred, void *arg,
859                            struct bch_move_stats *stats)
860 {
861         bool kthread = (current->flags & PF_KTHREAD) != 0;
862         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
863         struct moving_context ctxt;
864         struct btree_trans *trans;
865         struct btree_iter iter;
866         struct btree *b;
867         enum btree_id id;
868         struct data_update_opts data_opts;
869         int ret = 0;
870
871         bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
872                               writepoint_ptr(&c->btree_write_point),
873                               true);
874         trans = ctxt.trans;
875
876         stats->data_type = BCH_DATA_btree;
877
878         for (id = start_btree_id;
879              id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
880              id++) {
881                 stats->pos = BBPOS(id, POS_MIN);
882
883                 if (!bch2_btree_id_root(c, id)->b)
884                         continue;
885
886                 bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
887                                           BTREE_ITER_PREFETCH);
888 retry:
889                 ret = 0;
890                 while (bch2_trans_begin(trans),
891                        (b = bch2_btree_iter_peek_node(&iter)) &&
892                        !(ret = PTR_ERR_OR_ZERO(b))) {
893                         if (kthread && kthread_should_stop())
894                                 break;
895
896                         if ((cmp_int(id, end_btree_id) ?:
897                              bpos_cmp(b->key.k.p, end_pos)) > 0)
898                                 break;
899
900                         stats->pos = BBPOS(iter.btree_id, iter.pos);
901
902                         if (!pred(c, arg, b, &io_opts, &data_opts))
903                                 goto next;
904
905                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
906                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
907                                 continue;
908                         if (ret)
909                                 break;
910 next:
911                         bch2_btree_iter_next_node(&iter);
912                 }
913                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
914                         goto retry;
915
916                 bch2_trans_iter_exit(trans, &iter);
917
918                 if (kthread && kthread_should_stop())
919                         break;
920         }
921
922         bch_err_fn(c, ret);
923         bch2_moving_ctxt_exit(&ctxt);
924         bch2_btree_interior_updates_flush(c);
925
926         return ret;
927 }
928
929 static bool rereplicate_pred(struct bch_fs *c, void *arg,
930                              struct bkey_s_c k,
931                              struct bch_io_opts *io_opts,
932                              struct data_update_opts *data_opts)
933 {
934         unsigned nr_good = bch2_bkey_durability(c, k);
935         unsigned replicas = bkey_is_btree_ptr(k.k)
936                 ? c->opts.metadata_replicas
937                 : io_opts->data_replicas;
938
939         if (!nr_good || nr_good >= replicas)
940                 return false;
941
942         data_opts->target               = 0;
943         data_opts->extra_replicas       = replicas - nr_good;
944         data_opts->btree_insert_flags   = 0;
945         return true;
946 }
947
948 static bool migrate_pred(struct bch_fs *c, void *arg,
949                          struct bkey_s_c k,
950                          struct bch_io_opts *io_opts,
951                          struct data_update_opts *data_opts)
952 {
953         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
954         const struct bch_extent_ptr *ptr;
955         struct bch_ioctl_data *op = arg;
956         unsigned i = 0;
957
958         data_opts->rewrite_ptrs         = 0;
959         data_opts->target               = 0;
960         data_opts->extra_replicas       = 0;
961         data_opts->btree_insert_flags   = 0;
962
963         bkey_for_each_ptr(ptrs, ptr) {
964                 if (ptr->dev == op->migrate.dev)
965                         data_opts->rewrite_ptrs |= 1U << i;
966                 i++;
967         }
968
969         return data_opts->rewrite_ptrs != 0;
970 }
971
972 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
973                                    struct btree *b,
974                                    struct bch_io_opts *io_opts,
975                                    struct data_update_opts *data_opts)
976 {
977         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
978 }
979
980 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
981                                struct btree *b,
982                                struct bch_io_opts *io_opts,
983                                struct data_update_opts *data_opts)
984 {
985         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
986 }
987
988 static bool bformat_needs_redo(struct bkey_format *f)
989 {
990         unsigned i;
991
992         for (i = 0; i < f->nr_fields; i++) {
993                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
994                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
995                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
996
997                 if (f->bits_per_field[i] > unpacked_bits)
998                         return true;
999
1000                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
1001                         return true;
1002
1003                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
1004                      unpacked_mask) <
1005                     field_offset)
1006                         return true;
1007         }
1008
1009         return false;
1010 }
1011
1012 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1013                                    struct btree *b,
1014                                    struct bch_io_opts *io_opts,
1015                                    struct data_update_opts *data_opts)
1016 {
1017         if (b->version_ondisk != c->sb.version ||
1018             btree_node_need_rewrite(b) ||
1019             bformat_needs_redo(&b->format)) {
1020                 data_opts->target               = 0;
1021                 data_opts->extra_replicas       = 0;
1022                 data_opts->btree_insert_flags   = 0;
1023                 return true;
1024         }
1025
1026         return false;
1027 }
1028
1029 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1030 {
1031         int ret;
1032
1033         ret = bch2_move_btree(c,
1034                               0,                POS_MIN,
1035                               BTREE_ID_NR,      SPOS_MAX,
1036                               rewrite_old_nodes_pred, c, stats);
1037         if (!ret) {
1038                 mutex_lock(&c->sb_lock);
1039                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1040                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1041                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1042                 bch2_write_super(c);
1043                 mutex_unlock(&c->sb_lock);
1044         }
1045
1046         bch_err_fn(c, ret);
1047         return ret;
1048 }
1049
1050 int bch2_data_job(struct bch_fs *c,
1051                   struct bch_move_stats *stats,
1052                   struct bch_ioctl_data op)
1053 {
1054         int ret = 0;
1055
1056         switch (op.op) {
1057         case BCH_DATA_OP_REREPLICATE:
1058                 bch2_move_stats_init(stats, "rereplicate");
1059                 stats->data_type = BCH_DATA_journal;
1060                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1061
1062                 ret = bch2_move_btree(c,
1063                                       op.start_btree,   op.start_pos,
1064                                       op.end_btree,     op.end_pos,
1065                                       rereplicate_btree_pred, c, stats) ?: ret;
1066                 ret = bch2_replicas_gc2(c) ?: ret;
1067
1068                 ret = bch2_move_data(c,
1069                                      (struct bbpos) { op.start_btree,   op.start_pos },
1070                                      (struct bbpos) { op.end_btree,     op.end_pos },
1071                                      NULL,
1072                                      stats,
1073                                      writepoint_hashed((unsigned long) current),
1074                                      true,
1075                                      rereplicate_pred, c) ?: ret;
1076                 ret = bch2_replicas_gc2(c) ?: ret;
1077
1078                 bch2_move_stats_exit(stats, c);
1079                 break;
1080         case BCH_DATA_OP_MIGRATE:
1081                 if (op.migrate.dev >= c->sb.nr_devices)
1082                         return -EINVAL;
1083
1084                 bch2_move_stats_init(stats, "migrate");
1085                 stats->data_type = BCH_DATA_journal;
1086                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1087
1088                 ret = bch2_move_btree(c,
1089                                       op.start_btree,   op.start_pos,
1090                                       op.end_btree,     op.end_pos,
1091                                       migrate_btree_pred, &op, stats) ?: ret;
1092                 ret = bch2_replicas_gc2(c) ?: ret;
1093
1094                 ret = bch2_move_data(c,
1095                                      (struct bbpos) { op.start_btree,   op.start_pos },
1096                                      (struct bbpos) { op.end_btree,     op.end_pos },
1097                                      NULL,
1098                                      stats,
1099                                      writepoint_hashed((unsigned long) current),
1100                                      true,
1101                                      migrate_pred, &op) ?: ret;
1102                 ret = bch2_replicas_gc2(c) ?: ret;
1103
1104                 bch2_move_stats_exit(stats, c);
1105                 break;
1106         case BCH_DATA_OP_REWRITE_OLD_NODES:
1107                 bch2_move_stats_init(stats, "rewrite_old_nodes");
1108                 ret = bch2_scan_old_btree_nodes(c, stats);
1109                 bch2_move_stats_exit(stats, c);
1110                 break;
1111         default:
1112                 ret = -EINVAL;
1113         }
1114
1115         return ret;
1116 }
1117
1118 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1119 {
1120         prt_printf(out, "%s: data type=%s pos=",
1121                    stats->name,
1122                    bch2_data_types[stats->data_type]);
1123         bch2_bbpos_to_text(out, stats->pos);
1124         prt_newline(out);
1125         printbuf_indent_add(out, 2);
1126
1127         prt_str(out, "keys moved:  ");
1128         prt_u64(out, atomic64_read(&stats->keys_moved));
1129         prt_newline(out);
1130
1131         prt_str(out, "keys raced:  ");
1132         prt_u64(out, atomic64_read(&stats->keys_raced));
1133         prt_newline(out);
1134
1135         prt_str(out, "bytes seen:  ");
1136         prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1137         prt_newline(out);
1138
1139         prt_str(out, "bytes moved: ");
1140         prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1141         prt_newline(out);
1142
1143         prt_str(out, "bytes raced: ");
1144         prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1145         prt_newline(out);
1146
1147         printbuf_indent_sub(out, 2);
1148 }
1149
1150 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1151 {
1152         struct moving_io *io;
1153
1154         bch2_move_stats_to_text(out, ctxt->stats);
1155         printbuf_indent_add(out, 2);
1156
1157         prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1158                    atomic_read(&ctxt->read_ios),
1159                    c->opts.move_ios_in_flight,
1160                    atomic_read(&ctxt->read_sectors),
1161                    c->opts.move_bytes_in_flight >> 9);
1162         prt_newline(out);
1163
1164         prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1165                    atomic_read(&ctxt->write_ios),
1166                    c->opts.move_ios_in_flight,
1167                    atomic_read(&ctxt->write_sectors),
1168                    c->opts.move_bytes_in_flight >> 9);
1169         prt_newline(out);
1170
1171         printbuf_indent_add(out, 2);
1172
1173         mutex_lock(&ctxt->lock);
1174         list_for_each_entry(io, &ctxt->ios, io_list)
1175                 bch2_write_op_to_text(out, &io->write.op);
1176         mutex_unlock(&ctxt->lock);
1177
1178         printbuf_indent_sub(out, 4);
1179 }
1180
1181 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1182 {
1183         struct moving_context *ctxt;
1184
1185         mutex_lock(&c->moving_context_lock);
1186         list_for_each_entry(ctxt, &c->moving_context_list, list)
1187                 bch2_moving_ctxt_to_text(out, c, ctxt);
1188         mutex_unlock(&c->moving_context_lock);
1189 }
1190
1191 void bch2_fs_move_init(struct bch_fs *c)
1192 {
1193         INIT_LIST_HEAD(&c->moving_context_list);
1194         mutex_init(&c->moving_context_lock);
1195 }