]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to f7ccf51390 bcachefs: durability
[bcachefs-tools-debian] / libbcachefs / move.c
1
2 #include "bcachefs.h"
3 #include "btree_gc.h"
4 #include "btree_update.h"
5 #include "buckets.h"
6 #include "inode.h"
7 #include "io.h"
8 #include "move.h"
9 #include "super-io.h"
10 #include "keylist.h"
11
12 #include <linux/ioprio.h>
13 #include <linux/kthread.h>
14
15 #include <trace/events/bcachefs.h>
16
17 #define SECTORS_IN_FLIGHT_PER_DEVICE    2048
18
19 struct moving_io {
20         struct list_head        list;
21         struct closure          cl;
22         bool                    read_completed;
23
24         unsigned                read_dev;
25         unsigned                read_sectors;
26         unsigned                write_sectors;
27
28         struct bch_read_bio     rbio;
29
30         struct migrate_write    write;
31         /* Must be last since it is variable size */
32         struct bio_vec          bi_inline_vecs[0];
33 };
34
35 struct moving_context {
36         /* Closure for waiting on all reads and writes to complete */
37         struct closure          cl;
38
39         struct bch_move_stats   *stats;
40
41         struct list_head        reads;
42
43         /* in flight sectors: */
44         atomic_t                read_sectors[BCH_SB_MEMBERS_MAX];
45         atomic_t                write_sectors;
46
47         wait_queue_head_t       wait;
48 };
49
50 static int bch2_migrate_index_update(struct bch_write_op *op)
51 {
52         struct bch_fs *c = op->c;
53         struct migrate_write *m =
54                 container_of(op, struct migrate_write, op);
55         struct keylist *keys = &op->insert_keys;
56         struct btree_iter iter;
57         int ret = 0;
58
59         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
60                              bkey_start_pos(&bch2_keylist_front(keys)->k),
61                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
62
63         while (1) {
64                 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
65                 struct bkey_i_extent *insert, *new =
66                         bkey_i_to_extent(bch2_keylist_front(keys));
67                 BKEY_PADDED(k) _new, _insert;
68                 struct bch_extent_ptr *ptr;
69                 struct bch_extent_crc_unpacked crc;
70                 unsigned nr_dirty;
71                 bool did_work = false;
72
73                 if (btree_iter_err(k)) {
74                         ret = bch2_btree_iter_unlock(&iter);
75                         break;
76                 }
77
78                 if (bversion_cmp(k.k->version, new->k.version) ||
79                     !bkey_extent_is_data(k.k) ||
80                     !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
81                                              m->ptr, m->offset))
82                         goto nomatch;
83
84                 if (m->data_cmd == DATA_REWRITE &&
85                     !bch2_extent_has_device(bkey_s_c_to_extent(k),
86                                             m->data_opts.rewrite_dev))
87                         goto nomatch;
88
89                 bkey_reassemble(&_insert.k, k);
90                 insert = bkey_i_to_extent(&_insert.k);
91
92                 bkey_copy(&_new.k, bch2_keylist_front(keys));
93                 new = bkey_i_to_extent(&_new.k);
94
95                 bch2_cut_front(iter.pos, &insert->k_i);
96                 bch2_cut_back(new->k.p, &insert->k);
97                 bch2_cut_back(insert->k.p, &new->k);
98
99                 if (m->data_cmd == DATA_REWRITE) {
100                         ptr = (struct bch_extent_ptr *)
101                                 bch2_extent_has_device(extent_i_to_s_c(insert),
102                                                        m->data_opts.rewrite_dev);
103                         bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
104                 }
105
106                 extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
107                         if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
108                                 /*
109                                  * raced with another move op? extent already
110                                  * has a pointer to the device we just wrote
111                                  * data to
112                                  */
113                                 continue;
114                         }
115
116                         bch2_extent_crc_append(insert, crc);
117                         extent_ptr_append(insert, *ptr);
118                         did_work = true;
119                 }
120
121                 if (!did_work)
122                         goto nomatch;
123
124                 bch2_extent_narrow_crcs(insert,
125                                 (struct bch_extent_crc_unpacked) { 0 });
126                 bch2_extent_normalize(c, extent_i_to_s(insert).s);
127                 bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
128                                                  op->opts.background_target,
129                                                  op->opts.data_replicas);
130
131                 /*
132                  * It's possible we race, and for whatever reason the extent now
133                  * has fewer replicas than when we last looked at it - meaning
134                  * we need to get a disk reservation here:
135                  */
136                 nr_dirty = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i));
137                 if (m->nr_ptrs_reserved < nr_dirty) {
138                         unsigned sectors = (nr_dirty - m->nr_ptrs_reserved) *
139                                         keylist_sectors(keys);
140
141                         /*
142                          * can't call bch2_disk_reservation_add() with btree
143                          * locks held, at least not without a song and dance
144                          */
145                         bch2_btree_iter_unlock(&iter);
146
147                         ret = bch2_disk_reservation_add(c, &op->res, sectors, 0);
148                         if (ret)
149                                 goto out;
150
151                         m->nr_ptrs_reserved = nr_dirty;
152                         goto next;
153                 }
154
155                 ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
156                                               extent_i_to_s_c(insert).s_c);
157                 if (ret)
158                         break;
159
160                 ret = bch2_btree_insert_at(c, &op->res,
161                                 NULL, op_journal_seq(op),
162                                 BTREE_INSERT_ATOMIC|
163                                 BTREE_INSERT_NOFAIL|
164                                 BTREE_INSERT_USE_RESERVE|
165                                 m->data_opts.btree_insert_flags,
166                                 BTREE_INSERT_ENTRY(&iter, &insert->k_i));
167                 if (!ret)
168                         atomic_long_inc(&c->extent_migrate_done);
169                 if (ret == -EINTR)
170                         ret = 0;
171                 if (ret)
172                         break;
173 next:
174                 while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
175                         bch2_keylist_pop_front(keys);
176                         if (bch2_keylist_empty(keys))
177                                 goto out;
178                 }
179
180                 bch2_cut_front(iter.pos, bch2_keylist_front(keys));
181                 continue;
182 nomatch:
183                 if (m->ctxt)
184                         atomic64_add(k.k->p.offset - iter.pos.offset,
185                                      &m->ctxt->stats->sectors_raced);
186                 atomic_long_inc(&c->extent_migrate_raced);
187                 trace_move_race(&new->k);
188                 bch2_btree_iter_next_slot(&iter);
189                 goto next;
190         }
191 out:
192         bch2_btree_iter_unlock(&iter);
193         return ret;
194 }
195
196 void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
197 {
198         /* write bio must own pages: */
199         BUG_ON(!m->op.wbio.bio.bi_vcnt);
200
201         m->ptr          = rbio->pick.ptr;
202         m->offset       = rbio->pos.offset - rbio->pick.crc.offset;
203         m->op.devs_have = rbio->devs_have;
204         m->op.pos       = rbio->pos;
205         m->op.version   = rbio->version;
206         m->op.crc       = rbio->pick.crc;
207         m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
208
209         if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
210                 m->op.nonce     = m->op.crc.nonce + m->op.crc.offset;
211                 m->op.csum_type = m->op.crc.csum_type;
212         }
213
214         if (m->data_cmd == DATA_REWRITE)
215                 bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
216 }
217
218 int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
219                             struct write_point_specifier wp,
220                             struct bch_io_opts io_opts,
221                             enum data_cmd data_cmd,
222                             struct data_opts data_opts,
223                             struct bkey_s_c k)
224 {
225         int ret;
226
227         m->data_cmd     = data_cmd;
228         m->data_opts    = data_opts;
229         m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k);
230
231         bch2_write_op_init(&m->op, c, io_opts);
232         m->op.compression_type =
233                 bch2_compression_opt_to_type[io_opts.background_compression ?:
234                                              io_opts.compression];
235         m->op.target    = data_opts.target,
236         m->op.write_point = wp;
237
238         if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
239                 m->op.alloc_reserve = RESERVE_MOVINGGC;
240
241         m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
242                 BCH_WRITE_PAGES_STABLE|
243                 BCH_WRITE_PAGES_OWNED|
244                 BCH_WRITE_DATA_ENCODED|
245                 BCH_WRITE_NOMARK_REPLICAS;
246
247         m->op.nr_replicas       = 1;
248         m->op.nr_replicas_required = 1;
249         m->op.index_update_fn   = bch2_migrate_index_update;
250
251         switch (data_cmd) {
252         case DATA_ADD_REPLICAS:
253                 if (m->nr_ptrs_reserved < io_opts.data_replicas) {
254                         m->op.nr_replicas = io_opts.data_replicas - m->nr_ptrs_reserved;
255
256                         ret = bch2_disk_reservation_get(c, &m->op.res,
257                                                         k.k->size,
258                                                         m->op.nr_replicas, 0);
259                         if (ret)
260                                 return ret;
261
262                         m->nr_ptrs_reserved = io_opts.data_replicas;
263                 }
264                 break;
265         case DATA_REWRITE:
266                 break;
267         case DATA_PROMOTE:
268                 m->op.flags     |= BCH_WRITE_ALLOC_NOWAIT;
269                 m->op.flags     |= BCH_WRITE_CACHED;
270                 break;
271         default:
272                 BUG();
273         }
274
275         return 0;
276 }
277
278 static void move_free(struct closure *cl)
279 {
280         struct moving_io *io = container_of(cl, struct moving_io, cl);
281         struct moving_context *ctxt = io->write.ctxt;
282         struct bio_vec *bv;
283         int i;
284
285         bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
286
287         bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
288                 if (bv->bv_page)
289                         __free_page(bv->bv_page);
290
291         wake_up(&ctxt->wait);
292
293         kfree(io);
294 }
295
296 static void move_write_done(struct closure *cl)
297 {
298         struct moving_io *io = container_of(cl, struct moving_io, cl);
299
300         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
301         closure_return_with_destructor(cl, move_free);
302 }
303
304 static void move_write(struct closure *cl)
305 {
306         struct moving_io *io = container_of(cl, struct moving_io, cl);
307
308         if (likely(!io->rbio.bio.bi_status)) {
309                 bch2_migrate_read_done(&io->write, &io->rbio);
310
311                 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
312                 closure_call(&io->write.op.cl, bch2_write, NULL, cl);
313                 continue_at(cl, move_write_done, NULL);
314         }
315
316         closure_return_with_destructor(cl, move_free);
317 }
318
319 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
320 {
321         struct moving_io *io =
322                 list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
323
324         return io && io->read_completed ? io : NULL;
325 }
326
327 static void move_read_endio(struct bio *bio)
328 {
329         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
330         struct moving_context *ctxt = io->write.ctxt;
331
332         atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
333         io->read_completed = true;
334
335         if (next_pending_write(ctxt))
336                 wake_up(&ctxt->wait);
337
338         closure_put(&ctxt->cl);
339 }
340
341 static void do_pending_writes(struct moving_context *ctxt)
342 {
343         struct moving_io *io;
344
345         while ((io = next_pending_write(ctxt))) {
346                 list_del(&io->list);
347                 closure_call(&io->cl, move_write, NULL, &ctxt->cl);
348         }
349 }
350
351 #define move_ctxt_wait_event(_ctxt, _cond)                      \
352 do {                                                            \
353         do_pending_writes(_ctxt);                               \
354                                                                 \
355         if (_cond)                                              \
356                 break;                                          \
357         __wait_event((_ctxt)->wait,                             \
358                      next_pending_write(_ctxt) || (_cond));     \
359 } while (1)
360
361 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
362 {
363         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
364
365         move_ctxt_wait_event(ctxt,
366                 !atomic_read(&ctxt->write_sectors) ||
367                 atomic_read(&ctxt->write_sectors) != sectors_pending);
368 }
369
370 static int bch2_move_extent(struct bch_fs *c,
371                             struct moving_context *ctxt,
372                             struct write_point_specifier wp,
373                             struct bch_io_opts io_opts,
374                             struct bkey_s_c_extent e,
375                             enum data_cmd data_cmd,
376                             struct data_opts data_opts)
377 {
378         struct extent_pick_ptr pick;
379         struct moving_io *io;
380         const struct bch_extent_ptr *ptr;
381         struct bch_extent_crc_unpacked crc;
382         unsigned sectors = e.k->size, pages;
383         int ret = -ENOMEM;
384
385         move_ctxt_wait_event(ctxt,
386                 atomic_read(&ctxt->write_sectors) <
387                 SECTORS_IN_FLIGHT_PER_DEVICE);
388
389         bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
390         if (IS_ERR_OR_NULL(pick.ca))
391                 return pick.ca ? PTR_ERR(pick.ca) : 0;
392
393         move_ctxt_wait_event(ctxt,
394                 atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
395                 SECTORS_IN_FLIGHT_PER_DEVICE);
396
397         /* write path might have to decompress data: */
398         extent_for_each_ptr_crc(e, ptr, crc)
399                 sectors = max_t(unsigned, sectors, crc.uncompressed_size);
400
401         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
402         io = kzalloc(sizeof(struct moving_io) +
403                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
404         if (!io)
405                 goto err;
406
407         io->write.ctxt          = ctxt;
408         io->read_dev            = pick.ca->dev_idx;
409         io->read_sectors        = pick.crc.uncompressed_size;
410         io->write_sectors       = e.k->size;
411
412         bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
413         bio_set_prio(&io->write.op.wbio.bio,
414                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
415         io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
416
417         bch2_bio_map(&io->write.op.wbio.bio, NULL);
418         if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
419                 goto err_free;
420
421         io->rbio.opts = io_opts;
422         bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
423         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
424         io->rbio.bio.bi_iter.bi_size = sectors << 9;
425
426         bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
427         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(e.k);
428         io->rbio.bio.bi_end_io          = move_read_endio;
429
430         ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
431                                       data_cmd, data_opts, e.s_c);
432         if (ret)
433                 goto err_free_pages;
434
435         atomic64_inc(&ctxt->stats->keys_moved);
436         atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
437
438         trace_move_extent(e.k);
439
440         atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
441         list_add_tail(&io->list, &ctxt->reads);
442
443         /*
444          * dropped by move_read_endio() - guards against use after free of
445          * ctxt when doing wakeup
446          */
447         closure_get(&ctxt->cl);
448         bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
449         return 0;
450 err_free_pages:
451         bio_free_pages(&io->write.op.wbio.bio);
452 err_free:
453         kfree(io);
454 err:
455         percpu_ref_put(&pick.ca->io_ref);
456         trace_move_alloc_fail(e.k);
457         return ret;
458 }
459
460 int bch2_move_data(struct bch_fs *c,
461                    struct bch_ratelimit *rate,
462                    struct write_point_specifier wp,
463                    struct bpos start,
464                    struct bpos end,
465                    move_pred_fn pred, void *arg,
466                    struct bch_move_stats *stats)
467 {
468         bool kthread = (current->flags & PF_KTHREAD) != 0;
469         struct moving_context ctxt = { .stats = stats };
470         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
471         BKEY_PADDED(k) tmp;
472         struct bkey_s_c k;
473         struct bkey_s_c_extent e;
474         struct data_opts data_opts;
475         enum data_cmd data_cmd;
476         u64 cur_inum = U64_MAX;
477         int ret = 0, ret2;
478
479         closure_init_stack(&ctxt.cl);
480         INIT_LIST_HEAD(&ctxt.reads);
481         init_waitqueue_head(&ctxt.wait);
482
483         stats->data_type = BCH_DATA_USER;
484         bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
485                              BTREE_ITER_PREFETCH);
486
487         if (rate)
488                 bch2_ratelimit_reset(rate);
489
490         while (!kthread || !(ret = kthread_should_stop())) {
491                 if (rate &&
492                     bch2_ratelimit_delay(rate) &&
493                     (bch2_btree_iter_unlock(&stats->iter),
494                      (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
495                         break;
496 peek:
497                 k = bch2_btree_iter_peek(&stats->iter);
498                 if (!k.k)
499                         break;
500                 ret = btree_iter_err(k);
501                 if (ret)
502                         break;
503                 if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
504                         break;
505
506                 if (!bkey_extent_is_data(k.k))
507                         goto next_nondata;
508
509                 e = bkey_s_c_to_extent(k);
510
511                 if (cur_inum != k.k->p.inode) {
512                         struct bch_inode_unpacked inode;
513
514                         /* don't hold btree locks while looking up inode: */
515                         bch2_btree_iter_unlock(&stats->iter);
516
517                         io_opts = bch2_opts_to_inode_opts(c->opts);
518                         if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
519                                 bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
520                         cur_inum = k.k->p.inode;
521                         goto peek;
522                 }
523
524                 switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
525                                          &io_opts, &data_opts))) {
526                 case DATA_SKIP:
527                         goto next;
528                 case DATA_SCRUB:
529                         BUG();
530                 case DATA_ADD_REPLICAS:
531                 case DATA_REWRITE:
532                 case DATA_PROMOTE:
533                         break;
534                 default:
535                         BUG();
536                 }
537
538                 /* unlock before doing IO: */
539                 bkey_reassemble(&tmp.k, k);
540                 k = bkey_i_to_s_c(&tmp.k);
541                 bch2_btree_iter_unlock(&stats->iter);
542
543                 ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
544                                         bkey_s_c_to_extent(k),
545                                         data_cmd, data_opts);
546                 if (ret2) {
547                         if (ret2 == -ENOMEM) {
548                                 /* memory allocation failure, wait for some IO to finish */
549                                 bch2_move_ctxt_wait_for_io(&ctxt);
550                                 continue;
551                         }
552
553                         /* XXX signal failure */
554                         goto next;
555                 }
556
557                 if (rate)
558                         bch2_ratelimit_increment(rate, k.k->size);
559 next:
560                 atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
561                              &stats->sectors_seen);
562 next_nondata:
563                 bch2_btree_iter_next(&stats->iter);
564                 bch2_btree_iter_cond_resched(&stats->iter);
565         }
566
567         bch2_btree_iter_unlock(&stats->iter);
568
569         move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
570         closure_sync(&ctxt.cl);
571
572         EBUG_ON(atomic_read(&ctxt.write_sectors));
573
574         trace_move_data(c,
575                         atomic64_read(&stats->sectors_moved),
576                         atomic64_read(&stats->keys_moved));
577
578         return ret;
579 }
580
581 static int bch2_gc_data_replicas(struct bch_fs *c)
582 {
583         struct btree_iter iter;
584         struct bkey_s_c k;
585         int ret;
586
587         mutex_lock(&c->replicas_gc_lock);
588         bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
589
590         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
591                            BTREE_ITER_PREFETCH, k) {
592                 ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
593                 if (ret)
594                         break;
595         }
596         ret = bch2_btree_iter_unlock(&iter) ?: ret;
597
598         bch2_replicas_gc_end(c, ret);
599         mutex_unlock(&c->replicas_gc_lock);
600
601         return ret;
602 }
603
604 static int bch2_gc_btree_replicas(struct bch_fs *c)
605 {
606         struct btree_iter iter;
607         struct btree *b;
608         unsigned id;
609         int ret = 0;
610
611         mutex_lock(&c->replicas_gc_lock);
612         bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
613
614         for (id = 0; id < BTREE_ID_NR; id++) {
615                 for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
616                         ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
617                                                       bkey_i_to_s_c(&b->key));
618
619                         bch2_btree_iter_cond_resched(&iter);
620                 }
621
622                 ret = bch2_btree_iter_unlock(&iter) ?: ret;
623         }
624
625         bch2_replicas_gc_end(c, ret);
626         mutex_unlock(&c->replicas_gc_lock);
627
628         return ret;
629 }
630
631 static int bch2_move_btree(struct bch_fs *c,
632                            move_pred_fn pred,
633                            void *arg,
634                            struct bch_move_stats *stats)
635 {
636         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
637         struct btree *b;
638         unsigned id;
639         struct data_opts data_opts;
640         enum data_cmd cmd;
641         int ret = 0;
642
643         stats->data_type = BCH_DATA_BTREE;
644
645         for (id = 0; id < BTREE_ID_NR; id++) {
646                 for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
647                         switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
648                                             bkey_i_to_s_c_extent(&b->key),
649                                             &io_opts,
650                                             &data_opts))) {
651                         case DATA_SKIP:
652                                 goto next;
653                         case DATA_SCRUB:
654                                 BUG();
655                         case DATA_ADD_REPLICAS:
656                         case DATA_REWRITE:
657                                 break;
658                         default:
659                                 BUG();
660                         }
661
662                         ret = bch2_btree_node_rewrite(c, &stats->iter,
663                                         b->data->keys.seq, 0) ?: ret;
664 next:
665                         bch2_btree_iter_cond_resched(&stats->iter);
666                 }
667
668                 ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
669         }
670
671         return ret;
672 }
673
674 #if 0
675 static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
676                                 enum bkey_type type,
677                                 struct bkey_s_c_extent e,
678                                 struct bch_io_opts *io_opts,
679                                 struct data_opts *data_opts)
680 {
681         return DATA_SCRUB;
682 }
683 #endif
684
685 static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
686                                       enum bkey_type type,
687                                       struct bkey_s_c_extent e,
688                                       struct bch_io_opts *io_opts,
689                                       struct data_opts *data_opts)
690 {
691         unsigned nr_good = bch2_extent_durability(c, e);
692         unsigned replicas = type == BKEY_TYPE_BTREE
693                 ? c->opts.metadata_replicas
694                 : io_opts->data_replicas;
695
696         if (!nr_good || nr_good >= replicas)
697                 return DATA_SKIP;
698
699         data_opts->target               = 0;
700         data_opts->btree_insert_flags = 0;
701         return DATA_ADD_REPLICAS;
702 }
703
704 static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
705                                   enum bkey_type type,
706                                   struct bkey_s_c_extent e,
707                                   struct bch_io_opts *io_opts,
708                                   struct data_opts *data_opts)
709 {
710         struct bch_ioctl_data *op = arg;
711
712         if (!bch2_extent_has_device(e, op->migrate.dev))
713                 return DATA_SKIP;
714
715         data_opts->target               = 0;
716         data_opts->btree_insert_flags   = 0;
717         data_opts->rewrite_dev          = op->migrate.dev;
718         return DATA_REWRITE;
719 }
720
721 int bch2_data_job(struct bch_fs *c,
722                   struct bch_move_stats *stats,
723                   struct bch_ioctl_data op)
724 {
725         int ret = 0;
726
727         switch (op.op) {
728         case BCH_DATA_OP_REREPLICATE:
729                 stats->data_type = BCH_DATA_JOURNAL;
730                 ret = bch2_journal_flush_device(&c->journal, -1);
731
732                 ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
733                 ret = bch2_gc_btree_replicas(c) ?: ret;
734
735                 ret = bch2_move_data(c, NULL,
736                                      writepoint_hashed((unsigned long) current),
737                                      op.start,
738                                      op.end,
739                                      rereplicate_pred, c, stats) ?: ret;
740                 ret = bch2_gc_data_replicas(c) ?: ret;
741                 break;
742         case BCH_DATA_OP_MIGRATE:
743                 if (op.migrate.dev >= c->sb.nr_devices)
744                         return -EINVAL;
745
746                 stats->data_type = BCH_DATA_JOURNAL;
747                 ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
748
749                 ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
750                 ret = bch2_gc_btree_replicas(c) ?: ret;
751
752                 ret = bch2_move_data(c, NULL,
753                                      writepoint_hashed((unsigned long) current),
754                                      op.start,
755                                      op.end,
756                                      migrate_pred, &op, stats) ?: ret;
757                 ret = bch2_gc_data_replicas(c) ?: ret;
758                 break;
759         default:
760                 ret = -EINVAL;
761         }
762
763         return ret;
764 }