]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to f70a3402188e bcachefs: Fix ca->oldest_gen allocation
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io_read.h"
18 #include "io_write.h"
19 #include "journal_reclaim.h"
20 #include "keylist.h"
21 #include "move.h"
22 #include "replicas.h"
23 #include "super-io.h"
24 #include "trace.h"
25
26 #include <linux/ioprio.h>
27 #include <linux/kthread.h>
28
29 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
30 {
31         if (trace_move_extent_enabled()) {
32                 struct printbuf buf = PRINTBUF;
33
34                 bch2_bkey_val_to_text(&buf, c, k);
35                 trace_move_extent(c, buf.buf);
36                 printbuf_exit(&buf);
37         }
38 }
39
40 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
41 {
42         if (trace_move_extent_read_enabled()) {
43                 struct printbuf buf = PRINTBUF;
44
45                 bch2_bkey_val_to_text(&buf, c, k);
46                 trace_move_extent_read(c, buf.buf);
47                 printbuf_exit(&buf);
48         }
49 }
50
51 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
52 {
53         if (trace_move_extent_alloc_mem_fail_enabled()) {
54                 struct printbuf buf = PRINTBUF;
55
56                 bch2_bkey_val_to_text(&buf, c, k);
57                 trace_move_extent_alloc_mem_fail(c, buf.buf);
58                 printbuf_exit(&buf);
59         }
60 }
61
62 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
63 {
64         mutex_lock(&c->data_progress_lock);
65         list_add(&stats->list, &c->data_progress_list);
66         mutex_unlock(&c->data_progress_lock);
67 }
68
69 static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
70 {
71         mutex_lock(&c->data_progress_lock);
72         list_del(&stats->list);
73         mutex_unlock(&c->data_progress_lock);
74 }
75
76 struct moving_io {
77         struct list_head                read_list;
78         struct list_head                io_list;
79         struct move_bucket_in_flight    *b;
80         struct closure                  cl;
81         bool                            read_completed;
82
83         unsigned                        read_sectors;
84         unsigned                        write_sectors;
85
86         struct bch_read_bio             rbio;
87
88         struct data_update              write;
89         /* Must be last since it is variable size */
90         struct bio_vec                  bi_inline_vecs[0];
91 };
92
93 static void move_free(struct moving_io *io)
94 {
95         struct moving_context *ctxt = io->write.ctxt;
96
97         if (io->b)
98                 atomic_dec(&io->b->count);
99
100         bch2_data_update_exit(&io->write);
101
102         mutex_lock(&ctxt->lock);
103         list_del(&io->io_list);
104         wake_up(&ctxt->wait);
105         mutex_unlock(&ctxt->lock);
106
107         kfree(io);
108 }
109
110 static void move_write_done(struct bch_write_op *op)
111 {
112         struct moving_io *io = container_of(op, struct moving_io, write.op);
113         struct moving_context *ctxt = io->write.ctxt;
114
115         if (io->write.op.error)
116                 ctxt->write_error = true;
117
118         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
119         atomic_dec(&io->write.ctxt->write_ios);
120         move_free(io);
121         closure_put(&ctxt->cl);
122 }
123
124 static void move_write(struct moving_io *io)
125 {
126         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
127                 move_free(io);
128                 return;
129         }
130
131         closure_get(&io->write.ctxt->cl);
132         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
133         atomic_inc(&io->write.ctxt->write_ios);
134
135         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
136 }
137
138 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
139 {
140         struct moving_io *io =
141                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
142
143         return io && io->read_completed ? io : NULL;
144 }
145
146 static void move_read_endio(struct bio *bio)
147 {
148         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
149         struct moving_context *ctxt = io->write.ctxt;
150
151         atomic_sub(io->read_sectors, &ctxt->read_sectors);
152         atomic_dec(&ctxt->read_ios);
153         io->read_completed = true;
154
155         wake_up(&ctxt->wait);
156         closure_put(&ctxt->cl);
157 }
158
159 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
160                                         struct btree_trans *trans)
161 {
162         struct moving_io *io;
163
164         if (trans)
165                 bch2_trans_unlock(trans);
166
167         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
168                 list_del(&io->read_list);
169                 move_write(io);
170         }
171 }
172
173 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
174                                        struct btree_trans *trans)
175 {
176         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
177
178         move_ctxt_wait_event(ctxt, trans,
179                 !atomic_read(&ctxt->write_sectors) ||
180                 atomic_read(&ctxt->write_sectors) != sectors_pending);
181 }
182
183 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
184 {
185         struct bch_fs *c = ctxt->c;
186
187         move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
188         closure_sync(&ctxt->cl);
189
190         EBUG_ON(atomic_read(&ctxt->write_sectors));
191         EBUG_ON(atomic_read(&ctxt->write_ios));
192         EBUG_ON(atomic_read(&ctxt->read_sectors));
193         EBUG_ON(atomic_read(&ctxt->read_ios));
194
195         if (ctxt->stats) {
196                 progress_list_del(c, ctxt->stats);
197                 trace_move_data(c,
198                                 atomic64_read(&ctxt->stats->sectors_moved),
199                                 atomic64_read(&ctxt->stats->keys_moved));
200         }
201
202         mutex_lock(&c->moving_context_lock);
203         list_del(&ctxt->list);
204         mutex_unlock(&c->moving_context_lock);
205 }
206
207 void bch2_moving_ctxt_init(struct moving_context *ctxt,
208                            struct bch_fs *c,
209                            struct bch_ratelimit *rate,
210                            struct bch_move_stats *stats,
211                            struct write_point_specifier wp,
212                            bool wait_on_copygc)
213 {
214         memset(ctxt, 0, sizeof(*ctxt));
215
216         ctxt->c         = c;
217         ctxt->fn        = (void *) _RET_IP_;
218         ctxt->rate      = rate;
219         ctxt->stats     = stats;
220         ctxt->wp        = wp;
221         ctxt->wait_on_copygc = wait_on_copygc;
222
223         closure_init_stack(&ctxt->cl);
224
225         mutex_init(&ctxt->lock);
226         INIT_LIST_HEAD(&ctxt->reads);
227         INIT_LIST_HEAD(&ctxt->ios);
228         init_waitqueue_head(&ctxt->wait);
229
230         mutex_lock(&c->moving_context_lock);
231         list_add(&ctxt->list, &c->moving_context_list);
232         mutex_unlock(&c->moving_context_lock);
233
234         if (stats) {
235                 progress_list_add(c, stats);
236                 stats->data_type = BCH_DATA_user;
237         }
238 }
239
240 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
241 {
242         memset(stats, 0, sizeof(*stats));
243         scnprintf(stats->name, sizeof(stats->name), "%s", name);
244 }
245
246 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
247                                  struct btree_iter *iter,
248                                  struct bkey_s_c k,
249                                  struct data_update_opts data_opts)
250 {
251         struct bch_fs *c = trans->c;
252         struct bkey_i *n;
253         int ret;
254
255         n = bch2_bkey_make_mut_noupdate(trans, k);
256         ret = PTR_ERR_OR_ZERO(n);
257         if (ret)
258                 return ret;
259
260         while (data_opts.kill_ptrs) {
261                 unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
262                 struct bch_extent_ptr *ptr;
263
264                 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
265                 data_opts.kill_ptrs ^= 1U << drop;
266         }
267
268         /*
269          * If the new extent no longer has any pointers, bch2_extent_normalize()
270          * will do the appropriate thing with it (turning it into a
271          * KEY_TYPE_error key, or just a discard if it was a cached extent)
272          */
273         bch2_extent_normalize(c, bkey_i_to_s(n));
274
275         /*
276          * Since we're not inserting through an extent iterator
277          * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
278          * we aren't using the extent overwrite path to delete, we're
279          * just using the normal key deletion path:
280          */
281         if (bkey_deleted(&n->k))
282                 n->k.size = 0;
283
284         return bch2_trans_relock(trans) ?:
285                 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
286                 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
287 }
288
289 static int bch2_move_extent(struct btree_trans *trans,
290                             struct btree_iter *iter,
291                             struct moving_context *ctxt,
292                             struct move_bucket_in_flight *bucket_in_flight,
293                             struct bch_io_opts io_opts,
294                             enum btree_id btree_id,
295                             struct bkey_s_c k,
296                             struct data_update_opts data_opts)
297 {
298         struct bch_fs *c = trans->c;
299         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
300         struct moving_io *io;
301         const union bch_extent_entry *entry;
302         struct extent_ptr_decoded p;
303         unsigned sectors = k.k->size, pages;
304         int ret = -ENOMEM;
305
306         trace_move_extent2(c, k);
307
308         bch2_data_update_opts_normalize(k, &data_opts);
309
310         if (!data_opts.rewrite_ptrs &&
311             !data_opts.extra_replicas) {
312                 if (data_opts.kill_ptrs)
313                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
314                 return 0;
315         }
316
317         /*
318          * Before memory allocations & taking nocow locks in
319          * bch2_data_update_init():
320          */
321         bch2_trans_unlock(trans);
322
323         /* write path might have to decompress data: */
324         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
325                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
326
327         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
328         io = kzalloc(sizeof(struct moving_io) +
329                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
330         if (!io)
331                 goto err;
332
333         INIT_LIST_HEAD(&io->io_list);
334         io->write.ctxt          = ctxt;
335         io->read_sectors        = k.k->size;
336         io->write_sectors       = k.k->size;
337
338         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
339         bio_set_prio(&io->write.op.wbio.bio,
340                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
341
342         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
343                                  GFP_KERNEL))
344                 goto err_free;
345
346         io->rbio.c              = c;
347         io->rbio.opts           = io_opts;
348         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
349         io->rbio.bio.bi_vcnt = pages;
350         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
351         io->rbio.bio.bi_iter.bi_size = sectors << 9;
352
353         io->rbio.bio.bi_opf             = REQ_OP_READ;
354         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
355         io->rbio.bio.bi_end_io          = move_read_endio;
356
357         ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
358                                     io_opts, data_opts, btree_id, k);
359         if (ret && ret != -BCH_ERR_unwritten_extent_update)
360                 goto err_free_pages;
361
362         if (ret == -BCH_ERR_unwritten_extent_update) {
363                 bch2_update_unwritten_extent(trans, &io->write);
364                 move_free(io);
365                 return 0;
366         }
367
368         BUG_ON(ret);
369
370         io->write.ctxt = ctxt;
371         io->write.op.end_io = move_write_done;
372
373         if (ctxt->stats) {
374                 atomic64_inc(&ctxt->stats->keys_moved);
375                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
376         }
377
378         if (bucket_in_flight) {
379                 io->b = bucket_in_flight;
380                 atomic_inc(&io->b->count);
381         }
382
383         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
384         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
385         trace_move_extent_read2(c, k);
386
387         mutex_lock(&ctxt->lock);
388         atomic_add(io->read_sectors, &ctxt->read_sectors);
389         atomic_inc(&ctxt->read_ios);
390
391         list_add_tail(&io->read_list, &ctxt->reads);
392         list_add_tail(&io->io_list, &ctxt->ios);
393         mutex_unlock(&ctxt->lock);
394
395         /*
396          * dropped by move_read_endio() - guards against use after free of
397          * ctxt when doing wakeup
398          */
399         closure_get(&ctxt->cl);
400         bch2_read_extent(trans, &io->rbio,
401                          bkey_start_pos(k.k),
402                          btree_id, k, 0,
403                          BCH_READ_NODECODE|
404                          BCH_READ_LAST_FRAGMENT);
405         return 0;
406 err_free_pages:
407         bio_free_pages(&io->write.op.wbio.bio);
408 err_free:
409         kfree(io);
410 err:
411         this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
412         trace_move_extent_alloc_mem_fail2(c, k);
413         return ret;
414 }
415
416 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
417                         struct bch_inode_unpacked *inode)
418 {
419         struct btree_iter iter;
420         struct bkey_s_c k;
421         int ret;
422
423         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
424                              BTREE_ITER_ALL_SNAPSHOTS);
425         k = bch2_btree_iter_peek(&iter);
426         ret = bkey_err(k);
427         if (ret)
428                 goto err;
429
430         if (!k.k || !bkey_eq(k.k->p, pos)) {
431                 ret = -BCH_ERR_ENOENT_inode;
432                 goto err;
433         }
434
435         ret = bkey_is_inode(k.k) ? 0 : -EIO;
436         if (ret)
437                 goto err;
438
439         ret = bch2_inode_unpack(k, inode);
440         if (ret)
441                 goto err;
442 err:
443         bch2_trans_iter_exit(trans, &iter);
444         return ret;
445 }
446
447 static int move_ratelimit(struct btree_trans *trans,
448                           struct moving_context *ctxt)
449 {
450         struct bch_fs *c = trans->c;
451         u64 delay;
452
453         if (ctxt->wait_on_copygc) {
454                 bch2_trans_unlock(trans);
455                 wait_event_killable(c->copygc_running_wq,
456                                     !c->copygc_running ||
457                                     kthread_should_stop());
458         }
459
460         do {
461                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
462
463                 if (delay) {
464                         bch2_trans_unlock(trans);
465                         set_current_state(TASK_INTERRUPTIBLE);
466                 }
467
468                 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
469                         __set_current_state(TASK_RUNNING);
470                         return 1;
471                 }
472
473                 if (delay)
474                         schedule_timeout(delay);
475
476                 if (unlikely(freezing(current))) {
477                         move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
478                         try_to_freeze();
479                 }
480         } while (delay);
481
482         /*
483          * XXX: these limits really ought to be per device, SSDs and hard drives
484          * will want different limits
485          */
486         move_ctxt_wait_event(ctxt, trans,
487                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
488                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
489                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
490                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
491
492         return 0;
493 }
494
495 static int move_get_io_opts(struct btree_trans *trans,
496                             struct bch_io_opts *io_opts,
497                             struct bkey_s_c k, u64 *cur_inum)
498 {
499         struct bch_inode_unpacked inode;
500         int ret;
501
502         if (*cur_inum == k.k->p.inode)
503                 return 0;
504
505         ret = lookup_inode(trans,
506                            SPOS(0, k.k->p.inode, k.k->p.snapshot),
507                            &inode);
508         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
509                 return ret;
510
511         if (!ret)
512                 bch2_inode_opts_get(io_opts, trans->c, &inode);
513         else
514                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
515         *cur_inum = k.k->p.inode;
516         return 0;
517 }
518
519 static int __bch2_move_data(struct moving_context *ctxt,
520                             struct bpos start,
521                             struct bpos end,
522                             move_pred_fn pred, void *arg,
523                             enum btree_id btree_id)
524 {
525         struct bch_fs *c = ctxt->c;
526         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
527         struct bkey_buf sk;
528         struct btree_trans *trans = bch2_trans_get(c);
529         struct btree_iter iter;
530         struct bkey_s_c k;
531         struct data_update_opts data_opts;
532         u64 cur_inum = U64_MAX;
533         int ret = 0, ret2;
534
535         bch2_bkey_buf_init(&sk);
536
537         if (ctxt->stats) {
538                 ctxt->stats->data_type  = BCH_DATA_user;
539                 ctxt->stats->btree_id   = btree_id;
540                 ctxt->stats->pos        = start;
541         }
542
543         bch2_trans_iter_init(trans, &iter, btree_id, start,
544                              BTREE_ITER_PREFETCH|
545                              BTREE_ITER_ALL_SNAPSHOTS);
546
547         if (ctxt->rate)
548                 bch2_ratelimit_reset(ctxt->rate);
549
550         while (!move_ratelimit(trans, ctxt)) {
551                 bch2_trans_begin(trans);
552
553                 k = bch2_btree_iter_peek(&iter);
554                 if (!k.k)
555                         break;
556
557                 ret = bkey_err(k);
558                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
559                         continue;
560                 if (ret)
561                         break;
562
563                 if (bkey_ge(bkey_start_pos(k.k), end))
564                         break;
565
566                 if (ctxt->stats)
567                         ctxt->stats->pos = iter.pos;
568
569                 if (!bkey_extent_is_direct_data(k.k))
570                         goto next_nondata;
571
572                 ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
573                 if (ret)
574                         continue;
575
576                 memset(&data_opts, 0, sizeof(data_opts));
577                 if (!pred(c, arg, k, &io_opts, &data_opts))
578                         goto next;
579
580                 /*
581                  * The iterator gets unlocked by __bch2_read_extent - need to
582                  * save a copy of @k elsewhere:
583                  */
584                 bch2_bkey_buf_reassemble(&sk, c, k);
585                 k = bkey_i_to_s_c(sk.k);
586
587                 ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
588                                         io_opts, btree_id, k, data_opts);
589                 if (ret2) {
590                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
591                                 continue;
592
593                         if (ret2 == -ENOMEM) {
594                                 /* memory allocation failure, wait for some IO to finish */
595                                 bch2_move_ctxt_wait_for_io(ctxt, trans);
596                                 continue;
597                         }
598
599                         /* XXX signal failure */
600                         goto next;
601                 }
602
603                 if (ctxt->rate)
604                         bch2_ratelimit_increment(ctxt->rate, k.k->size);
605 next:
606                 if (ctxt->stats)
607                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
608 next_nondata:
609                 bch2_btree_iter_advance(&iter);
610         }
611
612         bch2_trans_iter_exit(trans, &iter);
613         bch2_trans_put(trans);
614         bch2_bkey_buf_exit(&sk, c);
615
616         return ret;
617 }
618
619 int bch2_move_data(struct bch_fs *c,
620                    enum btree_id start_btree_id, struct bpos start_pos,
621                    enum btree_id end_btree_id,   struct bpos end_pos,
622                    struct bch_ratelimit *rate,
623                    struct bch_move_stats *stats,
624                    struct write_point_specifier wp,
625                    bool wait_on_copygc,
626                    move_pred_fn pred, void *arg)
627 {
628         struct moving_context ctxt;
629         enum btree_id id;
630         int ret = 0;
631
632         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
633
634         for (id = start_btree_id;
635              id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
636              id++) {
637                 stats->btree_id = id;
638
639                 if (id != BTREE_ID_extents &&
640                     id != BTREE_ID_reflink)
641                         continue;
642
643                 if (!bch2_btree_id_root(c, id)->b)
644                         continue;
645
646                 ret = __bch2_move_data(&ctxt,
647                                        id == start_btree_id ? start_pos : POS_MIN,
648                                        id == end_btree_id   ? end_pos   : POS_MAX,
649                                        pred, arg, id);
650                 if (ret)
651                         break;
652         }
653
654         bch2_moving_ctxt_exit(&ctxt);
655
656         return ret;
657 }
658
659 int __bch2_evacuate_bucket(struct btree_trans *trans,
660                            struct moving_context *ctxt,
661                            struct move_bucket_in_flight *bucket_in_flight,
662                            struct bpos bucket, int gen,
663                            struct data_update_opts _data_opts)
664 {
665         struct bch_fs *c = ctxt->c;
666         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
667         struct btree_iter iter;
668         struct bkey_buf sk;
669         struct bch_backpointer bp;
670         struct bch_alloc_v4 a_convert;
671         const struct bch_alloc_v4 *a;
672         struct bkey_s_c k;
673         struct data_update_opts data_opts;
674         unsigned dirty_sectors, bucket_size;
675         u64 fragmentation;
676         u64 cur_inum = U64_MAX;
677         struct bpos bp_pos = POS_MIN;
678         int ret = 0;
679
680         trace_bucket_evacuate(c, &bucket);
681
682         bch2_bkey_buf_init(&sk);
683
684         /*
685          * We're not run in a context that handles transaction restarts:
686          */
687         bch2_trans_begin(trans);
688
689         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
690                              bucket, BTREE_ITER_CACHED);
691         ret = lockrestart_do(trans,
692                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
693         bch2_trans_iter_exit(trans, &iter);
694
695         if (ret) {
696                 bch_err_msg(c, ret, "looking up alloc key");
697                 goto err;
698         }
699
700         a = bch2_alloc_to_v4(k, &a_convert);
701         dirty_sectors = a->dirty_sectors;
702         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
703         fragmentation = a->fragmentation_lru;
704
705         ret = bch2_btree_write_buffer_flush(trans);
706         if (ret) {
707                 bch_err_msg(c, ret, "flushing btree write buffer");
708                 goto err;
709         }
710
711         while (!(ret = move_ratelimit(trans, ctxt))) {
712                 bch2_trans_begin(trans);
713
714                 ret = bch2_get_next_backpointer(trans, bucket, gen,
715                                                 &bp_pos, &bp,
716                                                 BTREE_ITER_CACHED);
717                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
718                         continue;
719                 if (ret)
720                         goto err;
721                 if (bkey_eq(bp_pos, POS_MAX))
722                         break;
723
724                 if (!bp.level) {
725                         const struct bch_extent_ptr *ptr;
726                         unsigned i = 0;
727
728                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
729                         ret = bkey_err(k);
730                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
731                                 continue;
732                         if (ret)
733                                 goto err;
734                         if (!k.k)
735                                 goto next;
736
737                         bch2_bkey_buf_reassemble(&sk, c, k);
738                         k = bkey_i_to_s_c(sk.k);
739
740                         ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
741                         if (ret) {
742                                 bch2_trans_iter_exit(trans, &iter);
743                                 continue;
744                         }
745
746                         data_opts = _data_opts;
747                         data_opts.target        = io_opts.background_target;
748                         data_opts.rewrite_ptrs = 0;
749
750                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
751                                 if (ptr->dev == bucket.inode) {
752                                         data_opts.rewrite_ptrs |= 1U << i;
753                                         if (ptr->cached) {
754                                                 bch2_trans_iter_exit(trans, &iter);
755                                                 goto next;
756                                         }
757                                 }
758                                 i++;
759                         }
760
761                         ret = bch2_move_extent(trans, &iter, ctxt,
762                                         bucket_in_flight,
763                                         io_opts, bp.btree_id, k, data_opts);
764                         bch2_trans_iter_exit(trans, &iter);
765
766                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
767                                 continue;
768                         if (ret == -ENOMEM) {
769                                 /* memory allocation failure, wait for some IO to finish */
770                                 bch2_move_ctxt_wait_for_io(ctxt, trans);
771                                 continue;
772                         }
773                         if (ret)
774                                 goto err;
775
776                         if (ctxt->rate)
777                                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
778                         if (ctxt->stats)
779                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
780                 } else {
781                         struct btree *b;
782
783                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
784                         ret = PTR_ERR_OR_ZERO(b);
785                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
786                                 continue;
787                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
788                                 continue;
789                         if (ret)
790                                 goto err;
791                         if (!b)
792                                 goto next;
793
794                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
795                         bch2_trans_iter_exit(trans, &iter);
796
797                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
798                                 continue;
799                         if (ret)
800                                 goto err;
801
802                         if (ctxt->rate)
803                                 bch2_ratelimit_increment(ctxt->rate,
804                                                          c->opts.btree_node_size >> 9);
805                         if (ctxt->stats) {
806                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
807                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
808                         }
809                 }
810 next:
811                 bp_pos = bpos_nosnap_successor(bp_pos);
812         }
813
814         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
815 err:
816         bch2_bkey_buf_exit(&sk, c);
817         return ret;
818 }
819
820 int bch2_evacuate_bucket(struct bch_fs *c,
821                          struct bpos bucket, int gen,
822                          struct data_update_opts data_opts,
823                          struct bch_ratelimit *rate,
824                          struct bch_move_stats *stats,
825                          struct write_point_specifier wp,
826                          bool wait_on_copygc)
827 {
828         struct btree_trans *trans = bch2_trans_get(c);
829         struct moving_context ctxt;
830         int ret;
831
832         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
833         ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
834         bch2_moving_ctxt_exit(&ctxt);
835         bch2_trans_put(trans);
836
837         return ret;
838 }
839
840 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
841                                 struct btree *, struct bch_io_opts *,
842                                 struct data_update_opts *);
843
844 static int bch2_move_btree(struct bch_fs *c,
845                            enum btree_id start_btree_id, struct bpos start_pos,
846                            enum btree_id end_btree_id,   struct bpos end_pos,
847                            move_btree_pred pred, void *arg,
848                            struct bch_move_stats *stats)
849 {
850         bool kthread = (current->flags & PF_KTHREAD) != 0;
851         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
852         struct btree_trans *trans = bch2_trans_get(c);
853         struct btree_iter iter;
854         struct btree *b;
855         enum btree_id id;
856         struct data_update_opts data_opts;
857         int ret = 0;
858
859         progress_list_add(c, stats);
860
861         stats->data_type = BCH_DATA_btree;
862
863         for (id = start_btree_id;
864              id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
865              id++) {
866                 stats->btree_id = id;
867
868                 if (!bch2_btree_id_root(c, id)->b)
869                         continue;
870
871                 bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
872                                           BTREE_ITER_PREFETCH);
873 retry:
874                 ret = 0;
875                 while (bch2_trans_begin(trans),
876                        (b = bch2_btree_iter_peek_node(&iter)) &&
877                        !(ret = PTR_ERR_OR_ZERO(b))) {
878                         if (kthread && kthread_should_stop())
879                                 break;
880
881                         if ((cmp_int(id, end_btree_id) ?:
882                              bpos_cmp(b->key.k.p, end_pos)) > 0)
883                                 break;
884
885                         stats->pos = iter.pos;
886
887                         if (!pred(c, arg, b, &io_opts, &data_opts))
888                                 goto next;
889
890                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
891                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
892                                 continue;
893                         if (ret)
894                                 break;
895 next:
896                         bch2_btree_iter_next_node(&iter);
897                 }
898                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
899                         goto retry;
900
901                 bch2_trans_iter_exit(trans, &iter);
902
903                 if (kthread && kthread_should_stop())
904                         break;
905         }
906
907         bch2_trans_put(trans);
908
909         if (ret)
910                 bch_err_fn(c, ret);
911
912         bch2_btree_interior_updates_flush(c);
913
914         progress_list_del(c, stats);
915         return ret;
916 }
917
918 static bool rereplicate_pred(struct bch_fs *c, void *arg,
919                              struct bkey_s_c k,
920                              struct bch_io_opts *io_opts,
921                              struct data_update_opts *data_opts)
922 {
923         unsigned nr_good = bch2_bkey_durability(c, k);
924         unsigned replicas = bkey_is_btree_ptr(k.k)
925                 ? c->opts.metadata_replicas
926                 : io_opts->data_replicas;
927
928         if (!nr_good || nr_good >= replicas)
929                 return false;
930
931         data_opts->target               = 0;
932         data_opts->extra_replicas       = replicas - nr_good;
933         data_opts->btree_insert_flags   = 0;
934         return true;
935 }
936
937 static bool migrate_pred(struct bch_fs *c, void *arg,
938                          struct bkey_s_c k,
939                          struct bch_io_opts *io_opts,
940                          struct data_update_opts *data_opts)
941 {
942         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
943         const struct bch_extent_ptr *ptr;
944         struct bch_ioctl_data *op = arg;
945         unsigned i = 0;
946
947         data_opts->rewrite_ptrs         = 0;
948         data_opts->target               = 0;
949         data_opts->extra_replicas       = 0;
950         data_opts->btree_insert_flags   = 0;
951
952         bkey_for_each_ptr(ptrs, ptr) {
953                 if (ptr->dev == op->migrate.dev)
954                         data_opts->rewrite_ptrs |= 1U << i;
955                 i++;
956         }
957
958         return data_opts->rewrite_ptrs != 0;
959 }
960
961 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
962                                    struct btree *b,
963                                    struct bch_io_opts *io_opts,
964                                    struct data_update_opts *data_opts)
965 {
966         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
967 }
968
969 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
970                                struct btree *b,
971                                struct bch_io_opts *io_opts,
972                                struct data_update_opts *data_opts)
973 {
974         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
975 }
976
977 static bool bformat_needs_redo(struct bkey_format *f)
978 {
979         unsigned i;
980
981         for (i = 0; i < f->nr_fields; i++) {
982                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
983                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
984                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
985
986                 if (f->bits_per_field[i] > unpacked_bits)
987                         return true;
988
989                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
990                         return true;
991
992                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
993                      unpacked_mask) <
994                     field_offset)
995                         return true;
996         }
997
998         return false;
999 }
1000
1001 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1002                                    struct btree *b,
1003                                    struct bch_io_opts *io_opts,
1004                                    struct data_update_opts *data_opts)
1005 {
1006         if (b->version_ondisk != c->sb.version ||
1007             btree_node_need_rewrite(b) ||
1008             bformat_needs_redo(&b->format)) {
1009                 data_opts->target               = 0;
1010                 data_opts->extra_replicas       = 0;
1011                 data_opts->btree_insert_flags   = 0;
1012                 return true;
1013         }
1014
1015         return false;
1016 }
1017
1018 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1019 {
1020         int ret;
1021
1022         ret = bch2_move_btree(c,
1023                               0,                POS_MIN,
1024                               BTREE_ID_NR,      SPOS_MAX,
1025                               rewrite_old_nodes_pred, c, stats);
1026         if (!ret) {
1027                 mutex_lock(&c->sb_lock);
1028                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1029                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1030                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1031                 bch2_write_super(c);
1032                 mutex_unlock(&c->sb_lock);
1033         }
1034
1035         if (ret)
1036                 bch_err_fn(c, ret);
1037         return ret;
1038 }
1039
1040 int bch2_data_job(struct bch_fs *c,
1041                   struct bch_move_stats *stats,
1042                   struct bch_ioctl_data op)
1043 {
1044         int ret = 0;
1045
1046         switch (op.op) {
1047         case BCH_DATA_OP_REREPLICATE:
1048                 bch2_move_stats_init(stats, "rereplicate");
1049                 stats->data_type = BCH_DATA_journal;
1050                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1051
1052                 ret = bch2_move_btree(c,
1053                                       op.start_btree,   op.start_pos,
1054                                       op.end_btree,     op.end_pos,
1055                                       rereplicate_btree_pred, c, stats) ?: ret;
1056                 ret = bch2_replicas_gc2(c) ?: ret;
1057
1058                 ret = bch2_move_data(c,
1059                                      op.start_btree,    op.start_pos,
1060                                      op.end_btree,      op.end_pos,
1061                                      NULL,
1062                                      stats,
1063                                      writepoint_hashed((unsigned long) current),
1064                                      true,
1065                                      rereplicate_pred, c) ?: ret;
1066                 ret = bch2_replicas_gc2(c) ?: ret;
1067                 break;
1068         case BCH_DATA_OP_MIGRATE:
1069                 if (op.migrate.dev >= c->sb.nr_devices)
1070                         return -EINVAL;
1071
1072                 bch2_move_stats_init(stats, "migrate");
1073                 stats->data_type = BCH_DATA_journal;
1074                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1075
1076                 ret = bch2_move_btree(c,
1077                                       op.start_btree,   op.start_pos,
1078                                       op.end_btree,     op.end_pos,
1079                                       migrate_btree_pred, &op, stats) ?: ret;
1080                 ret = bch2_replicas_gc2(c) ?: ret;
1081
1082                 ret = bch2_move_data(c,
1083                                      op.start_btree,    op.start_pos,
1084                                      op.end_btree,      op.end_pos,
1085                                      NULL,
1086                                      stats,
1087                                      writepoint_hashed((unsigned long) current),
1088                                      true,
1089                                      migrate_pred, &op) ?: ret;
1090                 ret = bch2_replicas_gc2(c) ?: ret;
1091                 break;
1092         case BCH_DATA_OP_REWRITE_OLD_NODES:
1093                 bch2_move_stats_init(stats, "rewrite_old_nodes");
1094                 ret = bch2_scan_old_btree_nodes(c, stats);
1095                 break;
1096         default:
1097                 ret = -EINVAL;
1098         }
1099
1100         return ret;
1101 }
1102
1103 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1104 {
1105         struct bch_move_stats *stats = ctxt->stats;
1106         struct moving_io *io;
1107
1108         prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
1109         prt_newline(out);
1110
1111         prt_printf(out, " data type %s btree_id %s position: ",
1112                    bch2_data_types[stats->data_type],
1113                    bch2_btree_id_str(stats->btree_id));
1114         bch2_bpos_to_text(out, stats->pos);
1115         prt_newline(out);
1116         printbuf_indent_add(out, 2);
1117
1118         prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1119                    atomic_read(&ctxt->read_ios),
1120                    c->opts.move_ios_in_flight,
1121                    atomic_read(&ctxt->read_sectors),
1122                    c->opts.move_bytes_in_flight >> 9);
1123         prt_newline(out);
1124
1125         prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1126                    atomic_read(&ctxt->write_ios),
1127                    c->opts.move_ios_in_flight,
1128                    atomic_read(&ctxt->write_sectors),
1129                    c->opts.move_bytes_in_flight >> 9);
1130         prt_newline(out);
1131
1132         printbuf_indent_add(out, 2);
1133
1134         mutex_lock(&ctxt->lock);
1135         list_for_each_entry(io, &ctxt->ios, io_list)
1136                 bch2_write_op_to_text(out, &io->write.op);
1137         mutex_unlock(&ctxt->lock);
1138
1139         printbuf_indent_sub(out, 4);
1140 }
1141
1142 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1143 {
1144         struct moving_context *ctxt;
1145
1146         mutex_lock(&c->moving_context_lock);
1147         list_for_each_entry(ctxt, &c->moving_context_list, list)
1148                 bch2_moving_ctxt_to_text(out, c, ctxt);
1149         mutex_unlock(&c->moving_context_lock);
1150 }
1151
1152 void bch2_fs_move_init(struct bch_fs *c)
1153 {
1154         INIT_LIST_HEAD(&c->moving_context_list);
1155         mutex_init(&c->moving_context_lock);
1156
1157         INIT_LIST_HEAD(&c->data_progress_list);
1158         mutex_init(&c->data_progress_lock);
1159 }