]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/move.c
Update bcachefs sources to 2115a2ffde bcachefs: Kill bch2_verify_bucket_evacuated()
[bcachefs-tools-debian] / libbcachefs / move.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io.h"
18 #include "journal_reclaim.h"
19 #include "move.h"
20 #include "replicas.h"
21 #include "super-io.h"
22 #include "keylist.h"
23
24 #include <linux/ioprio.h>
25 #include <linux/kthread.h>
26
27 #include <trace/events/bcachefs.h>
28
29 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
30 {
31         if (trace_move_extent_enabled()) {
32                 struct printbuf buf = PRINTBUF;
33
34                 bch2_bkey_val_to_text(&buf, c, k);
35                 trace_move_extent(c, buf.buf);
36                 printbuf_exit(&buf);
37         }
38 }
39
40 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
41 {
42         if (trace_move_extent_read_enabled()) {
43                 struct printbuf buf = PRINTBUF;
44
45                 bch2_bkey_val_to_text(&buf, c, k);
46                 trace_move_extent_read(c, buf.buf);
47                 printbuf_exit(&buf);
48         }
49 }
50
51 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
52 {
53         if (trace_move_extent_alloc_mem_fail_enabled()) {
54                 struct printbuf buf = PRINTBUF;
55
56                 bch2_bkey_val_to_text(&buf, c, k);
57                 trace_move_extent_alloc_mem_fail(c, buf.buf);
58                 printbuf_exit(&buf);
59         }
60 }
61
62 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
63 {
64         mutex_lock(&c->data_progress_lock);
65         list_add(&stats->list, &c->data_progress_list);
66         mutex_unlock(&c->data_progress_lock);
67 }
68
69 static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
70 {
71         mutex_lock(&c->data_progress_lock);
72         list_del(&stats->list);
73         mutex_unlock(&c->data_progress_lock);
74 }
75
76 struct moving_io {
77         struct list_head                read_list;
78         struct list_head                io_list;
79         struct move_bucket_in_flight    *b;
80         struct closure                  cl;
81         bool                            read_completed;
82
83         unsigned                        read_sectors;
84         unsigned                        write_sectors;
85
86         struct bch_read_bio             rbio;
87
88         struct data_update              write;
89         /* Must be last since it is variable size */
90         struct bio_vec                  bi_inline_vecs[0];
91 };
92
93 static void move_free(struct moving_io *io)
94 {
95         struct moving_context *ctxt = io->write.ctxt;
96
97         if (io->b)
98                 atomic_dec(&io->b->count);
99
100         bch2_data_update_exit(&io->write);
101
102         mutex_lock(&ctxt->lock);
103         list_del(&io->io_list);
104         wake_up(&ctxt->wait);
105         mutex_unlock(&ctxt->lock);
106
107         kfree(io);
108 }
109
110 static void move_write_done(struct bch_write_op *op)
111 {
112         struct moving_io *io = container_of(op, struct moving_io, write.op);
113         struct moving_context *ctxt = io->write.ctxt;
114
115         if (io->write.op.error)
116                 ctxt->write_error = true;
117
118         atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
119         atomic_dec(&io->write.ctxt->write_ios);
120         move_free(io);
121         closure_put(&ctxt->cl);
122 }
123
124 static void move_write(struct moving_io *io)
125 {
126         if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
127                 move_free(io);
128                 return;
129         }
130
131         closure_get(&io->write.ctxt->cl);
132         atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
133         atomic_inc(&io->write.ctxt->write_ios);
134
135         bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
136 }
137
138 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
139 {
140         struct moving_io *io =
141                 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
142
143         return io && io->read_completed ? io : NULL;
144 }
145
146 static void move_read_endio(struct bio *bio)
147 {
148         struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
149         struct moving_context *ctxt = io->write.ctxt;
150
151         atomic_sub(io->read_sectors, &ctxt->read_sectors);
152         atomic_dec(&ctxt->read_ios);
153         io->read_completed = true;
154
155         wake_up(&ctxt->wait);
156         closure_put(&ctxt->cl);
157 }
158
159 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
160                                         struct btree_trans *trans)
161 {
162         struct moving_io *io;
163
164         if (trans)
165                 bch2_trans_unlock(trans);
166
167         while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
168                 list_del(&io->read_list);
169                 move_write(io);
170         }
171 }
172
173 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
174                                        struct btree_trans *trans)
175 {
176         unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
177
178         move_ctxt_wait_event(ctxt, trans,
179                 !atomic_read(&ctxt->write_sectors) ||
180                 atomic_read(&ctxt->write_sectors) != sectors_pending);
181 }
182
183 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
184 {
185         struct bch_fs *c = ctxt->c;
186
187         move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
188         closure_sync(&ctxt->cl);
189
190         EBUG_ON(atomic_read(&ctxt->write_sectors));
191         EBUG_ON(atomic_read(&ctxt->write_ios));
192         EBUG_ON(atomic_read(&ctxt->read_sectors));
193         EBUG_ON(atomic_read(&ctxt->read_ios));
194
195         if (ctxt->stats) {
196                 progress_list_del(c, ctxt->stats);
197                 trace_move_data(c,
198                                 atomic64_read(&ctxt->stats->sectors_moved),
199                                 atomic64_read(&ctxt->stats->keys_moved));
200         }
201
202         mutex_lock(&c->moving_context_lock);
203         list_del(&ctxt->list);
204         mutex_unlock(&c->moving_context_lock);
205 }
206
207 void bch2_moving_ctxt_init(struct moving_context *ctxt,
208                            struct bch_fs *c,
209                            struct bch_ratelimit *rate,
210                            struct bch_move_stats *stats,
211                            struct write_point_specifier wp,
212                            bool wait_on_copygc)
213 {
214         memset(ctxt, 0, sizeof(*ctxt));
215
216         ctxt->c         = c;
217         ctxt->fn        = (void *) _RET_IP_;
218         ctxt->rate      = rate;
219         ctxt->stats     = stats;
220         ctxt->wp        = wp;
221         ctxt->wait_on_copygc = wait_on_copygc;
222
223         closure_init_stack(&ctxt->cl);
224
225         mutex_init(&ctxt->lock);
226         INIT_LIST_HEAD(&ctxt->reads);
227         INIT_LIST_HEAD(&ctxt->ios);
228         init_waitqueue_head(&ctxt->wait);
229
230         mutex_lock(&c->moving_context_lock);
231         list_add(&ctxt->list, &c->moving_context_list);
232         mutex_unlock(&c->moving_context_lock);
233
234         if (stats) {
235                 progress_list_add(c, stats);
236                 stats->data_type = BCH_DATA_user;
237         }
238 }
239
240 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
241 {
242         memset(stats, 0, sizeof(*stats));
243         scnprintf(stats->name, sizeof(stats->name), "%s", name);
244 }
245
246 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
247                                  struct btree_iter *iter,
248                                  struct bkey_s_c k,
249                                  struct data_update_opts data_opts)
250 {
251         struct bch_fs *c = trans->c;
252         struct bkey_i *n;
253         int ret;
254
255         n = bch2_bkey_make_mut(trans, k);
256         ret = PTR_ERR_OR_ZERO(n);
257         if (ret)
258                 return ret;
259
260         while (data_opts.kill_ptrs) {
261                 unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
262                 struct bch_extent_ptr *ptr;
263
264                 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
265                 data_opts.kill_ptrs ^= 1U << drop;
266         }
267
268         /*
269          * If the new extent no longer has any pointers, bch2_extent_normalize()
270          * will do the appropriate thing with it (turning it into a
271          * KEY_TYPE_error key, or just a discard if it was a cached extent)
272          */
273         bch2_extent_normalize(c, bkey_i_to_s(n));
274
275         /*
276          * Since we're not inserting through an extent iterator
277          * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
278          * we aren't using the extent overwrite path to delete, we're
279          * just using the normal key deletion path:
280          */
281         if (bkey_deleted(&n->k))
282                 n->k.size = 0;
283
284         return bch2_trans_relock(trans) ?:
285                 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
286                 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
287 }
288
289 static int bch2_move_extent(struct btree_trans *trans,
290                             struct btree_iter *iter,
291                             struct moving_context *ctxt,
292                             struct move_bucket_in_flight *bucket_in_flight,
293                             struct bch_io_opts io_opts,
294                             enum btree_id btree_id,
295                             struct bkey_s_c k,
296                             struct data_update_opts data_opts)
297 {
298         struct bch_fs *c = trans->c;
299         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
300         struct moving_io *io;
301         const union bch_extent_entry *entry;
302         struct extent_ptr_decoded p;
303         unsigned sectors = k.k->size, pages;
304         int ret = -ENOMEM;
305
306         trace_move_extent2(c, k);
307
308         bch2_data_update_opts_normalize(k, &data_opts);
309
310         if (!data_opts.rewrite_ptrs &&
311             !data_opts.extra_replicas) {
312                 if (data_opts.kill_ptrs)
313                         return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
314                 return 0;
315         }
316
317         /*
318          * Before memory allocations & taking nocow locks in
319          * bch2_data_update_init():
320          */
321         bch2_trans_unlock(trans);
322
323         /* write path might have to decompress data: */
324         bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
325                 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
326
327         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
328         io = kzalloc(sizeof(struct moving_io) +
329                      sizeof(struct bio_vec) * pages, GFP_KERNEL);
330         if (!io)
331                 goto err;
332
333         INIT_LIST_HEAD(&io->io_list);
334         io->write.ctxt          = ctxt;
335         io->read_sectors        = k.k->size;
336         io->write_sectors       = k.k->size;
337
338         bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
339         bio_set_prio(&io->write.op.wbio.bio,
340                      IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
341
342         if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
343                                  GFP_KERNEL))
344                 goto err_free;
345
346         io->rbio.c              = c;
347         io->rbio.opts           = io_opts;
348         bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
349         io->rbio.bio.bi_vcnt = pages;
350         bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
351         io->rbio.bio.bi_iter.bi_size = sectors << 9;
352
353         io->rbio.bio.bi_opf             = REQ_OP_READ;
354         io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
355         io->rbio.bio.bi_end_io          = move_read_endio;
356
357         ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
358                                     io_opts, data_opts, btree_id, k);
359         if (ret && ret != -BCH_ERR_unwritten_extent_update)
360                 goto err_free_pages;
361
362         if (ret == -BCH_ERR_unwritten_extent_update) {
363                 bch2_update_unwritten_extent(trans, &io->write);
364                 move_free(io);
365                 return 0;
366         }
367
368         BUG_ON(ret);
369
370         io->write.ctxt = ctxt;
371         io->write.op.end_io = move_write_done;
372
373         if (ctxt->stats) {
374                 atomic64_inc(&ctxt->stats->keys_moved);
375                 atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
376         }
377
378         if (bucket_in_flight) {
379                 io->b = bucket_in_flight;
380                 atomic_inc(&io->b->count);
381         }
382
383         this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
384         this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
385         trace_move_extent_read2(c, k);
386
387         mutex_lock(&ctxt->lock);
388         atomic_add(io->read_sectors, &ctxt->read_sectors);
389         atomic_inc(&ctxt->read_ios);
390
391         list_add_tail(&io->read_list, &ctxt->reads);
392         list_add_tail(&io->io_list, &ctxt->ios);
393         mutex_unlock(&ctxt->lock);
394
395         /*
396          * dropped by move_read_endio() - guards against use after free of
397          * ctxt when doing wakeup
398          */
399         closure_get(&ctxt->cl);
400         bch2_read_extent(trans, &io->rbio,
401                          bkey_start_pos(k.k),
402                          btree_id, k, 0,
403                          BCH_READ_NODECODE|
404                          BCH_READ_LAST_FRAGMENT);
405         return 0;
406 err_free_pages:
407         bio_free_pages(&io->write.op.wbio.bio);
408 err_free:
409         kfree(io);
410 err:
411         this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
412         trace_move_extent_alloc_mem_fail2(c, k);
413         return ret;
414 }
415
416 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
417                         struct bch_inode_unpacked *inode)
418 {
419         struct btree_iter iter;
420         struct bkey_s_c k;
421         int ret;
422
423         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
424                              BTREE_ITER_ALL_SNAPSHOTS);
425         k = bch2_btree_iter_peek(&iter);
426         ret = bkey_err(k);
427         if (ret)
428                 goto err;
429
430         if (!k.k || !bkey_eq(k.k->p, pos)) {
431                 ret = -ENOENT;
432                 goto err;
433         }
434
435         ret = bkey_is_inode(k.k) ? 0 : -EIO;
436         if (ret)
437                 goto err;
438
439         ret = bch2_inode_unpack(k, inode);
440         if (ret)
441                 goto err;
442 err:
443         bch2_trans_iter_exit(trans, &iter);
444         return ret;
445 }
446
447 static int move_ratelimit(struct btree_trans *trans,
448                           struct moving_context *ctxt)
449 {
450         struct bch_fs *c = trans->c;
451         u64 delay;
452
453         if (ctxt->wait_on_copygc) {
454                 bch2_trans_unlock(trans);
455                 wait_event_killable(c->copygc_running_wq,
456                                     !c->copygc_running ||
457                                     kthread_should_stop());
458         }
459
460         do {
461                 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
462
463                 if (delay) {
464                         bch2_trans_unlock(trans);
465                         set_current_state(TASK_INTERRUPTIBLE);
466                 }
467
468                 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
469                         __set_current_state(TASK_RUNNING);
470                         return 1;
471                 }
472
473                 if (delay)
474                         schedule_timeout(delay);
475
476                 if (unlikely(freezing(current))) {
477                         move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
478                         try_to_freeze();
479                 }
480         } while (delay);
481
482         /*
483          * XXX: these limits really ought to be per device, SSDs and hard drives
484          * will want different limits
485          */
486         move_ctxt_wait_event(ctxt, trans,
487                 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
488                 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
489                 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
490                 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
491
492         return 0;
493 }
494
495 static int move_get_io_opts(struct btree_trans *trans,
496                             struct bch_io_opts *io_opts,
497                             struct bkey_s_c k, u64 *cur_inum)
498 {
499         struct bch_inode_unpacked inode;
500         int ret;
501
502         if (*cur_inum == k.k->p.inode)
503                 return 0;
504
505         ret = lookup_inode(trans,
506                            SPOS(0, k.k->p.inode, k.k->p.snapshot),
507                            &inode);
508         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
509                 return ret;
510
511         if (!ret)
512                 bch2_inode_opts_get(io_opts, trans->c, &inode);
513         else
514                 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
515         *cur_inum = k.k->p.inode;
516         return 0;
517 }
518
519 static int __bch2_move_data(struct moving_context *ctxt,
520                             struct bpos start,
521                             struct bpos end,
522                             move_pred_fn pred, void *arg,
523                             enum btree_id btree_id)
524 {
525         struct bch_fs *c = ctxt->c;
526         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
527         struct bkey_buf sk;
528         struct btree_trans trans;
529         struct btree_iter iter;
530         struct bkey_s_c k;
531         struct data_update_opts data_opts;
532         u64 cur_inum = U64_MAX;
533         int ret = 0, ret2;
534
535         bch2_bkey_buf_init(&sk);
536         bch2_trans_init(&trans, c, 0, 0);
537
538         if (ctxt->stats) {
539                 ctxt->stats->data_type  = BCH_DATA_user;
540                 ctxt->stats->btree_id   = btree_id;
541                 ctxt->stats->pos        = start;
542         }
543
544         bch2_trans_iter_init(&trans, &iter, btree_id, start,
545                              BTREE_ITER_PREFETCH|
546                              BTREE_ITER_ALL_SNAPSHOTS);
547
548         if (ctxt->rate)
549                 bch2_ratelimit_reset(ctxt->rate);
550
551         while (!move_ratelimit(&trans, ctxt)) {
552                 bch2_trans_begin(&trans);
553
554                 k = bch2_btree_iter_peek(&iter);
555                 if (!k.k)
556                         break;
557
558                 ret = bkey_err(k);
559                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
560                         continue;
561                 if (ret)
562                         break;
563
564                 if (bkey_ge(bkey_start_pos(k.k), end))
565                         break;
566
567                 if (ctxt->stats)
568                         ctxt->stats->pos = iter.pos;
569
570                 if (!bkey_extent_is_direct_data(k.k))
571                         goto next_nondata;
572
573                 ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
574                 if (ret)
575                         continue;
576
577                 memset(&data_opts, 0, sizeof(data_opts));
578                 if (!pred(c, arg, k, &io_opts, &data_opts))
579                         goto next;
580
581                 /*
582                  * The iterator gets unlocked by __bch2_read_extent - need to
583                  * save a copy of @k elsewhere:
584                  */
585                 bch2_bkey_buf_reassemble(&sk, c, k);
586                 k = bkey_i_to_s_c(sk.k);
587                 bch2_trans_unlock(&trans);
588
589                 ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
590                                         io_opts, btree_id, k, data_opts);
591                 if (ret2) {
592                         if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
593                                 continue;
594
595                         if (ret2 == -ENOMEM) {
596                                 /* memory allocation failure, wait for some IO to finish */
597                                 bch2_move_ctxt_wait_for_io(ctxt, &trans);
598                                 continue;
599                         }
600
601                         /* XXX signal failure */
602                         goto next;
603                 }
604
605                 if (ctxt->rate)
606                         bch2_ratelimit_increment(ctxt->rate, k.k->size);
607 next:
608                 if (ctxt->stats)
609                         atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
610 next_nondata:
611                 bch2_btree_iter_advance(&iter);
612         }
613
614         bch2_trans_iter_exit(&trans, &iter);
615         bch2_trans_exit(&trans);
616         bch2_bkey_buf_exit(&sk, c);
617
618         return ret;
619 }
620
621 int bch2_move_data(struct bch_fs *c,
622                    enum btree_id start_btree_id, struct bpos start_pos,
623                    enum btree_id end_btree_id,   struct bpos end_pos,
624                    struct bch_ratelimit *rate,
625                    struct bch_move_stats *stats,
626                    struct write_point_specifier wp,
627                    bool wait_on_copygc,
628                    move_pred_fn pred, void *arg)
629 {
630         struct moving_context ctxt;
631         enum btree_id id;
632         int ret;
633
634         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
635
636         for (id = start_btree_id;
637              id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
638              id++) {
639                 stats->btree_id = id;
640
641                 if (id != BTREE_ID_extents &&
642                     id != BTREE_ID_reflink)
643                         continue;
644
645                 ret = __bch2_move_data(&ctxt,
646                                        id == start_btree_id ? start_pos : POS_MIN,
647                                        id == end_btree_id   ? end_pos   : POS_MAX,
648                                        pred, arg, id);
649                 if (ret)
650                         break;
651         }
652
653         bch2_moving_ctxt_exit(&ctxt);
654
655         return ret;
656 }
657
658 int __bch2_evacuate_bucket(struct btree_trans *trans,
659                            struct moving_context *ctxt,
660                            struct move_bucket_in_flight *bucket_in_flight,
661                            struct bpos bucket, int gen,
662                            struct data_update_opts _data_opts)
663 {
664         struct bch_fs *c = ctxt->c;
665         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
666         struct btree_iter iter;
667         struct bkey_buf sk;
668         struct bch_backpointer bp;
669         struct bch_alloc_v4 a_convert;
670         const struct bch_alloc_v4 *a;
671         struct bkey_s_c k;
672         struct data_update_opts data_opts;
673         unsigned dirty_sectors, bucket_size;
674         u64 fragmentation;
675         u64 cur_inum = U64_MAX;
676         struct bpos bp_pos = POS_MIN;
677         int ret = 0;
678
679         trace_bucket_evacuate(c, bucket);
680
681         bch2_bkey_buf_init(&sk);
682
683         /*
684          * We're not run in a context that handles transaction restarts:
685          */
686         bch2_trans_begin(trans);
687
688         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
689                              bucket, BTREE_ITER_CACHED);
690         ret = lockrestart_do(trans,
691                         bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
692         bch2_trans_iter_exit(trans, &iter);
693
694         if (ret) {
695                 bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
696                 goto err;
697         }
698
699         a = bch2_alloc_to_v4(k, &a_convert);
700         dirty_sectors = a->dirty_sectors;
701         bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
702         fragmentation = a->fragmentation_lru;
703
704         ret = bch2_btree_write_buffer_flush(trans);
705         if (ret) {
706                 bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
707                 goto err;
708         }
709
710         while (!(ret = move_ratelimit(trans, ctxt))) {
711                 bch2_trans_begin(trans);
712
713                 ret = bch2_get_next_backpointer(trans, bucket, gen,
714                                                 &bp_pos, &bp,
715                                                 BTREE_ITER_CACHED);
716                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
717                         continue;
718                 if (ret)
719                         goto err;
720                 if (bkey_eq(bp_pos, POS_MAX))
721                         break;
722
723                 if (!bp.level) {
724                         const struct bch_extent_ptr *ptr;
725                         struct bkey_s_c k;
726                         unsigned i = 0;
727
728                         k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
729                         ret = bkey_err(k);
730                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
731                                 continue;
732                         if (ret)
733                                 goto err;
734                         if (!k.k)
735                                 goto next;
736
737                         bch2_bkey_buf_reassemble(&sk, c, k);
738                         k = bkey_i_to_s_c(sk.k);
739
740                         ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
741                         if (ret) {
742                                 bch2_trans_iter_exit(trans, &iter);
743                                 continue;
744                         }
745
746                         data_opts = _data_opts;
747                         data_opts.target        = io_opts.background_target;
748                         data_opts.rewrite_ptrs = 0;
749
750                         bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
751                                 if (ptr->dev == bucket.inode) {
752                                         data_opts.rewrite_ptrs |= 1U << i;
753                                         if (ptr->cached) {
754                                                 bch2_trans_iter_exit(trans, &iter);
755                                                 goto next;
756                                         }
757                                 }
758                                 i++;
759                         }
760
761                         ret = bch2_move_extent(trans, &iter, ctxt,
762                                         bucket_in_flight,
763                                         io_opts, bp.btree_id, k, data_opts);
764                         bch2_trans_iter_exit(trans, &iter);
765
766                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
767                                 continue;
768                         if (ret == -ENOMEM) {
769                                 /* memory allocation failure, wait for some IO to finish */
770                                 bch2_move_ctxt_wait_for_io(ctxt, trans);
771                                 continue;
772                         }
773                         if (ret)
774                                 goto err;
775
776                         if (ctxt->rate)
777                                 bch2_ratelimit_increment(ctxt->rate, k.k->size);
778                         if (ctxt->stats)
779                                 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
780                 } else {
781                         struct btree *b;
782
783                         b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
784                         ret = PTR_ERR_OR_ZERO(b);
785                         if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
786                                 continue;
787                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
788                                 continue;
789                         if (ret)
790                                 goto err;
791                         if (!b)
792                                 goto next;
793
794                         ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
795                         bch2_trans_iter_exit(trans, &iter);
796
797                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
798                                 continue;
799                         if (ret)
800                                 goto err;
801
802                         if (ctxt->rate)
803                                 bch2_ratelimit_increment(ctxt->rate,
804                                                          c->opts.btree_node_size >> 9);
805                         if (ctxt->stats) {
806                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
807                                 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
808                         }
809                 }
810 next:
811                 bp_pos = bpos_nosnap_successor(bp_pos);
812         }
813
814         trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
815 err:
816         bch2_bkey_buf_exit(&sk, c);
817         return ret;
818 }
819
820 int bch2_evacuate_bucket(struct bch_fs *c,
821                          struct bpos bucket, int gen,
822                          struct data_update_opts data_opts,
823                          struct bch_ratelimit *rate,
824                          struct bch_move_stats *stats,
825                          struct write_point_specifier wp,
826                          bool wait_on_copygc)
827 {
828         struct btree_trans trans;
829         struct moving_context ctxt;
830         int ret;
831
832         bch2_trans_init(&trans, c, 0, 0);
833         bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
834         ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
835         bch2_moving_ctxt_exit(&ctxt);
836         bch2_trans_exit(&trans);
837
838         return ret;
839 }
840
841 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
842                                 struct btree *, struct bch_io_opts *,
843                                 struct data_update_opts *);
844
845 static int bch2_move_btree(struct bch_fs *c,
846                            enum btree_id start_btree_id, struct bpos start_pos,
847                            enum btree_id end_btree_id,   struct bpos end_pos,
848                            move_btree_pred pred, void *arg,
849                            struct bch_move_stats *stats)
850 {
851         bool kthread = (current->flags & PF_KTHREAD) != 0;
852         struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
853         struct btree_trans trans;
854         struct btree_iter iter;
855         struct btree *b;
856         enum btree_id id;
857         struct data_update_opts data_opts;
858         int ret = 0;
859
860         bch2_trans_init(&trans, c, 0, 0);
861         progress_list_add(c, stats);
862
863         stats->data_type = BCH_DATA_btree;
864
865         for (id = start_btree_id;
866              id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
867              id++) {
868                 stats->btree_id = id;
869
870                 bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
871                                           BTREE_ITER_PREFETCH);
872 retry:
873                 ret = 0;
874                 while (bch2_trans_begin(&trans),
875                        (b = bch2_btree_iter_peek_node(&iter)) &&
876                        !(ret = PTR_ERR_OR_ZERO(b))) {
877                         if (kthread && kthread_should_stop())
878                                 break;
879
880                         if ((cmp_int(id, end_btree_id) ?:
881                              bpos_cmp(b->key.k.p, end_pos)) > 0)
882                                 break;
883
884                         stats->pos = iter.pos;
885
886                         if (!pred(c, arg, b, &io_opts, &data_opts))
887                                 goto next;
888
889                         ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
890                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
891                                 continue;
892                         if (ret)
893                                 break;
894 next:
895                         bch2_btree_iter_next_node(&iter);
896                 }
897                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
898                         goto retry;
899
900                 bch2_trans_iter_exit(&trans, &iter);
901
902                 if (kthread && kthread_should_stop())
903                         break;
904         }
905
906         bch2_trans_exit(&trans);
907
908         if (ret)
909                 bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
910
911         bch2_btree_interior_updates_flush(c);
912
913         progress_list_del(c, stats);
914         return ret;
915 }
916
917 static bool rereplicate_pred(struct bch_fs *c, void *arg,
918                              struct bkey_s_c k,
919                              struct bch_io_opts *io_opts,
920                              struct data_update_opts *data_opts)
921 {
922         unsigned nr_good = bch2_bkey_durability(c, k);
923         unsigned replicas = bkey_is_btree_ptr(k.k)
924                 ? c->opts.metadata_replicas
925                 : io_opts->data_replicas;
926
927         if (!nr_good || nr_good >= replicas)
928                 return false;
929
930         data_opts->target               = 0;
931         data_opts->extra_replicas       = replicas - nr_good;
932         data_opts->btree_insert_flags   = 0;
933         return true;
934 }
935
936 static bool migrate_pred(struct bch_fs *c, void *arg,
937                          struct bkey_s_c k,
938                          struct bch_io_opts *io_opts,
939                          struct data_update_opts *data_opts)
940 {
941         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
942         const struct bch_extent_ptr *ptr;
943         struct bch_ioctl_data *op = arg;
944         unsigned i = 0;
945
946         data_opts->rewrite_ptrs         = 0;
947         data_opts->target               = 0;
948         data_opts->extra_replicas       = 0;
949         data_opts->btree_insert_flags   = 0;
950
951         bkey_for_each_ptr(ptrs, ptr) {
952                 if (ptr->dev == op->migrate.dev)
953                         data_opts->rewrite_ptrs |= 1U << i;
954                 i++;
955         }
956
957         return data_opts->rewrite_ptrs != 0;
958 }
959
960 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
961                                    struct btree *b,
962                                    struct bch_io_opts *io_opts,
963                                    struct data_update_opts *data_opts)
964 {
965         return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
966 }
967
968 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
969                                struct btree *b,
970                                struct bch_io_opts *io_opts,
971                                struct data_update_opts *data_opts)
972 {
973         return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
974 }
975
976 static bool bformat_needs_redo(struct bkey_format *f)
977 {
978         unsigned i;
979
980         for (i = 0; i < f->nr_fields; i++) {
981                 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
982                 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
983                 u64 field_offset = le64_to_cpu(f->field_offset[i]);
984
985                 if (f->bits_per_field[i] > unpacked_bits)
986                         return true;
987
988                 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
989                         return true;
990
991                 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
992                      unpacked_mask) <
993                     field_offset)
994                         return true;
995         }
996
997         return false;
998 }
999
1000 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1001                                    struct btree *b,
1002                                    struct bch_io_opts *io_opts,
1003                                    struct data_update_opts *data_opts)
1004 {
1005         if (b->version_ondisk != c->sb.version ||
1006             btree_node_need_rewrite(b) ||
1007             bformat_needs_redo(&b->format)) {
1008                 data_opts->target               = 0;
1009                 data_opts->extra_replicas       = 0;
1010                 data_opts->btree_insert_flags   = 0;
1011                 return true;
1012         }
1013
1014         return false;
1015 }
1016
1017 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1018 {
1019         int ret;
1020
1021         ret = bch2_move_btree(c,
1022                               0,                POS_MIN,
1023                               BTREE_ID_NR,      SPOS_MAX,
1024                               rewrite_old_nodes_pred, c, stats);
1025         if (!ret) {
1026                 mutex_lock(&c->sb_lock);
1027                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1028                 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1029                 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1030                 bch2_write_super(c);
1031                 mutex_unlock(&c->sb_lock);
1032         }
1033
1034         return ret;
1035 }
1036
1037 int bch2_data_job(struct bch_fs *c,
1038                   struct bch_move_stats *stats,
1039                   struct bch_ioctl_data op)
1040 {
1041         int ret = 0;
1042
1043         switch (op.op) {
1044         case BCH_DATA_OP_REREPLICATE:
1045                 bch2_move_stats_init(stats, "rereplicate");
1046                 stats->data_type = BCH_DATA_journal;
1047                 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1048
1049                 ret = bch2_move_btree(c,
1050                                       op.start_btree,   op.start_pos,
1051                                       op.end_btree,     op.end_pos,
1052                                       rereplicate_btree_pred, c, stats) ?: ret;
1053                 ret = bch2_replicas_gc2(c) ?: ret;
1054
1055                 ret = bch2_move_data(c,
1056                                      op.start_btree,    op.start_pos,
1057                                      op.end_btree,      op.end_pos,
1058                                      NULL,
1059                                      stats,
1060                                      writepoint_hashed((unsigned long) current),
1061                                      true,
1062                                      rereplicate_pred, c) ?: ret;
1063                 ret = bch2_replicas_gc2(c) ?: ret;
1064                 break;
1065         case BCH_DATA_OP_MIGRATE:
1066                 if (op.migrate.dev >= c->sb.nr_devices)
1067                         return -EINVAL;
1068
1069                 bch2_move_stats_init(stats, "migrate");
1070                 stats->data_type = BCH_DATA_journal;
1071                 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1072
1073                 ret = bch2_move_btree(c,
1074                                       op.start_btree,   op.start_pos,
1075                                       op.end_btree,     op.end_pos,
1076                                       migrate_btree_pred, &op, stats) ?: ret;
1077                 ret = bch2_replicas_gc2(c) ?: ret;
1078
1079                 ret = bch2_move_data(c,
1080                                      op.start_btree,    op.start_pos,
1081                                      op.end_btree,      op.end_pos,
1082                                      NULL,
1083                                      stats,
1084                                      writepoint_hashed((unsigned long) current),
1085                                      true,
1086                                      migrate_pred, &op) ?: ret;
1087                 ret = bch2_replicas_gc2(c) ?: ret;
1088                 break;
1089         case BCH_DATA_OP_REWRITE_OLD_NODES:
1090                 bch2_move_stats_init(stats, "rewrite_old_nodes");
1091                 ret = bch2_scan_old_btree_nodes(c, stats);
1092                 break;
1093         default:
1094                 ret = -EINVAL;
1095         }
1096
1097         return ret;
1098 }
1099
1100 void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
1101 {
1102         struct bch_move_stats *stats;
1103
1104         mutex_lock(&c->data_progress_lock);
1105         list_for_each_entry(stats, &c->data_progress_list, list) {
1106                 prt_printf(out, "%s: data type %s btree_id %s position: ",
1107                        stats->name,
1108                        bch2_data_types[stats->data_type],
1109                        bch2_btree_ids[stats->btree_id]);
1110                 bch2_bpos_to_text(out, stats->pos);
1111                 prt_printf(out, "%s", "\n");
1112         }
1113         mutex_unlock(&c->data_progress_lock);
1114 }
1115
1116 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
1117 {
1118         struct moving_io *io;
1119
1120         prt_printf(out, "%ps:", ctxt->fn);
1121         prt_newline(out);
1122         printbuf_indent_add(out, 2);
1123
1124         prt_printf(out, "reads: %u sectors %u",
1125                    atomic_read(&ctxt->read_ios),
1126                    atomic_read(&ctxt->read_sectors));
1127         prt_newline(out);
1128
1129         prt_printf(out, "writes: %u sectors %u",
1130                    atomic_read(&ctxt->write_ios),
1131                    atomic_read(&ctxt->write_sectors));
1132         prt_newline(out);
1133
1134         printbuf_indent_add(out, 2);
1135
1136         mutex_lock(&ctxt->lock);
1137         list_for_each_entry(io, &ctxt->ios, io_list) {
1138                 bch2_write_op_to_text(out, &io->write.op);
1139         }
1140         mutex_unlock(&ctxt->lock);
1141
1142         printbuf_indent_sub(out, 4);
1143 }
1144
1145 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1146 {
1147         struct moving_context *ctxt;
1148
1149         mutex_lock(&c->moving_context_lock);
1150         list_for_each_entry(ctxt, &c->moving_context_list, list)
1151                 bch2_moving_ctxt_to_text(out, ctxt);
1152         mutex_unlock(&c->moving_context_lock);
1153 }
1154
1155 void bch2_fs_move_init(struct bch_fs *c)
1156 {
1157         INIT_LIST_HEAD(&c->moving_context_list);
1158         mutex_init(&c->moving_context_lock);
1159
1160         INIT_LIST_HEAD(&c->data_progress_list);
1161         mutex_init(&c->data_progress_lock);
1162 }