]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c
Update bcachefs sources to da7d42a9a2 bcachefs: Add new assertions for shutdown path
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  * Copyright (C) 2014 Datera Inc.
5  */
6
7 #include "bcachefs.h"
8 #include "alloc_background.h"
9 #include "alloc_foreground.h"
10 #include "bkey_methods.h"
11 #include "bkey_buf.h"
12 #include "btree_key_cache.h"
13 #include "btree_locking.h"
14 #include "btree_update_interior.h"
15 #include "btree_io.h"
16 #include "btree_gc.h"
17 #include "buckets.h"
18 #include "clock.h"
19 #include "debug.h"
20 #include "ec.h"
21 #include "error.h"
22 #include "extents.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "move.h"
26 #include "recovery.h"
27 #include "reflink.h"
28 #include "replicas.h"
29 #include "super-io.h"
30 #include "trace.h"
31
32 #include <linux/slab.h>
33 #include <linux/bitops.h>
34 #include <linux/freezer.h>
35 #include <linux/kthread.h>
36 #include <linux/preempt.h>
37 #include <linux/rcupdate.h>
38 #include <linux/sched/task.h>
39
40 #define DROP_THIS_NODE          10
41 #define DROP_PREV_NODE          11
42
43 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
44 {
45         preempt_disable();
46         write_seqcount_begin(&c->gc_pos_lock);
47         c->gc_pos = new_pos;
48         write_seqcount_end(&c->gc_pos_lock);
49         preempt_enable();
50 }
51
52 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
53 {
54         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
55         __gc_pos_set(c, new_pos);
56 }
57
58 /*
59  * Missing: if an interior btree node is empty, we need to do something -
60  * perhaps just kill it
61  */
62 static int bch2_gc_check_topology(struct bch_fs *c,
63                                   struct btree *b,
64                                   struct bkey_buf *prev,
65                                   struct bkey_buf cur,
66                                   bool is_last)
67 {
68         struct bpos node_start  = b->data->min_key;
69         struct bpos node_end    = b->data->max_key;
70         struct bpos expected_start = bkey_deleted(&prev->k->k)
71                 ? node_start
72                 : bpos_successor(prev->k->k.p);
73         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
74         int ret = 0;
75
76         if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
77                 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
78
79                 if (!bpos_eq(expected_start, bp->v.min_key)) {
80                         bch2_topology_error(c);
81
82                         if (bkey_deleted(&prev->k->k)) {
83                                 prt_printf(&buf1, "start of node: ");
84                                 bch2_bpos_to_text(&buf1, node_start);
85                         } else {
86                                 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
87                         }
88                         bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
89
90                         if (__fsck_err(c,
91                                   FSCK_CAN_FIX|
92                                   FSCK_CAN_IGNORE|
93                                   FSCK_NO_RATELIMIT,
94                                   "btree node with incorrect min_key at btree %s level %u:\n"
95                                   "  prev %s\n"
96                                   "  cur %s",
97                                   bch2_btree_ids[b->c.btree_id], b->c.level,
98                                   buf1.buf, buf2.buf) &&
99                             !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
100                                 bch_info(c, "Halting mark and sweep to start topology repair pass");
101                                 ret = -BCH_ERR_need_topology_repair;
102                                 goto err;
103                         } else {
104                                 set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
105                         }
106                 }
107         }
108
109         if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
110                 bch2_topology_error(c);
111
112                 printbuf_reset(&buf1);
113                 printbuf_reset(&buf2);
114
115                 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
116                 bch2_bpos_to_text(&buf2, node_end);
117
118                 if (__fsck_err(c,
119                           FSCK_CAN_FIX|
120                           FSCK_CAN_IGNORE|
121                           FSCK_NO_RATELIMIT,
122                           "btree node with incorrect max_key at btree %s level %u:\n"
123                           "  %s\n"
124                           "  expected %s",
125                           bch2_btree_ids[b->c.btree_id], b->c.level,
126                           buf1.buf, buf2.buf) &&
127                     !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
128                         bch_info(c, "Halting mark and sweep to start topology repair pass");
129                         ret = -BCH_ERR_need_topology_repair;
130                         goto err;
131                 } else {
132                         set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
133                 }
134         }
135
136         bch2_bkey_buf_copy(prev, c, cur.k);
137 err:
138 fsck_err:
139         printbuf_exit(&buf2);
140         printbuf_exit(&buf1);
141         return ret;
142 }
143
144 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
145 {
146         switch (b->key.k.type) {
147         case KEY_TYPE_btree_ptr: {
148                 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
149
150                 dst->k.p                = src->k.p;
151                 dst->v.mem_ptr          = 0;
152                 dst->v.seq              = b->data->keys.seq;
153                 dst->v.sectors_written  = 0;
154                 dst->v.flags            = 0;
155                 dst->v.min_key          = b->data->min_key;
156                 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
157                 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
158                 break;
159         }
160         case KEY_TYPE_btree_ptr_v2:
161                 bkey_copy(&dst->k_i, &b->key);
162                 break;
163         default:
164                 BUG();
165         }
166 }
167
168 static void bch2_btree_node_update_key_early(struct btree_trans *trans,
169                                              enum btree_id btree, unsigned level,
170                                              struct bkey_s_c old, struct bkey_i *new)
171 {
172         struct bch_fs *c = trans->c;
173         struct btree *b;
174         struct bkey_buf tmp;
175         int ret;
176
177         bch2_bkey_buf_init(&tmp);
178         bch2_bkey_buf_reassemble(&tmp, c, old);
179
180         b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
181         if (!IS_ERR_OR_NULL(b)) {
182                 mutex_lock(&c->btree_cache.lock);
183
184                 bch2_btree_node_hash_remove(&c->btree_cache, b);
185
186                 bkey_copy(&b->key, new);
187                 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
188                 BUG_ON(ret);
189
190                 mutex_unlock(&c->btree_cache.lock);
191                 six_unlock_read(&b->c.lock);
192         }
193
194         bch2_bkey_buf_exit(&tmp, c);
195 }
196
197 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
198 {
199         struct bkey_i_btree_ptr_v2 *new;
200         int ret;
201
202         new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
203         if (!new)
204                 return -BCH_ERR_ENOMEM_gc_repair_key;
205
206         btree_ptr_to_v2(b, new);
207         b->data->min_key        = new_min;
208         new->v.min_key          = new_min;
209         SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
210
211         ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
212         if (ret) {
213                 kfree(new);
214                 return ret;
215         }
216
217         bch2_btree_node_drop_keys_outside_node(b);
218         bkey_copy(&b->key, &new->k_i);
219         return 0;
220 }
221
222 static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
223 {
224         struct bkey_i_btree_ptr_v2 *new;
225         int ret;
226
227         ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
228         if (ret)
229                 return ret;
230
231         new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
232         if (!new)
233                 return -BCH_ERR_ENOMEM_gc_repair_key;
234
235         btree_ptr_to_v2(b, new);
236         b->data->max_key        = new_max;
237         new->k.p                = new_max;
238         SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
239
240         ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
241         if (ret) {
242                 kfree(new);
243                 return ret;
244         }
245
246         bch2_btree_node_drop_keys_outside_node(b);
247
248         mutex_lock(&c->btree_cache.lock);
249         bch2_btree_node_hash_remove(&c->btree_cache, b);
250
251         bkey_copy(&b->key, &new->k_i);
252         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
253         BUG_ON(ret);
254         mutex_unlock(&c->btree_cache.lock);
255         return 0;
256 }
257
258 static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
259                                         struct btree *prev, struct btree *cur)
260 {
261         struct bpos expected_start = !prev
262                 ? b->data->min_key
263                 : bpos_successor(prev->key.k.p);
264         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
265         int ret = 0;
266
267         if (!prev) {
268                 prt_printf(&buf1, "start of node: ");
269                 bch2_bpos_to_text(&buf1, b->data->min_key);
270         } else {
271                 bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
272         }
273
274         bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
275
276         if (prev &&
277             bpos_gt(expected_start, cur->data->min_key) &&
278             BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
279                 /* cur overwrites prev: */
280
281                 if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
282                                                 cur->data->min_key), c,
283                                 "btree node overwritten by next node at btree %s level %u:\n"
284                                 "  node %s\n"
285                                 "  next %s",
286                                 bch2_btree_ids[b->c.btree_id], b->c.level,
287                                 buf1.buf, buf2.buf)) {
288                         ret = DROP_PREV_NODE;
289                         goto out;
290                 }
291
292                 if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
293                                                  bpos_predecessor(cur->data->min_key)), c,
294                                 "btree node with incorrect max_key at btree %s level %u:\n"
295                                 "  node %s\n"
296                                 "  next %s",
297                                 bch2_btree_ids[b->c.btree_id], b->c.level,
298                                 buf1.buf, buf2.buf))
299                         ret = set_node_max(c, prev,
300                                            bpos_predecessor(cur->data->min_key));
301         } else {
302                 /* prev overwrites cur: */
303
304                 if (mustfix_fsck_err_on(bpos_ge(expected_start,
305                                                 cur->data->max_key), c,
306                                 "btree node overwritten by prev node at btree %s level %u:\n"
307                                 "  prev %s\n"
308                                 "  node %s",
309                                 bch2_btree_ids[b->c.btree_id], b->c.level,
310                                 buf1.buf, buf2.buf)) {
311                         ret = DROP_THIS_NODE;
312                         goto out;
313                 }
314
315                 if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
316                                 "btree node with incorrect min_key at btree %s level %u:\n"
317                                 "  prev %s\n"
318                                 "  node %s",
319                                 bch2_btree_ids[b->c.btree_id], b->c.level,
320                                 buf1.buf, buf2.buf))
321                         ret = set_node_min(c, cur, expected_start);
322         }
323 out:
324 fsck_err:
325         printbuf_exit(&buf2);
326         printbuf_exit(&buf1);
327         return ret;
328 }
329
330 static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
331                                  struct btree *child)
332 {
333         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
334         int ret = 0;
335
336         bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
337         bch2_bpos_to_text(&buf2, b->key.k.p);
338
339         if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
340                         "btree node with incorrect max_key at btree %s level %u:\n"
341                         "  %s\n"
342                         "  expected %s",
343                         bch2_btree_ids[b->c.btree_id], b->c.level,
344                         buf1.buf, buf2.buf)) {
345                 ret = set_node_max(c, child, b->key.k.p);
346                 if (ret)
347                         goto err;
348         }
349 err:
350 fsck_err:
351         printbuf_exit(&buf2);
352         printbuf_exit(&buf1);
353         return ret;
354 }
355
356 static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
357 {
358         struct bch_fs *c = trans->c;
359         struct btree_and_journal_iter iter;
360         struct bkey_s_c k;
361         struct bkey_buf prev_k, cur_k;
362         struct btree *prev = NULL, *cur = NULL;
363         bool have_child, dropped_children = false;
364         struct printbuf buf = PRINTBUF;
365         int ret = 0;
366
367         if (!b->c.level)
368                 return 0;
369 again:
370         prev = NULL;
371         have_child = dropped_children = false;
372         bch2_bkey_buf_init(&prev_k);
373         bch2_bkey_buf_init(&cur_k);
374         bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
375
376         while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
377                 BUG_ON(bpos_lt(k.k->p, b->data->min_key));
378                 BUG_ON(bpos_gt(k.k->p, b->data->max_key));
379
380                 bch2_btree_and_journal_iter_advance(&iter);
381                 bch2_bkey_buf_reassemble(&cur_k, c, k);
382
383                 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
384                                         b->c.btree_id, b->c.level - 1,
385                                         false);
386                 ret = PTR_ERR_OR_ZERO(cur);
387
388                 printbuf_reset(&buf);
389                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
390
391                 if (mustfix_fsck_err_on(ret == -EIO, c,
392                                 "Topology repair: unreadable btree node at btree %s level %u:\n"
393                                 "  %s",
394                                 bch2_btree_ids[b->c.btree_id],
395                                 b->c.level - 1,
396                                 buf.buf)) {
397                         bch2_btree_node_evict(trans, cur_k.k);
398                         ret = bch2_journal_key_delete(c, b->c.btree_id,
399                                                       b->c.level, cur_k.k->k.p);
400                         cur = NULL;
401                         if (ret)
402                                 break;
403                         continue;
404                 }
405
406                 if (ret) {
407                         bch_err_msg(c, ret, "getting btree node");
408                         break;
409                 }
410
411                 ret = btree_repair_node_boundaries(c, b, prev, cur);
412
413                 if (ret == DROP_THIS_NODE) {
414                         six_unlock_read(&cur->c.lock);
415                         bch2_btree_node_evict(trans, cur_k.k);
416                         ret = bch2_journal_key_delete(c, b->c.btree_id,
417                                                       b->c.level, cur_k.k->k.p);
418                         cur = NULL;
419                         if (ret)
420                                 break;
421                         continue;
422                 }
423
424                 if (prev)
425                         six_unlock_read(&prev->c.lock);
426                 prev = NULL;
427
428                 if (ret == DROP_PREV_NODE) {
429                         bch2_btree_node_evict(trans, prev_k.k);
430                         ret = bch2_journal_key_delete(c, b->c.btree_id,
431                                                       b->c.level, prev_k.k->k.p);
432                         if (ret)
433                                 break;
434
435                         bch2_btree_and_journal_iter_exit(&iter);
436                         bch2_bkey_buf_exit(&prev_k, c);
437                         bch2_bkey_buf_exit(&cur_k, c);
438                         goto again;
439                 } else if (ret)
440                         break;
441
442                 prev = cur;
443                 cur = NULL;
444                 bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
445         }
446
447         if (!ret && !IS_ERR_OR_NULL(prev)) {
448                 BUG_ON(cur);
449                 ret = btree_repair_node_end(c, b, prev);
450         }
451
452         if (!IS_ERR_OR_NULL(prev))
453                 six_unlock_read(&prev->c.lock);
454         prev = NULL;
455         if (!IS_ERR_OR_NULL(cur))
456                 six_unlock_read(&cur->c.lock);
457         cur = NULL;
458
459         if (ret)
460                 goto err;
461
462         bch2_btree_and_journal_iter_exit(&iter);
463         bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
464
465         while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
466                 bch2_bkey_buf_reassemble(&cur_k, c, k);
467                 bch2_btree_and_journal_iter_advance(&iter);
468
469                 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
470                                         b->c.btree_id, b->c.level - 1,
471                                         false);
472                 ret = PTR_ERR_OR_ZERO(cur);
473
474                 if (ret) {
475                         bch_err_msg(c, ret, "getting btree node");
476                         goto err;
477                 }
478
479                 ret = bch2_btree_repair_topology_recurse(trans, cur);
480                 six_unlock_read(&cur->c.lock);
481                 cur = NULL;
482
483                 if (ret == DROP_THIS_NODE) {
484                         bch2_btree_node_evict(trans, cur_k.k);
485                         ret = bch2_journal_key_delete(c, b->c.btree_id,
486                                                       b->c.level, cur_k.k->k.p);
487                         dropped_children = true;
488                 }
489
490                 if (ret)
491                         goto err;
492
493                 have_child = true;
494         }
495
496         printbuf_reset(&buf);
497         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
498
499         if (mustfix_fsck_err_on(!have_child, c,
500                         "empty interior btree node at btree %s level %u\n"
501                         "  %s",
502                         bch2_btree_ids[b->c.btree_id],
503                         b->c.level, buf.buf))
504                 ret = DROP_THIS_NODE;
505 err:
506 fsck_err:
507         if (!IS_ERR_OR_NULL(prev))
508                 six_unlock_read(&prev->c.lock);
509         if (!IS_ERR_OR_NULL(cur))
510                 six_unlock_read(&cur->c.lock);
511
512         bch2_btree_and_journal_iter_exit(&iter);
513         bch2_bkey_buf_exit(&prev_k, c);
514         bch2_bkey_buf_exit(&cur_k, c);
515
516         if (!ret && dropped_children)
517                 goto again;
518
519         printbuf_exit(&buf);
520         return ret;
521 }
522
523 static int bch2_repair_topology(struct bch_fs *c)
524 {
525         struct btree_trans trans;
526         struct btree *b;
527         unsigned i;
528         int ret = 0;
529
530         bch2_trans_init(&trans, c, 0, 0);
531
532         for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) {
533                 struct btree_root *r = bch2_btree_id_root(c, i);
534
535                 if (!r->alive)
536                         continue;
537
538                 b = r->b;
539                 if (btree_node_fake(b))
540                         continue;
541
542                 btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
543                 ret = bch2_btree_repair_topology_recurse(&trans, b);
544                 six_unlock_read(&b->c.lock);
545
546                 if (ret == DROP_THIS_NODE) {
547                         bch_err(c, "empty btree root - repair unimplemented");
548                         ret = -BCH_ERR_fsck_repair_unimplemented;
549                 }
550         }
551
552         bch2_trans_exit(&trans);
553
554         return ret;
555 }
556
557 static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
558                                unsigned level, bool is_root,
559                                struct bkey_s_c *k)
560 {
561         struct bch_fs *c = trans->c;
562         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
563         const union bch_extent_entry *entry;
564         struct extent_ptr_decoded p = { 0 };
565         bool do_update = false;
566         struct printbuf buf = PRINTBUF;
567         int ret = 0;
568
569         /*
570          * XXX
571          * use check_bucket_ref here
572          */
573         bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
574                 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
575                 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
576                 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
577
578                 if (!g->gen_valid &&
579                     (c->opts.reconstruct_alloc ||
580                      fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
581                               "while marking %s",
582                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
583                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
584                               p.ptr.gen,
585                               (printbuf_reset(&buf),
586                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
587                         if (!p.ptr.cached) {
588                                 g->gen_valid            = true;
589                                 g->gen                  = p.ptr.gen;
590                         } else {
591                                 do_update = true;
592                         }
593                 }
594
595                 if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
596                     (c->opts.reconstruct_alloc ||
597                      fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
598                               "while marking %s",
599                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
600                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
601                               p.ptr.gen, g->gen,
602                               (printbuf_reset(&buf),
603                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
604                         if (!p.ptr.cached) {
605                                 g->gen_valid            = true;
606                                 g->gen                  = p.ptr.gen;
607                                 g->data_type            = 0;
608                                 g->dirty_sectors        = 0;
609                                 g->cached_sectors       = 0;
610                                 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
611                         } else {
612                                 do_update = true;
613                         }
614                 }
615
616                 if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
617                     (c->opts.reconstruct_alloc ||
618                      fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
619                               "while marking %s",
620                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
621                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
622                               p.ptr.gen,
623                               (printbuf_reset(&buf),
624                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
625                         do_update = true;
626
627                 if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
628                     (c->opts.reconstruct_alloc ||
629                      fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
630                               "while marking %s",
631                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
632                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
633                               p.ptr.gen, g->gen,
634                               (printbuf_reset(&buf),
635                                bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
636                         do_update = true;
637
638                 if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
639                         continue;
640
641                 if (fsck_err_on(bucket_data_type(g->data_type) &&
642                                 bucket_data_type(g->data_type) != data_type, c,
643                                 "bucket %u:%zu different types of data in same bucket: %s, %s\n"
644                                 "while marking %s",
645                                 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
646                                 bch2_data_types[g->data_type],
647                                 bch2_data_types[data_type],
648                                 (printbuf_reset(&buf),
649                                  bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
650                         if (data_type == BCH_DATA_btree) {
651                                 g->data_type    = data_type;
652                                 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
653                         } else {
654                                 do_update = true;
655                         }
656                 }
657
658                 if (p.has_ec) {
659                         struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
660
661                         if (fsck_err_on(!m || !m->alive, c,
662                                         "pointer to nonexistent stripe %llu\n"
663                                         "while marking %s",
664                                         (u64) p.ec.idx,
665                                         (printbuf_reset(&buf),
666                                          bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
667                                 do_update = true;
668
669                         if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
670                                         "pointer does not match stripe %llu\n"
671                                         "while marking %s",
672                                         (u64) p.ec.idx,
673                                         (printbuf_reset(&buf),
674                                          bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
675                                 do_update = true;
676                 }
677         }
678
679         if (do_update) {
680                 struct bkey_ptrs ptrs;
681                 union bch_extent_entry *entry;
682                 struct bch_extent_ptr *ptr;
683                 struct bkey_i *new;
684
685                 if (is_root) {
686                         bch_err(c, "cannot update btree roots yet");
687                         ret = -EINVAL;
688                         goto err;
689                 }
690
691                 new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
692                 if (!new) {
693                         bch_err_msg(c, ret, "allocating new key");
694                         ret = -BCH_ERR_ENOMEM_gc_repair_key;
695                         goto err;
696                 }
697
698                 bkey_reassemble(new, *k);
699
700                 if (level) {
701                         /*
702                          * We don't want to drop btree node pointers - if the
703                          * btree node isn't there anymore, the read path will
704                          * sort it out:
705                          */
706                         ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
707                         bkey_for_each_ptr(ptrs, ptr) {
708                                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
709                                 struct bucket *g = PTR_GC_BUCKET(ca, ptr);
710
711                                 ptr->gen = g->gen;
712                         }
713                 } else {
714                         bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
715                                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
716                                 struct bucket *g = PTR_GC_BUCKET(ca, ptr);
717                                 enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
718
719                                 (ptr->cached &&
720                                  (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
721                                 (!ptr->cached &&
722                                  gen_cmp(ptr->gen, g->gen) < 0) ||
723                                 gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
724                                 (g->data_type &&
725                                  g->data_type != data_type);
726                         }));
727 again:
728                         ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
729                         bkey_extent_entry_for_each(ptrs, entry) {
730                                 if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
731                                         struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
732                                                                         entry->stripe_ptr.idx);
733                                         union bch_extent_entry *next_ptr;
734
735                                         bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
736                                                 if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
737                                                         goto found;
738                                         next_ptr = NULL;
739 found:
740                                         if (!next_ptr) {
741                                                 bch_err(c, "aieee, found stripe ptr with no data ptr");
742                                                 continue;
743                                         }
744
745                                         if (!m || !m->alive ||
746                                             !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
747                                                                        &next_ptr->ptr,
748                                                                        m->sectors)) {
749                                                 bch2_bkey_extent_entry_drop(new, entry);
750                                                 goto again;
751                                         }
752                                 }
753                         }
754                 }
755
756                 ret = bch2_journal_key_insert_take(c, btree_id, level, new);
757                 if (ret) {
758                         kfree(new);
759                         goto err;
760                 }
761
762                 if (level)
763                         bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
764
765                 if (0) {
766                         printbuf_reset(&buf);
767                         bch2_bkey_val_to_text(&buf, c, *k);
768                         bch_info(c, "updated %s", buf.buf);
769
770                         printbuf_reset(&buf);
771                         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
772                         bch_info(c, "new key %s", buf.buf);
773                 }
774
775                 *k = bkey_i_to_s_c(new);
776         }
777 err:
778 fsck_err:
779         printbuf_exit(&buf);
780         return ret;
781 }
782
783 /* marking of btree keys/nodes: */
784
785 static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
786                             unsigned level, bool is_root,
787                             struct bkey_s_c *k,
788                             bool initial)
789 {
790         struct bch_fs *c = trans->c;
791         struct bkey deleted = KEY(0, 0, 0);
792         struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
793         unsigned flags =
794                 BTREE_TRIGGER_GC|
795                 (initial ? BTREE_TRIGGER_NOATOMIC : 0);
796         int ret = 0;
797
798         deleted.p = k->k->p;
799
800         if (initial) {
801                 BUG_ON(bch2_journal_seq_verify &&
802                        k->k->version.lo > atomic64_read(&c->journal.seq));
803
804                 ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
805                 if (ret)
806                         goto err;
807
808                 if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
809                                 "key version number higher than recorded: %llu > %llu",
810                                 k->k->version.lo,
811                                 atomic64_read(&c->key_version)))
812                         atomic64_set(&c->key_version, k->k->version.lo);
813         }
814
815         ret = commit_do(trans, NULL, NULL, 0,
816                         bch2_mark_key(trans, btree_id, level, old, *k, flags));
817 fsck_err:
818 err:
819         if (ret)
820                 bch_err_fn(c, ret);
821         return ret;
822 }
823
824 static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
825 {
826         struct bch_fs *c = trans->c;
827         struct btree_node_iter iter;
828         struct bkey unpacked;
829         struct bkey_s_c k;
830         struct bkey_buf prev, cur;
831         int ret = 0;
832
833         if (!btree_node_type_needs_gc(btree_node_type(b)))
834                 return 0;
835
836         bch2_btree_node_iter_init_from_start(&iter, b);
837         bch2_bkey_buf_init(&prev);
838         bch2_bkey_buf_init(&cur);
839         bkey_init(&prev.k->k);
840
841         while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
842                 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
843                                        &k, initial);
844                 if (ret)
845                         break;
846
847                 bch2_btree_node_iter_advance(&iter, b);
848
849                 if (b->c.level) {
850                         bch2_bkey_buf_reassemble(&cur, c, k);
851
852                         ret = bch2_gc_check_topology(c, b, &prev, cur,
853                                         bch2_btree_node_iter_end(&iter));
854                         if (ret)
855                                 break;
856                 }
857         }
858
859         bch2_bkey_buf_exit(&cur, c);
860         bch2_bkey_buf_exit(&prev, c);
861         return ret;
862 }
863
864 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
865                          bool initial, bool metadata_only)
866 {
867         struct bch_fs *c = trans->c;
868         struct btree_iter iter;
869         struct btree *b;
870         unsigned depth = metadata_only ? 1 : 0;
871         int ret = 0;
872
873         gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
874
875         __for_each_btree_node(trans, iter, btree_id, POS_MIN,
876                               0, depth, BTREE_ITER_PREFETCH, b, ret) {
877                 bch2_verify_btree_nr_keys(b);
878
879                 gc_pos_set(c, gc_pos_btree_node(b));
880
881                 ret = btree_gc_mark_node(trans, b, initial);
882                 if (ret)
883                         break;
884         }
885         bch2_trans_iter_exit(trans, &iter);
886
887         if (ret)
888                 return ret;
889
890         mutex_lock(&c->btree_root_lock);
891         b = bch2_btree_id_root(c, btree_id)->b;
892         if (!btree_node_fake(b)) {
893                 struct bkey_s_c k = bkey_i_to_s_c(&b->key);
894
895                 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
896                                        true, &k, initial);
897         }
898         gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
899         mutex_unlock(&c->btree_root_lock);
900
901         return ret;
902 }
903
904 static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
905                                       unsigned target_depth)
906 {
907         struct bch_fs *c = trans->c;
908         struct btree_and_journal_iter iter;
909         struct bkey_s_c k;
910         struct bkey_buf cur, prev;
911         struct printbuf buf = PRINTBUF;
912         int ret = 0;
913
914         bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
915         bch2_bkey_buf_init(&prev);
916         bch2_bkey_buf_init(&cur);
917         bkey_init(&prev.k->k);
918
919         while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
920                 BUG_ON(bpos_lt(k.k->p, b->data->min_key));
921                 BUG_ON(bpos_gt(k.k->p, b->data->max_key));
922
923                 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
924                                        false, &k, true);
925                 if (ret)
926                         goto fsck_err;
927
928                 if (b->c.level) {
929                         bch2_bkey_buf_reassemble(&cur, c, k);
930                         k = bkey_i_to_s_c(cur.k);
931
932                         bch2_btree_and_journal_iter_advance(&iter);
933
934                         ret = bch2_gc_check_topology(c, b,
935                                         &prev, cur,
936                                         !bch2_btree_and_journal_iter_peek(&iter).k);
937                         if (ret)
938                                 goto fsck_err;
939                 } else {
940                         bch2_btree_and_journal_iter_advance(&iter);
941                 }
942         }
943
944         if (b->c.level > target_depth) {
945                 bch2_btree_and_journal_iter_exit(&iter);
946                 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
947
948                 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
949                         struct btree *child;
950
951                         bch2_bkey_buf_reassemble(&cur, c, k);
952                         bch2_btree_and_journal_iter_advance(&iter);
953
954                         child = bch2_btree_node_get_noiter(trans, cur.k,
955                                                 b->c.btree_id, b->c.level - 1,
956                                                 false);
957                         ret = PTR_ERR_OR_ZERO(child);
958
959                         if (ret == -EIO) {
960                                 bch2_topology_error(c);
961
962                                 if (__fsck_err(c,
963                                           FSCK_CAN_FIX|
964                                           FSCK_CAN_IGNORE|
965                                           FSCK_NO_RATELIMIT,
966                                           "Unreadable btree node at btree %s level %u:\n"
967                                           "  %s",
968                                           bch2_btree_ids[b->c.btree_id],
969                                           b->c.level - 1,
970                                           (printbuf_reset(&buf),
971                                            bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
972                                     !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
973                                         ret = -BCH_ERR_need_topology_repair;
974                                         bch_info(c, "Halting mark and sweep to start topology repair pass");
975                                         goto fsck_err;
976                                 } else {
977                                         /* Continue marking when opted to not
978                                          * fix the error: */
979                                         ret = 0;
980                                         set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
981                                         continue;
982                                 }
983                         } else if (ret) {
984                                 bch_err_msg(c, ret, "getting btree node");
985                                 break;
986                         }
987
988                         ret = bch2_gc_btree_init_recurse(trans, child,
989                                                          target_depth);
990                         six_unlock_read(&child->c.lock);
991
992                         if (ret)
993                                 break;
994                 }
995         }
996 fsck_err:
997         bch2_bkey_buf_exit(&cur, c);
998         bch2_bkey_buf_exit(&prev, c);
999         bch2_btree_and_journal_iter_exit(&iter);
1000         printbuf_exit(&buf);
1001         return ret;
1002 }
1003
1004 static int bch2_gc_btree_init(struct btree_trans *trans,
1005                               enum btree_id btree_id,
1006                               bool metadata_only)
1007 {
1008         struct bch_fs *c = trans->c;
1009         struct btree *b;
1010         unsigned target_depth = metadata_only ? 1 : 0;
1011         struct printbuf buf = PRINTBUF;
1012         int ret = 0;
1013
1014         b = bch2_btree_id_root(c, btree_id)->b;
1015
1016         if (btree_node_fake(b))
1017                 return 0;
1018
1019         six_lock_read(&b->c.lock, NULL, NULL);
1020         printbuf_reset(&buf);
1021         bch2_bpos_to_text(&buf, b->data->min_key);
1022         if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
1023                         "btree root with incorrect min_key: %s", buf.buf)) {
1024                 bch_err(c, "repair unimplemented");
1025                 ret = -BCH_ERR_fsck_repair_unimplemented;
1026                 goto fsck_err;
1027         }
1028
1029         printbuf_reset(&buf);
1030         bch2_bpos_to_text(&buf, b->data->max_key);
1031         if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
1032                         "btree root with incorrect max_key: %s", buf.buf)) {
1033                 bch_err(c, "repair unimplemented");
1034                 ret = -BCH_ERR_fsck_repair_unimplemented;
1035                 goto fsck_err;
1036         }
1037
1038         if (b->c.level >= target_depth)
1039                 ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
1040
1041         if (!ret) {
1042                 struct bkey_s_c k = bkey_i_to_s_c(&b->key);
1043
1044                 ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
1045                                        &k, true);
1046         }
1047 fsck_err:
1048         six_unlock_read(&b->c.lock);
1049
1050         if (ret < 0)
1051                 bch_err_fn(c, ret);
1052         printbuf_exit(&buf);
1053         return ret;
1054 }
1055
1056 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
1057 {
1058         return  (int) btree_id_to_gc_phase(l) -
1059                 (int) btree_id_to_gc_phase(r);
1060 }
1061
1062 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
1063 {
1064         struct btree_trans trans;
1065         enum btree_id ids[BTREE_ID_NR];
1066         unsigned i;
1067         int ret = 0;
1068
1069         bch2_trans_init(&trans, c, 0, 0);
1070
1071         if (initial)
1072                 trans.is_initial_gc = true;
1073
1074         for (i = 0; i < BTREE_ID_NR; i++)
1075                 ids[i] = i;
1076         bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
1077
1078         for (i = 0; i < BTREE_ID_NR && !ret; i++)
1079                 ret = initial
1080                         ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
1081                         : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
1082
1083         for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
1084                 if (!bch2_btree_id_root(c, i)->alive)
1085                         continue;
1086
1087                 ret = initial
1088                         ? bch2_gc_btree_init(&trans, i, metadata_only)
1089                         : bch2_gc_btree(&trans, i, initial, metadata_only);
1090         }
1091
1092         if (ret < 0)
1093                 bch_err_fn(c, ret);
1094
1095         bch2_trans_exit(&trans);
1096         return ret;
1097 }
1098
1099 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
1100                                   u64 start, u64 end,
1101                                   enum bch_data_type type,
1102                                   unsigned flags)
1103 {
1104         u64 b = sector_to_bucket(ca, start);
1105
1106         do {
1107                 unsigned sectors =
1108                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1109
1110                 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
1111                                           gc_phase(GC_PHASE_SB), flags);
1112                 b++;
1113                 start += sectors;
1114         } while (start < end);
1115 }
1116
1117 static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
1118                                      unsigned flags)
1119 {
1120         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
1121         unsigned i;
1122         u64 b;
1123
1124         for (i = 0; i < layout->nr_superblocks; i++) {
1125                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
1126
1127                 if (offset == BCH_SB_SECTOR)
1128                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
1129                                               BCH_DATA_sb, flags);
1130
1131                 mark_metadata_sectors(c, ca, offset,
1132                                       offset + (1 << layout->sb_max_size_bits),
1133                                       BCH_DATA_sb, flags);
1134         }
1135
1136         for (i = 0; i < ca->journal.nr; i++) {
1137                 b = ca->journal.buckets[i];
1138                 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
1139                                           ca->mi.bucket_size,
1140                                           gc_phase(GC_PHASE_SB), flags);
1141         }
1142 }
1143
1144 static void bch2_mark_superblocks(struct bch_fs *c)
1145 {
1146         struct bch_dev *ca;
1147         unsigned i;
1148
1149         mutex_lock(&c->sb_lock);
1150         gc_pos_set(c, gc_phase(GC_PHASE_SB));
1151
1152         for_each_online_member(ca, c, i)
1153                 bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
1154         mutex_unlock(&c->sb_lock);
1155 }
1156
1157 #if 0
1158 /* Also see bch2_pending_btree_node_free_insert_done() */
1159 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
1160 {
1161         struct btree_update *as;
1162         struct pending_btree_node_free *d;
1163
1164         mutex_lock(&c->btree_interior_update_lock);
1165         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
1166
1167         for_each_pending_btree_node_free(c, as, d)
1168                 if (d->index_update_done)
1169                         bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
1170
1171         mutex_unlock(&c->btree_interior_update_lock);
1172 }
1173 #endif
1174
1175 static void bch2_gc_free(struct bch_fs *c)
1176 {
1177         struct bch_dev *ca;
1178         unsigned i;
1179
1180         genradix_free(&c->reflink_gc_table);
1181         genradix_free(&c->gc_stripes);
1182
1183         for_each_member_device(ca, c, i) {
1184                 kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
1185                         sizeof(struct bucket_array) +
1186                         ca->mi.nbuckets * sizeof(struct bucket));
1187                 ca->buckets_gc = NULL;
1188
1189                 free_percpu(ca->usage_gc);
1190                 ca->usage_gc = NULL;
1191         }
1192
1193         free_percpu(c->usage_gc);
1194         c->usage_gc = NULL;
1195 }
1196
1197 static int bch2_gc_done(struct bch_fs *c,
1198                         bool initial, bool metadata_only)
1199 {
1200         struct bch_dev *ca = NULL;
1201         struct printbuf buf = PRINTBUF;
1202         bool verify = !metadata_only &&
1203                 !c->opts.reconstruct_alloc &&
1204                 (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
1205         unsigned i, dev;
1206         int ret = 0;
1207
1208         percpu_down_write(&c->mark_lock);
1209
1210 #define copy_field(_f, _msg, ...)                                       \
1211         if (dst->_f != src->_f &&                                       \
1212             (!verify ||                                                 \
1213              fsck_err(c, _msg ": got %llu, should be %llu"              \
1214                       , ##__VA_ARGS__, dst->_f, src->_f)))              \
1215                 dst->_f = src->_f
1216 #define copy_stripe_field(_f, _msg, ...)                                \
1217         if (dst->_f != src->_f &&                                       \
1218             (!verify ||                                                 \
1219              fsck_err(c, "stripe %zu has wrong "_msg                    \
1220                       ": got %u, should be %u",                         \
1221                       iter.pos, ##__VA_ARGS__,                          \
1222                       dst->_f, src->_f)))                               \
1223                 dst->_f = src->_f
1224 #define copy_dev_field(_f, _msg, ...)                                   \
1225         copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
1226 #define copy_fs_field(_f, _msg, ...)                                    \
1227         copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
1228
1229         for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1230                 bch2_fs_usage_acc_to_base(c, i);
1231
1232         for_each_member_device(ca, c, dev) {
1233                 struct bch_dev_usage *dst = ca->usage_base;
1234                 struct bch_dev_usage *src = (void *)
1235                         bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
1236                                              dev_usage_u64s());
1237
1238                 copy_dev_field(buckets_ec,              "buckets_ec");
1239
1240                 for (i = 0; i < BCH_DATA_NR; i++) {
1241                         copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
1242                         copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
1243                         copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
1244                 }
1245         };
1246
1247         {
1248                 unsigned nr = fs_usage_u64s(c);
1249                 struct bch_fs_usage *dst = c->usage_base;
1250                 struct bch_fs_usage *src = (void *)
1251                         bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
1252
1253                 copy_fs_field(hidden,           "hidden");
1254                 copy_fs_field(btree,            "btree");
1255
1256                 if (!metadata_only) {
1257                         copy_fs_field(data,     "data");
1258                         copy_fs_field(cached,   "cached");
1259                         copy_fs_field(reserved, "reserved");
1260                         copy_fs_field(nr_inodes,"nr_inodes");
1261
1262                         for (i = 0; i < BCH_REPLICAS_MAX; i++)
1263                                 copy_fs_field(persistent_reserved[i],
1264                                               "persistent_reserved[%i]", i);
1265                 }
1266
1267                 for (i = 0; i < c->replicas.nr; i++) {
1268                         struct bch_replicas_entry *e =
1269                                 cpu_replicas_entry(&c->replicas, i);
1270
1271                         if (metadata_only &&
1272                             (e->data_type == BCH_DATA_user ||
1273                              e->data_type == BCH_DATA_cached))
1274                                 continue;
1275
1276                         printbuf_reset(&buf);
1277                         bch2_replicas_entry_to_text(&buf, e);
1278
1279                         copy_fs_field(replicas[i], "%s", buf.buf);
1280                 }
1281         }
1282
1283 #undef copy_fs_field
1284 #undef copy_dev_field
1285 #undef copy_stripe_field
1286 #undef copy_field
1287 fsck_err:
1288         if (ca)
1289                 percpu_ref_put(&ca->ref);
1290         if (ret)
1291                 bch_err_fn(c, ret);
1292
1293         percpu_up_write(&c->mark_lock);
1294         printbuf_exit(&buf);
1295         return ret;
1296 }
1297
1298 static int bch2_gc_start(struct bch_fs *c)
1299 {
1300         struct bch_dev *ca = NULL;
1301         unsigned i;
1302
1303         BUG_ON(c->usage_gc);
1304
1305         c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
1306                                          sizeof(u64), GFP_KERNEL);
1307         if (!c->usage_gc) {
1308                 bch_err(c, "error allocating c->usage_gc");
1309                 return -BCH_ERR_ENOMEM_gc_start;
1310         }
1311
1312         for_each_member_device(ca, c, i) {
1313                 BUG_ON(ca->usage_gc);
1314
1315                 ca->usage_gc = alloc_percpu(struct bch_dev_usage);
1316                 if (!ca->usage_gc) {
1317                         bch_err(c, "error allocating ca->usage_gc");
1318                         percpu_ref_put(&ca->ref);
1319                         return -BCH_ERR_ENOMEM_gc_start;
1320                 }
1321
1322                 this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
1323                                ca->mi.nbuckets - ca->mi.first_bucket);
1324         }
1325
1326         return 0;
1327 }
1328
1329 static int bch2_gc_reset(struct bch_fs *c)
1330 {
1331         struct bch_dev *ca;
1332         unsigned i;
1333
1334         for_each_member_device(ca, c, i) {
1335                 free_percpu(ca->usage_gc);
1336                 ca->usage_gc = NULL;
1337         }
1338
1339         free_percpu(c->usage_gc);
1340         c->usage_gc = NULL;
1341
1342         return bch2_gc_start(c);
1343 }
1344
1345 /* returns true if not equal */
1346 static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
1347                                      struct bch_alloc_v4 r)
1348 {
1349         return  l.gen != r.gen                          ||
1350                 l.oldest_gen != r.oldest_gen            ||
1351                 l.data_type != r.data_type              ||
1352                 l.dirty_sectors != r.dirty_sectors      ||
1353                 l.cached_sectors != r.cached_sectors     ||
1354                 l.stripe_redundancy != r.stripe_redundancy ||
1355                 l.stripe != r.stripe;
1356 }
1357
1358 static int bch2_alloc_write_key(struct btree_trans *trans,
1359                                 struct btree_iter *iter,
1360                                 struct bkey_s_c k,
1361                                 bool metadata_only)
1362 {
1363         struct bch_fs *c = trans->c;
1364         struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
1365         struct bucket gc, *b;
1366         struct bkey_i_alloc_v4 *a;
1367         struct bch_alloc_v4 old_convert, new;
1368         const struct bch_alloc_v4 *old;
1369         enum bch_data_type type;
1370         int ret;
1371
1372         if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
1373                 return 1;
1374
1375         old = bch2_alloc_to_v4(k, &old_convert);
1376         new = *old;
1377
1378         percpu_down_read(&c->mark_lock);
1379         b = gc_bucket(ca, iter->pos.offset);
1380
1381         /*
1382          * b->data_type doesn't yet include need_discard & need_gc_gen states -
1383          * fix that here:
1384          */
1385         type = __alloc_data_type(b->dirty_sectors,
1386                                  b->cached_sectors,
1387                                  b->stripe,
1388                                  *old,
1389                                  b->data_type);
1390         if (b->data_type != type) {
1391                 struct bch_dev_usage *u;
1392
1393                 preempt_disable();
1394                 u = this_cpu_ptr(ca->usage_gc);
1395                 u->d[b->data_type].buckets--;
1396                 b->data_type = type;
1397                 u->d[b->data_type].buckets++;
1398                 preempt_enable();
1399         }
1400
1401         gc = *b;
1402         percpu_up_read(&c->mark_lock);
1403
1404         if (metadata_only &&
1405             gc.data_type != BCH_DATA_sb &&
1406             gc.data_type != BCH_DATA_journal &&
1407             gc.data_type != BCH_DATA_btree)
1408                 return 0;
1409
1410         if (gen_after(old->gen, gc.gen))
1411                 return 0;
1412
1413         if (c->opts.reconstruct_alloc ||
1414             fsck_err_on(new.data_type != gc.data_type, c,
1415                         "bucket %llu:%llu gen %u has wrong data_type"
1416                         ": got %s, should be %s",
1417                         iter->pos.inode, iter->pos.offset,
1418                         gc.gen,
1419                         bch2_data_types[new.data_type],
1420                         bch2_data_types[gc.data_type]))
1421                 new.data_type = gc.data_type;
1422
1423 #define copy_bucket_field(_f)                                           \
1424         if (c->opts.reconstruct_alloc ||                                \
1425             fsck_err_on(new._f != gc._f, c,                             \
1426                         "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
1427                         ": got %u, should be %u",                       \
1428                         iter->pos.inode, iter->pos.offset,              \
1429                         gc.gen,                                         \
1430                         bch2_data_types[gc.data_type],                  \
1431                         new._f, gc._f))                                 \
1432                 new._f = gc._f;                                         \
1433
1434         copy_bucket_field(gen);
1435         copy_bucket_field(dirty_sectors);
1436         copy_bucket_field(cached_sectors);
1437         copy_bucket_field(stripe_redundancy);
1438         copy_bucket_field(stripe);
1439 #undef copy_bucket_field
1440
1441         if (!bch2_alloc_v4_cmp(*old, new))
1442                 return 0;
1443
1444         a = bch2_alloc_to_v4_mut(trans, k);
1445         ret = PTR_ERR_OR_ZERO(a);
1446         if (ret)
1447                 return ret;
1448
1449         a->v = new;
1450
1451         /*
1452          * The trigger normally makes sure this is set, but we're not running
1453          * triggers:
1454          */
1455         if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
1456                 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
1457
1458         ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
1459 fsck_err:
1460         return ret;
1461 }
1462
1463 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
1464 {
1465         struct btree_trans trans;
1466         struct btree_iter iter;
1467         struct bkey_s_c k;
1468         struct bch_dev *ca;
1469         unsigned i;
1470         int ret = 0;
1471
1472         bch2_trans_init(&trans, c, 0, 0);
1473
1474         for_each_member_device(ca, c, i) {
1475                 ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
1476                                 POS(ca->dev_idx, ca->mi.first_bucket),
1477                                 BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
1478                                 NULL, NULL, BTREE_INSERT_LAZY_RW,
1479                         bch2_alloc_write_key(&trans, &iter, k, metadata_only));
1480
1481                 if (ret < 0) {
1482                         bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
1483                         percpu_ref_put(&ca->ref);
1484                         break;
1485                 }
1486         }
1487
1488         bch2_trans_exit(&trans);
1489         return ret < 0 ? ret : 0;
1490 }
1491
1492 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
1493 {
1494         struct bch_dev *ca;
1495         struct btree_trans trans;
1496         struct btree_iter iter;
1497         struct bkey_s_c k;
1498         struct bucket *g;
1499         struct bch_alloc_v4 a_convert;
1500         const struct bch_alloc_v4 *a;
1501         unsigned i;
1502         int ret;
1503
1504         for_each_member_device(ca, c, i) {
1505                 struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
1506                                 ca->mi.nbuckets * sizeof(struct bucket),
1507                                 GFP_KERNEL|__GFP_ZERO);
1508                 if (!buckets) {
1509                         percpu_ref_put(&ca->ref);
1510                         bch_err(c, "error allocating ca->buckets[gc]");
1511                         return -BCH_ERR_ENOMEM_gc_alloc_start;
1512                 }
1513
1514                 buckets->first_bucket   = ca->mi.first_bucket;
1515                 buckets->nbuckets       = ca->mi.nbuckets;
1516                 rcu_assign_pointer(ca->buckets_gc, buckets);
1517         };
1518
1519         bch2_trans_init(&trans, c, 0, 0);
1520
1521         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
1522                            BTREE_ITER_PREFETCH, k, ret) {
1523                 ca = bch_dev_bkey_exists(c, k.k->p.inode);
1524                 g = gc_bucket(ca, k.k->p.offset);
1525
1526                 a = bch2_alloc_to_v4(k, &a_convert);
1527
1528                 g->gen_valid    = 1;
1529                 g->gen          = a->gen;
1530
1531                 if (metadata_only &&
1532                     (a->data_type == BCH_DATA_user ||
1533                      a->data_type == BCH_DATA_cached ||
1534                      a->data_type == BCH_DATA_parity)) {
1535                         g->data_type            = a->data_type;
1536                         g->dirty_sectors        = a->dirty_sectors;
1537                         g->cached_sectors       = a->cached_sectors;
1538                         g->stripe               = a->stripe;
1539                         g->stripe_redundancy    = a->stripe_redundancy;
1540                 }
1541         }
1542         bch2_trans_iter_exit(&trans, &iter);
1543
1544         bch2_trans_exit(&trans);
1545
1546         if (ret)
1547                 bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
1548
1549         return ret;
1550 }
1551
1552 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
1553 {
1554         struct bch_dev *ca;
1555         unsigned i;
1556
1557         for_each_member_device(ca, c, i) {
1558                 struct bucket_array *buckets = gc_bucket_array(ca);
1559                 struct bucket *g;
1560
1561                 for_each_bucket(g, buckets) {
1562                         if (metadata_only &&
1563                             (g->data_type == BCH_DATA_user ||
1564                              g->data_type == BCH_DATA_cached ||
1565                              g->data_type == BCH_DATA_parity))
1566                                 continue;
1567                         g->data_type = 0;
1568                         g->dirty_sectors = 0;
1569                         g->cached_sectors = 0;
1570                 }
1571         };
1572 }
1573
1574 static int bch2_gc_write_reflink_key(struct btree_trans *trans,
1575                                      struct btree_iter *iter,
1576                                      struct bkey_s_c k,
1577                                      size_t *idx)
1578 {
1579         struct bch_fs *c = trans->c;
1580         const __le64 *refcount = bkey_refcount_c(k);
1581         struct printbuf buf = PRINTBUF;
1582         struct reflink_gc *r;
1583         int ret = 0;
1584
1585         if (!refcount)
1586                 return 0;
1587
1588         while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
1589                r->offset < k.k->p.offset)
1590                 ++*idx;
1591
1592         if (!r ||
1593             r->offset != k.k->p.offset ||
1594             r->size != k.k->size) {
1595                 bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
1596                 return -EINVAL;
1597         }
1598
1599         if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
1600                         "reflink key has wrong refcount:\n"
1601                         "  %s\n"
1602                         "  should be %u",
1603                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
1604                         r->refcount)) {
1605                 struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
1606
1607                 ret = PTR_ERR_OR_ZERO(new);
1608                 if (ret)
1609                         return ret;
1610
1611                 if (!r->refcount)
1612                         new->k.type = KEY_TYPE_deleted;
1613                 else
1614                         *bkey_refcount(new) = cpu_to_le64(r->refcount);
1615         }
1616 fsck_err:
1617         printbuf_exit(&buf);
1618         return ret;
1619 }
1620
1621 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
1622 {
1623         struct btree_trans trans;
1624         struct btree_iter iter;
1625         struct bkey_s_c k;
1626         size_t idx = 0;
1627         int ret = 0;
1628
1629         if (metadata_only)
1630                 return 0;
1631
1632         bch2_trans_init(&trans, c, 0, 0);
1633
1634         ret = for_each_btree_key_commit(&trans, iter,
1635                         BTREE_ID_reflink, POS_MIN,
1636                         BTREE_ITER_PREFETCH, k,
1637                         NULL, NULL, BTREE_INSERT_NOFAIL,
1638                 bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
1639
1640         c->reflink_gc_nr = 0;
1641         bch2_trans_exit(&trans);
1642         return ret;
1643 }
1644
1645 static int bch2_gc_reflink_start(struct bch_fs *c,
1646                                  bool metadata_only)
1647 {
1648         struct btree_trans trans;
1649         struct btree_iter iter;
1650         struct bkey_s_c k;
1651         struct reflink_gc *r;
1652         int ret = 0;
1653
1654         if (metadata_only)
1655                 return 0;
1656
1657         bch2_trans_init(&trans, c, 0, 0);
1658         c->reflink_gc_nr = 0;
1659
1660         for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
1661                            BTREE_ITER_PREFETCH, k, ret) {
1662                 const __le64 *refcount = bkey_refcount_c(k);
1663
1664                 if (!refcount)
1665                         continue;
1666
1667                 r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
1668                                        GFP_KERNEL);
1669                 if (!r) {
1670                         ret = -BCH_ERR_ENOMEM_gc_reflink_start;
1671                         break;
1672                 }
1673
1674                 r->offset       = k.k->p.offset;
1675                 r->size         = k.k->size;
1676                 r->refcount     = 0;
1677         }
1678         bch2_trans_iter_exit(&trans, &iter);
1679
1680         bch2_trans_exit(&trans);
1681         return ret;
1682 }
1683
1684 static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
1685 {
1686         struct genradix_iter iter;
1687         struct reflink_gc *r;
1688
1689         genradix_for_each(&c->reflink_gc_table, iter, r)
1690                 r->refcount = 0;
1691 }
1692
1693 static int bch2_gc_write_stripes_key(struct btree_trans *trans,
1694                                      struct btree_iter *iter,
1695                                      struct bkey_s_c k)
1696 {
1697         struct bch_fs *c = trans->c;
1698         struct printbuf buf = PRINTBUF;
1699         const struct bch_stripe *s;
1700         struct gc_stripe *m;
1701         bool bad = false;
1702         unsigned i;
1703         int ret = 0;
1704
1705         if (k.k->type != KEY_TYPE_stripe)
1706                 return 0;
1707
1708         s = bkey_s_c_to_stripe(k).v;
1709         m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
1710
1711         for (i = 0; i < s->nr_blocks; i++) {
1712                 u32 old = stripe_blockcount_get(s, i);
1713                 u32 new = (m ? m->block_sectors[i] : 0);
1714
1715                 if (old != new) {
1716                         prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
1717                                    i, old, new);
1718                         bad = true;
1719                 }
1720         }
1721
1722         if (bad)
1723                 bch2_bkey_val_to_text(&buf, c, k);
1724
1725         if (fsck_err_on(bad, c, "%s", buf.buf)) {
1726                 struct bkey_i_stripe *new;
1727
1728                 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1729                 ret = PTR_ERR_OR_ZERO(new);
1730                 if (ret)
1731                         return ret;
1732
1733                 bkey_reassemble(&new->k_i, k);
1734
1735                 for (i = 0; i < new->v.nr_blocks; i++)
1736                         stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
1737
1738                 ret = bch2_trans_update(trans, iter, &new->k_i, 0);
1739         }
1740 fsck_err:
1741         printbuf_exit(&buf);
1742         return ret;
1743 }
1744
1745 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
1746 {
1747         struct btree_trans trans;
1748         struct btree_iter iter;
1749         struct bkey_s_c k;
1750         int ret = 0;
1751
1752         if (metadata_only)
1753                 return 0;
1754
1755         bch2_trans_init(&trans, c, 0, 0);
1756
1757         ret = for_each_btree_key_commit(&trans, iter,
1758                         BTREE_ID_stripes, POS_MIN,
1759                         BTREE_ITER_PREFETCH, k,
1760                         NULL, NULL, BTREE_INSERT_NOFAIL,
1761                 bch2_gc_write_stripes_key(&trans, &iter, k));
1762
1763         bch2_trans_exit(&trans);
1764         return ret;
1765 }
1766
1767 static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
1768 {
1769         genradix_free(&c->gc_stripes);
1770 }
1771
1772 /**
1773  * bch2_gc - walk _all_ references to buckets, and recompute them:
1774  *
1775  * Order matters here:
1776  *  - Concurrent GC relies on the fact that we have a total ordering for
1777  *    everything that GC walks - see  gc_will_visit_node(),
1778  *    gc_will_visit_root()
1779  *
1780  *  - also, references move around in the course of index updates and
1781  *    various other crap: everything needs to agree on the ordering
1782  *    references are allowed to move around in - e.g., we're allowed to
1783  *    start with a reference owned by an open_bucket (the allocator) and
1784  *    move it to the btree, but not the reverse.
1785  *
1786  *    This is necessary to ensure that gc doesn't miss references that
1787  *    move around - if references move backwards in the ordering GC
1788  *    uses, GC could skip past them
1789  */
1790 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
1791 {
1792         unsigned iter = 0;
1793         int ret;
1794
1795         lockdep_assert_held(&c->state_lock);
1796
1797         down_write(&c->gc_lock);
1798
1799         bch2_btree_interior_updates_flush(c);
1800
1801         ret   = bch2_gc_start(c) ?:
1802                 bch2_gc_alloc_start(c, metadata_only) ?:
1803                 bch2_gc_reflink_start(c, metadata_only);
1804         if (ret)
1805                 goto out;
1806 again:
1807         gc_pos_set(c, gc_phase(GC_PHASE_START));
1808
1809         bch2_mark_superblocks(c);
1810
1811         if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) ||
1812             (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
1813              c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations &&
1814              c->opts.fix_errors != FSCK_OPT_NO)) {
1815                 bch_info(c, "Starting topology repair pass");
1816                 ret = bch2_repair_topology(c);
1817                 if (ret)
1818                         goto out;
1819                 bch_info(c, "Topology repair pass done");
1820
1821                 set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
1822         }
1823
1824         ret = bch2_gc_btrees(c, initial, metadata_only);
1825
1826         if (ret == -BCH_ERR_need_topology_repair &&
1827             !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
1828             c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) {
1829                 set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
1830                 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
1831                 ret = 0;
1832         }
1833
1834         if (ret == -BCH_ERR_need_topology_repair)
1835                 ret = -BCH_ERR_fsck_errors_not_fixed;
1836
1837         if (ret)
1838                 goto out;
1839
1840 #if 0
1841         bch2_mark_pending_btree_node_frees(c);
1842 #endif
1843         c->gc_count++;
1844
1845         if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
1846             (!iter && bch2_test_restart_gc)) {
1847                 if (iter++ > 2) {
1848                         bch_info(c, "Unable to fix bucket gens, looping");
1849                         ret = -EINVAL;
1850                         goto out;
1851                 }
1852
1853                 /*
1854                  * XXX: make sure gens we fixed got saved
1855                  */
1856                 bch_info(c, "Second GC pass needed, restarting:");
1857                 clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
1858                 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
1859
1860                 bch2_gc_stripes_reset(c, metadata_only);
1861                 bch2_gc_alloc_reset(c, metadata_only);
1862                 bch2_gc_reflink_reset(c, metadata_only);
1863                 ret = bch2_gc_reset(c);
1864                 if (ret)
1865                         goto out;
1866
1867                 /* flush fsck errors, reset counters */
1868                 bch2_flush_fsck_errs(c);
1869                 goto again;
1870         }
1871 out:
1872         if (!ret) {
1873                 bch2_journal_block(&c->journal);
1874
1875                 ret   = bch2_gc_stripes_done(c, metadata_only) ?:
1876                         bch2_gc_reflink_done(c, metadata_only) ?:
1877                         bch2_gc_alloc_done(c, metadata_only) ?:
1878                         bch2_gc_done(c, initial, metadata_only);
1879
1880                 bch2_journal_unblock(&c->journal);
1881         }
1882
1883         percpu_down_write(&c->mark_lock);
1884         /* Indicates that gc is no longer in progress: */
1885         __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
1886
1887         bch2_gc_free(c);
1888         percpu_up_write(&c->mark_lock);
1889
1890         up_write(&c->gc_lock);
1891
1892         /*
1893          * At startup, allocations can happen directly instead of via the
1894          * allocator thread - issue wakeup in case they blocked on gc_lock:
1895          */
1896         closure_wake_up(&c->freelist_wait);
1897
1898         if (ret)
1899                 bch_err_fn(c, ret);
1900         return ret;
1901 }
1902
1903 static int gc_btree_gens_key(struct btree_trans *trans,
1904                              struct btree_iter *iter,
1905                              struct bkey_s_c k)
1906 {
1907         struct bch_fs *c = trans->c;
1908         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1909         const struct bch_extent_ptr *ptr;
1910         struct bkey_i *u;
1911         int ret;
1912
1913         percpu_down_read(&c->mark_lock);
1914         bkey_for_each_ptr(ptrs, ptr) {
1915                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1916
1917                 if (ptr_stale(ca, ptr) > 16) {
1918                         percpu_up_read(&c->mark_lock);
1919                         goto update;
1920                 }
1921         }
1922
1923         bkey_for_each_ptr(ptrs, ptr) {
1924                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1925                 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
1926
1927                 if (gen_after(*gen, ptr->gen))
1928                         *gen = ptr->gen;
1929         }
1930         percpu_up_read(&c->mark_lock);
1931         return 0;
1932 update:
1933         u = bch2_bkey_make_mut(trans, iter, &k, 0);
1934         ret = PTR_ERR_OR_ZERO(u);
1935         if (ret)
1936                 return ret;
1937
1938         bch2_extent_normalize(c, bkey_i_to_s(u));
1939         return 0;
1940 }
1941
1942 static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
1943                                        struct bkey_s_c k)
1944 {
1945         struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
1946         struct bch_alloc_v4 a_convert;
1947         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1948         struct bkey_i_alloc_v4 *a_mut;
1949         int ret;
1950
1951         if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
1952                 return 0;
1953
1954         a_mut = bch2_alloc_to_v4_mut(trans, k);
1955         ret = PTR_ERR_OR_ZERO(a_mut);
1956         if (ret)
1957                 return ret;
1958
1959         a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
1960         a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
1961
1962         return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
1963 }
1964
1965 int bch2_gc_gens(struct bch_fs *c)
1966 {
1967         struct btree_trans trans;
1968         struct btree_iter iter;
1969         struct bkey_s_c k;
1970         struct bch_dev *ca;
1971         u64 b, start_time = local_clock();
1972         unsigned i;
1973         int ret;
1974
1975         /*
1976          * Ideally we would be using state_lock and not gc_lock here, but that
1977          * introduces a deadlock in the RO path - we currently take the state
1978          * lock at the start of going RO, thus the gc thread may get stuck:
1979          */
1980         if (!mutex_trylock(&c->gc_gens_lock))
1981                 return 0;
1982
1983         trace_and_count(c, gc_gens_start, c);
1984         down_read(&c->gc_lock);
1985         bch2_trans_init(&trans, c, 0, 0);
1986
1987         for_each_member_device(ca, c, i) {
1988                 struct bucket_gens *gens;
1989
1990                 BUG_ON(ca->oldest_gen);
1991
1992                 ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
1993                 if (!ca->oldest_gen) {
1994                         percpu_ref_put(&ca->ref);
1995                         ret = -BCH_ERR_ENOMEM_gc_gens;
1996                         goto err;
1997                 }
1998
1999                 gens = bucket_gens(ca);
2000
2001                 for (b = gens->first_bucket;
2002                      b < gens->nbuckets; b++)
2003                         ca->oldest_gen[b] = gens->b[b];
2004         }
2005
2006         for (i = 0; i < BTREE_ID_NR; i++)
2007                 if (btree_type_has_ptrs(i)) {
2008                         struct btree_iter iter;
2009                         struct bkey_s_c k;
2010
2011                         c->gc_gens_btree = i;
2012                         c->gc_gens_pos = POS_MIN;
2013                         ret = for_each_btree_key_commit(&trans, iter, i,
2014                                         POS_MIN,
2015                                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2016                                         k,
2017                                         NULL, NULL,
2018                                         BTREE_INSERT_NOFAIL,
2019                                 gc_btree_gens_key(&trans, &iter, k));
2020                         if (ret && !bch2_err_matches(ret, EROFS))
2021                                 bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
2022                         if (ret)
2023                                 goto err;
2024                 }
2025
2026         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
2027                         POS_MIN,
2028                         BTREE_ITER_PREFETCH,
2029                         k,
2030                         NULL, NULL,
2031                         BTREE_INSERT_NOFAIL,
2032                 bch2_alloc_write_oldest_gen(&trans, &iter, k));
2033         if (ret && !bch2_err_matches(ret, EROFS))
2034                 bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
2035         if (ret)
2036                 goto err;
2037
2038         c->gc_gens_btree        = 0;
2039         c->gc_gens_pos          = POS_MIN;
2040
2041         c->gc_count++;
2042
2043         bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
2044         trace_and_count(c, gc_gens_end, c);
2045 err:
2046         for_each_member_device(ca, c, i) {
2047                 kvfree(ca->oldest_gen);
2048                 ca->oldest_gen = NULL;
2049         }
2050
2051         bch2_trans_exit(&trans);
2052         up_read(&c->gc_lock);
2053         mutex_unlock(&c->gc_gens_lock);
2054         return ret;
2055 }
2056
2057 static int bch2_gc_thread(void *arg)
2058 {
2059         struct bch_fs *c = arg;
2060         struct io_clock *clock = &c->io_clock[WRITE];
2061         unsigned long last = atomic64_read(&clock->now);
2062         unsigned last_kick = atomic_read(&c->kick_gc);
2063         int ret;
2064
2065         set_freezable();
2066
2067         while (1) {
2068                 while (1) {
2069                         set_current_state(TASK_INTERRUPTIBLE);
2070
2071                         if (kthread_should_stop()) {
2072                                 __set_current_state(TASK_RUNNING);
2073                                 return 0;
2074                         }
2075
2076                         if (atomic_read(&c->kick_gc) != last_kick)
2077                                 break;
2078
2079                         if (c->btree_gc_periodic) {
2080                                 unsigned long next = last + c->capacity / 16;
2081
2082                                 if (atomic64_read(&clock->now) >= next)
2083                                         break;
2084
2085                                 bch2_io_clock_schedule_timeout(clock, next);
2086                         } else {
2087                                 schedule();
2088                         }
2089
2090                         try_to_freeze();
2091                 }
2092                 __set_current_state(TASK_RUNNING);
2093
2094                 last = atomic64_read(&clock->now);
2095                 last_kick = atomic_read(&c->kick_gc);
2096
2097                 /*
2098                  * Full gc is currently incompatible with btree key cache:
2099                  */
2100 #if 0
2101                 ret = bch2_gc(c, false, false);
2102 #else
2103                 ret = bch2_gc_gens(c);
2104 #endif
2105                 if (ret < 0)
2106                         bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
2107
2108                 debug_check_no_locks_held();
2109         }
2110
2111         return 0;
2112 }
2113
2114 void bch2_gc_thread_stop(struct bch_fs *c)
2115 {
2116         struct task_struct *p;
2117
2118         p = c->gc_thread;
2119         c->gc_thread = NULL;
2120
2121         if (p) {
2122                 kthread_stop(p);
2123                 put_task_struct(p);
2124         }
2125 }
2126
2127 int bch2_gc_thread_start(struct bch_fs *c)
2128 {
2129         struct task_struct *p;
2130
2131         if (c->gc_thread)
2132                 return 0;
2133
2134         p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
2135         if (IS_ERR(p)) {
2136                 bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
2137                 return PTR_ERR(p);
2138         }
2139
2140         get_task_struct(p);
2141         c->gc_thread = p;
2142         wake_up_process(p);
2143         return 0;
2144 }