]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c
c30d1f7b15a3e9478d520806b8a192665f6e950e
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  * Copyright (C) 2014 Datera Inc.
4  */
5
6 #include "bcachefs.h"
7 #include "alloc_background.h"
8 #include "alloc_foreground.h"
9 #include "bkey_methods.h"
10 #include "btree_locking.h"
11 #include "btree_update_interior.h"
12 #include "btree_io.h"
13 #include "btree_gc.h"
14 #include "buckets.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "extents.h"
20 #include "journal.h"
21 #include "journal_io.h"
22 #include "keylist.h"
23 #include "move.h"
24 #include "replicas.h"
25 #include "super-io.h"
26
27 #include <linux/slab.h>
28 #include <linux/bitops.h>
29 #include <linux/freezer.h>
30 #include <linux/kthread.h>
31 #include <linux/preempt.h>
32 #include <linux/rcupdate.h>
33 #include <linux/sched/task.h>
34 #include <trace/events/bcachefs.h>
35
36 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
37 {
38         write_seqcount_begin(&c->gc_pos_lock);
39         c->gc_pos = new_pos;
40         write_seqcount_end(&c->gc_pos_lock);
41 }
42
43 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
44 {
45         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
46         __gc_pos_set(c, new_pos);
47 }
48
49 /* range_checks - for validating min/max pos of each btree node: */
50
51 struct range_checks {
52         struct range_level {
53                 struct bpos     min;
54                 struct bpos     max;
55         }                       l[BTREE_MAX_DEPTH];
56         unsigned                depth;
57 };
58
59 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
60 {
61         unsigned i;
62
63         for (i = 0; i < BTREE_MAX_DEPTH; i++)
64                 r->l[i].min = r->l[i].max = POS_MIN;
65         r->depth = depth;
66 }
67
68 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
69                                     struct range_checks *r)
70 {
71         struct range_level *l = &r->l[b->level];
72
73         struct bpos expected_min = bkey_cmp(l->min, l->max)
74                 ? btree_type_successor(b->btree_id, l->max)
75                 : l->max;
76
77         bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
78                 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
79                 b->data->min_key.inode,
80                 b->data->min_key.offset,
81                 expected_min.inode,
82                 expected_min.offset);
83
84         l->max = b->data->max_key;
85
86         if (b->level > r->depth) {
87                 l = &r->l[b->level - 1];
88
89                 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
90                         "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
91                         b->data->min_key.inode,
92                         b->data->min_key.offset,
93                         l->min.inode,
94                         l->min.offset);
95
96                 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
97                         "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
98                         b->data->max_key.inode,
99                         b->data->max_key.offset,
100                         l->max.inode,
101                         l->max.offset);
102
103                 if (bkey_cmp(b->data->max_key, POS_MAX))
104                         l->min = l->max =
105                                 btree_type_successor(b->btree_id,
106                                                      b->data->max_key);
107         }
108 }
109
110 /* marking of btree keys/nodes: */
111
112 static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
113                             u8 *max_stale, bool initial)
114 {
115         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
116         const struct bch_extent_ptr *ptr;
117         struct gc_pos pos = { 0 };
118         unsigned flags =
119                 BCH_BUCKET_MARK_GC|
120                 (initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
121         int ret = 0;
122
123         if (initial) {
124                 BUG_ON(journal_seq_verify(c) &&
125                        k.k->version.lo > journal_cur_seq(&c->journal));
126
127                 if (k.k->version.lo > atomic64_read(&c->key_version))
128                         atomic64_set(&c->key_version, k.k->version.lo);
129
130                 if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
131                     fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
132                                 "superblock not marked as containing replicas (type %u)",
133                                 k.k->type)) {
134                         ret = bch2_mark_bkey_replicas(c, k);
135                         if (ret)
136                                 return ret;
137                 }
138
139                 bkey_for_each_ptr(ptrs, ptr) {
140                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
141                         size_t b = PTR_BUCKET_NR(ca, ptr);
142                         struct bucket *g = PTR_BUCKET(ca, ptr);
143
144                         if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
145                                         "found ptr with missing gen in alloc btree,\n"
146                                         "type %u gen %u",
147                                         k.k->type, ptr->gen)) {
148                                 g->_mark.gen = ptr->gen;
149                                 g->_mark.gen_valid = 1;
150                                 set_bit(b, ca->buckets_dirty);
151                         }
152
153                         if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
154                                         "%u ptr gen in the future: %u > %u",
155                                         k.k->type, ptr->gen, g->mark.gen)) {
156                                 g->_mark.gen = ptr->gen;
157                                 g->_mark.gen_valid = 1;
158                                 set_bit(b, ca->buckets_dirty);
159                                 set_bit(BCH_FS_FIXED_GENS, &c->flags);
160                         }
161                 }
162         }
163
164         bkey_for_each_ptr(ptrs, ptr) {
165                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
166                 size_t b = PTR_BUCKET_NR(ca, ptr);
167
168                 if (gen_after(ca->oldest_gens[b], ptr->gen))
169                         ca->oldest_gens[b] = ptr->gen;
170
171                 *max_stale = max(*max_stale, ptr_stale(ca, ptr));
172         }
173
174         bch2_mark_key(c, k, true, k.k->size, pos, NULL, 0, flags);
175 fsck_err:
176         return ret;
177 }
178
179 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
180                               u8 *max_stale, bool initial)
181 {
182         struct btree_node_iter iter;
183         struct bkey unpacked;
184         struct bkey_s_c k;
185         int ret = 0;
186
187         *max_stale = 0;
188
189         if (!btree_node_type_needs_gc(btree_node_type(b)))
190                 return 0;
191
192         for_each_btree_node_key_unpack(b, k, &iter,
193                                        &unpacked) {
194                 bch2_bkey_debugcheck(c, b, k);
195
196                 ret = bch2_gc_mark_key(c, k, max_stale, initial);
197                 if (ret)
198                         break;
199         }
200
201         return ret;
202 }
203
204 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
205                          bool initial)
206 {
207         struct btree_iter iter;
208         struct btree *b;
209         struct range_checks r;
210         unsigned depth = btree_node_type_needs_gc(btree_id) ? 0 : 1;
211         u8 max_stale;
212         int ret = 0;
213
214         gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
215
216         /*
217          * if expensive_debug_checks is on, run range_checks on all leaf nodes:
218          *
219          * and on startup, we have to read every btree node (XXX: only if it was
220          * an unclean shutdown)
221          */
222         if (initial || expensive_debug_checks(c))
223                 depth = 0;
224
225         btree_node_range_checks_init(&r, depth);
226
227         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
228                               0, depth, BTREE_ITER_PREFETCH, b) {
229                 btree_node_range_checks(c, b, &r);
230
231                 bch2_verify_btree_nr_keys(b);
232
233                 ret = btree_gc_mark_node(c, b, &max_stale, initial);
234                 if (ret)
235                         break;
236
237                 gc_pos_set(c, gc_pos_btree_node(b));
238
239                 if (!initial) {
240                         if (max_stale > 64)
241                                 bch2_btree_node_rewrite(c, &iter,
242                                                 b->data->keys.seq,
243                                                 BTREE_INSERT_USE_RESERVE|
244                                                 BTREE_INSERT_NOWAIT|
245                                                 BTREE_INSERT_GC_LOCK_HELD);
246                         else if (!btree_gc_rewrite_disabled(c) &&
247                                  (btree_gc_always_rewrite(c) || max_stale > 16))
248                                 bch2_btree_node_rewrite(c, &iter,
249                                                 b->data->keys.seq,
250                                                 BTREE_INSERT_NOWAIT|
251                                                 BTREE_INSERT_GC_LOCK_HELD);
252                 }
253
254                 bch2_btree_iter_cond_resched(&iter);
255         }
256         ret = bch2_btree_iter_unlock(&iter) ?: ret;
257         if (ret)
258                 return ret;
259
260         mutex_lock(&c->btree_root_lock);
261
262         b = c->btree_roots[btree_id].b;
263         if (!btree_node_fake(b))
264                 bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
265                                  &max_stale, initial);
266         gc_pos_set(c, gc_pos_btree_root(b->btree_id));
267
268         mutex_unlock(&c->btree_root_lock);
269         return 0;
270 }
271
272 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
273 {
274         return  (int) btree_id_to_gc_phase(l) -
275                 (int) btree_id_to_gc_phase(r);
276 }
277
278 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
279                           bool initial)
280 {
281         enum btree_id ids[BTREE_ID_NR];
282         u8 max_stale;
283         unsigned i;
284
285         for (i = 0; i < BTREE_ID_NR; i++)
286                 ids[i] = i;
287         bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
288
289         for (i = 0; i < BTREE_ID_NR; i++) {
290                 enum btree_id id = ids[i];
291                 enum btree_node_type type = __btree_node_type(0, id);
292
293                 int ret = bch2_gc_btree(c, id, initial);
294                 if (ret)
295                         return ret;
296
297                 if (journal && btree_node_type_needs_gc(type)) {
298                         struct bkey_i *k, *n;
299                         struct jset_entry *j;
300                         struct journal_replay *r;
301                         int ret;
302
303                         list_for_each_entry(r, journal, list)
304                                 for_each_jset_key(k, n, j, &r->j) {
305                                         if (type == __btree_node_type(j->level, j->btree_id)) {
306                                                 ret = bch2_gc_mark_key(c,
307                                                         bkey_i_to_s_c(k),
308                                                         &max_stale, initial);
309                                                 if (ret)
310                                                         return ret;
311                                         }
312                                 }
313                 }
314         }
315
316         return 0;
317 }
318
319 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
320                                   u64 start, u64 end,
321                                   enum bch_data_type type,
322                                   unsigned flags)
323 {
324         u64 b = sector_to_bucket(ca, start);
325
326         do {
327                 unsigned sectors =
328                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
329
330                 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
331                                           gc_phase(GC_PHASE_SB), flags);
332                 b++;
333                 start += sectors;
334         } while (start < end);
335 }
336
337 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
338                               unsigned flags)
339 {
340         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
341         unsigned i;
342         u64 b;
343
344         /*
345          * This conditional is kind of gross, but we may be called from the
346          * device add path, before the new device has actually been added to the
347          * running filesystem:
348          */
349         if (c) {
350                 lockdep_assert_held(&c->sb_lock);
351                 percpu_down_read_preempt_disable(&c->usage_lock);
352         } else {
353                 preempt_disable();
354         }
355
356         for (i = 0; i < layout->nr_superblocks; i++) {
357                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
358
359                 if (offset == BCH_SB_SECTOR)
360                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
361                                               BCH_DATA_SB, flags);
362
363                 mark_metadata_sectors(c, ca, offset,
364                                       offset + (1 << layout->sb_max_size_bits),
365                                       BCH_DATA_SB, flags);
366         }
367
368         for (i = 0; i < ca->journal.nr; i++) {
369                 b = ca->journal.buckets[i];
370                 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
371                                           ca->mi.bucket_size,
372                                           gc_phase(GC_PHASE_SB), flags);
373         }
374
375         if (c) {
376                 percpu_up_read_preempt_enable(&c->usage_lock);
377         } else {
378                 preempt_enable();
379         }
380 }
381
382 static void bch2_mark_superblocks(struct bch_fs *c)
383 {
384         struct bch_dev *ca;
385         unsigned i;
386
387         mutex_lock(&c->sb_lock);
388         gc_pos_set(c, gc_phase(GC_PHASE_SB));
389
390         for_each_online_member(ca, c, i)
391                 bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
392         mutex_unlock(&c->sb_lock);
393 }
394
395 /* Also see bch2_pending_btree_node_free_insert_done() */
396 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
397 {
398         struct gc_pos pos = { 0 };
399         struct btree_update *as;
400         struct pending_btree_node_free *d;
401
402         mutex_lock(&c->btree_interior_update_lock);
403         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
404
405         for_each_pending_btree_node_free(c, as, d)
406                 if (d->index_update_done)
407                         bch2_mark_key(c, bkey_i_to_s_c(&d->key),
408                                       true, 0,
409                                       pos, NULL, 0,
410                                       BCH_BUCKET_MARK_GC);
411
412         mutex_unlock(&c->btree_interior_update_lock);
413 }
414
415 static void bch2_mark_allocator_buckets(struct bch_fs *c)
416 {
417         struct bch_dev *ca;
418         struct open_bucket *ob;
419         size_t i, j, iter;
420         unsigned ci;
421
422         percpu_down_read_preempt_disable(&c->usage_lock);
423
424         spin_lock(&c->freelist_lock);
425         gc_pos_set(c, gc_pos_alloc(c, NULL));
426
427         for_each_member_device(ca, c, ci) {
428                 fifo_for_each_entry(i, &ca->free_inc, iter)
429                         bch2_mark_alloc_bucket(c, ca, i, true,
430                                                gc_pos_alloc(c, NULL),
431                                                BCH_BUCKET_MARK_GC);
432
433
434
435                 for (j = 0; j < RESERVE_NR; j++)
436                         fifo_for_each_entry(i, &ca->free[j], iter)
437                                 bch2_mark_alloc_bucket(c, ca, i, true,
438                                                        gc_pos_alloc(c, NULL),
439                                                        BCH_BUCKET_MARK_GC);
440         }
441
442         spin_unlock(&c->freelist_lock);
443
444         for (ob = c->open_buckets;
445              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
446              ob++) {
447                 spin_lock(&ob->lock);
448                 if (ob->valid) {
449                         gc_pos_set(c, gc_pos_alloc(c, ob));
450                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
451                         bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
452                                                gc_pos_alloc(c, ob),
453                                                BCH_BUCKET_MARK_GC);
454                 }
455                 spin_unlock(&ob->lock);
456         }
457
458         percpu_up_read_preempt_enable(&c->usage_lock);
459 }
460
461 static void bch2_gc_free(struct bch_fs *c)
462 {
463         struct bch_dev *ca;
464         unsigned i;
465
466         genradix_free(&c->stripes[1]);
467
468         for_each_member_device(ca, c, i) {
469                 kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
470                         sizeof(struct bucket_array) +
471                         ca->mi.nbuckets * sizeof(struct bucket));
472                 ca->buckets[1] = NULL;
473
474                 free_percpu(ca->usage[1]);
475                 ca->usage[1] = NULL;
476         }
477
478         free_percpu(c->usage[1]);
479         c->usage[1] = NULL;
480 }
481
482 static void bch2_gc_done_nocheck(struct bch_fs *c)
483 {
484         struct bch_dev *ca;
485         unsigned i;
486         int cpu;
487
488         {
489                 struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
490                 struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
491                 struct stripe *dst, *src;
492
493                 c->ec_stripes_heap.used = 0;
494
495                 while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
496                        (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
497                         *dst = *src;
498
499                         if (dst->alive)
500                                 bch2_stripes_heap_insert(c, dst, dst_iter.pos);
501
502                         genradix_iter_advance(&dst_iter, &c->stripes[0]);
503                         genradix_iter_advance(&src_iter, &c->stripes[1]);
504                 }
505         }
506
507         for_each_member_device(ca, c, i) {
508                 struct bucket_array *src = __bucket_array(ca, 1);
509
510                 memcpy(__bucket_array(ca, 0), src,
511                        sizeof(struct bucket_array) +
512                        sizeof(struct bucket) * src->nbuckets);
513         };
514
515         for_each_member_device(ca, c, i) {
516                 struct bch_dev_usage *p;
517
518                 for_each_possible_cpu(cpu) {
519                         p = per_cpu_ptr(ca->usage[0], cpu);
520                         memset(p, 0, sizeof(*p));
521                 }
522
523                 preempt_disable();
524                 *this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
525                 preempt_enable();
526         }
527
528         {
529                 struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
530                 struct bch_fs_usage *p;
531
532                 for_each_possible_cpu(cpu) {
533                         p = per_cpu_ptr(c->usage[0], cpu);
534                         memset(p, 0, offsetof(typeof(*p), online_reserved));
535                 }
536
537                 preempt_disable();
538                 memcpy(this_cpu_ptr(c->usage[0]),
539                        &src,
540                        offsetof(typeof(*p), online_reserved));
541                 preempt_enable();
542         }
543
544 }
545
546 static void bch2_gc_done(struct bch_fs *c, bool initial)
547 {
548         struct bch_dev *ca;
549         unsigned i;
550         int cpu;
551
552 #define copy_field(_f, _msg, ...)                                       \
553         if (dst._f != src._f) {                                         \
554                 bch_err(c, _msg ": got %llu, should be %llu, fixing"\
555                         , ##__VA_ARGS__, dst._f, src._f);               \
556                 dst._f = src._f;                                        \
557         }
558 #define copy_stripe_field(_f, _msg, ...)                                \
559         if (dst->_f != src->_f) {                                       \
560                 bch_err_ratelimited(c, "stripe %zu has wrong "_msg      \
561                         ": got %u, should be %u, fixing",               \
562                         dst_iter.pos, ##__VA_ARGS__,                    \
563                         dst->_f, src->_f);                              \
564                 dst->_f = src->_f;                                      \
565         }
566 #define copy_bucket_field(_f)                                           \
567         if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
568                 bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
569                         ": got %u, should be %u, fixing",               \
570                         i, b, dst->b[b].mark._f, src->b[b].mark._f);    \
571                 dst->b[b]._mark._f = src->b[b].mark._f;                 \
572         }
573 #define copy_dev_field(_f, _msg, ...)                                   \
574         copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
575 #define copy_fs_field(_f, _msg, ...)                                    \
576         copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
577
578         percpu_down_write(&c->usage_lock);
579
580         if (initial) {
581                 bch2_gc_done_nocheck(c);
582                 goto out;
583         }
584
585         {
586                 struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
587                 struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
588                 struct stripe *dst, *src;
589                 unsigned i;
590
591                 c->ec_stripes_heap.used = 0;
592
593                 while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
594                        (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
595                         copy_stripe_field(alive,        "alive");
596                         copy_stripe_field(sectors,      "sectors");
597                         copy_stripe_field(algorithm,    "algorithm");
598                         copy_stripe_field(nr_blocks,    "nr_blocks");
599                         copy_stripe_field(nr_redundant, "nr_redundant");
600                         copy_stripe_field(blocks_nonempty.counter,
601                                           "blocks_nonempty");
602
603                         for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
604                                 copy_stripe_field(block_sectors[i].counter,
605                                                   "block_sectors[%u]", i);
606
607                         if (dst->alive)
608                                 bch2_stripes_heap_insert(c, dst, dst_iter.pos);
609
610                         genradix_iter_advance(&dst_iter, &c->stripes[0]);
611                         genradix_iter_advance(&src_iter, &c->stripes[1]);
612                 }
613         }
614
615         for_each_member_device(ca, c, i) {
616                 struct bucket_array *dst = __bucket_array(ca, 0);
617                 struct bucket_array *src = __bucket_array(ca, 1);
618                 size_t b;
619
620                 if (initial) {
621                         memcpy(dst, src,
622                                sizeof(struct bucket_array) +
623                                sizeof(struct bucket) * dst->nbuckets);
624                 }
625
626                 for (b = 0; b < src->nbuckets; b++) {
627                         copy_bucket_field(gen);
628                         copy_bucket_field(data_type);
629                         copy_bucket_field(owned_by_allocator);
630                         copy_bucket_field(stripe);
631                         copy_bucket_field(dirty_sectors);
632                         copy_bucket_field(cached_sectors);
633                 }
634         };
635
636         for_each_member_device(ca, c, i) {
637                 struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
638                 struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
639                 struct bch_dev_usage *p;
640                 unsigned b;
641
642                 for (b = 0; b < BCH_DATA_NR; b++)
643                         copy_dev_field(buckets[b],
644                                        "buckets[%s]", bch2_data_types[b]);
645                 copy_dev_field(buckets_alloc, "buckets_alloc");
646                 copy_dev_field(buckets_ec, "buckets_ec");
647
648                 for (b = 0; b < BCH_DATA_NR; b++)
649                         copy_dev_field(sectors[b],
650                                        "sectors[%s]", bch2_data_types[b]);
651                 copy_dev_field(sectors_fragmented,
652                                "sectors_fragmented");
653
654                 for_each_possible_cpu(cpu) {
655                         p = per_cpu_ptr(ca->usage[0], cpu);
656                         memset(p, 0, sizeof(*p));
657                 }
658
659                 preempt_disable();
660                 p = this_cpu_ptr(ca->usage[0]);
661                 *p = dst;
662                 preempt_enable();
663         }
664
665         {
666                 struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
667                 struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
668                 struct bch_fs_usage *p;
669                 unsigned r, b;
670
671                 for (r = 0; r < BCH_REPLICAS_MAX; r++) {
672                         for (b = 0; b < BCH_DATA_NR; b++)
673                                 copy_fs_field(replicas[r].data[b],
674                                               "replicas[%i].data[%s]",
675                                               r, bch2_data_types[b]);
676                         copy_fs_field(replicas[r].ec_data,
677                                       "replicas[%i].ec_data", r);
678                         copy_fs_field(replicas[r].persistent_reserved,
679                                       "replicas[%i].persistent_reserved", r);
680                 }
681
682                 for (b = 0; b < BCH_DATA_NR; b++)
683                         copy_fs_field(buckets[b],
684                                       "buckets[%s]", bch2_data_types[b]);
685
686                 for_each_possible_cpu(cpu) {
687                         p = per_cpu_ptr(c->usage[0], cpu);
688                         memset(p, 0, offsetof(typeof(*p), online_reserved));
689                 }
690
691                 preempt_disable();
692                 p = this_cpu_ptr(c->usage[0]);
693                 memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
694                 preempt_enable();
695         }
696 out:
697         percpu_up_write(&c->usage_lock);
698
699 #undef copy_fs_field
700 #undef copy_dev_field
701 #undef copy_bucket_field
702 #undef copy_stripe_field
703 #undef copy_field
704 }
705
706 static int bch2_gc_start(struct bch_fs *c)
707 {
708         struct bch_dev *ca;
709         unsigned i;
710
711         /*
712          * indicate to stripe code that we need to allocate for the gc stripes
713          * radix tree, too
714          */
715         gc_pos_set(c, gc_phase(GC_PHASE_START));
716
717         BUG_ON(c->usage[1]);
718
719         c->usage[1] = alloc_percpu(struct bch_fs_usage);
720         if (!c->usage[1])
721                 return -ENOMEM;
722
723         for_each_member_device(ca, c, i) {
724                 BUG_ON(ca->buckets[1]);
725                 BUG_ON(ca->usage[1]);
726
727                 ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
728                                 ca->mi.nbuckets * sizeof(struct bucket),
729                                 GFP_KERNEL|__GFP_ZERO);
730                 if (!ca->buckets[1]) {
731                         percpu_ref_put(&ca->ref);
732                         return -ENOMEM;
733                 }
734
735                 ca->usage[1] = alloc_percpu(struct bch_dev_usage);
736                 if (!ca->usage[1]) {
737                         percpu_ref_put(&ca->ref);
738                         return -ENOMEM;
739                 }
740         }
741
742         percpu_down_write(&c->usage_lock);
743
744         for_each_member_device(ca, c, i) {
745                 struct bucket_array *dst = __bucket_array(ca, 1);
746                 struct bucket_array *src = __bucket_array(ca, 0);
747                 size_t b;
748
749                 dst->first_bucket       = src->first_bucket;
750                 dst->nbuckets           = src->nbuckets;
751
752                 for (b = 0; b < src->nbuckets; b++)
753                         dst->b[b]._mark.gen = src->b[b].mark.gen;
754         };
755
756         percpu_up_write(&c->usage_lock);
757
758         return bch2_ec_mem_alloc(c, true);
759 }
760
761 /**
762  * bch2_gc - walk _all_ references to buckets, and recompute them:
763  *
764  * Order matters here:
765  *  - Concurrent GC relies on the fact that we have a total ordering for
766  *    everything that GC walks - see  gc_will_visit_node(),
767  *    gc_will_visit_root()
768  *
769  *  - also, references move around in the course of index updates and
770  *    various other crap: everything needs to agree on the ordering
771  *    references are allowed to move around in - e.g., we're allowed to
772  *    start with a reference owned by an open_bucket (the allocator) and
773  *    move it to the btree, but not the reverse.
774  *
775  *    This is necessary to ensure that gc doesn't miss references that
776  *    move around - if references move backwards in the ordering GC
777  *    uses, GC could skip past them
778  */
779 int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
780 {
781         struct bch_dev *ca;
782         u64 start_time = local_clock();
783         unsigned i, iter = 0;
784         int ret;
785
786         trace_gc_start(c);
787
788         down_write(&c->gc_lock);
789 again:
790         ret = bch2_gc_start(c);
791         if (ret)
792                 goto out;
793
794         bch2_mark_superblocks(c);
795
796         ret = bch2_gc_btrees(c, journal, initial);
797         if (ret)
798                 goto out;
799
800         bch2_mark_pending_btree_node_frees(c);
801         bch2_mark_allocator_buckets(c);
802
803         c->gc_count++;
804 out:
805         if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
806                 /*
807                  * XXX: make sure gens we fixed got saved
808                  */
809                 if (iter++ <= 2) {
810                         bch_info(c, "Fixed gens, restarting mark and sweep:");
811                         clear_bit(BCH_FS_FIXED_GENS, &c->flags);
812                         goto again;
813                 }
814
815                 bch_info(c, "Unable to fix bucket gens, looping");
816                 ret = -EINVAL;
817         }
818
819         if (!ret)
820                 bch2_gc_done(c, initial);
821
822         /* Indicates that gc is no longer in progress: */
823         __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
824
825         bch2_gc_free(c);
826         up_write(&c->gc_lock);
827
828         if (!ret && initial)
829                 set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
830
831         trace_gc_end(c);
832         bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
833
834         /*
835          * Wake up allocator in case it was waiting for buckets
836          * because of not being able to inc gens
837          */
838         for_each_member_device(ca, c, i)
839                 bch2_wake_allocator(ca);
840
841         /*
842          * At startup, allocations can happen directly instead of via the
843          * allocator thread - issue wakeup in case they blocked on gc_lock:
844          */
845         closure_wake_up(&c->freelist_wait);
846         return ret;
847 }
848
849 /* Btree coalescing */
850
851 static void recalc_packed_keys(struct btree *b)
852 {
853         struct bset *i = btree_bset_first(b);
854         struct bkey_packed *k;
855
856         memset(&b->nr, 0, sizeof(b->nr));
857
858         BUG_ON(b->nsets != 1);
859
860         vstruct_for_each(i, k)
861                 btree_keys_account_key_add(&b->nr, 0, k);
862 }
863
864 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
865                                 struct btree *old_nodes[GC_MERGE_NODES])
866 {
867         struct btree *parent = btree_node_parent(iter, old_nodes[0]);
868         unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
869         unsigned blocks = btree_blocks(c) * 2 / 3;
870         struct btree *new_nodes[GC_MERGE_NODES];
871         struct btree_update *as;
872         struct keylist keylist;
873         struct bkey_format_state format_state;
874         struct bkey_format new_format;
875
876         memset(new_nodes, 0, sizeof(new_nodes));
877         bch2_keylist_init(&keylist, NULL);
878
879         /* Count keys that are not deleted */
880         for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
881                 u64s += old_nodes[i]->nr.live_u64s;
882
883         nr_old_nodes = nr_new_nodes = i;
884
885         /* Check if all keys in @old_nodes could fit in one fewer node */
886         if (nr_old_nodes <= 1 ||
887             __vstruct_blocks(struct btree_node, c->block_bits,
888                              DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
889                 return;
890
891         /* Find a format that all keys in @old_nodes can pack into */
892         bch2_bkey_format_init(&format_state);
893
894         for (i = 0; i < nr_old_nodes; i++)
895                 __bch2_btree_calc_format(&format_state, old_nodes[i]);
896
897         new_format = bch2_bkey_format_done(&format_state);
898
899         /* Check if repacking would make any nodes too big to fit */
900         for (i = 0; i < nr_old_nodes; i++)
901                 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
902                         trace_btree_gc_coalesce_fail(c,
903                                         BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
904                         return;
905                 }
906
907         if (bch2_keylist_realloc(&keylist, NULL, 0,
908                         (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
909                 trace_btree_gc_coalesce_fail(c,
910                                 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
911                 return;
912         }
913
914         as = bch2_btree_update_start(c, iter->btree_id,
915                         btree_update_reserve_required(c, parent) + nr_old_nodes,
916                         BTREE_INSERT_NOFAIL|
917                         BTREE_INSERT_USE_RESERVE,
918                         NULL);
919         if (IS_ERR(as)) {
920                 trace_btree_gc_coalesce_fail(c,
921                                 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
922                 bch2_keylist_free(&keylist, NULL);
923                 return;
924         }
925
926         trace_btree_gc_coalesce(c, old_nodes[0]);
927
928         for (i = 0; i < nr_old_nodes; i++)
929                 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
930
931         /* Repack everything with @new_format and sort down to one bset */
932         for (i = 0; i < nr_old_nodes; i++)
933                 new_nodes[i] =
934                         __bch2_btree_node_alloc_replacement(as, old_nodes[i],
935                                                             new_format);
936
937         /*
938          * Conceptually we concatenate the nodes together and slice them
939          * up at different boundaries.
940          */
941         for (i = nr_new_nodes - 1; i > 0; --i) {
942                 struct btree *n1 = new_nodes[i];
943                 struct btree *n2 = new_nodes[i - 1];
944
945                 struct bset *s1 = btree_bset_first(n1);
946                 struct bset *s2 = btree_bset_first(n2);
947                 struct bkey_packed *k, *last = NULL;
948
949                 /* Calculate how many keys from @n2 we could fit inside @n1 */
950                 u64s = 0;
951
952                 for (k = s2->start;
953                      k < vstruct_last(s2) &&
954                      vstruct_blocks_plus(n1->data, c->block_bits,
955                                          u64s + k->u64s) <= blocks;
956                      k = bkey_next(k)) {
957                         last = k;
958                         u64s += k->u64s;
959                 }
960
961                 if (u64s == le16_to_cpu(s2->u64s)) {
962                         /* n2 fits entirely in n1 */
963                         n1->key.k.p = n1->data->max_key = n2->data->max_key;
964
965                         memcpy_u64s(vstruct_last(s1),
966                                     s2->start,
967                                     le16_to_cpu(s2->u64s));
968                         le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
969
970                         set_btree_bset_end(n1, n1->set);
971
972                         six_unlock_write(&n2->lock);
973                         bch2_btree_node_free_never_inserted(c, n2);
974                         six_unlock_intent(&n2->lock);
975
976                         memmove(new_nodes + i - 1,
977                                 new_nodes + i,
978                                 sizeof(new_nodes[0]) * (nr_new_nodes - i));
979                         new_nodes[--nr_new_nodes] = NULL;
980                 } else if (u64s) {
981                         /* move part of n2 into n1 */
982                         n1->key.k.p = n1->data->max_key =
983                                 bkey_unpack_pos(n1, last);
984
985                         n2->data->min_key =
986                                 btree_type_successor(iter->btree_id,
987                                                      n1->data->max_key);
988
989                         memcpy_u64s(vstruct_last(s1),
990                                     s2->start, u64s);
991                         le16_add_cpu(&s1->u64s, u64s);
992
993                         memmove(s2->start,
994                                 vstruct_idx(s2, u64s),
995                                 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
996                         s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
997
998                         set_btree_bset_end(n1, n1->set);
999                         set_btree_bset_end(n2, n2->set);
1000                 }
1001         }
1002
1003         for (i = 0; i < nr_new_nodes; i++) {
1004                 struct btree *n = new_nodes[i];
1005
1006                 recalc_packed_keys(n);
1007                 btree_node_reset_sib_u64s(n);
1008
1009                 bch2_btree_build_aux_trees(n);
1010                 six_unlock_write(&n->lock);
1011
1012                 bch2_btree_node_write(c, n, SIX_LOCK_intent);
1013         }
1014
1015         /*
1016          * The keys for the old nodes get deleted. We don't want to insert keys
1017          * that compare equal to the keys for the new nodes we'll also be
1018          * inserting - we can't because keys on a keylist must be strictly
1019          * greater than the previous keys, and we also don't need to since the
1020          * key for the new node will serve the same purpose (overwriting the key
1021          * for the old node).
1022          */
1023         for (i = 0; i < nr_old_nodes; i++) {
1024                 struct bkey_i delete;
1025                 unsigned j;
1026
1027                 for (j = 0; j < nr_new_nodes; j++)
1028                         if (!bkey_cmp(old_nodes[i]->key.k.p,
1029                                       new_nodes[j]->key.k.p))
1030                                 goto next;
1031
1032                 bkey_init(&delete.k);
1033                 delete.k.p = old_nodes[i]->key.k.p;
1034                 bch2_keylist_add_in_order(&keylist, &delete);
1035 next:
1036                 i = i;
1037         }
1038
1039         /*
1040          * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
1041          * does the lookup once and thus expects the keys to be in sorted order
1042          * so we have to make sure the new keys are correctly ordered with
1043          * respect to the deleted keys added in the previous loop
1044          */
1045         for (i = 0; i < nr_new_nodes; i++)
1046                 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
1047
1048         /* Insert the newly coalesced nodes */
1049         bch2_btree_insert_node(as, parent, iter, &keylist, 0);
1050
1051         BUG_ON(!bch2_keylist_empty(&keylist));
1052
1053         BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
1054
1055         bch2_btree_iter_node_replace(iter, new_nodes[0]);
1056
1057         for (i = 0; i < nr_new_nodes; i++)
1058                 bch2_open_buckets_put(c, &new_nodes[i]->ob);
1059
1060         /* Free the old nodes and update our sliding window */
1061         for (i = 0; i < nr_old_nodes; i++) {
1062                 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
1063
1064                 /*
1065                  * the index update might have triggered a split, in which case
1066                  * the nodes we coalesced - the new nodes we just created -
1067                  * might not be sibling nodes anymore - don't add them to the
1068                  * sliding window (except the first):
1069                  */
1070                 if (!i) {
1071                         old_nodes[i] = new_nodes[i];
1072                 } else {
1073                         old_nodes[i] = NULL;
1074                         if (new_nodes[i])
1075                                 six_unlock_intent(&new_nodes[i]->lock);
1076                 }
1077         }
1078
1079         bch2_btree_update_done(as);
1080         bch2_keylist_free(&keylist, NULL);
1081 }
1082
1083 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
1084 {
1085         struct btree_iter iter;
1086         struct btree *b;
1087         bool kthread = (current->flags & PF_KTHREAD) != 0;
1088         unsigned i;
1089
1090         /* Sliding window of adjacent btree nodes */
1091         struct btree *merge[GC_MERGE_NODES];
1092         u32 lock_seq[GC_MERGE_NODES];
1093
1094         /*
1095          * XXX: We don't have a good way of positively matching on sibling nodes
1096          * that have the same parent - this code works by handling the cases
1097          * where they might not have the same parent, and is thus fragile. Ugh.
1098          *
1099          * Perhaps redo this to use multiple linked iterators?
1100          */
1101         memset(merge, 0, sizeof(merge));
1102
1103         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
1104                               BTREE_MAX_DEPTH, 0,
1105                               BTREE_ITER_PREFETCH, b) {
1106                 memmove(merge + 1, merge,
1107                         sizeof(merge) - sizeof(merge[0]));
1108                 memmove(lock_seq + 1, lock_seq,
1109                         sizeof(lock_seq) - sizeof(lock_seq[0]));
1110
1111                 merge[0] = b;
1112
1113                 for (i = 1; i < GC_MERGE_NODES; i++) {
1114                         if (!merge[i] ||
1115                             !six_relock_intent(&merge[i]->lock, lock_seq[i]))
1116                                 break;
1117
1118                         if (merge[i]->level != merge[0]->level) {
1119                                 six_unlock_intent(&merge[i]->lock);
1120                                 break;
1121                         }
1122                 }
1123                 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
1124
1125                 bch2_coalesce_nodes(c, &iter, merge);
1126
1127                 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
1128                         lock_seq[i] = merge[i]->lock.state.seq;
1129                         six_unlock_intent(&merge[i]->lock);
1130                 }
1131
1132                 lock_seq[0] = merge[0]->lock.state.seq;
1133
1134                 if (kthread && kthread_should_stop()) {
1135                         bch2_btree_iter_unlock(&iter);
1136                         return -ESHUTDOWN;
1137                 }
1138
1139                 bch2_btree_iter_cond_resched(&iter);
1140
1141                 /*
1142                  * If the parent node wasn't relocked, it might have been split
1143                  * and the nodes in our sliding window might not have the same
1144                  * parent anymore - blow away the sliding window:
1145                  */
1146                 if (btree_iter_node(&iter, iter.level + 1) &&
1147                     !btree_node_intent_locked(&iter, iter.level + 1))
1148                         memset(merge + 1, 0,
1149                                (GC_MERGE_NODES - 1) * sizeof(merge[0]));
1150         }
1151         return bch2_btree_iter_unlock(&iter);
1152 }
1153
1154 /**
1155  * bch_coalesce - coalesce adjacent nodes with low occupancy
1156  */
1157 void bch2_coalesce(struct bch_fs *c)
1158 {
1159         enum btree_id id;
1160
1161         down_read(&c->gc_lock);
1162         trace_gc_coalesce_start(c);
1163
1164         for (id = 0; id < BTREE_ID_NR; id++) {
1165                 int ret = c->btree_roots[id].b
1166                         ? bch2_coalesce_btree(c, id)
1167                         : 0;
1168
1169                 if (ret) {
1170                         if (ret != -ESHUTDOWN)
1171                                 bch_err(c, "btree coalescing failed: %d", ret);
1172                         return;
1173                 }
1174         }
1175
1176         trace_gc_coalesce_end(c);
1177         up_read(&c->gc_lock);
1178 }
1179
1180 static int bch2_gc_thread(void *arg)
1181 {
1182         struct bch_fs *c = arg;
1183         struct io_clock *clock = &c->io_clock[WRITE];
1184         unsigned long last = atomic_long_read(&clock->now);
1185         unsigned last_kick = atomic_read(&c->kick_gc);
1186         int ret;
1187
1188         set_freezable();
1189
1190         while (1) {
1191                 while (1) {
1192                         set_current_state(TASK_INTERRUPTIBLE);
1193
1194                         if (kthread_should_stop()) {
1195                                 __set_current_state(TASK_RUNNING);
1196                                 return 0;
1197                         }
1198
1199                         if (atomic_read(&c->kick_gc) != last_kick)
1200                                 break;
1201
1202                         if (c->btree_gc_periodic) {
1203                                 unsigned long next = last + c->capacity / 16;
1204
1205                                 if (atomic_long_read(&clock->now) >= next)
1206                                         break;
1207
1208                                 bch2_io_clock_schedule_timeout(clock, next);
1209                         } else {
1210                                 schedule();
1211                         }
1212
1213                         try_to_freeze();
1214                 }
1215                 __set_current_state(TASK_RUNNING);
1216
1217                 last = atomic_long_read(&clock->now);
1218                 last_kick = atomic_read(&c->kick_gc);
1219
1220                 ret = bch2_gc(c, NULL, false);
1221                 if (ret)
1222                         bch_err(c, "btree gc failed: %i", ret);
1223
1224                 debug_check_no_locks_held();
1225         }
1226
1227         return 0;
1228 }
1229
1230 void bch2_gc_thread_stop(struct bch_fs *c)
1231 {
1232         struct task_struct *p;
1233
1234         p = c->gc_thread;
1235         c->gc_thread = NULL;
1236
1237         if (p) {
1238                 kthread_stop(p);
1239                 put_task_struct(p);
1240         }
1241 }
1242
1243 int bch2_gc_thread_start(struct bch_fs *c)
1244 {
1245         struct task_struct *p;
1246
1247         BUG_ON(c->gc_thread);
1248
1249         p = kthread_create(bch2_gc_thread, c, "bch_gc");
1250         if (IS_ERR(p))
1251                 return PTR_ERR(p);
1252
1253         get_task_struct(p);
1254         c->gc_thread = p;
1255         wake_up_process(p);
1256         return 0;
1257 }
1258
1259 /* Initial GC computes bucket marks during startup */
1260
1261 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1262 {
1263         int ret = bch2_gc(c, journal, true);
1264
1265         /*
1266          * Skip past versions that might have possibly been used (as nonces),
1267          * but hadn't had their pointers written:
1268          */
1269         if (c->sb.encryption_type)
1270                 atomic64_add(1 << 16, &c->key_version);
1271
1272         return ret;
1273 }