]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c
635086638ba841edbf32283e47aa62a7d3726dde
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  * Copyright (C) 2014 Datera Inc.
4  */
5
6 #include "bcachefs.h"
7 #include "alloc.h"
8 #include "bkey_methods.h"
9 #include "btree_locking.h"
10 #include "btree_update_interior.h"
11 #include "btree_io.h"
12 #include "btree_gc.h"
13 #include "buckets.h"
14 #include "clock.h"
15 #include "debug.h"
16 #include "error.h"
17 #include "extents.h"
18 #include "journal.h"
19 #include "keylist.h"
20 #include "move.h"
21 #include "super-io.h"
22
23 #include <linux/slab.h>
24 #include <linux/bitops.h>
25 #include <linux/freezer.h>
26 #include <linux/kthread.h>
27 #include <linux/preempt.h>
28 #include <linux/rcupdate.h>
29 #include <trace/events/bcachefs.h>
30
31 struct range_checks {
32         struct range_level {
33                 struct bpos     min;
34                 struct bpos     max;
35         }                       l[BTREE_MAX_DEPTH];
36         unsigned                depth;
37 };
38
39 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
40 {
41         unsigned i;
42
43         for (i = 0; i < BTREE_MAX_DEPTH; i++)
44                 r->l[i].min = r->l[i].max = POS_MIN;
45         r->depth = depth;
46 }
47
48 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
49                                     struct range_checks *r)
50 {
51         struct range_level *l = &r->l[b->level];
52
53         struct bpos expected_min = bkey_cmp(l->min, l->max)
54                 ? btree_type_successor(b->btree_id, l->max)
55                 : l->max;
56
57         bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
58                 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
59                 b->data->min_key.inode,
60                 b->data->min_key.offset,
61                 expected_min.inode,
62                 expected_min.offset);
63
64         l->max = b->data->max_key;
65
66         if (b->level > r->depth) {
67                 l = &r->l[b->level - 1];
68
69                 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
70                         "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
71                         b->data->min_key.inode,
72                         b->data->min_key.offset,
73                         l->min.inode,
74                         l->min.offset);
75
76                 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
77                         "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
78                         b->data->max_key.inode,
79                         b->data->max_key.offset,
80                         l->max.inode,
81                         l->max.offset);
82
83                 if (bkey_cmp(b->data->max_key, POS_MAX))
84                         l->min = l->max =
85                                 btree_type_successor(b->btree_id,
86                                                      b->data->max_key);
87         }
88 }
89
90 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
91 {
92         const struct bch_extent_ptr *ptr;
93         u8 max_stale = 0;
94
95         if (bkey_extent_is_data(k.k)) {
96                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
97
98                 extent_for_each_ptr(e, ptr) {
99                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
100                         size_t b = PTR_BUCKET_NR(ca, ptr);
101
102                         if (gen_after(ca->oldest_gens[b], ptr->gen))
103                                 ca->oldest_gens[b] = ptr->gen;
104
105                         max_stale = max(max_stale, ptr_stale(ca, ptr));
106                 }
107         }
108
109         return max_stale;
110 }
111
112 /*
113  * For runtime mark and sweep:
114  */
115 static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
116                            struct bkey_s_c k, unsigned flags)
117 {
118         struct gc_pos pos = { 0 };
119         struct bch_fs_usage *stats;
120         u8 ret = 0;
121
122         preempt_disable();
123         stats = this_cpu_ptr(c->usage_percpu);
124         switch (type) {
125         case BKEY_TYPE_BTREE:
126                 bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
127                               0, flags|
128                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
129                               BCH_BUCKET_MARK_GC_LOCK_HELD);
130                 break;
131         case BKEY_TYPE_EXTENTS:
132                 bch2_mark_key(c, k, k.k->size, false, pos, stats,
133                               0, flags|
134                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
135                               BCH_BUCKET_MARK_GC_LOCK_HELD);
136                 ret = bch2_btree_key_recalc_oldest_gen(c, k);
137                 break;
138         default:
139                 BUG();
140         }
141         preempt_enable();
142
143         return ret;
144 }
145
146 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
147                                 struct bkey_s_c k)
148 {
149         enum bch_data_type data_type = type == BKEY_TYPE_BTREE
150                 ? BCH_DATA_BTREE : BCH_DATA_USER;
151         struct bch_devs_list devs = bch2_bkey_devs(k);
152         int ret = 0;
153
154         if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
155             fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
156                         "superblock not marked as containing replicas (type %u)",
157                         data_type)) {
158                 ret = bch2_check_mark_super(c, data_type, devs);
159                 if (ret)
160                         return ret;
161         }
162
163         switch (k.k->type) {
164         case BCH_EXTENT:
165         case BCH_EXTENT_CACHED: {
166                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
167                 const struct bch_extent_ptr *ptr;
168
169                 extent_for_each_ptr(e, ptr) {
170                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
171                         size_t b = PTR_BUCKET_NR(ca, ptr);
172                         struct bucket *g = PTR_BUCKET(ca, ptr);
173
174                         if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
175                                         "found ptr with missing gen in alloc btree,\n"
176                                         "type %s gen %u",
177                                         bch2_data_types[data_type],
178                                         ptr->gen)) {
179                                 g->_mark.gen = ptr->gen;
180                                 g->_mark.gen_valid = 1;
181                                 set_bit(b, ca->buckets_dirty);
182                         }
183
184                         if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
185                                         "%s ptr gen in the future: %u > %u",
186                                         bch2_data_types[data_type],
187                                         ptr->gen, g->mark.gen)) {
188                                 g->_mark.gen = ptr->gen;
189                                 g->_mark.gen_valid = 1;
190                                 set_bit(b, ca->buckets_dirty);
191                                 set_bit(BCH_FS_FIXED_GENS, &c->flags);
192                         }
193
194                 }
195                 break;
196         }
197         }
198
199         atomic64_set(&c->key_version,
200                      max_t(u64, k.k->version.lo,
201                            atomic64_read(&c->key_version)));
202
203         bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
204 fsck_err:
205         return ret;
206 }
207
208 static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
209 {
210         enum bkey_type type = btree_node_type(b);
211         struct btree_node_iter iter;
212         struct bkey unpacked;
213         struct bkey_s_c k;
214         u8 stale = 0;
215
216         if (btree_node_has_ptrs(b))
217                 for_each_btree_node_key_unpack(b, k, &iter,
218                                                btree_node_is_extents(b),
219                                                &unpacked) {
220                         bch2_bkey_debugcheck(c, b, k);
221                         stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
222                 }
223
224         return stale;
225 }
226
227 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
228 {
229         write_seqcount_begin(&c->gc_pos_lock);
230         c->gc_pos = new_pos;
231         write_seqcount_end(&c->gc_pos_lock);
232 }
233
234 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
235 {
236         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
237         __gc_pos_set(c, new_pos);
238 }
239
240 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
241 {
242         struct btree_iter iter;
243         struct btree *b;
244         struct range_checks r;
245         unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
246         unsigned max_stale;
247         int ret = 0;
248
249         /*
250          * if expensive_debug_checks is on, run range_checks on all leaf nodes:
251          */
252         if (expensive_debug_checks(c))
253                 depth = 0;
254
255         btree_node_range_checks_init(&r, depth);
256
257         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
258                               0, depth, BTREE_ITER_PREFETCH, b) {
259                 btree_node_range_checks(c, b, &r);
260
261                 bch2_verify_btree_nr_keys(b);
262
263                 max_stale = btree_gc_mark_node(c, b);
264
265                 gc_pos_set(c, gc_pos_btree_node(b));
266
267                 if (max_stale > 32)
268                         bch2_btree_node_rewrite(c, &iter,
269                                         b->data->keys.seq,
270                                         BTREE_INSERT_USE_RESERVE|
271                                         BTREE_INSERT_GC_LOCK_HELD);
272                 else if (!btree_gc_rewrite_disabled(c) &&
273                          (btree_gc_always_rewrite(c) || max_stale > 16))
274                         bch2_btree_node_rewrite(c, &iter,
275                                         b->data->keys.seq,
276                                         BTREE_INSERT_NOWAIT|
277                                         BTREE_INSERT_GC_LOCK_HELD);
278
279                 bch2_btree_iter_cond_resched(&iter);
280         }
281         ret = bch2_btree_iter_unlock(&iter);
282         if (ret)
283                 return ret;
284
285         mutex_lock(&c->btree_root_lock);
286
287         b = c->btree_roots[btree_id].b;
288         if (!btree_node_fake(b))
289                 bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
290         gc_pos_set(c, gc_pos_btree_root(b->btree_id));
291
292         mutex_unlock(&c->btree_root_lock);
293         return 0;
294 }
295
296 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
297                                   u64 start, u64 end,
298                                   enum bch_data_type type,
299                                   unsigned flags)
300 {
301         u64 b = sector_to_bucket(ca, start);
302
303         do {
304                 unsigned sectors =
305                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
306
307                 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
308                                           gc_phase(GC_PHASE_SB), flags);
309                 b++;
310                 start += sectors;
311         } while (start < end);
312 }
313
314 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
315                               unsigned flags)
316 {
317         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
318         unsigned i;
319         u64 b;
320
321         lockdep_assert_held(&c->sb_lock);
322
323         for (i = 0; i < layout->nr_superblocks; i++) {
324                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
325
326                 if (offset == BCH_SB_SECTOR)
327                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
328                                               BCH_DATA_SB, flags);
329
330                 mark_metadata_sectors(c, ca, offset,
331                                       offset + (1 << layout->sb_max_size_bits),
332                                       BCH_DATA_SB, flags);
333         }
334
335         spin_lock(&c->journal.lock);
336
337         for (i = 0; i < ca->journal.nr; i++) {
338                 b = ca->journal.buckets[i];
339                 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
340                                           ca->mi.bucket_size,
341                                           gc_phase(GC_PHASE_SB), flags);
342         }
343
344         spin_unlock(&c->journal.lock);
345 }
346
347 static void bch2_mark_superblocks(struct bch_fs *c)
348 {
349         struct bch_dev *ca;
350         unsigned i;
351
352         mutex_lock(&c->sb_lock);
353         gc_pos_set(c, gc_phase(GC_PHASE_SB));
354
355         for_each_online_member(ca, c, i)
356                 bch2_mark_dev_superblock(c, ca,
357                                          BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
358                                          BCH_BUCKET_MARK_GC_LOCK_HELD);
359         mutex_unlock(&c->sb_lock);
360 }
361
362 /* Also see bch2_pending_btree_node_free_insert_done() */
363 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
364 {
365         struct gc_pos pos = { 0 };
366         struct bch_fs_usage stats = { 0 };
367         struct btree_update *as;
368         struct pending_btree_node_free *d;
369
370         mutex_lock(&c->btree_interior_update_lock);
371         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
372
373         for_each_pending_btree_node_free(c, as, d)
374                 if (d->index_update_done)
375                         bch2_mark_key(c, bkey_i_to_s_c(&d->key),
376                                       c->opts.btree_node_size, true, pos,
377                                       &stats, 0,
378                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
379                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
380         /*
381          * Don't apply stats - pending deletes aren't tracked in
382          * bch_alloc_stats:
383          */
384
385         mutex_unlock(&c->btree_interior_update_lock);
386 }
387
388 static void bch2_mark_allocator_buckets(struct bch_fs *c)
389 {
390         struct bch_dev *ca;
391         struct open_bucket *ob;
392         size_t i, j, iter;
393         unsigned ci;
394
395         spin_lock(&c->freelist_lock);
396         gc_pos_set(c, gc_pos_alloc(c, NULL));
397
398         for_each_member_device(ca, c, ci) {
399                 fifo_for_each_entry(i, &ca->free_inc, iter)
400                         bch2_mark_alloc_bucket(c, ca, i, true,
401                                                gc_pos_alloc(c, NULL),
402                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
403                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
404
405
406
407                 for (j = 0; j < RESERVE_NR; j++)
408                         fifo_for_each_entry(i, &ca->free[j], iter)
409                                 bch2_mark_alloc_bucket(c, ca, i, true,
410                                                        gc_pos_alloc(c, NULL),
411                                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
412                                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
413         }
414
415         spin_unlock(&c->freelist_lock);
416
417         for (ob = c->open_buckets;
418              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
419              ob++) {
420                 spin_lock(&ob->lock);
421                 if (ob->valid) {
422                         gc_pos_set(c, gc_pos_alloc(c, ob));
423                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
424                         bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
425                                                gc_pos_alloc(c, ob),
426                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
427                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
428                 }
429                 spin_unlock(&ob->lock);
430         }
431 }
432
433 static void bch2_gc_start(struct bch_fs *c)
434 {
435         struct bch_dev *ca;
436         struct bucket_array *buckets;
437         struct bucket_mark new;
438         unsigned i;
439         size_t b;
440         int cpu;
441
442         lg_global_lock(&c->usage_lock);
443
444         /*
445          * Indicates to buckets code that gc is now in progress - done under
446          * usage_lock to avoid racing with bch2_mark_key():
447          */
448         __gc_pos_set(c, GC_POS_MIN);
449
450         /* Save a copy of the existing bucket stats while we recompute them: */
451         for_each_member_device(ca, c, i) {
452                 ca->usage_cached = __bch2_dev_usage_read(ca);
453                 for_each_possible_cpu(cpu) {
454                         struct bch_dev_usage *p =
455                                 per_cpu_ptr(ca->usage_percpu, cpu);
456                         memset(p, 0, sizeof(*p));
457                 }
458         }
459
460         c->usage_cached = __bch2_fs_usage_read(c);
461         for_each_possible_cpu(cpu) {
462                 struct bch_fs_usage *p =
463                         per_cpu_ptr(c->usage_percpu, cpu);
464
465                 memset(p->s, 0, sizeof(p->s));
466         }
467
468         lg_global_unlock(&c->usage_lock);
469
470         /* Clear bucket marks: */
471         for_each_member_device(ca, c, i) {
472                 down_read(&ca->bucket_lock);
473                 buckets = bucket_array(ca);
474
475                 for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
476                         bucket_cmpxchg(buckets->b + b, new, ({
477                                 new.owned_by_allocator  = 0;
478                                 new.data_type           = 0;
479                                 new.cached_sectors      = 0;
480                                 new.dirty_sectors       = 0;
481                         }));
482                         ca->oldest_gens[b] = new.gen;
483                 }
484                 up_read(&ca->bucket_lock);
485         }
486 }
487
488 /**
489  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
490  */
491 void bch2_gc(struct bch_fs *c)
492 {
493         struct bch_dev *ca;
494         u64 start_time = local_clock();
495         unsigned i;
496
497         /*
498          * Walk _all_ references to buckets, and recompute them:
499          *
500          * Order matters here:
501          *  - Concurrent GC relies on the fact that we have a total ordering for
502          *    everything that GC walks - see  gc_will_visit_node(),
503          *    gc_will_visit_root()
504          *
505          *  - also, references move around in the course of index updates and
506          *    various other crap: everything needs to agree on the ordering
507          *    references are allowed to move around in - e.g., we're allowed to
508          *    start with a reference owned by an open_bucket (the allocator) and
509          *    move it to the btree, but not the reverse.
510          *
511          *    This is necessary to ensure that gc doesn't miss references that
512          *    move around - if references move backwards in the ordering GC
513          *    uses, GC could skip past them
514          */
515         trace_gc_start(c);
516
517         /*
518          * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
519          * gc_lock if sectors_available goes to 0:
520          */
521         bch2_recalc_sectors_available(c);
522
523         down_write(&c->gc_lock);
524         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
525                 goto out;
526
527         bch2_gc_start(c);
528
529         /* Walk btree: */
530         while (c->gc_pos.phase < (int) BTREE_ID_NR) {
531                 int ret = c->btree_roots[c->gc_pos.phase].b
532                         ? bch2_gc_btree(c, (int) c->gc_pos.phase)
533                         : 0;
534
535                 if (ret) {
536                         bch_err(c, "btree gc failed: %d", ret);
537                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
538                         goto out;
539                 }
540
541                 gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
542         }
543
544         bch2_mark_superblocks(c);
545         bch2_mark_pending_btree_node_frees(c);
546         bch2_mark_allocator_buckets(c);
547
548         for_each_member_device(ca, c, i)
549                 atomic_long_set(&ca->saturated_count, 0);
550
551         /* Indicates that gc is no longer in progress: */
552         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
553         c->gc_count++;
554 out:
555         up_write(&c->gc_lock);
556         trace_gc_end(c);
557         bch2_time_stats_update(&c->btree_gc_time, start_time);
558
559         /*
560          * Wake up allocator in case it was waiting for buckets
561          * because of not being able to inc gens
562          */
563         for_each_member_device(ca, c, i)
564                 bch2_wake_allocator(ca);
565
566         /*
567          * At startup, allocations can happen directly instead of via the
568          * allocator thread - issue wakeup in case they blocked on gc_lock:
569          */
570         closure_wake_up(&c->freelist_wait);
571 }
572
573 /* Btree coalescing */
574
575 static void recalc_packed_keys(struct btree *b)
576 {
577         struct bkey_packed *k;
578
579         memset(&b->nr, 0, sizeof(b->nr));
580
581         BUG_ON(b->nsets != 1);
582
583         for (k =  btree_bkey_first(b, b->set);
584              k != btree_bkey_last(b, b->set);
585              k = bkey_next(k))
586                 btree_keys_account_key_add(&b->nr, 0, k);
587 }
588
589 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
590                                 struct btree *old_nodes[GC_MERGE_NODES])
591 {
592         struct btree *parent = btree_node_parent(iter, old_nodes[0]);
593         unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
594         unsigned blocks = btree_blocks(c) * 2 / 3;
595         struct btree *new_nodes[GC_MERGE_NODES];
596         struct btree_update *as;
597         struct keylist keylist;
598         struct bkey_format_state format_state;
599         struct bkey_format new_format;
600
601         memset(new_nodes, 0, sizeof(new_nodes));
602         bch2_keylist_init(&keylist, NULL);
603
604         /* Count keys that are not deleted */
605         for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
606                 u64s += old_nodes[i]->nr.live_u64s;
607
608         nr_old_nodes = nr_new_nodes = i;
609
610         /* Check if all keys in @old_nodes could fit in one fewer node */
611         if (nr_old_nodes <= 1 ||
612             __vstruct_blocks(struct btree_node, c->block_bits,
613                              DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
614                 return;
615
616         /* Find a format that all keys in @old_nodes can pack into */
617         bch2_bkey_format_init(&format_state);
618
619         for (i = 0; i < nr_old_nodes; i++)
620                 __bch2_btree_calc_format(&format_state, old_nodes[i]);
621
622         new_format = bch2_bkey_format_done(&format_state);
623
624         /* Check if repacking would make any nodes too big to fit */
625         for (i = 0; i < nr_old_nodes; i++)
626                 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
627                         trace_btree_gc_coalesce_fail(c,
628                                         BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
629                         return;
630                 }
631
632         if (bch2_keylist_realloc(&keylist, NULL, 0,
633                         (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
634                 trace_btree_gc_coalesce_fail(c,
635                                 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
636                 return;
637         }
638
639         as = bch2_btree_update_start(c, iter->btree_id,
640                         btree_update_reserve_required(c, parent) + nr_old_nodes,
641                         BTREE_INSERT_NOFAIL|
642                         BTREE_INSERT_USE_RESERVE,
643                         NULL);
644         if (IS_ERR(as)) {
645                 trace_btree_gc_coalesce_fail(c,
646                                 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
647                 bch2_keylist_free(&keylist, NULL);
648                 return;
649         }
650
651         trace_btree_gc_coalesce(c, old_nodes[0]);
652
653         for (i = 0; i < nr_old_nodes; i++)
654                 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
655
656         /* Repack everything with @new_format and sort down to one bset */
657         for (i = 0; i < nr_old_nodes; i++)
658                 new_nodes[i] =
659                         __bch2_btree_node_alloc_replacement(as, old_nodes[i],
660                                                             new_format);
661
662         /*
663          * Conceptually we concatenate the nodes together and slice them
664          * up at different boundaries.
665          */
666         for (i = nr_new_nodes - 1; i > 0; --i) {
667                 struct btree *n1 = new_nodes[i];
668                 struct btree *n2 = new_nodes[i - 1];
669
670                 struct bset *s1 = btree_bset_first(n1);
671                 struct bset *s2 = btree_bset_first(n2);
672                 struct bkey_packed *k, *last = NULL;
673
674                 /* Calculate how many keys from @n2 we could fit inside @n1 */
675                 u64s = 0;
676
677                 for (k = s2->start;
678                      k < vstruct_last(s2) &&
679                      vstruct_blocks_plus(n1->data, c->block_bits,
680                                          u64s + k->u64s) <= blocks;
681                      k = bkey_next(k)) {
682                         last = k;
683                         u64s += k->u64s;
684                 }
685
686                 if (u64s == le16_to_cpu(s2->u64s)) {
687                         /* n2 fits entirely in n1 */
688                         n1->key.k.p = n1->data->max_key = n2->data->max_key;
689
690                         memcpy_u64s(vstruct_last(s1),
691                                     s2->start,
692                                     le16_to_cpu(s2->u64s));
693                         le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
694
695                         set_btree_bset_end(n1, n1->set);
696
697                         six_unlock_write(&n2->lock);
698                         bch2_btree_node_free_never_inserted(c, n2);
699                         six_unlock_intent(&n2->lock);
700
701                         memmove(new_nodes + i - 1,
702                                 new_nodes + i,
703                                 sizeof(new_nodes[0]) * (nr_new_nodes - i));
704                         new_nodes[--nr_new_nodes] = NULL;
705                 } else if (u64s) {
706                         /* move part of n2 into n1 */
707                         n1->key.k.p = n1->data->max_key =
708                                 bkey_unpack_pos(n1, last);
709
710                         n2->data->min_key =
711                                 btree_type_successor(iter->btree_id,
712                                                      n1->data->max_key);
713
714                         memcpy_u64s(vstruct_last(s1),
715                                     s2->start, u64s);
716                         le16_add_cpu(&s1->u64s, u64s);
717
718                         memmove(s2->start,
719                                 vstruct_idx(s2, u64s),
720                                 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
721                         s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
722
723                         set_btree_bset_end(n1, n1->set);
724                         set_btree_bset_end(n2, n2->set);
725                 }
726         }
727
728         for (i = 0; i < nr_new_nodes; i++) {
729                 struct btree *n = new_nodes[i];
730
731                 recalc_packed_keys(n);
732                 btree_node_reset_sib_u64s(n);
733
734                 bch2_btree_build_aux_trees(n);
735                 six_unlock_write(&n->lock);
736
737                 bch2_btree_node_write(c, n, SIX_LOCK_intent);
738         }
739
740         /*
741          * The keys for the old nodes get deleted. We don't want to insert keys
742          * that compare equal to the keys for the new nodes we'll also be
743          * inserting - we can't because keys on a keylist must be strictly
744          * greater than the previous keys, and we also don't need to since the
745          * key for the new node will serve the same purpose (overwriting the key
746          * for the old node).
747          */
748         for (i = 0; i < nr_old_nodes; i++) {
749                 struct bkey_i delete;
750                 unsigned j;
751
752                 for (j = 0; j < nr_new_nodes; j++)
753                         if (!bkey_cmp(old_nodes[i]->key.k.p,
754                                       new_nodes[j]->key.k.p))
755                                 goto next;
756
757                 bkey_init(&delete.k);
758                 delete.k.p = old_nodes[i]->key.k.p;
759                 bch2_keylist_add_in_order(&keylist, &delete);
760 next:
761                 i = i;
762         }
763
764         /*
765          * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
766          * does the lookup once and thus expects the keys to be in sorted order
767          * so we have to make sure the new keys are correctly ordered with
768          * respect to the deleted keys added in the previous loop
769          */
770         for (i = 0; i < nr_new_nodes; i++)
771                 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
772
773         /* Insert the newly coalesced nodes */
774         bch2_btree_insert_node(as, parent, iter, &keylist);
775
776         BUG_ON(!bch2_keylist_empty(&keylist));
777
778         BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
779
780         BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
781
782         for (i = 0; i < nr_new_nodes; i++)
783                 bch2_btree_open_bucket_put(c, new_nodes[i]);
784
785         /* Free the old nodes and update our sliding window */
786         for (i = 0; i < nr_old_nodes; i++) {
787                 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
788                 six_unlock_intent(&old_nodes[i]->lock);
789
790                 /*
791                  * the index update might have triggered a split, in which case
792                  * the nodes we coalesced - the new nodes we just created -
793                  * might not be sibling nodes anymore - don't add them to the
794                  * sliding window (except the first):
795                  */
796                 if (!i) {
797                         old_nodes[i] = new_nodes[i];
798                 } else {
799                         old_nodes[i] = NULL;
800                         if (new_nodes[i])
801                                 six_unlock_intent(&new_nodes[i]->lock);
802                 }
803         }
804
805         bch2_btree_update_done(as);
806         bch2_keylist_free(&keylist, NULL);
807 }
808
809 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
810 {
811         struct btree_iter iter;
812         struct btree *b;
813         unsigned i;
814
815         /* Sliding window of adjacent btree nodes */
816         struct btree *merge[GC_MERGE_NODES];
817         u32 lock_seq[GC_MERGE_NODES];
818
819         /*
820          * XXX: We don't have a good way of positively matching on sibling nodes
821          * that have the same parent - this code works by handling the cases
822          * where they might not have the same parent, and is thus fragile. Ugh.
823          *
824          * Perhaps redo this to use multiple linked iterators?
825          */
826         memset(merge, 0, sizeof(merge));
827
828         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
829                               BTREE_MAX_DEPTH, 0,
830                               BTREE_ITER_PREFETCH, b) {
831                 memmove(merge + 1, merge,
832                         sizeof(merge) - sizeof(merge[0]));
833                 memmove(lock_seq + 1, lock_seq,
834                         sizeof(lock_seq) - sizeof(lock_seq[0]));
835
836                 merge[0] = b;
837
838                 for (i = 1; i < GC_MERGE_NODES; i++) {
839                         if (!merge[i] ||
840                             !six_relock_intent(&merge[i]->lock, lock_seq[i]))
841                                 break;
842
843                         if (merge[i]->level != merge[0]->level) {
844                                 six_unlock_intent(&merge[i]->lock);
845                                 break;
846                         }
847                 }
848                 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
849
850                 bch2_coalesce_nodes(c, &iter, merge);
851
852                 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
853                         lock_seq[i] = merge[i]->lock.state.seq;
854                         six_unlock_intent(&merge[i]->lock);
855                 }
856
857                 lock_seq[0] = merge[0]->lock.state.seq;
858
859                 if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
860                         bch2_btree_iter_unlock(&iter);
861                         return -ESHUTDOWN;
862                 }
863
864                 bch2_btree_iter_cond_resched(&iter);
865
866                 /*
867                  * If the parent node wasn't relocked, it might have been split
868                  * and the nodes in our sliding window might not have the same
869                  * parent anymore - blow away the sliding window:
870                  */
871                 if (btree_iter_node(&iter, iter.level + 1) &&
872                     !btree_node_intent_locked(&iter, iter.level + 1))
873                         memset(merge + 1, 0,
874                                (GC_MERGE_NODES - 1) * sizeof(merge[0]));
875         }
876         return bch2_btree_iter_unlock(&iter);
877 }
878
879 /**
880  * bch_coalesce - coalesce adjacent nodes with low occupancy
881  */
882 void bch2_coalesce(struct bch_fs *c)
883 {
884         enum btree_id id;
885
886         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
887                 return;
888
889         down_read(&c->gc_lock);
890         trace_gc_coalesce_start(c);
891
892         for (id = 0; id < BTREE_ID_NR; id++) {
893                 int ret = c->btree_roots[id].b
894                         ? bch2_coalesce_btree(c, id)
895                         : 0;
896
897                 if (ret) {
898                         if (ret != -ESHUTDOWN)
899                                 bch_err(c, "btree coalescing failed: %d", ret);
900                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
901                         return;
902                 }
903         }
904
905         trace_gc_coalesce_end(c);
906         up_read(&c->gc_lock);
907 }
908
909 static int bch2_gc_thread(void *arg)
910 {
911         struct bch_fs *c = arg;
912         struct io_clock *clock = &c->io_clock[WRITE];
913         unsigned long last = atomic_long_read(&clock->now);
914         unsigned last_kick = atomic_read(&c->kick_gc);
915
916         set_freezable();
917
918         while (1) {
919                 while (1) {
920                         set_current_state(TASK_INTERRUPTIBLE);
921
922                         if (kthread_should_stop()) {
923                                 __set_current_state(TASK_RUNNING);
924                                 return 0;
925                         }
926
927                         if (atomic_read(&c->kick_gc) != last_kick)
928                                 break;
929
930                         if (c->btree_gc_periodic) {
931                                 unsigned long next = last + c->capacity / 16;
932
933                                 if (atomic_long_read(&clock->now) >= next)
934                                         break;
935
936                                 bch2_io_clock_schedule_timeout(clock, next);
937                         } else {
938                                 schedule();
939                         }
940
941                         try_to_freeze();
942                 }
943                 __set_current_state(TASK_RUNNING);
944
945                 last = atomic_long_read(&clock->now);
946                 last_kick = atomic_read(&c->kick_gc);
947
948                 bch2_gc(c);
949
950                 debug_check_no_locks_held();
951         }
952
953         return 0;
954 }
955
956 void bch2_gc_thread_stop(struct bch_fs *c)
957 {
958         set_bit(BCH_FS_GC_STOPPING, &c->flags);
959
960         if (c->gc_thread)
961                 kthread_stop(c->gc_thread);
962
963         c->gc_thread = NULL;
964         clear_bit(BCH_FS_GC_STOPPING, &c->flags);
965 }
966
967 int bch2_gc_thread_start(struct bch_fs *c)
968 {
969         struct task_struct *p;
970
971         BUG_ON(c->gc_thread);
972
973         p = kthread_create(bch2_gc_thread, c, "bcache_gc");
974         if (IS_ERR(p))
975                 return PTR_ERR(p);
976
977         c->gc_thread = p;
978         wake_up_process(c->gc_thread);
979         return 0;
980 }
981
982 /* Initial GC computes bucket marks during startup */
983
984 static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
985 {
986         struct btree_iter iter;
987         struct btree *b;
988         struct range_checks r;
989         int ret = 0;
990
991         btree_node_range_checks_init(&r, 0);
992
993         if (!c->btree_roots[id].b)
994                 return 0;
995
996         b = c->btree_roots[id].b;
997         if (!btree_node_fake(b))
998                 ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
999                                                   bkey_i_to_s_c(&b->key));
1000         if (ret)
1001                 return ret;
1002
1003         /*
1004          * We have to hit every btree node before starting journal replay, in
1005          * order for the journal seq blacklist machinery to work:
1006          */
1007         for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
1008                 btree_node_range_checks(c, b, &r);
1009
1010                 if (btree_node_has_ptrs(b)) {
1011                         struct btree_node_iter node_iter;
1012                         struct bkey unpacked;
1013                         struct bkey_s_c k;
1014
1015                         for_each_btree_node_key_unpack(b, k, &node_iter,
1016                                                        btree_node_is_extents(b),
1017                                                        &unpacked) {
1018                                 ret = bch2_btree_mark_key_initial(c,
1019                                                         btree_node_type(b), k);
1020                                 if (ret)
1021                                         goto err;
1022                         }
1023                 }
1024
1025                 bch2_btree_iter_cond_resched(&iter);
1026         }
1027 err:
1028         return bch2_btree_iter_unlock(&iter) ?: ret;
1029 }
1030
1031 static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1032 {
1033         unsigned iter = 0;
1034         enum btree_id id;
1035         int ret;
1036
1037         mutex_lock(&c->sb_lock);
1038         if (!bch2_sb_get_replicas(c->disk_sb)) {
1039                 if (BCH_SB_INITIALIZED(c->disk_sb))
1040                         bch_info(c, "building replicas info");
1041                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
1042         }
1043         mutex_unlock(&c->sb_lock);
1044 again:
1045         bch2_gc_start(c);
1046
1047         for (id = 0; id < BTREE_ID_NR; id++) {
1048                 ret = bch2_initial_gc_btree(c, id);
1049                 if (ret)
1050                         return ret;
1051         }
1052
1053         ret = bch2_journal_mark(c, journal);
1054         if (ret)
1055                 return ret;
1056
1057         if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
1058                 if (iter++ > 2) {
1059                         bch_info(c, "Unable to fix bucket gens, looping");
1060                         return -EINVAL;
1061                 }
1062
1063                 bch_info(c, "Fixed gens, restarting initial mark and sweep:");
1064                 clear_bit(BCH_FS_FIXED_GENS, &c->flags);
1065                 goto again;
1066         }
1067
1068         /*
1069          * Skip past versions that might have possibly been used (as nonces),
1070          * but hadn't had their pointers written:
1071          */
1072         if (c->sb.encryption_type)
1073                 atomic64_add(1 << 16, &c->key_version);
1074
1075         bch2_mark_superblocks(c);
1076
1077         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
1078         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
1079
1080         return 0;
1081 }
1082
1083 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1084 {
1085         int ret;
1086
1087         down_write(&c->gc_lock);
1088         ret = __bch2_initial_gc(c, journal);
1089         up_write(&c->gc_lock);
1090
1091         return ret;
1092 }