]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c
Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  * Copyright (C) 2014 Datera Inc.
4  */
5
6 #include "bcachefs.h"
7 #include "alloc.h"
8 #include "bkey_methods.h"
9 #include "btree_locking.h"
10 #include "btree_update_interior.h"
11 #include "btree_io.h"
12 #include "btree_gc.h"
13 #include "buckets.h"
14 #include "clock.h"
15 #include "debug.h"
16 #include "error.h"
17 #include "extents.h"
18 #include "journal.h"
19 #include "keylist.h"
20 #include "move.h"
21 #include "super-io.h"
22
23 #include <linux/slab.h>
24 #include <linux/bitops.h>
25 #include <linux/freezer.h>
26 #include <linux/kthread.h>
27 #include <linux/preempt.h>
28 #include <linux/rcupdate.h>
29 #include <trace/events/bcachefs.h>
30
31 struct range_checks {
32         struct range_level {
33                 struct bpos     min;
34                 struct bpos     max;
35         }                       l[BTREE_MAX_DEPTH];
36         unsigned                depth;
37 };
38
39 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
40 {
41         unsigned i;
42
43         for (i = 0; i < BTREE_MAX_DEPTH; i++)
44                 r->l[i].min = r->l[i].max = POS_MIN;
45         r->depth = depth;
46 }
47
48 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
49                                     struct range_checks *r)
50 {
51         struct range_level *l = &r->l[b->level];
52
53         struct bpos expected_min = bkey_cmp(l->min, l->max)
54                 ? btree_type_successor(b->btree_id, l->max)
55                 : l->max;
56
57         bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
58                 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
59                 b->data->min_key.inode,
60                 b->data->min_key.offset,
61                 expected_min.inode,
62                 expected_min.offset);
63
64         l->max = b->data->max_key;
65
66         if (b->level > r->depth) {
67                 l = &r->l[b->level - 1];
68
69                 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
70                         "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
71                         b->data->min_key.inode,
72                         b->data->min_key.offset,
73                         l->min.inode,
74                         l->min.offset);
75
76                 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
77                         "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
78                         b->data->max_key.inode,
79                         b->data->max_key.offset,
80                         l->max.inode,
81                         l->max.offset);
82
83                 if (bkey_cmp(b->data->max_key, POS_MAX))
84                         l->min = l->max =
85                                 btree_type_successor(b->btree_id,
86                                                      b->data->max_key);
87         }
88 }
89
90 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
91 {
92         const struct bch_extent_ptr *ptr;
93         u8 max_stale = 0;
94
95         if (bkey_extent_is_data(k.k)) {
96                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
97
98                 extent_for_each_ptr(e, ptr) {
99                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
100                         size_t b = PTR_BUCKET_NR(ca, ptr);
101
102                         if (gen_after(ca->oldest_gens[b], ptr->gen))
103                                 ca->oldest_gens[b] = ptr->gen;
104
105                         max_stale = max(max_stale, ptr_stale(ca, ptr));
106                 }
107         }
108
109         return max_stale;
110 }
111
112 /*
113  * For runtime mark and sweep:
114  */
115 static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
116                            struct bkey_s_c k, unsigned flags)
117 {
118         struct gc_pos pos = { 0 };
119         struct bch_fs_usage *stats;
120         u8 ret = 0;
121
122         preempt_disable();
123         stats = this_cpu_ptr(c->usage_percpu);
124         switch (type) {
125         case BKEY_TYPE_BTREE:
126                 bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
127                               0, flags|
128                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
129                               BCH_BUCKET_MARK_GC_LOCK_HELD);
130                 break;
131         case BKEY_TYPE_EXTENTS:
132                 bch2_mark_key(c, k, k.k->size, false, pos, stats,
133                               0, flags|
134                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
135                               BCH_BUCKET_MARK_GC_LOCK_HELD);
136                 ret = bch2_btree_key_recalc_oldest_gen(c, k);
137                 break;
138         default:
139                 BUG();
140         }
141         preempt_enable();
142
143         return ret;
144 }
145
146 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
147                                 struct bkey_s_c k)
148 {
149         enum bch_data_type data_type = type == BKEY_TYPE_BTREE
150                 ? BCH_DATA_BTREE : BCH_DATA_USER;
151         int ret = 0;
152
153         switch (k.k->type) {
154         case BCH_EXTENT:
155         case BCH_EXTENT_CACHED: {
156                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
157                 const struct bch_extent_ptr *ptr;
158
159                 if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
160                     (!c->opts.nofsck &&
161                      fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
162                                  "superblock not marked as containing replicas (type %u)",
163                                  data_type))) {
164                         ret = bch2_check_mark_super(c, e, data_type);
165                         if (ret)
166                                 return ret;
167                 }
168
169                 extent_for_each_ptr(e, ptr) {
170                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
171                         struct bucket *g = PTR_BUCKET(ca, ptr);
172
173                         if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
174                                         "found ptr with missing gen in alloc btree,\n"
175                                         "type %s gen %u",
176                                         bch2_data_types[data_type],
177                                         ptr->gen)) {
178                                 g->_mark.gen = ptr->gen;
179                                 g->_mark.gen_valid = 1;
180                                 set_bit(g - ca->buckets, ca->bucket_dirty);
181                         }
182
183                         if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
184                                         "%s ptr gen in the future: %u > %u",
185                                         bch2_data_types[data_type],
186                                         ptr->gen, g->mark.gen)) {
187                                 g->_mark.gen = ptr->gen;
188                                 g->_mark.gen_valid = 1;
189                                 set_bit(g - ca->buckets, ca->bucket_dirty);
190                                 set_bit(BCH_FS_FIXED_GENS, &c->flags);
191                         }
192
193                 }
194                 break;
195         }
196         }
197
198
199         atomic64_set(&c->key_version,
200                      max_t(u64, k.k->version.lo,
201                            atomic64_read(&c->key_version)));
202
203         bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
204 fsck_err:
205         return ret;
206 }
207
208 static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
209 {
210         enum bkey_type type = btree_node_type(b);
211         struct btree_node_iter iter;
212         struct bkey unpacked;
213         struct bkey_s_c k;
214         u8 stale = 0;
215
216         if (btree_node_has_ptrs(b))
217                 for_each_btree_node_key_unpack(b, k, &iter,
218                                                btree_node_is_extents(b),
219                                                &unpacked) {
220                         bch2_bkey_debugcheck(c, b, k);
221                         stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
222                 }
223
224         return stale;
225 }
226
227 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
228 {
229         write_seqcount_begin(&c->gc_pos_lock);
230         c->gc_pos = new_pos;
231         write_seqcount_end(&c->gc_pos_lock);
232 }
233
234 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
235 {
236         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
237         __gc_pos_set(c, new_pos);
238 }
239
240 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
241 {
242         struct btree_iter iter;
243         struct btree *b;
244         struct range_checks r;
245         unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
246         unsigned max_stale;
247         int ret = 0;
248
249         /*
250          * if expensive_debug_checks is on, run range_checks on all leaf nodes:
251          */
252         if (expensive_debug_checks(c))
253                 depth = 0;
254
255         btree_node_range_checks_init(&r, depth);
256
257         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
258                               0, depth, BTREE_ITER_PREFETCH, b) {
259                 btree_node_range_checks(c, b, &r);
260
261                 bch2_verify_btree_nr_keys(b);
262
263                 max_stale = btree_gc_mark_node(c, b);
264
265                 gc_pos_set(c, gc_pos_btree_node(b));
266
267                 if (max_stale > 32)
268                         bch2_btree_node_rewrite(c, &iter,
269                                         b->data->keys.seq,
270                                         BTREE_INSERT_USE_RESERVE|
271                                         BTREE_INSERT_GC_LOCK_HELD);
272                 else if (!btree_gc_rewrite_disabled(c) &&
273                          (btree_gc_always_rewrite(c) || max_stale > 16))
274                         bch2_btree_node_rewrite(c, &iter,
275                                         b->data->keys.seq,
276                                         BTREE_INSERT_NOWAIT|
277                                         BTREE_INSERT_GC_LOCK_HELD);
278
279                 bch2_btree_iter_cond_resched(&iter);
280         }
281         ret = bch2_btree_iter_unlock(&iter);
282         if (ret)
283                 return ret;
284
285         mutex_lock(&c->btree_root_lock);
286
287         b = c->btree_roots[btree_id].b;
288         bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
289         gc_pos_set(c, gc_pos_btree_root(b->btree_id));
290
291         mutex_unlock(&c->btree_root_lock);
292         return 0;
293 }
294
295 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
296                                   u64 start, u64 end,
297                                   enum bucket_data_type type,
298                                   unsigned flags)
299 {
300         u64 b = sector_to_bucket(ca, start);
301
302         do {
303                 bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type,
304                                           gc_phase(GC_PHASE_SB), flags);
305                 b++;
306         } while (b < sector_to_bucket(ca, end));
307 }
308
309 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
310                               unsigned flags)
311 {
312         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
313         unsigned i;
314         u64 b;
315
316         lockdep_assert_held(&c->sb_lock);
317
318         for (i = 0; i < layout->nr_superblocks; i++) {
319                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
320
321                 if (offset == BCH_SB_SECTOR)
322                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
323                                               BUCKET_SB, flags);
324
325                 mark_metadata_sectors(c, ca, offset,
326                                       offset + (1 << layout->sb_max_size_bits),
327                                       BUCKET_SB, flags);
328         }
329
330         spin_lock(&c->journal.lock);
331
332         for (i = 0; i < ca->journal.nr; i++) {
333                 b = ca->journal.buckets[i];
334                 bch2_mark_metadata_bucket(c, ca, ca->buckets + b,
335                                           BUCKET_JOURNAL,
336                                           gc_phase(GC_PHASE_SB), flags);
337         }
338
339         spin_unlock(&c->journal.lock);
340 }
341
342 static void bch2_mark_superblocks(struct bch_fs *c)
343 {
344         struct bch_dev *ca;
345         unsigned i;
346
347         mutex_lock(&c->sb_lock);
348         gc_pos_set(c, gc_phase(GC_PHASE_SB));
349
350         for_each_online_member(ca, c, i)
351                 bch2_mark_dev_superblock(c, ca,
352                                          BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
353                                          BCH_BUCKET_MARK_GC_LOCK_HELD);
354         mutex_unlock(&c->sb_lock);
355 }
356
357 /* Also see bch2_pending_btree_node_free_insert_done() */
358 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
359 {
360         struct gc_pos pos = { 0 };
361         struct bch_fs_usage stats = { 0 };
362         struct btree_update *as;
363         struct pending_btree_node_free *d;
364
365         mutex_lock(&c->btree_interior_update_lock);
366         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
367
368         for_each_pending_btree_node_free(c, as, d)
369                 if (d->index_update_done)
370                         bch2_mark_key(c, bkey_i_to_s_c(&d->key),
371                                       c->opts.btree_node_size, true, pos,
372                                       &stats, 0,
373                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
374                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
375         /*
376          * Don't apply stats - pending deletes aren't tracked in
377          * bch_alloc_stats:
378          */
379
380         mutex_unlock(&c->btree_interior_update_lock);
381 }
382
383 static void bch2_mark_allocator_buckets(struct bch_fs *c)
384 {
385         struct bch_dev *ca;
386         struct open_bucket *ob;
387         size_t i, j, iter;
388         unsigned ci;
389
390         spin_lock(&c->freelist_lock);
391         gc_pos_set(c, gc_pos_alloc(c, NULL));
392
393         for_each_member_device(ca, c, ci) {
394                 fifo_for_each_entry(i, &ca->free_inc, iter)
395                         bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
396                                                gc_pos_alloc(c, NULL),
397                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
398                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
399
400
401
402                 for (j = 0; j < RESERVE_NR; j++)
403                         fifo_for_each_entry(i, &ca->free[j], iter)
404                                 bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
405                                                        gc_pos_alloc(c, NULL),
406                                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
407                                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
408         }
409
410         spin_unlock(&c->freelist_lock);
411
412         for (ob = c->open_buckets;
413              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
414              ob++) {
415                 spin_lock(&ob->lock);
416                 if (ob->valid) {
417                         gc_pos_set(c, gc_pos_alloc(c, ob));
418                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
419                         bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
420                                                gc_pos_alloc(c, ob),
421                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
422                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
423                 }
424                 spin_unlock(&ob->lock);
425         }
426 }
427
428 static void bch2_gc_start(struct bch_fs *c)
429 {
430         struct bch_dev *ca;
431         struct bucket *g;
432         struct bucket_mark new;
433         unsigned i;
434         int cpu;
435
436         lg_global_lock(&c->usage_lock);
437
438         /*
439          * Indicates to buckets code that gc is now in progress - done under
440          * usage_lock to avoid racing with bch2_mark_key():
441          */
442         __gc_pos_set(c, GC_POS_MIN);
443
444         /* Save a copy of the existing bucket stats while we recompute them: */
445         for_each_member_device(ca, c, i) {
446                 ca->usage_cached = __bch2_dev_usage_read(ca);
447                 for_each_possible_cpu(cpu) {
448                         struct bch_dev_usage *p =
449                                 per_cpu_ptr(ca->usage_percpu, cpu);
450                         memset(p, 0, sizeof(*p));
451                 }
452         }
453
454         c->usage_cached = __bch2_fs_usage_read(c);
455         for_each_possible_cpu(cpu) {
456                 struct bch_fs_usage *p =
457                         per_cpu_ptr(c->usage_percpu, cpu);
458
459                 memset(p->s, 0, sizeof(p->s));
460         }
461
462         lg_global_unlock(&c->usage_lock);
463
464         /* Clear bucket marks: */
465         for_each_member_device(ca, c, i)
466                 for_each_bucket(g, ca) {
467                         bucket_cmpxchg(g, new, ({
468                                 new.owned_by_allocator  = 0;
469                                 new.data_type           = 0;
470                                 new.cached_sectors      = 0;
471                                 new.dirty_sectors       = 0;
472                         }));
473                         ca->oldest_gens[g - ca->buckets] = new.gen;
474                 }
475 }
476
477 /**
478  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
479  */
480 void bch2_gc(struct bch_fs *c)
481 {
482         struct bch_dev *ca;
483         u64 start_time = local_clock();
484         unsigned i;
485
486         /*
487          * Walk _all_ references to buckets, and recompute them:
488          *
489          * Order matters here:
490          *  - Concurrent GC relies on the fact that we have a total ordering for
491          *    everything that GC walks - see  gc_will_visit_node(),
492          *    gc_will_visit_root()
493          *
494          *  - also, references move around in the course of index updates and
495          *    various other crap: everything needs to agree on the ordering
496          *    references are allowed to move around in - e.g., we're allowed to
497          *    start with a reference owned by an open_bucket (the allocator) and
498          *    move it to the btree, but not the reverse.
499          *
500          *    This is necessary to ensure that gc doesn't miss references that
501          *    move around - if references move backwards in the ordering GC
502          *    uses, GC could skip past them
503          */
504         trace_gc_start(c);
505
506         /*
507          * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
508          * gc_lock if sectors_available goes to 0:
509          */
510         bch2_recalc_sectors_available(c);
511
512         down_write(&c->gc_lock);
513         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
514                 goto out;
515
516         bch2_gc_start(c);
517
518         /* Walk btree: */
519         while (c->gc_pos.phase < (int) BTREE_ID_NR) {
520                 int ret = c->btree_roots[c->gc_pos.phase].b
521                         ? bch2_gc_btree(c, (int) c->gc_pos.phase)
522                         : 0;
523
524                 if (ret) {
525                         bch_err(c, "btree gc failed: %d", ret);
526                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
527                         goto out;
528                 }
529
530                 gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
531         }
532
533         bch2_mark_superblocks(c);
534         bch2_mark_pending_btree_node_frees(c);
535         bch2_mark_allocator_buckets(c);
536
537         for_each_member_device(ca, c, i)
538                 atomic_long_set(&ca->saturated_count, 0);
539
540         /* Indicates that gc is no longer in progress: */
541         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
542         c->gc_count++;
543 out:
544         up_write(&c->gc_lock);
545         trace_gc_end(c);
546         bch2_time_stats_update(&c->btree_gc_time, start_time);
547
548         /*
549          * Wake up allocator in case it was waiting for buckets
550          * because of not being able to inc gens
551          */
552         for_each_member_device(ca, c, i)
553                 bch2_wake_allocator(ca);
554
555         /*
556          * At startup, allocations can happen directly instead of via the
557          * allocator thread - issue wakeup in case they blocked on gc_lock:
558          */
559         closure_wake_up(&c->freelist_wait);
560 }
561
562 /* Btree coalescing */
563
564 static void recalc_packed_keys(struct btree *b)
565 {
566         struct bkey_packed *k;
567
568         memset(&b->nr, 0, sizeof(b->nr));
569
570         BUG_ON(b->nsets != 1);
571
572         for (k =  btree_bkey_first(b, b->set);
573              k != btree_bkey_last(b, b->set);
574              k = bkey_next(k))
575                 btree_keys_account_key_add(&b->nr, 0, k);
576 }
577
578 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
579                                 struct btree *old_nodes[GC_MERGE_NODES])
580 {
581         struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
582         unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
583         unsigned blocks = btree_blocks(c) * 2 / 3;
584         struct btree *new_nodes[GC_MERGE_NODES];
585         struct btree_update *as;
586         struct keylist keylist;
587         struct bkey_format_state format_state;
588         struct bkey_format new_format;
589
590         memset(new_nodes, 0, sizeof(new_nodes));
591         bch2_keylist_init(&keylist, NULL);
592
593         /* Count keys that are not deleted */
594         for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
595                 u64s += old_nodes[i]->nr.live_u64s;
596
597         nr_old_nodes = nr_new_nodes = i;
598
599         /* Check if all keys in @old_nodes could fit in one fewer node */
600         if (nr_old_nodes <= 1 ||
601             __vstruct_blocks(struct btree_node, c->block_bits,
602                              DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
603                 return;
604
605         /* Find a format that all keys in @old_nodes can pack into */
606         bch2_bkey_format_init(&format_state);
607
608         for (i = 0; i < nr_old_nodes; i++)
609                 __bch2_btree_calc_format(&format_state, old_nodes[i]);
610
611         new_format = bch2_bkey_format_done(&format_state);
612
613         /* Check if repacking would make any nodes too big to fit */
614         for (i = 0; i < nr_old_nodes; i++)
615                 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
616                         trace_btree_gc_coalesce_fail(c,
617                                         BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
618                         return;
619                 }
620
621         if (bch2_keylist_realloc(&keylist, NULL, 0,
622                         (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
623                 trace_btree_gc_coalesce_fail(c,
624                                 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
625                 return;
626         }
627
628         as = bch2_btree_update_start(c, iter->btree_id,
629                         btree_update_reserve_required(c, parent) + nr_old_nodes,
630                         BTREE_INSERT_NOFAIL|
631                         BTREE_INSERT_USE_RESERVE,
632                         NULL);
633         if (IS_ERR(as)) {
634                 trace_btree_gc_coalesce_fail(c,
635                                 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
636                 bch2_keylist_free(&keylist, NULL);
637                 return;
638         }
639
640         trace_btree_gc_coalesce(c, old_nodes[0]);
641
642         for (i = 0; i < nr_old_nodes; i++)
643                 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
644
645         /* Repack everything with @new_format and sort down to one bset */
646         for (i = 0; i < nr_old_nodes; i++)
647                 new_nodes[i] =
648                         __bch2_btree_node_alloc_replacement(as, old_nodes[i],
649                                                             new_format);
650
651         /*
652          * Conceptually we concatenate the nodes together and slice them
653          * up at different boundaries.
654          */
655         for (i = nr_new_nodes - 1; i > 0; --i) {
656                 struct btree *n1 = new_nodes[i];
657                 struct btree *n2 = new_nodes[i - 1];
658
659                 struct bset *s1 = btree_bset_first(n1);
660                 struct bset *s2 = btree_bset_first(n2);
661                 struct bkey_packed *k, *last = NULL;
662
663                 /* Calculate how many keys from @n2 we could fit inside @n1 */
664                 u64s = 0;
665
666                 for (k = s2->start;
667                      k < vstruct_last(s2) &&
668                      vstruct_blocks_plus(n1->data, c->block_bits,
669                                          u64s + k->u64s) <= blocks;
670                      k = bkey_next(k)) {
671                         last = k;
672                         u64s += k->u64s;
673                 }
674
675                 if (u64s == le16_to_cpu(s2->u64s)) {
676                         /* n2 fits entirely in n1 */
677                         n1->key.k.p = n1->data->max_key = n2->data->max_key;
678
679                         memcpy_u64s(vstruct_last(s1),
680                                     s2->start,
681                                     le16_to_cpu(s2->u64s));
682                         le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
683
684                         set_btree_bset_end(n1, n1->set);
685
686                         six_unlock_write(&n2->lock);
687                         bch2_btree_node_free_never_inserted(c, n2);
688                         six_unlock_intent(&n2->lock);
689
690                         memmove(new_nodes + i - 1,
691                                 new_nodes + i,
692                                 sizeof(new_nodes[0]) * (nr_new_nodes - i));
693                         new_nodes[--nr_new_nodes] = NULL;
694                 } else if (u64s) {
695                         /* move part of n2 into n1 */
696                         n1->key.k.p = n1->data->max_key =
697                                 bkey_unpack_pos(n1, last);
698
699                         n2->data->min_key =
700                                 btree_type_successor(iter->btree_id,
701                                                      n1->data->max_key);
702
703                         memcpy_u64s(vstruct_last(s1),
704                                     s2->start, u64s);
705                         le16_add_cpu(&s1->u64s, u64s);
706
707                         memmove(s2->start,
708                                 vstruct_idx(s2, u64s),
709                                 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
710                         s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
711
712                         set_btree_bset_end(n1, n1->set);
713                         set_btree_bset_end(n2, n2->set);
714                 }
715         }
716
717         for (i = 0; i < nr_new_nodes; i++) {
718                 struct btree *n = new_nodes[i];
719
720                 recalc_packed_keys(n);
721                 btree_node_reset_sib_u64s(n);
722
723                 bch2_btree_build_aux_trees(n);
724                 six_unlock_write(&n->lock);
725
726                 bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent);
727         }
728
729         /*
730          * The keys for the old nodes get deleted. We don't want to insert keys
731          * that compare equal to the keys for the new nodes we'll also be
732          * inserting - we can't because keys on a keylist must be strictly
733          * greater than the previous keys, and we also don't need to since the
734          * key for the new node will serve the same purpose (overwriting the key
735          * for the old node).
736          */
737         for (i = 0; i < nr_old_nodes; i++) {
738                 struct bkey_i delete;
739                 unsigned j;
740
741                 for (j = 0; j < nr_new_nodes; j++)
742                         if (!bkey_cmp(old_nodes[i]->key.k.p,
743                                       new_nodes[j]->key.k.p))
744                                 goto next;
745
746                 bkey_init(&delete.k);
747                 delete.k.p = old_nodes[i]->key.k.p;
748                 bch2_keylist_add_in_order(&keylist, &delete);
749 next:
750                 i = i;
751         }
752
753         /*
754          * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
755          * does the lookup once and thus expects the keys to be in sorted order
756          * so we have to make sure the new keys are correctly ordered with
757          * respect to the deleted keys added in the previous loop
758          */
759         for (i = 0; i < nr_new_nodes; i++)
760                 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
761
762         /* Insert the newly coalesced nodes */
763         bch2_btree_insert_node(as, parent, iter, &keylist);
764
765         BUG_ON(!bch2_keylist_empty(&keylist));
766
767         BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]);
768
769         BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
770
771         for (i = 0; i < nr_new_nodes; i++)
772                 bch2_btree_open_bucket_put(c, new_nodes[i]);
773
774         /* Free the old nodes and update our sliding window */
775         for (i = 0; i < nr_old_nodes; i++) {
776                 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
777                 six_unlock_intent(&old_nodes[i]->lock);
778
779                 /*
780                  * the index update might have triggered a split, in which case
781                  * the nodes we coalesced - the new nodes we just created -
782                  * might not be sibling nodes anymore - don't add them to the
783                  * sliding window (except the first):
784                  */
785                 if (!i) {
786                         old_nodes[i] = new_nodes[i];
787                 } else {
788                         old_nodes[i] = NULL;
789                         if (new_nodes[i])
790                                 six_unlock_intent(&new_nodes[i]->lock);
791                 }
792         }
793
794         bch2_btree_update_done(as);
795         bch2_keylist_free(&keylist, NULL);
796 }
797
798 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
799 {
800         struct btree_iter iter;
801         struct btree *b;
802         unsigned i;
803
804         /* Sliding window of adjacent btree nodes */
805         struct btree *merge[GC_MERGE_NODES];
806         u32 lock_seq[GC_MERGE_NODES];
807
808         /*
809          * XXX: We don't have a good way of positively matching on sibling nodes
810          * that have the same parent - this code works by handling the cases
811          * where they might not have the same parent, and is thus fragile. Ugh.
812          *
813          * Perhaps redo this to use multiple linked iterators?
814          */
815         memset(merge, 0, sizeof(merge));
816
817         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
818                               BTREE_MAX_DEPTH, 0,
819                               BTREE_ITER_PREFETCH, b) {
820                 memmove(merge + 1, merge,
821                         sizeof(merge) - sizeof(merge[0]));
822                 memmove(lock_seq + 1, lock_seq,
823                         sizeof(lock_seq) - sizeof(lock_seq[0]));
824
825                 merge[0] = b;
826
827                 for (i = 1; i < GC_MERGE_NODES; i++) {
828                         if (!merge[i] ||
829                             !six_relock_intent(&merge[i]->lock, lock_seq[i]))
830                                 break;
831
832                         if (merge[i]->level != merge[0]->level) {
833                                 six_unlock_intent(&merge[i]->lock);
834                                 break;
835                         }
836                 }
837                 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
838
839                 bch2_coalesce_nodes(c, &iter, merge);
840
841                 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
842                         lock_seq[i] = merge[i]->lock.state.seq;
843                         six_unlock_intent(&merge[i]->lock);
844                 }
845
846                 lock_seq[0] = merge[0]->lock.state.seq;
847
848                 if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
849                         bch2_btree_iter_unlock(&iter);
850                         return -ESHUTDOWN;
851                 }
852
853                 bch2_btree_iter_cond_resched(&iter);
854
855                 /*
856                  * If the parent node wasn't relocked, it might have been split
857                  * and the nodes in our sliding window might not have the same
858                  * parent anymore - blow away the sliding window:
859                  */
860                 if (iter.nodes[iter.level + 1] &&
861                     !btree_node_intent_locked(&iter, iter.level + 1))
862                         memset(merge + 1, 0,
863                                (GC_MERGE_NODES - 1) * sizeof(merge[0]));
864         }
865         return bch2_btree_iter_unlock(&iter);
866 }
867
868 /**
869  * bch_coalesce - coalesce adjacent nodes with low occupancy
870  */
871 void bch2_coalesce(struct bch_fs *c)
872 {
873         enum btree_id id;
874
875         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
876                 return;
877
878         down_read(&c->gc_lock);
879         trace_gc_coalesce_start(c);
880
881         for (id = 0; id < BTREE_ID_NR; id++) {
882                 int ret = c->btree_roots[id].b
883                         ? bch2_coalesce_btree(c, id)
884                         : 0;
885
886                 if (ret) {
887                         if (ret != -ESHUTDOWN)
888                                 bch_err(c, "btree coalescing failed: %d", ret);
889                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
890                         return;
891                 }
892         }
893
894         trace_gc_coalesce_end(c);
895         up_read(&c->gc_lock);
896 }
897
898 static int bch2_gc_thread(void *arg)
899 {
900         struct bch_fs *c = arg;
901         struct io_clock *clock = &c->io_clock[WRITE];
902         unsigned long last = atomic_long_read(&clock->now);
903         unsigned last_kick = atomic_read(&c->kick_gc);
904
905         set_freezable();
906
907         while (1) {
908                 while (1) {
909                         set_current_state(TASK_INTERRUPTIBLE);
910
911                         if (kthread_should_stop()) {
912                                 __set_current_state(TASK_RUNNING);
913                                 return 0;
914                         }
915
916                         if (atomic_read(&c->kick_gc) != last_kick)
917                                 break;
918
919                         if (c->btree_gc_periodic) {
920                                 unsigned long next = last + c->capacity / 16;
921
922                                 if (atomic_long_read(&clock->now) >= next)
923                                         break;
924
925                                 bch2_io_clock_schedule_timeout(clock, next);
926                         } else {
927                                 schedule();
928                         }
929
930                         try_to_freeze();
931                 }
932                 __set_current_state(TASK_RUNNING);
933
934                 last = atomic_long_read(&clock->now);
935                 last_kick = atomic_read(&c->kick_gc);
936
937                 bch2_gc(c);
938
939                 debug_check_no_locks_held();
940         }
941
942         return 0;
943 }
944
945 void bch2_gc_thread_stop(struct bch_fs *c)
946 {
947         set_bit(BCH_FS_GC_STOPPING, &c->flags);
948
949         if (c->gc_thread)
950                 kthread_stop(c->gc_thread);
951
952         c->gc_thread = NULL;
953         clear_bit(BCH_FS_GC_STOPPING, &c->flags);
954 }
955
956 int bch2_gc_thread_start(struct bch_fs *c)
957 {
958         struct task_struct *p;
959
960         BUG_ON(c->gc_thread);
961
962         p = kthread_create(bch2_gc_thread, c, "bcache_gc");
963         if (IS_ERR(p))
964                 return PTR_ERR(p);
965
966         c->gc_thread = p;
967         wake_up_process(c->gc_thread);
968         return 0;
969 }
970
971 /* Initial GC computes bucket marks during startup */
972
973 static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
974 {
975         struct btree_iter iter;
976         struct btree *b;
977         struct range_checks r;
978         int ret = 0;
979
980         btree_node_range_checks_init(&r, 0);
981
982         if (!c->btree_roots[id].b)
983                 return 0;
984
985         ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
986                            bkey_i_to_s_c(&c->btree_roots[id].b->key));
987         if (ret)
988                 return ret;
989
990         /*
991          * We have to hit every btree node before starting journal replay, in
992          * order for the journal seq blacklist machinery to work:
993          */
994         for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
995                 btree_node_range_checks(c, b, &r);
996
997                 if (btree_node_has_ptrs(b)) {
998                         struct btree_node_iter node_iter;
999                         struct bkey unpacked;
1000                         struct bkey_s_c k;
1001
1002                         for_each_btree_node_key_unpack(b, k, &node_iter,
1003                                                        btree_node_is_extents(b),
1004                                                        &unpacked) {
1005                                 ret = bch2_btree_mark_key_initial(c,
1006                                                         btree_node_type(b), k);
1007                                 if (ret)
1008                                         goto err;
1009                         }
1010                 }
1011
1012                 bch2_btree_iter_cond_resched(&iter);
1013         }
1014 err:
1015         return bch2_btree_iter_unlock(&iter) ?: ret;
1016 }
1017
1018 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1019 {
1020         unsigned iter = 0;
1021         enum btree_id id;
1022         int ret;
1023
1024         mutex_lock(&c->sb_lock);
1025         if (!bch2_sb_get_replicas(c->disk_sb)) {
1026                 if (BCH_SB_INITIALIZED(c->disk_sb))
1027                         bch_info(c, "building replicas info");
1028                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
1029         }
1030         mutex_unlock(&c->sb_lock);
1031 again:
1032         bch2_gc_start(c);
1033
1034         for (id = 0; id < BTREE_ID_NR; id++) {
1035                 ret = bch2_initial_gc_btree(c, id);
1036                 if (ret)
1037                         return ret;
1038         }
1039
1040         ret = bch2_journal_mark(c, journal);
1041         if (ret)
1042         return ret;
1043
1044         if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
1045                 if (iter++ > 2) {
1046                         bch_info(c, "Unable to fix bucket gens, looping");
1047                         return -EINVAL;
1048                 }
1049
1050                 bch_info(c, "Fixed gens, restarting initial mark and sweep:");
1051                 clear_bit(BCH_FS_FIXED_GENS, &c->flags);
1052                 goto again;
1053         }
1054
1055         /*
1056          * Skip past versions that might have possibly been used (as nonces),
1057          * but hadn't had their pointers written:
1058          */
1059         if (c->sb.encryption_type)
1060                 atomic64_add(1 << 16, &c->key_version);
1061
1062         bch2_mark_superblocks(c);
1063
1064         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
1065         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
1066
1067         return 0;
1068 }