]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c
Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  * Copyright (C) 2014 Datera Inc.
4  */
5
6 #include "bcachefs.h"
7 #include "alloc.h"
8 #include "bkey_methods.h"
9 #include "btree_locking.h"
10 #include "btree_update_interior.h"
11 #include "btree_io.h"
12 #include "btree_gc.h"
13 #include "buckets.h"
14 #include "clock.h"
15 #include "debug.h"
16 #include "error.h"
17 #include "extents.h"
18 #include "journal.h"
19 #include "keylist.h"
20 #include "move.h"
21 #include "super-io.h"
22
23 #include <linux/slab.h>
24 #include <linux/bitops.h>
25 #include <linux/freezer.h>
26 #include <linux/kthread.h>
27 #include <linux/preempt.h>
28 #include <linux/rcupdate.h>
29 #include <trace/events/bcachefs.h>
30
31 struct range_checks {
32         struct range_level {
33                 struct bpos     min;
34                 struct bpos     max;
35         }                       l[BTREE_MAX_DEPTH];
36         unsigned                depth;
37 };
38
39 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
40 {
41         unsigned i;
42
43         for (i = 0; i < BTREE_MAX_DEPTH; i++)
44                 r->l[i].min = r->l[i].max = POS_MIN;
45         r->depth = depth;
46 }
47
48 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
49                                     struct range_checks *r)
50 {
51         struct range_level *l = &r->l[b->level];
52
53         struct bpos expected_min = bkey_cmp(l->min, l->max)
54                 ? btree_type_successor(b->btree_id, l->max)
55                 : l->max;
56
57         bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
58                 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
59                 b->data->min_key.inode,
60                 b->data->min_key.offset,
61                 expected_min.inode,
62                 expected_min.offset);
63
64         l->max = b->data->max_key;
65
66         if (b->level > r->depth) {
67                 l = &r->l[b->level - 1];
68
69                 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
70                         "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
71                         b->data->min_key.inode,
72                         b->data->min_key.offset,
73                         l->min.inode,
74                         l->min.offset);
75
76                 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
77                         "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
78                         b->data->max_key.inode,
79                         b->data->max_key.offset,
80                         l->max.inode,
81                         l->max.offset);
82
83                 if (bkey_cmp(b->data->max_key, POS_MAX))
84                         l->min = l->max =
85                                 btree_type_successor(b->btree_id,
86                                                      b->data->max_key);
87         }
88 }
89
90 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
91 {
92         const struct bch_extent_ptr *ptr;
93         u8 max_stale = 0;
94
95         if (bkey_extent_is_data(k.k)) {
96                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
97
98                 extent_for_each_ptr(e, ptr) {
99                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
100                         size_t b = PTR_BUCKET_NR(ca, ptr);
101
102                         if (gen_after(ca->oldest_gens[b], ptr->gen))
103                                 ca->oldest_gens[b] = ptr->gen;
104
105                         max_stale = max(max_stale, ptr_stale(ca, ptr));
106                 }
107         }
108
109         return max_stale;
110 }
111
112 /*
113  * For runtime mark and sweep:
114  */
115 static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
116                            struct bkey_s_c k, unsigned flags)
117 {
118         struct gc_pos pos = { 0 };
119         struct bch_fs_usage *stats;
120         u8 ret = 0;
121
122         preempt_disable();
123         stats = this_cpu_ptr(c->usage_percpu);
124         switch (type) {
125         case BKEY_TYPE_BTREE:
126                 bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
127                               0, flags|
128                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
129                               BCH_BUCKET_MARK_GC_LOCK_HELD);
130                 break;
131         case BKEY_TYPE_EXTENTS:
132                 bch2_mark_key(c, k, k.k->size, false, pos, stats,
133                               0, flags|
134                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
135                               BCH_BUCKET_MARK_GC_LOCK_HELD);
136                 ret = bch2_btree_key_recalc_oldest_gen(c, k);
137                 break;
138         default:
139                 BUG();
140         }
141         preempt_enable();
142
143         return ret;
144 }
145
146 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
147                                 struct bkey_s_c k)
148 {
149         enum bch_data_type data_type = type == BKEY_TYPE_BTREE
150                 ? BCH_DATA_BTREE : BCH_DATA_USER;
151         int ret = 0;
152
153         if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
154             fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
155                         "superblock not marked as containing replicas (type %u)",
156                         data_type)) {
157                 ret = bch2_mark_bkey_replicas(c, data_type, k);
158                 if (ret)
159                         return ret;
160         }
161
162         switch (k.k->type) {
163         case BCH_EXTENT:
164         case BCH_EXTENT_CACHED: {
165                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
166                 const struct bch_extent_ptr *ptr;
167
168                 extent_for_each_ptr(e, ptr) {
169                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
170                         size_t b = PTR_BUCKET_NR(ca, ptr);
171                         struct bucket *g = PTR_BUCKET(ca, ptr);
172
173                         if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
174                                         "found ptr with missing gen in alloc btree,\n"
175                                         "type %s gen %u",
176                                         bch2_data_types[data_type],
177                                         ptr->gen)) {
178                                 g->_mark.gen = ptr->gen;
179                                 g->_mark.gen_valid = 1;
180                                 set_bit(b, ca->buckets_dirty);
181                         }
182
183                         if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
184                                         "%s ptr gen in the future: %u > %u",
185                                         bch2_data_types[data_type],
186                                         ptr->gen, g->mark.gen)) {
187                                 g->_mark.gen = ptr->gen;
188                                 g->_mark.gen_valid = 1;
189                                 set_bit(b, ca->buckets_dirty);
190                                 set_bit(BCH_FS_FIXED_GENS, &c->flags);
191                         }
192
193                 }
194                 break;
195         }
196         }
197
198         atomic64_set(&c->key_version,
199                      max_t(u64, k.k->version.lo,
200                            atomic64_read(&c->key_version)));
201
202         bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
203 fsck_err:
204         return ret;
205 }
206
207 static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
208 {
209         enum bkey_type type = btree_node_type(b);
210         struct btree_node_iter iter;
211         struct bkey unpacked;
212         struct bkey_s_c k;
213         u8 stale = 0;
214
215         if (btree_node_has_ptrs(b))
216                 for_each_btree_node_key_unpack(b, k, &iter,
217                                                btree_node_is_extents(b),
218                                                &unpacked) {
219                         bch2_bkey_debugcheck(c, b, k);
220                         stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
221                 }
222
223         return stale;
224 }
225
226 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
227 {
228         write_seqcount_begin(&c->gc_pos_lock);
229         c->gc_pos = new_pos;
230         write_seqcount_end(&c->gc_pos_lock);
231 }
232
233 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
234 {
235         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
236         __gc_pos_set(c, new_pos);
237 }
238
239 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
240 {
241         struct btree_iter iter;
242         struct btree *b;
243         struct range_checks r;
244         unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
245         unsigned max_stale;
246         int ret = 0;
247
248         /*
249          * if expensive_debug_checks is on, run range_checks on all leaf nodes:
250          */
251         if (expensive_debug_checks(c))
252                 depth = 0;
253
254         btree_node_range_checks_init(&r, depth);
255
256         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
257                               0, depth, BTREE_ITER_PREFETCH, b) {
258                 btree_node_range_checks(c, b, &r);
259
260                 bch2_verify_btree_nr_keys(b);
261
262                 max_stale = btree_gc_mark_node(c, b);
263
264                 gc_pos_set(c, gc_pos_btree_node(b));
265
266                 if (max_stale > 32)
267                         bch2_btree_node_rewrite(c, &iter,
268                                         b->data->keys.seq,
269                                         BTREE_INSERT_USE_RESERVE|
270                                         BTREE_INSERT_GC_LOCK_HELD);
271                 else if (!btree_gc_rewrite_disabled(c) &&
272                          (btree_gc_always_rewrite(c) || max_stale > 16))
273                         bch2_btree_node_rewrite(c, &iter,
274                                         b->data->keys.seq,
275                                         BTREE_INSERT_NOWAIT|
276                                         BTREE_INSERT_GC_LOCK_HELD);
277
278                 bch2_btree_iter_cond_resched(&iter);
279         }
280         ret = bch2_btree_iter_unlock(&iter);
281         if (ret)
282                 return ret;
283
284         mutex_lock(&c->btree_root_lock);
285
286         b = c->btree_roots[btree_id].b;
287         if (!btree_node_fake(b))
288                 bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
289         gc_pos_set(c, gc_pos_btree_root(b->btree_id));
290
291         mutex_unlock(&c->btree_root_lock);
292         return 0;
293 }
294
295 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
296                                   u64 start, u64 end,
297                                   enum bch_data_type type,
298                                   unsigned flags)
299 {
300         u64 b = sector_to_bucket(ca, start);
301
302         do {
303                 unsigned sectors =
304                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
305
306                 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
307                                           gc_phase(GC_PHASE_SB), flags);
308                 b++;
309                 start += sectors;
310         } while (start < end);
311 }
312
313 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
314                               unsigned flags)
315 {
316         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
317         unsigned i;
318         u64 b;
319
320         lockdep_assert_held(&c->sb_lock);
321
322         for (i = 0; i < layout->nr_superblocks; i++) {
323                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
324
325                 if (offset == BCH_SB_SECTOR)
326                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
327                                               BCH_DATA_SB, flags);
328
329                 mark_metadata_sectors(c, ca, offset,
330                                       offset + (1 << layout->sb_max_size_bits),
331                                       BCH_DATA_SB, flags);
332         }
333
334         spin_lock(&c->journal.lock);
335
336         for (i = 0; i < ca->journal.nr; i++) {
337                 b = ca->journal.buckets[i];
338                 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
339                                           ca->mi.bucket_size,
340                                           gc_phase(GC_PHASE_SB), flags);
341         }
342
343         spin_unlock(&c->journal.lock);
344 }
345
346 static void bch2_mark_superblocks(struct bch_fs *c)
347 {
348         struct bch_dev *ca;
349         unsigned i;
350
351         mutex_lock(&c->sb_lock);
352         gc_pos_set(c, gc_phase(GC_PHASE_SB));
353
354         for_each_online_member(ca, c, i)
355                 bch2_mark_dev_superblock(c, ca,
356                                          BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
357                                          BCH_BUCKET_MARK_GC_LOCK_HELD);
358         mutex_unlock(&c->sb_lock);
359 }
360
361 /* Also see bch2_pending_btree_node_free_insert_done() */
362 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
363 {
364         struct gc_pos pos = { 0 };
365         struct bch_fs_usage stats = { 0 };
366         struct btree_update *as;
367         struct pending_btree_node_free *d;
368
369         mutex_lock(&c->btree_interior_update_lock);
370         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
371
372         for_each_pending_btree_node_free(c, as, d)
373                 if (d->index_update_done)
374                         bch2_mark_key(c, bkey_i_to_s_c(&d->key),
375                                       c->opts.btree_node_size, true, pos,
376                                       &stats, 0,
377                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
378                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
379         /*
380          * Don't apply stats - pending deletes aren't tracked in
381          * bch_alloc_stats:
382          */
383
384         mutex_unlock(&c->btree_interior_update_lock);
385 }
386
387 static void bch2_mark_allocator_buckets(struct bch_fs *c)
388 {
389         struct bch_dev *ca;
390         struct open_bucket *ob;
391         size_t i, j, iter;
392         unsigned ci;
393
394         spin_lock(&c->freelist_lock);
395         gc_pos_set(c, gc_pos_alloc(c, NULL));
396
397         for_each_member_device(ca, c, ci) {
398                 fifo_for_each_entry(i, &ca->free_inc, iter)
399                         bch2_mark_alloc_bucket(c, ca, i, true,
400                                                gc_pos_alloc(c, NULL),
401                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
402                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
403
404
405
406                 for (j = 0; j < RESERVE_NR; j++)
407                         fifo_for_each_entry(i, &ca->free[j], iter)
408                                 bch2_mark_alloc_bucket(c, ca, i, true,
409                                                        gc_pos_alloc(c, NULL),
410                                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
411                                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
412         }
413
414         spin_unlock(&c->freelist_lock);
415
416         for (ob = c->open_buckets;
417              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
418              ob++) {
419                 spin_lock(&ob->lock);
420                 if (ob->valid) {
421                         gc_pos_set(c, gc_pos_alloc(c, ob));
422                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
423                         bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
424                                                gc_pos_alloc(c, ob),
425                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
426                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
427                 }
428                 spin_unlock(&ob->lock);
429         }
430 }
431
432 static void bch2_gc_start(struct bch_fs *c)
433 {
434         struct bch_dev *ca;
435         struct bucket_array *buckets;
436         struct bucket_mark new;
437         unsigned i;
438         size_t b;
439         int cpu;
440
441         lg_global_lock(&c->usage_lock);
442
443         /*
444          * Indicates to buckets code that gc is now in progress - done under
445          * usage_lock to avoid racing with bch2_mark_key():
446          */
447         __gc_pos_set(c, GC_POS_MIN);
448
449         /* Save a copy of the existing bucket stats while we recompute them: */
450         for_each_member_device(ca, c, i) {
451                 ca->usage_cached = __bch2_dev_usage_read(ca);
452                 for_each_possible_cpu(cpu) {
453                         struct bch_dev_usage *p =
454                                 per_cpu_ptr(ca->usage_percpu, cpu);
455                         memset(p, 0, sizeof(*p));
456                 }
457         }
458
459         c->usage_cached = __bch2_fs_usage_read(c);
460         for_each_possible_cpu(cpu) {
461                 struct bch_fs_usage *p =
462                         per_cpu_ptr(c->usage_percpu, cpu);
463
464                 memset(p->s, 0, sizeof(p->s));
465         }
466
467         lg_global_unlock(&c->usage_lock);
468
469         /* Clear bucket marks: */
470         for_each_member_device(ca, c, i) {
471                 down_read(&ca->bucket_lock);
472                 buckets = bucket_array(ca);
473
474                 for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
475                         bucket_cmpxchg(buckets->b + b, new, ({
476                                 new.owned_by_allocator  = 0;
477                                 new.data_type           = 0;
478                                 new.cached_sectors      = 0;
479                                 new.dirty_sectors       = 0;
480                         }));
481                         ca->oldest_gens[b] = new.gen;
482                 }
483                 up_read(&ca->bucket_lock);
484         }
485 }
486
487 /**
488  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
489  */
490 void bch2_gc(struct bch_fs *c)
491 {
492         struct bch_dev *ca;
493         u64 start_time = local_clock();
494         unsigned i;
495
496         /*
497          * Walk _all_ references to buckets, and recompute them:
498          *
499          * Order matters here:
500          *  - Concurrent GC relies on the fact that we have a total ordering for
501          *    everything that GC walks - see  gc_will_visit_node(),
502          *    gc_will_visit_root()
503          *
504          *  - also, references move around in the course of index updates and
505          *    various other crap: everything needs to agree on the ordering
506          *    references are allowed to move around in - e.g., we're allowed to
507          *    start with a reference owned by an open_bucket (the allocator) and
508          *    move it to the btree, but not the reverse.
509          *
510          *    This is necessary to ensure that gc doesn't miss references that
511          *    move around - if references move backwards in the ordering GC
512          *    uses, GC could skip past them
513          */
514         trace_gc_start(c);
515
516         /*
517          * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
518          * gc_lock if sectors_available goes to 0:
519          */
520         bch2_recalc_sectors_available(c);
521
522         down_write(&c->gc_lock);
523         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
524                 goto out;
525
526         bch2_gc_start(c);
527
528         /* Walk btree: */
529         while (c->gc_pos.phase < (int) BTREE_ID_NR) {
530                 int ret = c->btree_roots[c->gc_pos.phase].b
531                         ? bch2_gc_btree(c, (int) c->gc_pos.phase)
532                         : 0;
533
534                 if (ret) {
535                         bch_err(c, "btree gc failed: %d", ret);
536                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
537                         goto out;
538                 }
539
540                 gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
541         }
542
543         bch2_mark_superblocks(c);
544         bch2_mark_pending_btree_node_frees(c);
545         bch2_mark_allocator_buckets(c);
546
547         for_each_member_device(ca, c, i)
548                 atomic_long_set(&ca->saturated_count, 0);
549
550         /* Indicates that gc is no longer in progress: */
551         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
552         c->gc_count++;
553 out:
554         up_write(&c->gc_lock);
555         trace_gc_end(c);
556         bch2_time_stats_update(&c->btree_gc_time, start_time);
557
558         /*
559          * Wake up allocator in case it was waiting for buckets
560          * because of not being able to inc gens
561          */
562         for_each_member_device(ca, c, i)
563                 bch2_wake_allocator(ca);
564
565         /*
566          * At startup, allocations can happen directly instead of via the
567          * allocator thread - issue wakeup in case they blocked on gc_lock:
568          */
569         closure_wake_up(&c->freelist_wait);
570 }
571
572 /* Btree coalescing */
573
574 static void recalc_packed_keys(struct btree *b)
575 {
576         struct bkey_packed *k;
577
578         memset(&b->nr, 0, sizeof(b->nr));
579
580         BUG_ON(b->nsets != 1);
581
582         for (k =  btree_bkey_first(b, b->set);
583              k != btree_bkey_last(b, b->set);
584              k = bkey_next(k))
585                 btree_keys_account_key_add(&b->nr, 0, k);
586 }
587
588 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
589                                 struct btree *old_nodes[GC_MERGE_NODES])
590 {
591         struct btree *parent = btree_node_parent(iter, old_nodes[0]);
592         unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
593         unsigned blocks = btree_blocks(c) * 2 / 3;
594         struct btree *new_nodes[GC_MERGE_NODES];
595         struct btree_update *as;
596         struct keylist keylist;
597         struct bkey_format_state format_state;
598         struct bkey_format new_format;
599
600         memset(new_nodes, 0, sizeof(new_nodes));
601         bch2_keylist_init(&keylist, NULL);
602
603         /* Count keys that are not deleted */
604         for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
605                 u64s += old_nodes[i]->nr.live_u64s;
606
607         nr_old_nodes = nr_new_nodes = i;
608
609         /* Check if all keys in @old_nodes could fit in one fewer node */
610         if (nr_old_nodes <= 1 ||
611             __vstruct_blocks(struct btree_node, c->block_bits,
612                              DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
613                 return;
614
615         /* Find a format that all keys in @old_nodes can pack into */
616         bch2_bkey_format_init(&format_state);
617
618         for (i = 0; i < nr_old_nodes; i++)
619                 __bch2_btree_calc_format(&format_state, old_nodes[i]);
620
621         new_format = bch2_bkey_format_done(&format_state);
622
623         /* Check if repacking would make any nodes too big to fit */
624         for (i = 0; i < nr_old_nodes; i++)
625                 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
626                         trace_btree_gc_coalesce_fail(c,
627                                         BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
628                         return;
629                 }
630
631         if (bch2_keylist_realloc(&keylist, NULL, 0,
632                         (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
633                 trace_btree_gc_coalesce_fail(c,
634                                 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
635                 return;
636         }
637
638         as = bch2_btree_update_start(c, iter->btree_id,
639                         btree_update_reserve_required(c, parent) + nr_old_nodes,
640                         BTREE_INSERT_NOFAIL|
641                         BTREE_INSERT_USE_RESERVE,
642                         NULL);
643         if (IS_ERR(as)) {
644                 trace_btree_gc_coalesce_fail(c,
645                                 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
646                 bch2_keylist_free(&keylist, NULL);
647                 return;
648         }
649
650         trace_btree_gc_coalesce(c, old_nodes[0]);
651
652         for (i = 0; i < nr_old_nodes; i++)
653                 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
654
655         /* Repack everything with @new_format and sort down to one bset */
656         for (i = 0; i < nr_old_nodes; i++)
657                 new_nodes[i] =
658                         __bch2_btree_node_alloc_replacement(as, old_nodes[i],
659                                                             new_format);
660
661         /*
662          * Conceptually we concatenate the nodes together and slice them
663          * up at different boundaries.
664          */
665         for (i = nr_new_nodes - 1; i > 0; --i) {
666                 struct btree *n1 = new_nodes[i];
667                 struct btree *n2 = new_nodes[i - 1];
668
669                 struct bset *s1 = btree_bset_first(n1);
670                 struct bset *s2 = btree_bset_first(n2);
671                 struct bkey_packed *k, *last = NULL;
672
673                 /* Calculate how many keys from @n2 we could fit inside @n1 */
674                 u64s = 0;
675
676                 for (k = s2->start;
677                      k < vstruct_last(s2) &&
678                      vstruct_blocks_plus(n1->data, c->block_bits,
679                                          u64s + k->u64s) <= blocks;
680                      k = bkey_next(k)) {
681                         last = k;
682                         u64s += k->u64s;
683                 }
684
685                 if (u64s == le16_to_cpu(s2->u64s)) {
686                         /* n2 fits entirely in n1 */
687                         n1->key.k.p = n1->data->max_key = n2->data->max_key;
688
689                         memcpy_u64s(vstruct_last(s1),
690                                     s2->start,
691                                     le16_to_cpu(s2->u64s));
692                         le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
693
694                         set_btree_bset_end(n1, n1->set);
695
696                         six_unlock_write(&n2->lock);
697                         bch2_btree_node_free_never_inserted(c, n2);
698                         six_unlock_intent(&n2->lock);
699
700                         memmove(new_nodes + i - 1,
701                                 new_nodes + i,
702                                 sizeof(new_nodes[0]) * (nr_new_nodes - i));
703                         new_nodes[--nr_new_nodes] = NULL;
704                 } else if (u64s) {
705                         /* move part of n2 into n1 */
706                         n1->key.k.p = n1->data->max_key =
707                                 bkey_unpack_pos(n1, last);
708
709                         n2->data->min_key =
710                                 btree_type_successor(iter->btree_id,
711                                                      n1->data->max_key);
712
713                         memcpy_u64s(vstruct_last(s1),
714                                     s2->start, u64s);
715                         le16_add_cpu(&s1->u64s, u64s);
716
717                         memmove(s2->start,
718                                 vstruct_idx(s2, u64s),
719                                 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
720                         s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
721
722                         set_btree_bset_end(n1, n1->set);
723                         set_btree_bset_end(n2, n2->set);
724                 }
725         }
726
727         for (i = 0; i < nr_new_nodes; i++) {
728                 struct btree *n = new_nodes[i];
729
730                 recalc_packed_keys(n);
731                 btree_node_reset_sib_u64s(n);
732
733                 bch2_btree_build_aux_trees(n);
734                 six_unlock_write(&n->lock);
735
736                 bch2_btree_node_write(c, n, SIX_LOCK_intent);
737         }
738
739         /*
740          * The keys for the old nodes get deleted. We don't want to insert keys
741          * that compare equal to the keys for the new nodes we'll also be
742          * inserting - we can't because keys on a keylist must be strictly
743          * greater than the previous keys, and we also don't need to since the
744          * key for the new node will serve the same purpose (overwriting the key
745          * for the old node).
746          */
747         for (i = 0; i < nr_old_nodes; i++) {
748                 struct bkey_i delete;
749                 unsigned j;
750
751                 for (j = 0; j < nr_new_nodes; j++)
752                         if (!bkey_cmp(old_nodes[i]->key.k.p,
753                                       new_nodes[j]->key.k.p))
754                                 goto next;
755
756                 bkey_init(&delete.k);
757                 delete.k.p = old_nodes[i]->key.k.p;
758                 bch2_keylist_add_in_order(&keylist, &delete);
759 next:
760                 i = i;
761         }
762
763         /*
764          * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
765          * does the lookup once and thus expects the keys to be in sorted order
766          * so we have to make sure the new keys are correctly ordered with
767          * respect to the deleted keys added in the previous loop
768          */
769         for (i = 0; i < nr_new_nodes; i++)
770                 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
771
772         /* Insert the newly coalesced nodes */
773         bch2_btree_insert_node(as, parent, iter, &keylist);
774
775         BUG_ON(!bch2_keylist_empty(&keylist));
776
777         BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
778
779         BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
780
781         for (i = 0; i < nr_new_nodes; i++)
782                 bch2_btree_open_bucket_put(c, new_nodes[i]);
783
784         /* Free the old nodes and update our sliding window */
785         for (i = 0; i < nr_old_nodes; i++) {
786                 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
787                 six_unlock_intent(&old_nodes[i]->lock);
788
789                 /*
790                  * the index update might have triggered a split, in which case
791                  * the nodes we coalesced - the new nodes we just created -
792                  * might not be sibling nodes anymore - don't add them to the
793                  * sliding window (except the first):
794                  */
795                 if (!i) {
796                         old_nodes[i] = new_nodes[i];
797                 } else {
798                         old_nodes[i] = NULL;
799                         if (new_nodes[i])
800                                 six_unlock_intent(&new_nodes[i]->lock);
801                 }
802         }
803
804         bch2_btree_update_done(as);
805         bch2_keylist_free(&keylist, NULL);
806 }
807
808 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
809 {
810         struct btree_iter iter;
811         struct btree *b;
812         unsigned i;
813
814         /* Sliding window of adjacent btree nodes */
815         struct btree *merge[GC_MERGE_NODES];
816         u32 lock_seq[GC_MERGE_NODES];
817
818         /*
819          * XXX: We don't have a good way of positively matching on sibling nodes
820          * that have the same parent - this code works by handling the cases
821          * where they might not have the same parent, and is thus fragile. Ugh.
822          *
823          * Perhaps redo this to use multiple linked iterators?
824          */
825         memset(merge, 0, sizeof(merge));
826
827         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
828                               BTREE_MAX_DEPTH, 0,
829                               BTREE_ITER_PREFETCH, b) {
830                 memmove(merge + 1, merge,
831                         sizeof(merge) - sizeof(merge[0]));
832                 memmove(lock_seq + 1, lock_seq,
833                         sizeof(lock_seq) - sizeof(lock_seq[0]));
834
835                 merge[0] = b;
836
837                 for (i = 1; i < GC_MERGE_NODES; i++) {
838                         if (!merge[i] ||
839                             !six_relock_intent(&merge[i]->lock, lock_seq[i]))
840                                 break;
841
842                         if (merge[i]->level != merge[0]->level) {
843                                 six_unlock_intent(&merge[i]->lock);
844                                 break;
845                         }
846                 }
847                 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
848
849                 bch2_coalesce_nodes(c, &iter, merge);
850
851                 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
852                         lock_seq[i] = merge[i]->lock.state.seq;
853                         six_unlock_intent(&merge[i]->lock);
854                 }
855
856                 lock_seq[0] = merge[0]->lock.state.seq;
857
858                 if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
859                         bch2_btree_iter_unlock(&iter);
860                         return -ESHUTDOWN;
861                 }
862
863                 bch2_btree_iter_cond_resched(&iter);
864
865                 /*
866                  * If the parent node wasn't relocked, it might have been split
867                  * and the nodes in our sliding window might not have the same
868                  * parent anymore - blow away the sliding window:
869                  */
870                 if (btree_iter_node(&iter, iter.level + 1) &&
871                     !btree_node_intent_locked(&iter, iter.level + 1))
872                         memset(merge + 1, 0,
873                                (GC_MERGE_NODES - 1) * sizeof(merge[0]));
874         }
875         return bch2_btree_iter_unlock(&iter);
876 }
877
878 /**
879  * bch_coalesce - coalesce adjacent nodes with low occupancy
880  */
881 void bch2_coalesce(struct bch_fs *c)
882 {
883         enum btree_id id;
884
885         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
886                 return;
887
888         down_read(&c->gc_lock);
889         trace_gc_coalesce_start(c);
890
891         for (id = 0; id < BTREE_ID_NR; id++) {
892                 int ret = c->btree_roots[id].b
893                         ? bch2_coalesce_btree(c, id)
894                         : 0;
895
896                 if (ret) {
897                         if (ret != -ESHUTDOWN)
898                                 bch_err(c, "btree coalescing failed: %d", ret);
899                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
900                         return;
901                 }
902         }
903
904         trace_gc_coalesce_end(c);
905         up_read(&c->gc_lock);
906 }
907
908 static int bch2_gc_thread(void *arg)
909 {
910         struct bch_fs *c = arg;
911         struct io_clock *clock = &c->io_clock[WRITE];
912         unsigned long last = atomic_long_read(&clock->now);
913         unsigned last_kick = atomic_read(&c->kick_gc);
914
915         set_freezable();
916
917         while (1) {
918                 while (1) {
919                         set_current_state(TASK_INTERRUPTIBLE);
920
921                         if (kthread_should_stop()) {
922                                 __set_current_state(TASK_RUNNING);
923                                 return 0;
924                         }
925
926                         if (atomic_read(&c->kick_gc) != last_kick)
927                                 break;
928
929                         if (c->btree_gc_periodic) {
930                                 unsigned long next = last + c->capacity / 16;
931
932                                 if (atomic_long_read(&clock->now) >= next)
933                                         break;
934
935                                 bch2_io_clock_schedule_timeout(clock, next);
936                         } else {
937                                 schedule();
938                         }
939
940                         try_to_freeze();
941                 }
942                 __set_current_state(TASK_RUNNING);
943
944                 last = atomic_long_read(&clock->now);
945                 last_kick = atomic_read(&c->kick_gc);
946
947                 bch2_gc(c);
948
949                 debug_check_no_locks_held();
950         }
951
952         return 0;
953 }
954
955 void bch2_gc_thread_stop(struct bch_fs *c)
956 {
957         set_bit(BCH_FS_GC_STOPPING, &c->flags);
958
959         if (c->gc_thread)
960                 kthread_stop(c->gc_thread);
961
962         c->gc_thread = NULL;
963         clear_bit(BCH_FS_GC_STOPPING, &c->flags);
964 }
965
966 int bch2_gc_thread_start(struct bch_fs *c)
967 {
968         struct task_struct *p;
969
970         BUG_ON(c->gc_thread);
971
972         p = kthread_create(bch2_gc_thread, c, "bcache_gc");
973         if (IS_ERR(p))
974                 return PTR_ERR(p);
975
976         c->gc_thread = p;
977         wake_up_process(c->gc_thread);
978         return 0;
979 }
980
981 /* Initial GC computes bucket marks during startup */
982
983 static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
984 {
985         struct btree_iter iter;
986         struct btree *b;
987         struct range_checks r;
988         int ret = 0;
989
990         btree_node_range_checks_init(&r, 0);
991
992         if (!c->btree_roots[id].b)
993                 return 0;
994
995         b = c->btree_roots[id].b;
996         if (!btree_node_fake(b))
997                 ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
998                                                   bkey_i_to_s_c(&b->key));
999         if (ret)
1000                 return ret;
1001
1002         /*
1003          * We have to hit every btree node before starting journal replay, in
1004          * order for the journal seq blacklist machinery to work:
1005          */
1006         for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
1007                 btree_node_range_checks(c, b, &r);
1008
1009                 if (btree_node_has_ptrs(b)) {
1010                         struct btree_node_iter node_iter;
1011                         struct bkey unpacked;
1012                         struct bkey_s_c k;
1013
1014                         for_each_btree_node_key_unpack(b, k, &node_iter,
1015                                                        btree_node_is_extents(b),
1016                                                        &unpacked) {
1017                                 ret = bch2_btree_mark_key_initial(c,
1018                                                         btree_node_type(b), k);
1019                                 if (ret)
1020                                         goto err;
1021                         }
1022                 }
1023
1024                 bch2_btree_iter_cond_resched(&iter);
1025         }
1026 err:
1027         return bch2_btree_iter_unlock(&iter) ?: ret;
1028 }
1029
1030 static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1031 {
1032         unsigned iter = 0;
1033         enum btree_id id;
1034         int ret;
1035
1036         mutex_lock(&c->sb_lock);
1037         if (!bch2_sb_get_replicas(c->disk_sb)) {
1038                 if (BCH_SB_INITIALIZED(c->disk_sb))
1039                         bch_info(c, "building replicas info");
1040                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
1041         }
1042         mutex_unlock(&c->sb_lock);
1043 again:
1044         bch2_gc_start(c);
1045
1046         for (id = 0; id < BTREE_ID_NR; id++) {
1047                 ret = bch2_initial_gc_btree(c, id);
1048                 if (ret)
1049                         return ret;
1050         }
1051
1052         ret = bch2_journal_mark(c, journal);
1053         if (ret)
1054                 return ret;
1055
1056         if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
1057                 if (iter++ > 2) {
1058                         bch_info(c, "Unable to fix bucket gens, looping");
1059                         return -EINVAL;
1060                 }
1061
1062                 bch_info(c, "Fixed gens, restarting initial mark and sweep:");
1063                 clear_bit(BCH_FS_FIXED_GENS, &c->flags);
1064                 goto again;
1065         }
1066
1067         /*
1068          * Skip past versions that might have possibly been used (as nonces),
1069          * but hadn't had their pointers written:
1070          */
1071         if (c->sb.encryption_type)
1072                 atomic64_add(1 << 16, &c->key_version);
1073
1074         bch2_mark_superblocks(c);
1075
1076         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
1077         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
1078
1079         return 0;
1080 }
1081
1082 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1083 {
1084         int ret;
1085
1086         down_write(&c->gc_lock);
1087         ret = __bch2_initial_gc(c, journal);
1088         up_write(&c->gc_lock);
1089
1090         return ret;
1091 }