2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 * Copyright (C) 2014 Datera Inc.
7 #include "alloc_background.h"
8 #include "alloc_foreground.h"
9 #include "bkey_methods.h"
10 #include "btree_locking.h"
11 #include "btree_update_interior.h"
27 #include <linux/slab.h>
28 #include <linux/bitops.h>
29 #include <linux/freezer.h>
30 #include <linux/kthread.h>
31 #include <linux/preempt.h>
32 #include <linux/rcupdate.h>
33 #include <linux/sched/task.h>
34 #include <trace/events/bcachefs.h>
36 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
38 write_seqcount_begin(&c->gc_pos_lock);
40 write_seqcount_end(&c->gc_pos_lock);
43 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
45 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
46 __gc_pos_set(c, new_pos);
49 /* range_checks - for validating min/max pos of each btree node: */
59 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
63 for (i = 0; i < BTREE_MAX_DEPTH; i++)
64 r->l[i].min = r->l[i].max = POS_MIN;
68 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
69 struct range_checks *r)
71 struct range_level *l = &r->l[b->level];
73 struct bpos expected_min = bkey_cmp(l->min, l->max)
74 ? btree_type_successor(b->btree_id, l->max)
77 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
78 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
79 b->data->min_key.inode,
80 b->data->min_key.offset,
84 l->max = b->data->max_key;
86 if (b->level > r->depth) {
87 l = &r->l[b->level - 1];
89 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
90 "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
91 b->data->min_key.inode,
92 b->data->min_key.offset,
96 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
97 "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
98 b->data->max_key.inode,
99 b->data->max_key.offset,
103 if (bkey_cmp(b->data->max_key, POS_MAX))
105 btree_type_successor(b->btree_id,
110 /* marking of btree keys/nodes: */
112 static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
113 u8 *max_stale, bool initial)
115 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
116 const struct bch_extent_ptr *ptr;
119 (initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
123 BUG_ON(journal_seq_verify(c) &&
124 k.k->version.lo > journal_cur_seq(&c->journal));
126 if (k.k->version.lo > atomic64_read(&c->key_version))
127 atomic64_set(&c->key_version, k.k->version.lo);
129 if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
130 fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
131 "superblock not marked as containing replicas (type %u)",
133 ret = bch2_mark_bkey_replicas(c, k);
138 bkey_for_each_ptr(ptrs, ptr) {
139 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
140 struct bucket *g = PTR_BUCKET(ca, ptr, true);
141 struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
143 if (mustfix_fsck_err_on(!g->gen_valid, c,
144 "found ptr with missing gen in alloc btree,\n"
146 k.k->type, ptr->gen)) {
147 g2->_mark.gen = g->_mark.gen = ptr->gen;
148 g2->_mark.dirty = g->_mark.dirty = true;
149 g2->gen_valid = g->gen_valid = true;
152 if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
153 "%u ptr gen in the future: %u > %u",
154 k.k->type, ptr->gen, g->mark.gen)) {
155 g2->_mark.gen = g->_mark.gen = ptr->gen;
156 g2->_mark.dirty = g->_mark.dirty = true;
157 g2->gen_valid = g->gen_valid = true;
158 set_bit(BCH_FS_FIXED_GENS, &c->flags);
163 bkey_for_each_ptr(ptrs, ptr) {
164 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
165 struct bucket *g = PTR_BUCKET(ca, ptr, true);
167 if (gen_after(g->oldest_gen, ptr->gen))
168 g->oldest_gen = ptr->gen;
170 *max_stale = max(*max_stale, ptr_stale(ca, ptr));
173 bch2_mark_key(c, k, k.k->size, NULL, 0, flags);
178 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
179 u8 *max_stale, bool initial)
181 struct btree_node_iter iter;
182 struct bkey unpacked;
188 if (!btree_node_type_needs_gc(btree_node_type(b)))
191 for_each_btree_node_key_unpack(b, k, &iter,
193 bch2_bkey_debugcheck(c, b, k);
195 ret = bch2_gc_mark_key(c, k, max_stale, initial);
203 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
204 bool initial, bool metadata_only)
206 struct btree_trans trans;
207 struct btree_iter *iter;
209 struct range_checks r;
210 unsigned depth = metadata_only ? 1
211 : expensive_debug_checks(c) ? 0
212 : !btree_node_type_needs_gc(btree_id) ? 1
217 bch2_trans_init(&trans, c, 0, 0);
219 gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
221 btree_node_range_checks_init(&r, depth);
223 __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
224 0, depth, BTREE_ITER_PREFETCH, b) {
225 btree_node_range_checks(c, b, &r);
227 bch2_verify_btree_nr_keys(b);
229 gc_pos_set(c, gc_pos_btree_node(b));
231 ret = btree_gc_mark_node(c, b, &max_stale, initial);
237 bch2_btree_node_rewrite(c, iter,
239 BTREE_INSERT_USE_RESERVE|
241 BTREE_INSERT_GC_LOCK_HELD);
242 else if (!btree_gc_rewrite_disabled(c) &&
243 (btree_gc_always_rewrite(c) || max_stale > 16))
244 bch2_btree_node_rewrite(c, iter,
247 BTREE_INSERT_GC_LOCK_HELD);
250 bch2_trans_cond_resched(&trans);
252 ret = bch2_trans_exit(&trans) ?: ret;
256 mutex_lock(&c->btree_root_lock);
257 b = c->btree_roots[btree_id].b;
258 if (!btree_node_fake(b))
259 ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
260 &max_stale, initial);
261 gc_pos_set(c, gc_pos_btree_root(b->btree_id));
262 mutex_unlock(&c->btree_root_lock);
267 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
269 return (int) btree_id_to_gc_phase(l) -
270 (int) btree_id_to_gc_phase(r);
273 static int mark_journal_key(struct bch_fs *c, enum btree_id id,
274 struct bkey_i *insert)
276 struct btree_trans trans;
277 struct btree_iter *iter;
282 ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
286 bch2_trans_init(&trans, c, 0, 0);
288 for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
289 BTREE_ITER_SLOTS, k, ret) {
290 percpu_down_read_preempt_disable(&c->mark_lock);
291 ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
293 BCH_BUCKET_MARK_NOATOMIC);
294 percpu_up_read_preempt_enable(&c->mark_lock);
300 return bch2_trans_exit(&trans) ?: ret;
303 static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
304 bool initial, bool metadata_only)
306 enum btree_id ids[BTREE_ID_NR];
309 for (i = 0; i < BTREE_ID_NR; i++)
311 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
313 for (i = 0; i < BTREE_ID_NR; i++) {
314 enum btree_id id = ids[i];
315 enum btree_node_type type = __btree_node_type(0, id);
317 int ret = bch2_gc_btree(c, id, initial, metadata_only);
321 if (journal_keys && !metadata_only &&
322 btree_node_type_needs_gc(type)) {
323 struct journal_key *j;
326 for_each_journal_key(*journal_keys, j)
327 if (j->btree_id == id) {
328 ret = mark_journal_key(c, id, j->k);
338 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
340 enum bch_data_type type,
343 u64 b = sector_to_bucket(ca, start);
347 min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
349 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
350 gc_phase(GC_PHASE_SB), flags);
353 } while (start < end);
356 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
359 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
364 * This conditional is kind of gross, but we may be called from the
365 * device add path, before the new device has actually been added to the
366 * running filesystem:
369 lockdep_assert_held(&c->sb_lock);
370 percpu_down_read_preempt_disable(&c->mark_lock);
375 for (i = 0; i < layout->nr_superblocks; i++) {
376 u64 offset = le64_to_cpu(layout->sb_offset[i]);
378 if (offset == BCH_SB_SECTOR)
379 mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
382 mark_metadata_sectors(c, ca, offset,
383 offset + (1 << layout->sb_max_size_bits),
387 for (i = 0; i < ca->journal.nr; i++) {
388 b = ca->journal.buckets[i];
389 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
391 gc_phase(GC_PHASE_SB), flags);
395 percpu_up_read_preempt_enable(&c->mark_lock);
401 static void bch2_mark_superblocks(struct bch_fs *c)
406 mutex_lock(&c->sb_lock);
407 gc_pos_set(c, gc_phase(GC_PHASE_SB));
409 for_each_online_member(ca, c, i)
410 bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
411 mutex_unlock(&c->sb_lock);
414 /* Also see bch2_pending_btree_node_free_insert_done() */
415 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
417 struct btree_update *as;
418 struct pending_btree_node_free *d;
420 mutex_lock(&c->btree_interior_update_lock);
421 gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
423 for_each_pending_btree_node_free(c, as, d)
424 if (d->index_update_done)
425 bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0,
428 mutex_unlock(&c->btree_interior_update_lock);
431 static void bch2_mark_allocator_buckets(struct bch_fs *c)
434 struct open_bucket *ob;
438 percpu_down_read_preempt_disable(&c->mark_lock);
440 spin_lock(&c->freelist_lock);
441 gc_pos_set(c, gc_pos_alloc(c, NULL));
443 for_each_member_device(ca, c, ci) {
444 fifo_for_each_entry(i, &ca->free_inc, iter)
445 bch2_mark_alloc_bucket(c, ca, i, true,
446 gc_pos_alloc(c, NULL),
451 for (j = 0; j < RESERVE_NR; j++)
452 fifo_for_each_entry(i, &ca->free[j], iter)
453 bch2_mark_alloc_bucket(c, ca, i, true,
454 gc_pos_alloc(c, NULL),
458 spin_unlock(&c->freelist_lock);
460 for (ob = c->open_buckets;
461 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
463 spin_lock(&ob->lock);
465 gc_pos_set(c, gc_pos_alloc(c, ob));
466 ca = bch_dev_bkey_exists(c, ob->ptr.dev);
467 bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
471 spin_unlock(&ob->lock);
474 percpu_up_read_preempt_enable(&c->mark_lock);
477 static void bch2_gc_free(struct bch_fs *c)
482 genradix_free(&c->stripes[1]);
484 for_each_member_device(ca, c, i) {
485 kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
486 sizeof(struct bucket_array) +
487 ca->mi.nbuckets * sizeof(struct bucket));
488 ca->buckets[1] = NULL;
490 free_percpu(ca->usage[1]);
494 free_percpu(c->usage_gc);
498 static int bch2_gc_done(struct bch_fs *c,
499 bool initial, bool metadata_only)
502 bool verify = !metadata_only &&
504 (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
508 #define copy_field(_f, _msg, ...) \
509 if (dst->_f != src->_f) { \
511 fsck_err(c, _msg ": got %llu, should be %llu" \
512 , ##__VA_ARGS__, dst->_f, src->_f); \
515 #define copy_stripe_field(_f, _msg, ...) \
516 if (dst->_f != src->_f) { \
518 fsck_err(c, "stripe %zu has wrong "_msg \
519 ": got %u, should be %u", \
520 dst_iter.pos, ##__VA_ARGS__, \
525 #define copy_bucket_field(_f) \
526 if (dst->b[b].mark._f != src->b[b].mark._f) { \
528 fsck_err(c, "dev %u bucket %zu has wrong " #_f \
529 ": got %u, should be %u", i, b, \
530 dst->b[b].mark._f, src->b[b].mark._f); \
531 dst->b[b]._mark._f = src->b[b].mark._f; \
532 dst->b[b]._mark.dirty = true; \
534 #define copy_dev_field(_f, _msg, ...) \
535 copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
536 #define copy_fs_field(_f, _msg, ...) \
537 copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
539 if (!metadata_only) {
540 struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
541 struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
542 struct stripe *dst, *src;
545 c->ec_stripes_heap.used = 0;
547 while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
548 (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
549 BUG_ON(src_iter.pos != dst_iter.pos);
551 copy_stripe_field(alive, "alive");
552 copy_stripe_field(sectors, "sectors");
553 copy_stripe_field(algorithm, "algorithm");
554 copy_stripe_field(nr_blocks, "nr_blocks");
555 copy_stripe_field(nr_redundant, "nr_redundant");
556 copy_stripe_field(blocks_nonempty,
559 for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
560 copy_stripe_field(block_sectors[i],
561 "block_sectors[%u]", i);
564 bch2_stripes_heap_insert(c, dst, dst_iter.pos);
566 genradix_iter_advance(&dst_iter, &c->stripes[0]);
567 genradix_iter_advance(&src_iter, &c->stripes[1]);
571 for_each_member_device(ca, c, i) {
572 struct bucket_array *dst = __bucket_array(ca, 0);
573 struct bucket_array *src = __bucket_array(ca, 1);
576 for (b = 0; b < src->nbuckets; b++) {
577 copy_bucket_field(gen);
578 copy_bucket_field(data_type);
579 copy_bucket_field(owned_by_allocator);
580 copy_bucket_field(stripe);
581 copy_bucket_field(dirty_sectors);
582 copy_bucket_field(cached_sectors);
584 if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
585 dst->b[b].oldest_gen = src->b[b].oldest_gen;
586 dst->b[b]._mark.dirty = true;
591 bch2_fs_usage_acc_to_base(c, 0);
592 bch2_fs_usage_acc_to_base(c, 1);
594 bch2_dev_usage_from_buckets(c);
597 unsigned nr = fs_usage_u64s(c);
598 struct bch_fs_usage *dst = c->usage_base;
599 struct bch_fs_usage *src = (void *)
600 bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
602 copy_fs_field(hidden, "hidden");
603 copy_fs_field(btree, "btree");
605 if (!metadata_only) {
606 copy_fs_field(data, "data");
607 copy_fs_field(cached, "cached");
608 copy_fs_field(reserved, "reserved");
609 copy_fs_field(nr_inodes,"nr_inodes");
611 for (i = 0; i < BCH_REPLICAS_MAX; i++)
612 copy_fs_field(persistent_reserved[i],
613 "persistent_reserved[%i]", i);
616 for (i = 0; i < c->replicas.nr; i++) {
617 struct bch_replicas_entry *e =
618 cpu_replicas_entry(&c->replicas, i);
622 (e->data_type == BCH_DATA_USER ||
623 e->data_type == BCH_DATA_CACHED))
626 bch2_replicas_entry_to_text(&PBUF(buf), e);
628 copy_fs_field(replicas[i], "%s", buf);
633 #undef copy_dev_field
634 #undef copy_bucket_field
635 #undef copy_stripe_field
641 static int bch2_gc_start(struct bch_fs *c,
648 * indicate to stripe code that we need to allocate for the gc stripes
651 gc_pos_set(c, gc_phase(GC_PHASE_START));
655 c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
656 sizeof(u64), GFP_KERNEL);
660 for_each_member_device(ca, c, i) {
661 BUG_ON(ca->buckets[1]);
662 BUG_ON(ca->usage[1]);
664 ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
665 ca->mi.nbuckets * sizeof(struct bucket),
666 GFP_KERNEL|__GFP_ZERO);
667 if (!ca->buckets[1]) {
668 percpu_ref_put(&ca->ref);
672 ca->usage[1] = alloc_percpu(struct bch_dev_usage);
674 percpu_ref_put(&ca->ref);
679 for_each_member_device(ca, c, i) {
680 struct bucket_array *dst = __bucket_array(ca, 1);
681 struct bucket_array *src = __bucket_array(ca, 0);
684 dst->first_bucket = src->first_bucket;
685 dst->nbuckets = src->nbuckets;
687 for (b = 0; b < src->nbuckets; b++) {
688 struct bucket *d = &dst->b[b];
689 struct bucket *s = &src->b[b];
691 d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
692 d->gen_valid = s->gen_valid;
695 (s->mark.data_type == BCH_DATA_USER ||
696 s->mark.data_type == BCH_DATA_CACHED)) {
698 d->_mark.owned_by_allocator = 0;
703 return bch2_ec_mem_alloc(c, true);
707 * bch2_gc - walk _all_ references to buckets, and recompute them:
709 * Order matters here:
710 * - Concurrent GC relies on the fact that we have a total ordering for
711 * everything that GC walks - see gc_will_visit_node(),
712 * gc_will_visit_root()
714 * - also, references move around in the course of index updates and
715 * various other crap: everything needs to agree on the ordering
716 * references are allowed to move around in - e.g., we're allowed to
717 * start with a reference owned by an open_bucket (the allocator) and
718 * move it to the btree, but not the reverse.
720 * This is necessary to ensure that gc doesn't miss references that
721 * move around - if references move backwards in the ordering GC
722 * uses, GC could skip past them
724 int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
725 bool initial, bool metadata_only)
728 u64 start_time = local_clock();
729 unsigned i, iter = 0;
734 down_write(&c->gc_lock);
736 percpu_down_write(&c->mark_lock);
737 ret = bch2_gc_start(c, metadata_only);
738 percpu_up_write(&c->mark_lock);
743 bch2_mark_superblocks(c);
745 ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
749 bch2_mark_pending_btree_node_frees(c);
750 bch2_mark_allocator_buckets(c);
755 (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
756 (!iter && test_restart_gc(c)))) {
758 * XXX: make sure gens we fixed got saved
761 bch_info(c, "Fixed gens, restarting mark and sweep:");
762 clear_bit(BCH_FS_FIXED_GENS, &c->flags);
763 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
765 percpu_down_write(&c->mark_lock);
767 percpu_up_write(&c->mark_lock);
772 bch_info(c, "Unable to fix bucket gens, looping");
777 bch2_journal_block(&c->journal);
779 percpu_down_write(&c->mark_lock);
780 ret = bch2_gc_done(c, initial, metadata_only);
782 bch2_journal_unblock(&c->journal);
784 percpu_down_write(&c->mark_lock);
787 /* Indicates that gc is no longer in progress: */
788 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
791 percpu_up_write(&c->mark_lock);
793 up_write(&c->gc_lock);
796 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
799 * Wake up allocator in case it was waiting for buckets
800 * because of not being able to inc gens
802 for_each_member_device(ca, c, i)
803 bch2_wake_allocator(ca);
806 * At startup, allocations can happen directly instead of via the
807 * allocator thread - issue wakeup in case they blocked on gc_lock:
809 closure_wake_up(&c->freelist_wait);
813 /* Btree coalescing */
815 static void recalc_packed_keys(struct btree *b)
817 struct bset *i = btree_bset_first(b);
818 struct bkey_packed *k;
820 memset(&b->nr, 0, sizeof(b->nr));
822 BUG_ON(b->nsets != 1);
824 vstruct_for_each(i, k)
825 btree_keys_account_key_add(&b->nr, 0, k);
828 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
829 struct btree *old_nodes[GC_MERGE_NODES])
831 struct btree *parent = btree_node_parent(iter, old_nodes[0]);
832 unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
833 unsigned blocks = btree_blocks(c) * 2 / 3;
834 struct btree *new_nodes[GC_MERGE_NODES];
835 struct btree_update *as;
836 struct keylist keylist;
837 struct bkey_format_state format_state;
838 struct bkey_format new_format;
840 memset(new_nodes, 0, sizeof(new_nodes));
841 bch2_keylist_init(&keylist, NULL);
843 /* Count keys that are not deleted */
844 for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
845 u64s += old_nodes[i]->nr.live_u64s;
847 nr_old_nodes = nr_new_nodes = i;
849 /* Check if all keys in @old_nodes could fit in one fewer node */
850 if (nr_old_nodes <= 1 ||
851 __vstruct_blocks(struct btree_node, c->block_bits,
852 DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
855 /* Find a format that all keys in @old_nodes can pack into */
856 bch2_bkey_format_init(&format_state);
858 for (i = 0; i < nr_old_nodes; i++)
859 __bch2_btree_calc_format(&format_state, old_nodes[i]);
861 new_format = bch2_bkey_format_done(&format_state);
863 /* Check if repacking would make any nodes too big to fit */
864 for (i = 0; i < nr_old_nodes; i++)
865 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
866 trace_btree_gc_coalesce_fail(c,
867 BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
871 if (bch2_keylist_realloc(&keylist, NULL, 0,
872 (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
873 trace_btree_gc_coalesce_fail(c,
874 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
878 as = bch2_btree_update_start(c, iter->btree_id,
879 btree_update_reserve_required(c, parent) + nr_old_nodes,
881 BTREE_INSERT_USE_RESERVE,
884 trace_btree_gc_coalesce_fail(c,
885 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
886 bch2_keylist_free(&keylist, NULL);
890 trace_btree_gc_coalesce(c, old_nodes[0]);
892 for (i = 0; i < nr_old_nodes; i++)
893 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
895 /* Repack everything with @new_format and sort down to one bset */
896 for (i = 0; i < nr_old_nodes; i++)
898 __bch2_btree_node_alloc_replacement(as, old_nodes[i],
902 * Conceptually we concatenate the nodes together and slice them
903 * up at different boundaries.
905 for (i = nr_new_nodes - 1; i > 0; --i) {
906 struct btree *n1 = new_nodes[i];
907 struct btree *n2 = new_nodes[i - 1];
909 struct bset *s1 = btree_bset_first(n1);
910 struct bset *s2 = btree_bset_first(n2);
911 struct bkey_packed *k, *last = NULL;
913 /* Calculate how many keys from @n2 we could fit inside @n1 */
917 k < vstruct_last(s2) &&
918 vstruct_blocks_plus(n1->data, c->block_bits,
919 u64s + k->u64s) <= blocks;
925 if (u64s == le16_to_cpu(s2->u64s)) {
926 /* n2 fits entirely in n1 */
927 n1->key.k.p = n1->data->max_key = n2->data->max_key;
929 memcpy_u64s(vstruct_last(s1),
931 le16_to_cpu(s2->u64s));
932 le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
934 set_btree_bset_end(n1, n1->set);
936 six_unlock_write(&n2->lock);
937 bch2_btree_node_free_never_inserted(c, n2);
938 six_unlock_intent(&n2->lock);
940 memmove(new_nodes + i - 1,
942 sizeof(new_nodes[0]) * (nr_new_nodes - i));
943 new_nodes[--nr_new_nodes] = NULL;
945 /* move part of n2 into n1 */
946 n1->key.k.p = n1->data->max_key =
947 bkey_unpack_pos(n1, last);
950 btree_type_successor(iter->btree_id,
953 memcpy_u64s(vstruct_last(s1),
955 le16_add_cpu(&s1->u64s, u64s);
958 vstruct_idx(s2, u64s),
959 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
960 s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
962 set_btree_bset_end(n1, n1->set);
963 set_btree_bset_end(n2, n2->set);
967 for (i = 0; i < nr_new_nodes; i++) {
968 struct btree *n = new_nodes[i];
970 recalc_packed_keys(n);
971 btree_node_reset_sib_u64s(n);
973 bch2_btree_build_aux_trees(n);
974 six_unlock_write(&n->lock);
976 bch2_btree_node_write(c, n, SIX_LOCK_intent);
980 * The keys for the old nodes get deleted. We don't want to insert keys
981 * that compare equal to the keys for the new nodes we'll also be
982 * inserting - we can't because keys on a keylist must be strictly
983 * greater than the previous keys, and we also don't need to since the
984 * key for the new node will serve the same purpose (overwriting the key
987 for (i = 0; i < nr_old_nodes; i++) {
988 struct bkey_i delete;
991 for (j = 0; j < nr_new_nodes; j++)
992 if (!bkey_cmp(old_nodes[i]->key.k.p,
993 new_nodes[j]->key.k.p))
996 bkey_init(&delete.k);
997 delete.k.p = old_nodes[i]->key.k.p;
998 bch2_keylist_add_in_order(&keylist, &delete);
1004 * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
1005 * does the lookup once and thus expects the keys to be in sorted order
1006 * so we have to make sure the new keys are correctly ordered with
1007 * respect to the deleted keys added in the previous loop
1009 for (i = 0; i < nr_new_nodes; i++)
1010 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
1012 /* Insert the newly coalesced nodes */
1013 bch2_btree_insert_node(as, parent, iter, &keylist, 0);
1015 BUG_ON(!bch2_keylist_empty(&keylist));
1017 BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
1019 bch2_btree_iter_node_replace(iter, new_nodes[0]);
1021 for (i = 0; i < nr_new_nodes; i++)
1022 bch2_open_buckets_put(c, &new_nodes[i]->ob);
1024 /* Free the old nodes and update our sliding window */
1025 for (i = 0; i < nr_old_nodes; i++) {
1026 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
1029 * the index update might have triggered a split, in which case
1030 * the nodes we coalesced - the new nodes we just created -
1031 * might not be sibling nodes anymore - don't add them to the
1032 * sliding window (except the first):
1035 old_nodes[i] = new_nodes[i];
1037 old_nodes[i] = NULL;
1039 six_unlock_intent(&new_nodes[i]->lock);
1043 bch2_btree_update_done(as);
1044 bch2_keylist_free(&keylist, NULL);
1047 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
1049 struct btree_trans trans;
1050 struct btree_iter *iter;
1052 bool kthread = (current->flags & PF_KTHREAD) != 0;
1055 /* Sliding window of adjacent btree nodes */
1056 struct btree *merge[GC_MERGE_NODES];
1057 u32 lock_seq[GC_MERGE_NODES];
1059 bch2_trans_init(&trans, c, 0, 0);
1062 * XXX: We don't have a good way of positively matching on sibling nodes
1063 * that have the same parent - this code works by handling the cases
1064 * where they might not have the same parent, and is thus fragile. Ugh.
1066 * Perhaps redo this to use multiple linked iterators?
1068 memset(merge, 0, sizeof(merge));
1070 __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
1072 BTREE_ITER_PREFETCH, b) {
1073 memmove(merge + 1, merge,
1074 sizeof(merge) - sizeof(merge[0]));
1075 memmove(lock_seq + 1, lock_seq,
1076 sizeof(lock_seq) - sizeof(lock_seq[0]));
1080 for (i = 1; i < GC_MERGE_NODES; i++) {
1082 !six_relock_intent(&merge[i]->lock, lock_seq[i]))
1085 if (merge[i]->level != merge[0]->level) {
1086 six_unlock_intent(&merge[i]->lock);
1090 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
1092 bch2_coalesce_nodes(c, iter, merge);
1094 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
1095 lock_seq[i] = merge[i]->lock.state.seq;
1096 six_unlock_intent(&merge[i]->lock);
1099 lock_seq[0] = merge[0]->lock.state.seq;
1101 if (kthread && kthread_should_stop()) {
1102 bch2_trans_exit(&trans);
1106 bch2_trans_cond_resched(&trans);
1109 * If the parent node wasn't relocked, it might have been split
1110 * and the nodes in our sliding window might not have the same
1111 * parent anymore - blow away the sliding window:
1113 if (btree_iter_node(iter, iter->level + 1) &&
1114 !btree_node_intent_locked(iter, iter->level + 1))
1115 memset(merge + 1, 0,
1116 (GC_MERGE_NODES - 1) * sizeof(merge[0]));
1118 return bch2_trans_exit(&trans);
1122 * bch_coalesce - coalesce adjacent nodes with low occupancy
1124 void bch2_coalesce(struct bch_fs *c)
1128 down_read(&c->gc_lock);
1129 trace_gc_coalesce_start(c);
1131 for (id = 0; id < BTREE_ID_NR; id++) {
1132 int ret = c->btree_roots[id].b
1133 ? bch2_coalesce_btree(c, id)
1137 if (ret != -ESHUTDOWN)
1138 bch_err(c, "btree coalescing failed: %d", ret);
1143 trace_gc_coalesce_end(c);
1144 up_read(&c->gc_lock);
1147 static int bch2_gc_thread(void *arg)
1149 struct bch_fs *c = arg;
1150 struct io_clock *clock = &c->io_clock[WRITE];
1151 unsigned long last = atomic_long_read(&clock->now);
1152 unsigned last_kick = atomic_read(&c->kick_gc);
1159 set_current_state(TASK_INTERRUPTIBLE);
1161 if (kthread_should_stop()) {
1162 __set_current_state(TASK_RUNNING);
1166 if (atomic_read(&c->kick_gc) != last_kick)
1169 if (c->btree_gc_periodic) {
1170 unsigned long next = last + c->capacity / 16;
1172 if (atomic_long_read(&clock->now) >= next)
1175 bch2_io_clock_schedule_timeout(clock, next);
1182 __set_current_state(TASK_RUNNING);
1184 last = atomic_long_read(&clock->now);
1185 last_kick = atomic_read(&c->kick_gc);
1187 ret = bch2_gc(c, NULL, false, false);
1189 bch_err(c, "btree gc failed: %i", ret);
1191 debug_check_no_locks_held();
1197 void bch2_gc_thread_stop(struct bch_fs *c)
1199 struct task_struct *p;
1202 c->gc_thread = NULL;
1210 int bch2_gc_thread_start(struct bch_fs *c)
1212 struct task_struct *p;
1214 BUG_ON(c->gc_thread);
1216 p = kthread_create(bch2_gc_thread, c, "bch_gc");