git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c

   1 /*
   2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   3  * Copyright (C) 2014 Datera Inc.
   4  */
   5
   6 #include "bcachefs.h"
   7 #include "alloc.h"
   8 #include "bkey_methods.h"
   9 #include "btree_locking.h"
  10 #include "btree_update_interior.h"
  11 #include "btree_io.h"
  12 #include "btree_gc.h"
  13 #include "buckets.h"
  14 #include "clock.h"
  15 #include "debug.h"
  16 #include "error.h"
  17 #include "extents.h"
  18 #include "journal.h"
  19 #include "keylist.h"
  20 #include "move.h"
  21 #include "super-io.h"
  22
  23 #include <linux/slab.h>
  24 #include <linux/bitops.h>
  25 #include <linux/freezer.h>
  26 #include <linux/kthread.h>
  27 #include <linux/preempt.h>
  28 #include <linux/rcupdate.h>
  29 #include <trace/events/bcachefs.h>
  30
  31 struct range_checks {
  32         struct range_level {
  33                 struct bpos     min;
  34                 struct bpos     max;
  35         }                       l[BTREE_MAX_DEPTH];
  36         unsigned                depth;
  37 };
  38
  39 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
  40 {
  41         unsigned i;
  42
  43         for (i = 0; i < BTREE_MAX_DEPTH; i++)
  44                 r->l[i].min = r->l[i].max = POS_MIN;
  45         r->depth = depth;
  46 }
  47
  48 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
  49                                     struct range_checks *r)
  50 {
  51         struct range_level *l = &r->l[b->level];
  52
  53         struct bpos expected_min = bkey_cmp(l->min, l->max)
  54                 ? btree_type_successor(b->btree_id, l->max)
  55                 : l->max;
  56
  57         bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
  58                 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
  59                 b->data->min_key.inode,
  60                 b->data->min_key.offset,
  61                 expected_min.inode,
  62                 expected_min.offset);
  63
  64         l->max = b->data->max_key;
  65
  66         if (b->level > r->depth) {
  67                 l = &r->l[b->level - 1];
  68
  69                 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
  70                         "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
  71                         b->data->min_key.inode,
  72                         b->data->min_key.offset,
  73                         l->min.inode,
  74                         l->min.offset);
  75
  76                 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
  77                         "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
  78                         b->data->max_key.inode,
  79                         b->data->max_key.offset,
  80                         l->max.inode,
  81                         l->max.offset);
  82
  83                 if (bkey_cmp(b->data->max_key, POS_MAX))
  84                         l->min = l->max =
  85                                 btree_type_successor(b->btree_id,
  86                                                      b->data->max_key);
  87         }
  88 }
  89
  90 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
  91 {
  92         const struct bch_extent_ptr *ptr;
  93         u8 max_stale = 0;
  94
  95         if (bkey_extent_is_data(k.k)) {
  96                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
  97
  98                 extent_for_each_ptr(e, ptr) {
  99                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 100                         size_t b = PTR_BUCKET_NR(ca, ptr);
 101
 102                         if (gen_after(ca->oldest_gens[b], ptr->gen))
 103                                 ca->oldest_gens[b] = ptr->gen;
 104
 105                         max_stale = max(max_stale, ptr_stale(ca, ptr));
 106                 }
 107         }
 108
 109         return max_stale;
 110 }
 111
 112 /*
 113  * For runtime mark and sweep:
 114  */
 115 static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 116                            struct bkey_s_c k, unsigned flags)
 117 {
 118         struct gc_pos pos = { 0 };
 119         struct bch_fs_usage *stats;
 120         u8 ret = 0;
 121
 122         preempt_disable();
 123         stats = this_cpu_ptr(c->usage_percpu);
 124         switch (type) {
 125         case BKEY_TYPE_BTREE:
 126                 bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
 127                               0, flags|
 128                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 129                               BCH_BUCKET_MARK_GC_LOCK_HELD);
 130                 break;
 131         case BKEY_TYPE_EXTENTS:
 132                 bch2_mark_key(c, k, k.k->size, false, pos, stats,
 133                               0, flags|
 134                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 135                               BCH_BUCKET_MARK_GC_LOCK_HELD);
 136                 ret = bch2_btree_key_recalc_oldest_gen(c, k);
 137                 break;
 138         default:
 139                 BUG();
 140         }
 141         preempt_enable();
 142
 143         return ret;
 144 }
 145
 146 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 147                                 struct bkey_s_c k)
 148 {
 149         enum bch_data_type data_type = type == BKEY_TYPE_BTREE
 150                 ? BCH_DATA_BTREE : BCH_DATA_USER;
 151         struct bch_devs_list devs = bch2_bkey_devs(k);
 152         int ret = 0;
 153
 154         if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 155             fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
 156                         "superblock not marked as containing replicas (type %u)",
 157                         data_type)) {
 158                 ret = bch2_check_mark_super(c, data_type, devs);
 159                 if (ret)
 160                         return ret;
 161         }
 162
 163         switch (k.k->type) {
 164         case BCH_EXTENT:
 165         case BCH_EXTENT_CACHED: {
 166                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 167                 const struct bch_extent_ptr *ptr;
 168
 169                 extent_for_each_ptr(e, ptr) {
 170                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 171                         size_t b = PTR_BUCKET_NR(ca, ptr);
 172                         struct bucket *g = PTR_BUCKET(ca, ptr);
 173
 174                         if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
 175                                         "found ptr with missing gen in alloc btree,\n"
 176                                         "type %s gen %u",
 177                                         bch2_data_types[data_type],
 178                                         ptr->gen)) {
 179                                 g->_mark.gen = ptr->gen;
 180                                 g->_mark.gen_valid = 1;
 181                                 set_bit(b, ca->buckets_dirty);
 182                         }
 183
 184                         if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
 185                                         "%s ptr gen in the future: %u > %u",
 186                                         bch2_data_types[data_type],
 187                                         ptr->gen, g->mark.gen)) {
 188                                 g->_mark.gen = ptr->gen;
 189                                 g->_mark.gen_valid = 1;
 190                                 set_bit(b, ca->buckets_dirty);
 191                                 set_bit(BCH_FS_FIXED_GENS, &c->flags);
 192                         }
 193
 194                 }
 195                 break;
 196         }
 197         }
 198
 199         atomic64_set(&c->key_version,
 200                      max_t(u64, k.k->version.lo,
 201                            atomic64_read(&c->key_version)));
 202
 203         bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
 204 fsck_err:
 205         return ret;
 206 }
 207
 208 static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 209 {
 210         enum bkey_type type = btree_node_type(b);
 211         struct btree_node_iter iter;
 212         struct bkey unpacked;
 213         struct bkey_s_c k;
 214         u8 stale = 0;
 215
 216         if (btree_node_has_ptrs(b))
 217                 for_each_btree_node_key_unpack(b, k, &iter,
 218                                                btree_node_is_extents(b),
 219                                                &unpacked) {
 220                         bch2_bkey_debugcheck(c, b, k);
 221                         stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
 222                 }
 223
 224         return stale;
 225 }
 226
 227 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 228 {
 229         write_seqcount_begin(&c->gc_pos_lock);
 230         c->gc_pos = new_pos;
 231         write_seqcount_end(&c->gc_pos_lock);
 232 }
 233
 234 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 235 {
 236         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
 237         __gc_pos_set(c, new_pos);
 238 }
 239
 240 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 241 {
 242         struct btree_iter iter;
 243         struct btree *b;
 244         struct range_checks r;
 245         unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
 246         unsigned max_stale;
 247         int ret = 0;
 248
 249         /*
 250          * if expensive_debug_checks is on, run range_checks on all leaf nodes:
 251          */
 252         if (expensive_debug_checks(c))
 253                 depth = 0;
 254
 255         btree_node_range_checks_init(&r, depth);
 256
 257         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
 258                               0, depth, BTREE_ITER_PREFETCH, b) {
 259                 btree_node_range_checks(c, b, &r);
 260
 261                 bch2_verify_btree_nr_keys(b);
 262
 263                 max_stale = btree_gc_mark_node(c, b);
 264
 265                 gc_pos_set(c, gc_pos_btree_node(b));
 266
 267                 if (max_stale > 32)
 268                         bch2_btree_node_rewrite(c, &iter,
 269                                         b->data->keys.seq,
 270                                         BTREE_INSERT_USE_RESERVE|
 271                                         BTREE_INSERT_GC_LOCK_HELD);
 272                 else if (!btree_gc_rewrite_disabled(c) &&
 273                          (btree_gc_always_rewrite(c) || max_stale > 16))
 274                         bch2_btree_node_rewrite(c, &iter,
 275                                         b->data->keys.seq,
 276                                         BTREE_INSERT_NOWAIT|
 277                                         BTREE_INSERT_GC_LOCK_HELD);
 278
 279                 bch2_btree_iter_cond_resched(&iter);
 280         }
 281         ret = bch2_btree_iter_unlock(&iter);
 282         if (ret)
 283                 return ret;
 284
 285         mutex_lock(&c->btree_root_lock);
 286
 287         b = c->btree_roots[btree_id].b;
 288         if (!btree_node_fake(b))
 289                 bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
 290         gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 291
 292         mutex_unlock(&c->btree_root_lock);
 293         return 0;
 294 }
 295
 296 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
 297                                   u64 start, u64 end,
 298                                   enum bch_data_type type,
 299                                   unsigned flags)
 300 {
 301         u64 b = sector_to_bucket(ca, start);
 302
 303         do {
 304                 unsigned sectors =
 305                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
 306
 307                 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
 308                                           gc_phase(GC_PHASE_SB), flags);
 309                 b++;
 310                 start += sectors;
 311         } while (start < end);
 312 }
 313
 314 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 315                               unsigned flags)
 316 {
 317         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 318         unsigned i;
 319         u64 b;
 320
 321         lockdep_assert_held(&c->sb_lock);
 322
 323         for (i = 0; i < layout->nr_superblocks; i++) {
 324                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
 325
 326                 if (offset == BCH_SB_SECTOR)
 327                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
 328                                               BCH_DATA_SB, flags);
 329
 330                 mark_metadata_sectors(c, ca, offset,
 331                                       offset + (1 << layout->sb_max_size_bits),
 332                                       BCH_DATA_SB, flags);
 333         }
 334
 335         spin_lock(&c->journal.lock);
 336
 337         for (i = 0; i < ca->journal.nr; i++) {
 338                 b = ca->journal.buckets[i];
 339                 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
 340                                           ca->mi.bucket_size,
 341                                           gc_phase(GC_PHASE_SB), flags);
 342         }
 343
 344         spin_unlock(&c->journal.lock);
 345 }
 346
 347 static void bch2_mark_superblocks(struct bch_fs *c)
 348 {
 349         struct bch_dev *ca;
 350         unsigned i;
 351
 352         mutex_lock(&c->sb_lock);
 353         gc_pos_set(c, gc_phase(GC_PHASE_SB));
 354
 355         for_each_online_member(ca, c, i)
 356                 bch2_mark_dev_superblock(c, ca,
 357                                          BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 358                                          BCH_BUCKET_MARK_GC_LOCK_HELD);
 359         mutex_unlock(&c->sb_lock);
 360 }
 361
 362 /* Also see bch2_pending_btree_node_free_insert_done() */
 363 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 364 {
 365         struct gc_pos pos = { 0 };
 366         struct bch_fs_usage stats = { 0 };
 367         struct btree_update *as;
 368         struct pending_btree_node_free *d;
 369
 370         mutex_lock(&c->btree_interior_update_lock);
 371         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
 372
 373         for_each_pending_btree_node_free(c, as, d)
 374                 if (d->index_update_done)
 375                         bch2_mark_key(c, bkey_i_to_s_c(&d->key),
 376                                       c->opts.btree_node_size, true, pos,
 377                                       &stats, 0,
 378                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 379                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
 380         /*
 381          * Don't apply stats - pending deletes aren't tracked in
 382          * bch_alloc_stats:
 383          */
 384
 385         mutex_unlock(&c->btree_interior_update_lock);
 386 }
 387
 388 static void bch2_mark_allocator_buckets(struct bch_fs *c)
 389 {
 390         struct bch_dev *ca;
 391         struct open_bucket *ob;
 392         size_t i, j, iter;
 393         unsigned ci;
 394
 395         spin_lock(&c->freelist_lock);
 396         gc_pos_set(c, gc_pos_alloc(c, NULL));
 397
 398         for_each_member_device(ca, c, ci) {
 399                 fifo_for_each_entry(i, &ca->free_inc, iter)
 400                         bch2_mark_alloc_bucket(c, ca, i, true,
 401                                                gc_pos_alloc(c, NULL),
 402                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 403                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
 404
 405
 406
 407                 for (j = 0; j < RESERVE_NR; j++)
 408                         fifo_for_each_entry(i, &ca->free[j], iter)
 409                                 bch2_mark_alloc_bucket(c, ca, i, true,
 410                                                        gc_pos_alloc(c, NULL),
 411                                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 412                                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
 413         }
 414
 415         spin_unlock(&c->freelist_lock);
 416
 417         for (ob = c->open_buckets;
 418              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
 419              ob++) {
 420                 spin_lock(&ob->lock);
 421                 if (ob->valid) {
 422                         gc_pos_set(c, gc_pos_alloc(c, ob));
 423                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 424                         bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
 425                                                gc_pos_alloc(c, ob),
 426                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 427                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
 428                 }
 429                 spin_unlock(&ob->lock);
 430         }
 431 }
 432
 433 static void bch2_gc_start(struct bch_fs *c)
 434 {
 435         struct bch_dev *ca;
 436         struct bucket_array *buckets;
 437         struct bucket_mark new;
 438         unsigned i;
 439         size_t b;
 440         int cpu;
 441
 442         lg_global_lock(&c->usage_lock);
 443
 444         /*
 445          * Indicates to buckets code that gc is now in progress - done under
 446          * usage_lock to avoid racing with bch2_mark_key():
 447          */
 448         __gc_pos_set(c, GC_POS_MIN);
 449
 450         /* Save a copy of the existing bucket stats while we recompute them: */
 451         for_each_member_device(ca, c, i) {
 452                 ca->usage_cached = __bch2_dev_usage_read(ca);
 453                 for_each_possible_cpu(cpu) {
 454                         struct bch_dev_usage *p =
 455                                 per_cpu_ptr(ca->usage_percpu, cpu);
 456                         memset(p, 0, sizeof(*p));
 457                 }
 458         }
 459
 460         c->usage_cached = __bch2_fs_usage_read(c);
 461         for_each_possible_cpu(cpu) {
 462                 struct bch_fs_usage *p =
 463                         per_cpu_ptr(c->usage_percpu, cpu);
 464
 465                 memset(p->s, 0, sizeof(p->s));
 466         }
 467
 468         lg_global_unlock(&c->usage_lock);
 469
 470         /* Clear bucket marks: */
 471         for_each_member_device(ca, c, i) {
 472                 down_read(&ca->bucket_lock);
 473                 buckets = bucket_array(ca);
 474
 475                 for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
 476                         bucket_cmpxchg(buckets->b + b, new, ({
 477                                 new.owned_by_allocator  = 0;
 478                                 new.data_type           = 0;
 479                                 new.cached_sectors      = 0;
 480                                 new.dirty_sectors       = 0;
 481                         }));
 482                         ca->oldest_gens[b] = new.gen;
 483                 }
 484                 up_read(&ca->bucket_lock);
 485         }
 486 }
 487
 488 /**
 489  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
 490  */
 491 void bch2_gc(struct bch_fs *c)
 492 {
 493         struct bch_dev *ca;
 494         u64 start_time = local_clock();
 495         unsigned i;
 496
 497         /*
 498          * Walk _all_ references to buckets, and recompute them:
 499          *
 500          * Order matters here:
 501          *  - Concurrent GC relies on the fact that we have a total ordering for
 502          *    everything that GC walks - see  gc_will_visit_node(),
 503          *    gc_will_visit_root()
 504          *
 505          *  - also, references move around in the course of index updates and
 506          *    various other crap: everything needs to agree on the ordering
 507          *    references are allowed to move around in - e.g., we're allowed to
 508          *    start with a reference owned by an open_bucket (the allocator) and
 509          *    move it to the btree, but not the reverse.
 510          *
 511          *    This is necessary to ensure that gc doesn't miss references that
 512          *    move around - if references move backwards in the ordering GC
 513          *    uses, GC could skip past them
 514          */
 515         trace_gc_start(c);
 516
 517         /*
 518          * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
 519          * gc_lock if sectors_available goes to 0:
 520          */
 521         bch2_recalc_sectors_available(c);
 522
 523         down_write(&c->gc_lock);
 524         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 525                 goto out;
 526
 527         bch2_gc_start(c);
 528
 529         /* Walk btree: */
 530         while (c->gc_pos.phase < (int) BTREE_ID_NR) {
 531                 int ret = c->btree_roots[c->gc_pos.phase].b
 532                         ? bch2_gc_btree(c, (int) c->gc_pos.phase)
 533                         : 0;
 534
 535                 if (ret) {
 536                         bch_err(c, "btree gc failed: %d", ret);
 537                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
 538                         goto out;
 539                 }
 540
 541                 gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
 542         }
 543
 544         bch2_mark_superblocks(c);
 545         bch2_mark_pending_btree_node_frees(c);
 546         bch2_mark_allocator_buckets(c);
 547
 548         for_each_member_device(ca, c, i)
 549                 atomic_long_set(&ca->saturated_count, 0);
 550
 551         /* Indicates that gc is no longer in progress: */
 552         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 553         c->gc_count++;
 554 out:
 555         up_write(&c->gc_lock);
 556         trace_gc_end(c);
 557         bch2_time_stats_update(&c->btree_gc_time, start_time);
 558
 559         /*
 560          * Wake up allocator in case it was waiting for buckets
 561          * because of not being able to inc gens
 562          */
 563         for_each_member_device(ca, c, i)
 564                 bch2_wake_allocator(ca);
 565
 566         /*
 567          * At startup, allocations can happen directly instead of via the
 568          * allocator thread - issue wakeup in case they blocked on gc_lock:
 569          */
 570         closure_wake_up(&c->freelist_wait);
 571 }
 572
 573 /* Btree coalescing */
 574
 575 static void recalc_packed_keys(struct btree *b)
 576 {
 577         struct bkey_packed *k;
 578
 579         memset(&b->nr, 0, sizeof(b->nr));
 580
 581         BUG_ON(b->nsets != 1);
 582
 583         for (k =  btree_bkey_first(b, b->set);
 584              k != btree_bkey_last(b, b->set);
 585              k = bkey_next(k))
 586                 btree_keys_account_key_add(&b->nr, 0, k);
 587 }
 588
 589 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 590                                 struct btree *old_nodes[GC_MERGE_NODES])
 591 {
 592         struct btree *parent = btree_node_parent(iter, old_nodes[0]);
 593         unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
 594         unsigned blocks = btree_blocks(c) * 2 / 3;
 595         struct btree *new_nodes[GC_MERGE_NODES];
 596         struct btree_update *as;
 597         struct keylist keylist;
 598         struct bkey_format_state format_state;
 599         struct bkey_format new_format;
 600
 601         memset(new_nodes, 0, sizeof(new_nodes));
 602         bch2_keylist_init(&keylist, NULL);
 603
 604         /* Count keys that are not deleted */
 605         for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
 606                 u64s += old_nodes[i]->nr.live_u64s;
 607
 608         nr_old_nodes = nr_new_nodes = i;
 609
 610         /* Check if all keys in @old_nodes could fit in one fewer node */
 611         if (nr_old_nodes <= 1 ||
 612             __vstruct_blocks(struct btree_node, c->block_bits,
 613                              DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
 614                 return;
 615
 616         /* Find a format that all keys in @old_nodes can pack into */
 617         bch2_bkey_format_init(&format_state);
 618
 619         for (i = 0; i < nr_old_nodes; i++)
 620                 __bch2_btree_calc_format(&format_state, old_nodes[i]);
 621
 622         new_format = bch2_bkey_format_done(&format_state);
 623
 624         /* Check if repacking would make any nodes too big to fit */
 625         for (i = 0; i < nr_old_nodes; i++)
 626                 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
 627                         trace_btree_gc_coalesce_fail(c,
 628                                         BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
 629                         return;
 630                 }
 631
 632         if (bch2_keylist_realloc(&keylist, NULL, 0,
 633                         (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
 634                 trace_btree_gc_coalesce_fail(c,
 635                                 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
 636                 return;
 637         }
 638
 639         as = bch2_btree_update_start(c, iter->btree_id,
 640                         btree_update_reserve_required(c, parent) + nr_old_nodes,
 641                         BTREE_INSERT_NOFAIL|
 642                         BTREE_INSERT_USE_RESERVE,
 643                         NULL);
 644         if (IS_ERR(as)) {
 645                 trace_btree_gc_coalesce_fail(c,
 646                                 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
 647                 bch2_keylist_free(&keylist, NULL);
 648                 return;
 649         }
 650
 651         trace_btree_gc_coalesce(c, old_nodes[0]);
 652
 653         for (i = 0; i < nr_old_nodes; i++)
 654                 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
 655
 656         /* Repack everything with @new_format and sort down to one bset */
 657         for (i = 0; i < nr_old_nodes; i++)
 658                 new_nodes[i] =
 659                         __bch2_btree_node_alloc_replacement(as, old_nodes[i],
 660                                                             new_format);
 661
 662         /*
 663          * Conceptually we concatenate the nodes together and slice them
 664          * up at different boundaries.
 665          */
 666         for (i = nr_new_nodes - 1; i > 0; --i) {
 667                 struct btree *n1 = new_nodes[i];
 668                 struct btree *n2 = new_nodes[i - 1];
 669
 670                 struct bset *s1 = btree_bset_first(n1);
 671                 struct bset *s2 = btree_bset_first(n2);
 672                 struct bkey_packed *k, *last = NULL;
 673
 674                 /* Calculate how many keys from @n2 we could fit inside @n1 */
 675                 u64s = 0;
 676
 677                 for (k = s2->start;
 678                      k < vstruct_last(s2) &&
 679                      vstruct_blocks_plus(n1->data, c->block_bits,
 680                                          u64s + k->u64s) <= blocks;
 681                      k = bkey_next(k)) {
 682                         last = k;
 683                         u64s += k->u64s;
 684                 }
 685
 686                 if (u64s == le16_to_cpu(s2->u64s)) {
 687                         /* n2 fits entirely in n1 */
 688                         n1->key.k.p = n1->data->max_key = n2->data->max_key;
 689
 690                         memcpy_u64s(vstruct_last(s1),
 691                                     s2->start,
 692                                     le16_to_cpu(s2->u64s));
 693                         le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
 694
 695                         set_btree_bset_end(n1, n1->set);
 696
 697                         six_unlock_write(&n2->lock);
 698                         bch2_btree_node_free_never_inserted(c, n2);
 699                         six_unlock_intent(&n2->lock);
 700
 701                         memmove(new_nodes + i - 1,
 702                                 new_nodes + i,
 703                                 sizeof(new_nodes[0]) * (nr_new_nodes - i));
 704                         new_nodes[--nr_new_nodes] = NULL;
 705                 } else if (u64s) {
 706                         /* move part of n2 into n1 */
 707                         n1->key.k.p = n1->data->max_key =
 708                                 bkey_unpack_pos(n1, last);
 709
 710                         n2->data->min_key =
 711                                 btree_type_successor(iter->btree_id,
 712                                                      n1->data->max_key);
 713
 714                         memcpy_u64s(vstruct_last(s1),
 715                                     s2->start, u64s);
 716                         le16_add_cpu(&s1->u64s, u64s);
 717
 718                         memmove(s2->start,
 719                                 vstruct_idx(s2, u64s),
 720                                 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
 721                         s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
 722
 723                         set_btree_bset_end(n1, n1->set);
 724                         set_btree_bset_end(n2, n2->set);
 725                 }
 726         }
 727
 728         for (i = 0; i < nr_new_nodes; i++) {
 729                 struct btree *n = new_nodes[i];
 730
 731                 recalc_packed_keys(n);
 732                 btree_node_reset_sib_u64s(n);
 733
 734                 bch2_btree_build_aux_trees(n);
 735                 six_unlock_write(&n->lock);
 736
 737                 bch2_btree_node_write(c, n, SIX_LOCK_intent);
 738         }
 739
 740         /*
 741          * The keys for the old nodes get deleted. We don't want to insert keys
 742          * that compare equal to the keys for the new nodes we'll also be
 743          * inserting - we can't because keys on a keylist must be strictly
 744          * greater than the previous keys, and we also don't need to since the
 745          * key for the new node will serve the same purpose (overwriting the key
 746          * for the old node).
 747          */
 748         for (i = 0; i < nr_old_nodes; i++) {
 749                 struct bkey_i delete;
 750                 unsigned j;
 751
 752                 for (j = 0; j < nr_new_nodes; j++)
 753                         if (!bkey_cmp(old_nodes[i]->key.k.p,
 754                                       new_nodes[j]->key.k.p))
 755                                 goto next;
 756
 757                 bkey_init(&delete.k);
 758                 delete.k.p = old_nodes[i]->key.k.p;
 759                 bch2_keylist_add_in_order(&keylist, &delete);
 760 next:
 761                 i = i;
 762         }
 763
 764         /*
 765          * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
 766          * does the lookup once and thus expects the keys to be in sorted order
 767          * so we have to make sure the new keys are correctly ordered with
 768          * respect to the deleted keys added in the previous loop
 769          */
 770         for (i = 0; i < nr_new_nodes; i++)
 771                 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
 772
 773         /* Insert the newly coalesced nodes */
 774         bch2_btree_insert_node(as, parent, iter, &keylist);
 775
 776         BUG_ON(!bch2_keylist_empty(&keylist));
 777
 778         BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
 779
 780         BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
 781
 782         for (i = 0; i < nr_new_nodes; i++)
 783                 bch2_btree_open_bucket_put(c, new_nodes[i]);
 784
 785         /* Free the old nodes and update our sliding window */
 786         for (i = 0; i < nr_old_nodes; i++) {
 787                 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
 788                 six_unlock_intent(&old_nodes[i]->lock);
 789
 790                 /*
 791                  * the index update might have triggered a split, in which case
 792                  * the nodes we coalesced - the new nodes we just created -
 793                  * might not be sibling nodes anymore - don't add them to the
 794                  * sliding window (except the first):
 795                  */
 796                 if (!i) {
 797                         old_nodes[i] = new_nodes[i];
 798                 } else {
 799                         old_nodes[i] = NULL;
 800                         if (new_nodes[i])
 801                                 six_unlock_intent(&new_nodes[i]->lock);
 802                 }
 803         }
 804
 805         bch2_btree_update_done(as);
 806         bch2_keylist_free(&keylist, NULL);
 807 }
 808
 809 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 810 {
 811         struct btree_iter iter;
 812         struct btree *b;
 813         unsigned i;
 814
 815         /* Sliding window of adjacent btree nodes */
 816         struct btree *merge[GC_MERGE_NODES];
 817         u32 lock_seq[GC_MERGE_NODES];
 818
 819         /*
 820          * XXX: We don't have a good way of positively matching on sibling nodes
 821          * that have the same parent - this code works by handling the cases
 822          * where they might not have the same parent, and is thus fragile. Ugh.
 823          *
 824          * Perhaps redo this to use multiple linked iterators?
 825          */
 826         memset(merge, 0, sizeof(merge));
 827
 828         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
 829                               BTREE_MAX_DEPTH, 0,
 830                               BTREE_ITER_PREFETCH, b) {
 831                 memmove(merge + 1, merge,
 832                         sizeof(merge) - sizeof(merge[0]));
 833                 memmove(lock_seq + 1, lock_seq,
 834                         sizeof(lock_seq) - sizeof(lock_seq[0]));
 835
 836                 merge[0] = b;
 837
 838                 for (i = 1; i < GC_MERGE_NODES; i++) {
 839                         if (!merge[i] ||
 840                             !six_relock_intent(&merge[i]->lock, lock_seq[i]))
 841                                 break;
 842
 843                         if (merge[i]->level != merge[0]->level) {
 844                                 six_unlock_intent(&merge[i]->lock);
 845                                 break;
 846                         }
 847                 }
 848                 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
 849
 850                 bch2_coalesce_nodes(c, &iter, merge);
 851
 852                 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
 853                         lock_seq[i] = merge[i]->lock.state.seq;
 854                         six_unlock_intent(&merge[i]->lock);
 855                 }
 856
 857                 lock_seq[0] = merge[0]->lock.state.seq;
 858
 859                 if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
 860                         bch2_btree_iter_unlock(&iter);
 861                         return -ESHUTDOWN;
 862                 }
 863
 864                 bch2_btree_iter_cond_resched(&iter);
 865
 866                 /*
 867                  * If the parent node wasn't relocked, it might have been split
 868                  * and the nodes in our sliding window might not have the same
 869                  * parent anymore - blow away the sliding window:
 870                  */
 871                 if (btree_iter_node(&iter, iter.level + 1) &&
 872                     !btree_node_intent_locked(&iter, iter.level + 1))
 873                         memset(merge + 1, 0,
 874                                (GC_MERGE_NODES - 1) * sizeof(merge[0]));
 875         }
 876         return bch2_btree_iter_unlock(&iter);
 877 }
 878
 879 /**
 880  * bch_coalesce - coalesce adjacent nodes with low occupancy
 881  */
 882 void bch2_coalesce(struct bch_fs *c)
 883 {
 884         enum btree_id id;
 885
 886         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 887                 return;
 888
 889         down_read(&c->gc_lock);
 890         trace_gc_coalesce_start(c);
 891
 892         for (id = 0; id < BTREE_ID_NR; id++) {
 893                 int ret = c->btree_roots[id].b
 894                         ? bch2_coalesce_btree(c, id)
 895                         : 0;
 896
 897                 if (ret) {
 898                         if (ret != -ESHUTDOWN)
 899                                 bch_err(c, "btree coalescing failed: %d", ret);
 900                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
 901                         return;
 902                 }
 903         }
 904
 905         trace_gc_coalesce_end(c);
 906         up_read(&c->gc_lock);
 907 }
 908
 909 static int bch2_gc_thread(void *arg)
 910 {
 911         struct bch_fs *c = arg;
 912         struct io_clock *clock = &c->io_clock[WRITE];
 913         unsigned long last = atomic_long_read(&clock->now);
 914         unsigned last_kick = atomic_read(&c->kick_gc);
 915
 916         set_freezable();
 917
 918         while (1) {
 919                 while (1) {
 920                         set_current_state(TASK_INTERRUPTIBLE);
 921
 922                         if (kthread_should_stop()) {
 923                                 __set_current_state(TASK_RUNNING);
 924                                 return 0;
 925                         }
 926
 927                         if (atomic_read(&c->kick_gc) != last_kick)
 928                                 break;
 929
 930                         if (c->btree_gc_periodic) {
 931                                 unsigned long next = last + c->capacity / 16;
 932
 933                                 if (atomic_long_read(&clock->now) >= next)
 934                                         break;
 935
 936                                 bch2_io_clock_schedule_timeout(clock, next);
 937                         } else {
 938                                 schedule();
 939                         }
 940
 941                         try_to_freeze();
 942                 }
 943                 __set_current_state(TASK_RUNNING);
 944
 945                 last = atomic_long_read(&clock->now);
 946                 last_kick = atomic_read(&c->kick_gc);
 947
 948                 bch2_gc(c);
 949
 950                 debug_check_no_locks_held();
 951         }
 952
 953         return 0;
 954 }
 955
 956 void bch2_gc_thread_stop(struct bch_fs *c)
 957 {
 958         set_bit(BCH_FS_GC_STOPPING, &c->flags);
 959
 960         if (c->gc_thread)
 961                 kthread_stop(c->gc_thread);
 962
 963         c->gc_thread = NULL;
 964         clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 965 }
 966
 967 int bch2_gc_thread_start(struct bch_fs *c)
 968 {
 969         struct task_struct *p;
 970
 971         BUG_ON(c->gc_thread);
 972
 973         p = kthread_create(bch2_gc_thread, c, "bcache_gc");
 974         if (IS_ERR(p))
 975                 return PTR_ERR(p);
 976
 977         c->gc_thread = p;
 978         wake_up_process(c->gc_thread);
 979         return 0;
 980 }
 981
 982 /* Initial GC computes bucket marks during startup */
 983
 984 static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
 985 {
 986         struct btree_iter iter;
 987         struct btree *b;
 988         struct range_checks r;
 989         int ret = 0;
 990
 991         btree_node_range_checks_init(&r, 0);
 992
 993         if (!c->btree_roots[id].b)
 994                 return 0;
 995
 996         b = c->btree_roots[id].b;
 997         if (!btree_node_fake(b))
 998                 ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
 999                                                   bkey_i_to_s_c(&b->key));
1000         if (ret)
1001                 return ret;
1002
1003         /*
1004          * We have to hit every btree node before starting journal replay, in
1005          * order for the journal seq blacklist machinery to work:
1006          */
1007         for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
1008                 btree_node_range_checks(c, b, &r);
1009
1010                 if (btree_node_has_ptrs(b)) {
1011                         struct btree_node_iter node_iter;
1012                         struct bkey unpacked;
1013                         struct bkey_s_c k;
1014
1015                         for_each_btree_node_key_unpack(b, k, &node_iter,
1016                                                        btree_node_is_extents(b),
1017                                                        &unpacked) {
1018                                 ret = bch2_btree_mark_key_initial(c,
1019                                                         btree_node_type(b), k);
1020                                 if (ret)
1021                                         goto err;
1022                         }
1023                 }
1024
1025                 bch2_btree_iter_cond_resched(&iter);
1026         }
1027 err:
1028         return bch2_btree_iter_unlock(&iter) ?: ret;
1029 }
1030
1031 static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1032 {
1033         unsigned iter = 0;
1034         enum btree_id id;
1035         int ret;
1036
1037         mutex_lock(&c->sb_lock);
1038         if (!bch2_sb_get_replicas(c->disk_sb)) {
1039                 if (BCH_SB_INITIALIZED(c->disk_sb))
1040                         bch_info(c, "building replicas info");
1041                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
1042         }
1043         mutex_unlock(&c->sb_lock);
1044 again:
1045         bch2_gc_start(c);
1046
1047         for (id = 0; id < BTREE_ID_NR; id++) {
1048                 ret = bch2_initial_gc_btree(c, id);
1049                 if (ret)
1050                         return ret;
1051         }
1052
1053         ret = bch2_journal_mark(c, journal);
1054         if (ret)
1055                 return ret;
1056
1057         if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
1058                 if (iter++ > 2) {
1059                         bch_info(c, "Unable to fix bucket gens, looping");
1060                         return -EINVAL;
1061                 }
1062
1063                 bch_info(c, "Fixed gens, restarting initial mark and sweep:");
1064                 clear_bit(BCH_FS_FIXED_GENS, &c->flags);
1065                 goto again;
1066         }
1067
1068         /*
1069          * Skip past versions that might have possibly been used (as nonces),
1070          * but hadn't had their pointers written:
1071          */
1072         if (c->sb.encryption_type)
1073                 atomic64_add(1 << 16, &c->key_version);
1074
1075         bch2_mark_superblocks(c);
1076
1077         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
1078         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
1079
1080         return 0;
1081 }
1082
1083 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1084 {
1085         int ret;
1086
1087         down_write(&c->gc_lock);
1088         ret = __bch2_initial_gc(c, journal);
1089         up_write(&c->gc_lock);
1090
1091         return ret;
1092 }