git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_gc.c

   1 /*
   2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   3  * Copyright (C) 2014 Datera Inc.
   4  */
   5
   6 #include "bcachefs.h"
   7 #include "alloc.h"
   8 #include "bkey_methods.h"
   9 #include "btree_locking.h"
  10 #include "btree_update_interior.h"
  11 #include "btree_io.h"
  12 #include "btree_gc.h"
  13 #include "buckets.h"
  14 #include "clock.h"
  15 #include "debug.h"
  16 #include "error.h"
  17 #include "extents.h"
  18 #include "journal.h"
  19 #include "keylist.h"
  20 #include "move.h"
  21 #include "super-io.h"
  22
  23 #include <linux/slab.h>
  24 #include <linux/bitops.h>
  25 #include <linux/freezer.h>
  26 #include <linux/kthread.h>
  27 #include <linux/preempt.h>
  28 #include <linux/rcupdate.h>
  29 #include <trace/events/bcachefs.h>
  30
  31 struct range_checks {
  32         struct range_level {
  33                 struct bpos     min;
  34                 struct bpos     max;
  35         }                       l[BTREE_MAX_DEPTH];
  36         unsigned                depth;
  37 };
  38
  39 static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
  40 {
  41         unsigned i;
  42
  43         for (i = 0; i < BTREE_MAX_DEPTH; i++)
  44                 r->l[i].min = r->l[i].max = POS_MIN;
  45         r->depth = depth;
  46 }
  47
  48 static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
  49                                     struct range_checks *r)
  50 {
  51         struct range_level *l = &r->l[b->level];
  52
  53         struct bpos expected_min = bkey_cmp(l->min, l->max)
  54                 ? btree_type_successor(b->btree_id, l->max)
  55                 : l->max;
  56
  57         bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
  58                 "btree node has incorrect min key: %llu:%llu != %llu:%llu",
  59                 b->data->min_key.inode,
  60                 b->data->min_key.offset,
  61                 expected_min.inode,
  62                 expected_min.offset);
  63
  64         l->max = b->data->max_key;
  65
  66         if (b->level > r->depth) {
  67                 l = &r->l[b->level - 1];
  68
  69                 bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
  70                         "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
  71                         b->data->min_key.inode,
  72                         b->data->min_key.offset,
  73                         l->min.inode,
  74                         l->min.offset);
  75
  76                 bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
  77                         "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
  78                         b->data->max_key.inode,
  79                         b->data->max_key.offset,
  80                         l->max.inode,
  81                         l->max.offset);
  82
  83                 if (bkey_cmp(b->data->max_key, POS_MAX))
  84                         l->min = l->max =
  85                                 btree_type_successor(b->btree_id,
  86                                                      b->data->max_key);
  87         }
  88 }
  89
  90 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
  91 {
  92         const struct bch_extent_ptr *ptr;
  93         u8 max_stale = 0;
  94
  95         if (bkey_extent_is_data(k.k)) {
  96                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
  97
  98                 extent_for_each_ptr(e, ptr) {
  99                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 100                         size_t b = PTR_BUCKET_NR(ca, ptr);
 101
 102                         if (gen_after(ca->oldest_gens[b], ptr->gen))
 103                                 ca->oldest_gens[b] = ptr->gen;
 104
 105                         max_stale = max(max_stale, ptr_stale(ca, ptr));
 106                 }
 107         }
 108
 109         return max_stale;
 110 }
 111
 112 /*
 113  * For runtime mark and sweep:
 114  */
 115 static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 116                            struct bkey_s_c k, unsigned flags)
 117 {
 118         struct gc_pos pos = { 0 };
 119         struct bch_fs_usage *stats;
 120         u8 ret = 0;
 121
 122         preempt_disable();
 123         stats = this_cpu_ptr(c->usage_percpu);
 124         switch (type) {
 125         case BKEY_TYPE_BTREE:
 126                 bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
 127                               0, flags|
 128                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 129                               BCH_BUCKET_MARK_GC_LOCK_HELD);
 130                 break;
 131         case BKEY_TYPE_EXTENTS:
 132                 bch2_mark_key(c, k, k.k->size, false, pos, stats,
 133                               0, flags|
 134                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 135                               BCH_BUCKET_MARK_GC_LOCK_HELD);
 136                 ret = bch2_btree_key_recalc_oldest_gen(c, k);
 137                 break;
 138         default:
 139                 BUG();
 140         }
 141         preempt_enable();
 142
 143         return ret;
 144 }
 145
 146 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 147                                 struct bkey_s_c k)
 148 {
 149         enum bch_data_type data_type = type == BKEY_TYPE_BTREE
 150                 ? BCH_DATA_BTREE : BCH_DATA_USER;
 151         int ret = 0;
 152
 153         if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 154             fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
 155                         "superblock not marked as containing replicas (type %u)",
 156                         data_type)) {
 157                 ret = bch2_mark_bkey_replicas(c, data_type, k);
 158                 if (ret)
 159                         return ret;
 160         }
 161
 162         switch (k.k->type) {
 163         case BCH_EXTENT:
 164         case BCH_EXTENT_CACHED: {
 165                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 166                 const struct bch_extent_ptr *ptr;
 167
 168                 extent_for_each_ptr(e, ptr) {
 169                         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 170                         size_t b = PTR_BUCKET_NR(ca, ptr);
 171                         struct bucket *g = PTR_BUCKET(ca, ptr);
 172
 173                         if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
 174                                         "found ptr with missing gen in alloc btree,\n"
 175                                         "type %s gen %u",
 176                                         bch2_data_types[data_type],
 177                                         ptr->gen)) {
 178                                 g->_mark.gen = ptr->gen;
 179                                 g->_mark.gen_valid = 1;
 180                                 set_bit(b, ca->buckets_dirty);
 181                         }
 182
 183                         if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
 184                                         "%s ptr gen in the future: %u > %u",
 185                                         bch2_data_types[data_type],
 186                                         ptr->gen, g->mark.gen)) {
 187                                 g->_mark.gen = ptr->gen;
 188                                 g->_mark.gen_valid = 1;
 189                                 set_bit(b, ca->buckets_dirty);
 190                                 set_bit(BCH_FS_FIXED_GENS, &c->flags);
 191                         }
 192
 193                 }
 194                 break;
 195         }
 196         }
 197
 198         atomic64_set(&c->key_version,
 199                      max_t(u64, k.k->version.lo,
 200                            atomic64_read(&c->key_version)));
 201
 202         bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
 203 fsck_err:
 204         return ret;
 205 }
 206
 207 static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 208 {
 209         enum bkey_type type = btree_node_type(b);
 210         struct btree_node_iter iter;
 211         struct bkey unpacked;
 212         struct bkey_s_c k;
 213         u8 stale = 0;
 214
 215         if (btree_node_has_ptrs(b))
 216                 for_each_btree_node_key_unpack(b, k, &iter,
 217                                                btree_node_is_extents(b),
 218                                                &unpacked) {
 219                         bch2_bkey_debugcheck(c, b, k);
 220                         stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
 221                 }
 222
 223         return stale;
 224 }
 225
 226 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 227 {
 228         write_seqcount_begin(&c->gc_pos_lock);
 229         c->gc_pos = new_pos;
 230         write_seqcount_end(&c->gc_pos_lock);
 231 }
 232
 233 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 234 {
 235         BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
 236         __gc_pos_set(c, new_pos);
 237 }
 238
 239 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 240 {
 241         struct btree_iter iter;
 242         struct btree *b;
 243         struct range_checks r;
 244         unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
 245         unsigned max_stale;
 246         int ret = 0;
 247
 248         /*
 249          * if expensive_debug_checks is on, run range_checks on all leaf nodes:
 250          */
 251         if (expensive_debug_checks(c))
 252                 depth = 0;
 253
 254         btree_node_range_checks_init(&r, depth);
 255
 256         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
 257                               0, depth, BTREE_ITER_PREFETCH, b) {
 258                 btree_node_range_checks(c, b, &r);
 259
 260                 bch2_verify_btree_nr_keys(b);
 261
 262                 max_stale = btree_gc_mark_node(c, b);
 263
 264                 gc_pos_set(c, gc_pos_btree_node(b));
 265
 266                 if (max_stale > 32)
 267                         bch2_btree_node_rewrite(c, &iter,
 268                                         b->data->keys.seq,
 269                                         BTREE_INSERT_USE_RESERVE|
 270                                         BTREE_INSERT_GC_LOCK_HELD);
 271                 else if (!btree_gc_rewrite_disabled(c) &&
 272                          (btree_gc_always_rewrite(c) || max_stale > 16))
 273                         bch2_btree_node_rewrite(c, &iter,
 274                                         b->data->keys.seq,
 275                                         BTREE_INSERT_NOWAIT|
 276                                         BTREE_INSERT_GC_LOCK_HELD);
 277
 278                 bch2_btree_iter_cond_resched(&iter);
 279         }
 280         ret = bch2_btree_iter_unlock(&iter);
 281         if (ret)
 282                 return ret;
 283
 284         mutex_lock(&c->btree_root_lock);
 285
 286         b = c->btree_roots[btree_id].b;
 287         if (!btree_node_fake(b))
 288                 bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
 289         gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 290
 291         mutex_unlock(&c->btree_root_lock);
 292         return 0;
 293 }
 294
 295 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
 296                                   u64 start, u64 end,
 297                                   enum bch_data_type type,
 298                                   unsigned flags)
 299 {
 300         u64 b = sector_to_bucket(ca, start);
 301
 302         do {
 303                 unsigned sectors =
 304                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
 305
 306                 bch2_mark_metadata_bucket(c, ca, b, type, sectors,
 307                                           gc_phase(GC_PHASE_SB), flags);
 308                 b++;
 309                 start += sectors;
 310         } while (start < end);
 311 }
 312
 313 void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 314                               unsigned flags)
 315 {
 316         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 317         unsigned i;
 318         u64 b;
 319
 320         lockdep_assert_held(&c->sb_lock);
 321
 322         for (i = 0; i < layout->nr_superblocks; i++) {
 323                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
 324
 325                 if (offset == BCH_SB_SECTOR)
 326                         mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
 327                                               BCH_DATA_SB, flags);
 328
 329                 mark_metadata_sectors(c, ca, offset,
 330                                       offset + (1 << layout->sb_max_size_bits),
 331                                       BCH_DATA_SB, flags);
 332         }
 333
 334         spin_lock(&c->journal.lock);
 335
 336         for (i = 0; i < ca->journal.nr; i++) {
 337                 b = ca->journal.buckets[i];
 338                 bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
 339                                           ca->mi.bucket_size,
 340                                           gc_phase(GC_PHASE_SB), flags);
 341         }
 342
 343         spin_unlock(&c->journal.lock);
 344 }
 345
 346 static void bch2_mark_superblocks(struct bch_fs *c)
 347 {
 348         struct bch_dev *ca;
 349         unsigned i;
 350
 351         mutex_lock(&c->sb_lock);
 352         gc_pos_set(c, gc_phase(GC_PHASE_SB));
 353
 354         for_each_online_member(ca, c, i)
 355                 bch2_mark_dev_superblock(c, ca,
 356                                          BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 357                                          BCH_BUCKET_MARK_GC_LOCK_HELD);
 358         mutex_unlock(&c->sb_lock);
 359 }
 360
 361 /* Also see bch2_pending_btree_node_free_insert_done() */
 362 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 363 {
 364         struct gc_pos pos = { 0 };
 365         struct bch_fs_usage stats = { 0 };
 366         struct btree_update *as;
 367         struct pending_btree_node_free *d;
 368
 369         mutex_lock(&c->btree_interior_update_lock);
 370         gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
 371
 372         for_each_pending_btree_node_free(c, as, d)
 373                 if (d->index_update_done)
 374                         bch2_mark_key(c, bkey_i_to_s_c(&d->key),
 375                                       c->opts.btree_node_size, true, pos,
 376                                       &stats, 0,
 377                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 378                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
 379         /*
 380          * Don't apply stats - pending deletes aren't tracked in
 381          * bch_alloc_stats:
 382          */
 383
 384         mutex_unlock(&c->btree_interior_update_lock);
 385 }
 386
 387 static void bch2_mark_allocator_buckets(struct bch_fs *c)
 388 {
 389         struct bch_dev *ca;
 390         struct open_bucket *ob;
 391         size_t i, j, iter;
 392         unsigned ci;
 393
 394         spin_lock(&c->freelist_lock);
 395         gc_pos_set(c, gc_pos_alloc(c, NULL));
 396
 397         for_each_member_device(ca, c, ci) {
 398                 fifo_for_each_entry(i, &ca->free_inc, iter)
 399                         bch2_mark_alloc_bucket(c, ca, i, true,
 400                                                gc_pos_alloc(c, NULL),
 401                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 402                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
 403
 404
 405
 406                 for (j = 0; j < RESERVE_NR; j++)
 407                         fifo_for_each_entry(i, &ca->free[j], iter)
 408                                 bch2_mark_alloc_bucket(c, ca, i, true,
 409                                                        gc_pos_alloc(c, NULL),
 410                                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 411                                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
 412         }
 413
 414         spin_unlock(&c->freelist_lock);
 415
 416         for (ob = c->open_buckets;
 417              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
 418              ob++) {
 419                 spin_lock(&ob->lock);
 420                 if (ob->valid) {
 421                         gc_pos_set(c, gc_pos_alloc(c, ob));
 422                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 423                         bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
 424                                                gc_pos_alloc(c, ob),
 425                                                BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
 426                                                BCH_BUCKET_MARK_GC_LOCK_HELD);
 427                 }
 428                 spin_unlock(&ob->lock);
 429         }
 430 }
 431
 432 static void bch2_gc_start(struct bch_fs *c)
 433 {
 434         struct bch_dev *ca;
 435         struct bucket_array *buckets;
 436         struct bucket_mark new;
 437         unsigned i;
 438         size_t b;
 439         int cpu;
 440
 441         lg_global_lock(&c->usage_lock);
 442
 443         /*
 444          * Indicates to buckets code that gc is now in progress - done under
 445          * usage_lock to avoid racing with bch2_mark_key():
 446          */
 447         __gc_pos_set(c, GC_POS_MIN);
 448
 449         /* Save a copy of the existing bucket stats while we recompute them: */
 450         for_each_member_device(ca, c, i) {
 451                 ca->usage_cached = __bch2_dev_usage_read(ca);
 452                 for_each_possible_cpu(cpu) {
 453                         struct bch_dev_usage *p =
 454                                 per_cpu_ptr(ca->usage_percpu, cpu);
 455                         memset(p, 0, sizeof(*p));
 456                 }
 457         }
 458
 459         c->usage_cached = __bch2_fs_usage_read(c);
 460         for_each_possible_cpu(cpu) {
 461                 struct bch_fs_usage *p =
 462                         per_cpu_ptr(c->usage_percpu, cpu);
 463
 464                 memset(p->s, 0, sizeof(p->s));
 465         }
 466
 467         lg_global_unlock(&c->usage_lock);
 468
 469         /* Clear bucket marks: */
 470         for_each_member_device(ca, c, i) {
 471                 down_read(&ca->bucket_lock);
 472                 buckets = bucket_array(ca);
 473
 474                 for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
 475                         bucket_cmpxchg(buckets->b + b, new, ({
 476                                 new.owned_by_allocator  = 0;
 477                                 new.data_type           = 0;
 478                                 new.cached_sectors      = 0;
 479                                 new.dirty_sectors       = 0;
 480                         }));
 481                         ca->oldest_gens[b] = new.gen;
 482                 }
 483                 up_read(&ca->bucket_lock);
 484         }
 485 }
 486
 487 /**
 488  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
 489  */
 490 void bch2_gc(struct bch_fs *c)
 491 {
 492         struct bch_dev *ca;
 493         u64 start_time = local_clock();
 494         unsigned i;
 495
 496         /*
 497          * Walk _all_ references to buckets, and recompute them:
 498          *
 499          * Order matters here:
 500          *  - Concurrent GC relies on the fact that we have a total ordering for
 501          *    everything that GC walks - see  gc_will_visit_node(),
 502          *    gc_will_visit_root()
 503          *
 504          *  - also, references move around in the course of index updates and
 505          *    various other crap: everything needs to agree on the ordering
 506          *    references are allowed to move around in - e.g., we're allowed to
 507          *    start with a reference owned by an open_bucket (the allocator) and
 508          *    move it to the btree, but not the reverse.
 509          *
 510          *    This is necessary to ensure that gc doesn't miss references that
 511          *    move around - if references move backwards in the ordering GC
 512          *    uses, GC could skip past them
 513          */
 514         trace_gc_start(c);
 515
 516         /*
 517          * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
 518          * gc_lock if sectors_available goes to 0:
 519          */
 520         bch2_recalc_sectors_available(c);
 521
 522         down_write(&c->gc_lock);
 523         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 524                 goto out;
 525
 526         bch2_gc_start(c);
 527
 528         /* Walk btree: */
 529         while (c->gc_pos.phase < (int) BTREE_ID_NR) {
 530                 int ret = c->btree_roots[c->gc_pos.phase].b
 531                         ? bch2_gc_btree(c, (int) c->gc_pos.phase)
 532                         : 0;
 533
 534                 if (ret) {
 535                         bch_err(c, "btree gc failed: %d", ret);
 536                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
 537                         goto out;
 538                 }
 539
 540                 gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
 541         }
 542
 543         bch2_mark_superblocks(c);
 544         bch2_mark_pending_btree_node_frees(c);
 545         bch2_mark_allocator_buckets(c);
 546
 547         for_each_member_device(ca, c, i)
 548                 atomic_long_set(&ca->saturated_count, 0);
 549
 550         /* Indicates that gc is no longer in progress: */
 551         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 552         c->gc_count++;
 553 out:
 554         up_write(&c->gc_lock);
 555         trace_gc_end(c);
 556         bch2_time_stats_update(&c->btree_gc_time, start_time);
 557
 558         /*
 559          * Wake up allocator in case it was waiting for buckets
 560          * because of not being able to inc gens
 561          */
 562         for_each_member_device(ca, c, i)
 563                 bch2_wake_allocator(ca);
 564
 565         /*
 566          * At startup, allocations can happen directly instead of via the
 567          * allocator thread - issue wakeup in case they blocked on gc_lock:
 568          */
 569         closure_wake_up(&c->freelist_wait);
 570 }
 571
 572 /* Btree coalescing */
 573
 574 static void recalc_packed_keys(struct btree *b)
 575 {
 576         struct bkey_packed *k;
 577
 578         memset(&b->nr, 0, sizeof(b->nr));
 579
 580         BUG_ON(b->nsets != 1);
 581
 582         for (k =  btree_bkey_first(b, b->set);
 583              k != btree_bkey_last(b, b->set);
 584              k = bkey_next(k))
 585                 btree_keys_account_key_add(&b->nr, 0, k);
 586 }
 587
 588 static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 589                                 struct btree *old_nodes[GC_MERGE_NODES])
 590 {
 591         struct btree *parent = btree_node_parent(iter, old_nodes[0]);
 592         unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
 593         unsigned blocks = btree_blocks(c) * 2 / 3;
 594         struct btree *new_nodes[GC_MERGE_NODES];
 595         struct btree_update *as;
 596         struct keylist keylist;
 597         struct bkey_format_state format_state;
 598         struct bkey_format new_format;
 599
 600         memset(new_nodes, 0, sizeof(new_nodes));
 601         bch2_keylist_init(&keylist, NULL);
 602
 603         /* Count keys that are not deleted */
 604         for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
 605                 u64s += old_nodes[i]->nr.live_u64s;
 606
 607         nr_old_nodes = nr_new_nodes = i;
 608
 609         /* Check if all keys in @old_nodes could fit in one fewer node */
 610         if (nr_old_nodes <= 1 ||
 611             __vstruct_blocks(struct btree_node, c->block_bits,
 612                              DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
 613                 return;
 614
 615         /* Find a format that all keys in @old_nodes can pack into */
 616         bch2_bkey_format_init(&format_state);
 617
 618         for (i = 0; i < nr_old_nodes; i++)
 619                 __bch2_btree_calc_format(&format_state, old_nodes[i]);
 620
 621         new_format = bch2_bkey_format_done(&format_state);
 622
 623         /* Check if repacking would make any nodes too big to fit */
 624         for (i = 0; i < nr_old_nodes; i++)
 625                 if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
 626                         trace_btree_gc_coalesce_fail(c,
 627                                         BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
 628                         return;
 629                 }
 630
 631         if (bch2_keylist_realloc(&keylist, NULL, 0,
 632                         (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
 633                 trace_btree_gc_coalesce_fail(c,
 634                                 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
 635                 return;
 636         }
 637
 638         as = bch2_btree_update_start(c, iter->btree_id,
 639                         btree_update_reserve_required(c, parent) + nr_old_nodes,
 640                         BTREE_INSERT_NOFAIL|
 641                         BTREE_INSERT_USE_RESERVE,
 642                         NULL);
 643         if (IS_ERR(as)) {
 644                 trace_btree_gc_coalesce_fail(c,
 645                                 BTREE_GC_COALESCE_FAIL_RESERVE_GET);
 646                 bch2_keylist_free(&keylist, NULL);
 647                 return;
 648         }
 649
 650         trace_btree_gc_coalesce(c, old_nodes[0]);
 651
 652         for (i = 0; i < nr_old_nodes; i++)
 653                 bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
 654
 655         /* Repack everything with @new_format and sort down to one bset */
 656         for (i = 0; i < nr_old_nodes; i++)
 657                 new_nodes[i] =
 658                         __bch2_btree_node_alloc_replacement(as, old_nodes[i],
 659                                                             new_format);
 660
 661         /*
 662          * Conceptually we concatenate the nodes together and slice them
 663          * up at different boundaries.
 664          */
 665         for (i = nr_new_nodes - 1; i > 0; --i) {
 666                 struct btree *n1 = new_nodes[i];
 667                 struct btree *n2 = new_nodes[i - 1];
 668
 669                 struct bset *s1 = btree_bset_first(n1);
 670                 struct bset *s2 = btree_bset_first(n2);
 671                 struct bkey_packed *k, *last = NULL;
 672
 673                 /* Calculate how many keys from @n2 we could fit inside @n1 */
 674                 u64s = 0;
 675
 676                 for (k = s2->start;
 677                      k < vstruct_last(s2) &&
 678                      vstruct_blocks_plus(n1->data, c->block_bits,
 679                                          u64s + k->u64s) <= blocks;
 680                      k = bkey_next(k)) {
 681                         last = k;
 682                         u64s += k->u64s;
 683                 }
 684
 685                 if (u64s == le16_to_cpu(s2->u64s)) {
 686                         /* n2 fits entirely in n1 */
 687                         n1->key.k.p = n1->data->max_key = n2->data->max_key;
 688
 689                         memcpy_u64s(vstruct_last(s1),
 690                                     s2->start,
 691                                     le16_to_cpu(s2->u64s));
 692                         le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
 693
 694                         set_btree_bset_end(n1, n1->set);
 695
 696                         six_unlock_write(&n2->lock);
 697                         bch2_btree_node_free_never_inserted(c, n2);
 698                         six_unlock_intent(&n2->lock);
 699
 700                         memmove(new_nodes + i - 1,
 701                                 new_nodes + i,
 702                                 sizeof(new_nodes[0]) * (nr_new_nodes - i));
 703                         new_nodes[--nr_new_nodes] = NULL;
 704                 } else if (u64s) {
 705                         /* move part of n2 into n1 */
 706                         n1->key.k.p = n1->data->max_key =
 707                                 bkey_unpack_pos(n1, last);
 708
 709                         n2->data->min_key =
 710                                 btree_type_successor(iter->btree_id,
 711                                                      n1->data->max_key);
 712
 713                         memcpy_u64s(vstruct_last(s1),
 714                                     s2->start, u64s);
 715                         le16_add_cpu(&s1->u64s, u64s);
 716
 717                         memmove(s2->start,
 718                                 vstruct_idx(s2, u64s),
 719                                 (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
 720                         s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
 721
 722                         set_btree_bset_end(n1, n1->set);
 723                         set_btree_bset_end(n2, n2->set);
 724                 }
 725         }
 726
 727         for (i = 0; i < nr_new_nodes; i++) {
 728                 struct btree *n = new_nodes[i];
 729
 730                 recalc_packed_keys(n);
 731                 btree_node_reset_sib_u64s(n);
 732
 733                 bch2_btree_build_aux_trees(n);
 734                 six_unlock_write(&n->lock);
 735
 736                 bch2_btree_node_write(c, n, SIX_LOCK_intent);
 737         }
 738
 739         /*
 740          * The keys for the old nodes get deleted. We don't want to insert keys
 741          * that compare equal to the keys for the new nodes we'll also be
 742          * inserting - we can't because keys on a keylist must be strictly
 743          * greater than the previous keys, and we also don't need to since the
 744          * key for the new node will serve the same purpose (overwriting the key
 745          * for the old node).
 746          */
 747         for (i = 0; i < nr_old_nodes; i++) {
 748                 struct bkey_i delete;
 749                 unsigned j;
 750
 751                 for (j = 0; j < nr_new_nodes; j++)
 752                         if (!bkey_cmp(old_nodes[i]->key.k.p,
 753                                       new_nodes[j]->key.k.p))
 754                                 goto next;
 755
 756                 bkey_init(&delete.k);
 757                 delete.k.p = old_nodes[i]->key.k.p;
 758                 bch2_keylist_add_in_order(&keylist, &delete);
 759 next:
 760                 i = i;
 761         }
 762
 763         /*
 764          * Keys for the new nodes get inserted: bch2_btree_insert_keys() only
 765          * does the lookup once and thus expects the keys to be in sorted order
 766          * so we have to make sure the new keys are correctly ordered with
 767          * respect to the deleted keys added in the previous loop
 768          */
 769         for (i = 0; i < nr_new_nodes; i++)
 770                 bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
 771
 772         /* Insert the newly coalesced nodes */
 773         bch2_btree_insert_node(as, parent, iter, &keylist);
 774
 775         BUG_ON(!bch2_keylist_empty(&keylist));
 776
 777         BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
 778
 779         BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
 780
 781         for (i = 0; i < nr_new_nodes; i++)
 782                 bch2_btree_open_bucket_put(c, new_nodes[i]);
 783
 784         /* Free the old nodes and update our sliding window */
 785         for (i = 0; i < nr_old_nodes; i++) {
 786                 bch2_btree_node_free_inmem(c, old_nodes[i], iter);
 787                 six_unlock_intent(&old_nodes[i]->lock);
 788
 789                 /*
 790                  * the index update might have triggered a split, in which case
 791                  * the nodes we coalesced - the new nodes we just created -
 792                  * might not be sibling nodes anymore - don't add them to the
 793                  * sliding window (except the first):
 794                  */
 795                 if (!i) {
 796                         old_nodes[i] = new_nodes[i];
 797                 } else {
 798                         old_nodes[i] = NULL;
 799                         if (new_nodes[i])
 800                                 six_unlock_intent(&new_nodes[i]->lock);
 801                 }
 802         }
 803
 804         bch2_btree_update_done(as);
 805         bch2_keylist_free(&keylist, NULL);
 806 }
 807
 808 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 809 {
 810         struct btree_iter iter;
 811         struct btree *b;
 812         unsigned i;
 813
 814         /* Sliding window of adjacent btree nodes */
 815         struct btree *merge[GC_MERGE_NODES];
 816         u32 lock_seq[GC_MERGE_NODES];
 817
 818         /*
 819          * XXX: We don't have a good way of positively matching on sibling nodes
 820          * that have the same parent - this code works by handling the cases
 821          * where they might not have the same parent, and is thus fragile. Ugh.
 822          *
 823          * Perhaps redo this to use multiple linked iterators?
 824          */
 825         memset(merge, 0, sizeof(merge));
 826
 827         __for_each_btree_node(&iter, c, btree_id, POS_MIN,
 828                               BTREE_MAX_DEPTH, 0,
 829                               BTREE_ITER_PREFETCH, b) {
 830                 memmove(merge + 1, merge,
 831                         sizeof(merge) - sizeof(merge[0]));
 832                 memmove(lock_seq + 1, lock_seq,
 833                         sizeof(lock_seq) - sizeof(lock_seq[0]));
 834
 835                 merge[0] = b;
 836
 837                 for (i = 1; i < GC_MERGE_NODES; i++) {
 838                         if (!merge[i] ||
 839                             !six_relock_intent(&merge[i]->lock, lock_seq[i]))
 840                                 break;
 841
 842                         if (merge[i]->level != merge[0]->level) {
 843                                 six_unlock_intent(&merge[i]->lock);
 844                                 break;
 845                         }
 846                 }
 847                 memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
 848
 849                 bch2_coalesce_nodes(c, &iter, merge);
 850
 851                 for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
 852                         lock_seq[i] = merge[i]->lock.state.seq;
 853                         six_unlock_intent(&merge[i]->lock);
 854                 }
 855
 856                 lock_seq[0] = merge[0]->lock.state.seq;
 857
 858                 if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
 859                         bch2_btree_iter_unlock(&iter);
 860                         return -ESHUTDOWN;
 861                 }
 862
 863                 bch2_btree_iter_cond_resched(&iter);
 864
 865                 /*
 866                  * If the parent node wasn't relocked, it might have been split
 867                  * and the nodes in our sliding window might not have the same
 868                  * parent anymore - blow away the sliding window:
 869                  */
 870                 if (btree_iter_node(&iter, iter.level + 1) &&
 871                     !btree_node_intent_locked(&iter, iter.level + 1))
 872                         memset(merge + 1, 0,
 873                                (GC_MERGE_NODES - 1) * sizeof(merge[0]));
 874         }
 875         return bch2_btree_iter_unlock(&iter);
 876 }
 877
 878 /**
 879  * bch_coalesce - coalesce adjacent nodes with low occupancy
 880  */
 881 void bch2_coalesce(struct bch_fs *c)
 882 {
 883         enum btree_id id;
 884
 885         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 886                 return;
 887
 888         down_read(&c->gc_lock);
 889         trace_gc_coalesce_start(c);
 890
 891         for (id = 0; id < BTREE_ID_NR; id++) {
 892                 int ret = c->btree_roots[id].b
 893                         ? bch2_coalesce_btree(c, id)
 894                         : 0;
 895
 896                 if (ret) {
 897                         if (ret != -ESHUTDOWN)
 898                                 bch_err(c, "btree coalescing failed: %d", ret);
 899                         set_bit(BCH_FS_GC_FAILURE, &c->flags);
 900                         return;
 901                 }
 902         }
 903
 904         trace_gc_coalesce_end(c);
 905         up_read(&c->gc_lock);
 906 }
 907
 908 static int bch2_gc_thread(void *arg)
 909 {
 910         struct bch_fs *c = arg;
 911         struct io_clock *clock = &c->io_clock[WRITE];
 912         unsigned long last = atomic_long_read(&clock->now);
 913         unsigned last_kick = atomic_read(&c->kick_gc);
 914
 915         set_freezable();
 916
 917         while (1) {
 918                 while (1) {
 919                         set_current_state(TASK_INTERRUPTIBLE);
 920
 921                         if (kthread_should_stop()) {
 922                                 __set_current_state(TASK_RUNNING);
 923                                 return 0;
 924                         }
 925
 926                         if (atomic_read(&c->kick_gc) != last_kick)
 927                                 break;
 928
 929                         if (c->btree_gc_periodic) {
 930                                 unsigned long next = last + c->capacity / 16;
 931
 932                                 if (atomic_long_read(&clock->now) >= next)
 933                                         break;
 934
 935                                 bch2_io_clock_schedule_timeout(clock, next);
 936                         } else {
 937                                 schedule();
 938                         }
 939
 940                         try_to_freeze();
 941                 }
 942                 __set_current_state(TASK_RUNNING);
 943
 944                 last = atomic_long_read(&clock->now);
 945                 last_kick = atomic_read(&c->kick_gc);
 946
 947                 bch2_gc(c);
 948
 949                 debug_check_no_locks_held();
 950         }
 951
 952         return 0;
 953 }
 954
 955 void bch2_gc_thread_stop(struct bch_fs *c)
 956 {
 957         set_bit(BCH_FS_GC_STOPPING, &c->flags);
 958
 959         if (c->gc_thread)
 960                 kthread_stop(c->gc_thread);
 961
 962         c->gc_thread = NULL;
 963         clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 964 }
 965
 966 int bch2_gc_thread_start(struct bch_fs *c)
 967 {
 968         struct task_struct *p;
 969
 970         BUG_ON(c->gc_thread);
 971
 972         p = kthread_create(bch2_gc_thread, c, "bcache_gc");
 973         if (IS_ERR(p))
 974                 return PTR_ERR(p);
 975
 976         c->gc_thread = p;
 977         wake_up_process(c->gc_thread);
 978         return 0;
 979 }
 980
 981 /* Initial GC computes bucket marks during startup */
 982
 983 static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
 984 {
 985         struct btree_iter iter;
 986         struct btree *b;
 987         struct range_checks r;
 988         int ret = 0;
 989
 990         btree_node_range_checks_init(&r, 0);
 991
 992         if (!c->btree_roots[id].b)
 993                 return 0;
 994
 995         b = c->btree_roots[id].b;
 996         if (!btree_node_fake(b))
 997                 ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE,
 998                                                   bkey_i_to_s_c(&b->key));
 999         if (ret)
1000                 return ret;
1001
1002         /*
1003          * We have to hit every btree node before starting journal replay, in
1004          * order for the journal seq blacklist machinery to work:
1005          */
1006         for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
1007                 btree_node_range_checks(c, b, &r);
1008
1009                 if (btree_node_has_ptrs(b)) {
1010                         struct btree_node_iter node_iter;
1011                         struct bkey unpacked;
1012                         struct bkey_s_c k;
1013
1014                         for_each_btree_node_key_unpack(b, k, &node_iter,
1015                                                        btree_node_is_extents(b),
1016                                                        &unpacked) {
1017                                 ret = bch2_btree_mark_key_initial(c,
1018                                                         btree_node_type(b), k);
1019                                 if (ret)
1020                                         goto err;
1021                         }
1022                 }
1023
1024                 bch2_btree_iter_cond_resched(&iter);
1025         }
1026 err:
1027         return bch2_btree_iter_unlock(&iter) ?: ret;
1028 }
1029
1030 static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1031 {
1032         unsigned iter = 0;
1033         enum btree_id id;
1034         int ret;
1035
1036         mutex_lock(&c->sb_lock);
1037         if (!bch2_sb_get_replicas(c->disk_sb)) {
1038                 if (BCH_SB_INITIALIZED(c->disk_sb))
1039                         bch_info(c, "building replicas info");
1040                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
1041         }
1042         mutex_unlock(&c->sb_lock);
1043 again:
1044         bch2_gc_start(c);
1045
1046         for (id = 0; id < BTREE_ID_NR; id++) {
1047                 ret = bch2_initial_gc_btree(c, id);
1048                 if (ret)
1049                         return ret;
1050         }
1051
1052         ret = bch2_journal_mark(c, journal);
1053         if (ret)
1054                 return ret;
1055
1056         if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
1057                 if (iter++ > 2) {
1058                         bch_info(c, "Unable to fix bucket gens, looping");
1059                         return -EINVAL;
1060                 }
1061
1062                 bch_info(c, "Fixed gens, restarting initial mark and sweep:");
1063                 clear_bit(BCH_FS_FIXED_GENS, &c->flags);
1064                 goto again;
1065         }
1066
1067         /*
1068          * Skip past versions that might have possibly been used (as nonces),
1069          * but hadn't had their pointers written:
1070          */
1071         if (c->sb.encryption_type)
1072                 atomic64_add(1 << 16, &c->key_version);
1073
1074         bch2_mark_superblocks(c);
1075
1076         gc_pos_set(c, gc_phase(GC_PHASE_DONE));
1077         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
1078
1079         return 0;
1080 }
1081
1082 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
1083 {
1084         int ret;
1085
1086         down_write(&c->gc_lock);
1087         ret = __bch2_initial_gc(c, journal);
1088         up_write(&c->gc_lock);
1089
1090         return ret;
1091 }