git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/buckets.c

   1 /*
   2  * Code for manipulating bucket marks for garbage collection.
   3  *
   4  * Copyright 2014 Datera, Inc.
   5  *
   6  * Bucket states:
   7  * - free bucket: mark == 0
   8  *   The bucket contains no data and will not be read
   9  *
  10  * - allocator bucket: owned_by_allocator == 1
  11  *   The bucket is on a free list, or it is an open bucket
  12  *
  13  * - cached bucket: owned_by_allocator == 0 &&
  14  *                  dirty_sectors == 0 &&
  15  *                  cached_sectors > 0
  16  *   The bucket contains data but may be safely discarded as there are
  17  *   enough replicas of the data on other cache devices, or it has been
  18  *   written back to the backing device
  19  *
  20  * - dirty bucket: owned_by_allocator == 0 &&
  21  *                 dirty_sectors > 0
  22  *   The bucket contains data that we must not discard (either only copy,
  23  *   or one of the 'main copies' for data requiring multiple replicas)
  24  *
  25  * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
  26  *   This is a btree node, journal or gen/prio bucket
  27  *
  28  * Lifecycle:
  29  *
  30  * bucket invalidated => bucket on freelist => open bucket =>
  31  *     [dirty bucket =>] cached bucket => bucket invalidated => ...
  32  *
  33  * Note that cache promotion can skip the dirty bucket step, as data
  34  * is copied from a deeper tier to a shallower tier, onto a cached
  35  * bucket.
  36  * Note also that a cached bucket can spontaneously become dirty --
  37  * see below.
  38  *
  39  * Only a traversal of the key space can determine whether a bucket is
  40  * truly dirty or cached.
  41  *
  42  * Transitions:
  43  *
  44  * - free => allocator: bucket was invalidated
  45  * - cached => allocator: bucket was invalidated
  46  *
  47  * - allocator => dirty: open bucket was filled up
  48  * - allocator => cached: open bucket was filled up
  49  * - allocator => metadata: metadata was allocated
  50  *
  51  * - dirty => cached: dirty sectors were copied to a deeper tier
  52  * - dirty => free: dirty sectors were overwritten or moved (copy gc)
  53  * - cached => free: cached sectors were overwritten
  54  *
  55  * - metadata => free: metadata was freed
  56  *
  57  * Oddities:
  58  * - cached => dirty: a device was removed so formerly replicated data
  59  *                    is no longer sufficiently replicated
  60  * - free => cached: cannot happen
  61  * - free => dirty: cannot happen
  62  * - free => metadata: cannot happen
  63  */
  64
  65 #include "bcachefs.h"
  66 #include "alloc.h"
  67 #include "btree_gc.h"
  68 #include "buckets.h"
  69 #include "error.h"
  70 #include "movinggc.h"
  71
  72 #include <linux/preempt.h>
  73 #include <trace/events/bcachefs.h>
  74
  75 #ifdef DEBUG_BUCKETS
  76
  77 #define lg_local_lock   lg_global_lock
  78 #define lg_local_unlock lg_global_unlock
  79
  80 static void bch2_fs_stats_verify(struct bch_fs *c)
  81 {
  82         struct bch_fs_usage stats =
  83                 __bch2_fs_usage_read(c);
  84         unsigned i;
  85
  86         for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
  87                 if ((s64) stats.s[i].data[S_META] < 0)
  88                         panic("replicas %u meta underflow: %lli\n",
  89                               i + 1, stats.s[i].data[S_META]);
  90
  91                 if ((s64) stats.s[i].data[S_DIRTY] < 0)
  92                         panic("replicas %u dirty underflow: %lli\n",
  93                               i + 1, stats.s[i].data[S_DIRTY]);
  94
  95                 if ((s64) stats.s[i].persistent_reserved < 0)
  96                         panic("replicas %u reserved underflow: %lli\n",
  97                               i + 1, stats.s[i].persistent_reserved);
  98         }
  99
 100         if ((s64) stats.online_reserved < 0)
 101                 panic("sectors_online_reserved underflow: %lli\n",
 102                       stats.online_reserved);
 103 }
 104
 105 static void bch2_dev_stats_verify(struct bch_dev *ca)
 106 {
 107         struct bch_dev_usage stats =
 108                 __bch2_dev_usage_read(ca);
 109         u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
 110         unsigned i;
 111
 112         for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
 113                 BUG_ON(stats.buckets[i]         > n);
 114         BUG_ON(stats.buckets_alloc              > n);
 115         BUG_ON(stats.buckets_unavailable        > n);
 116 }
 117
 118 static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
 119 {
 120         if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
 121                 u64 used = __bch2_fs_sectors_used(c);
 122                 u64 cached = 0;
 123                 u64 avail = atomic64_read(&c->sectors_available);
 124                 int cpu;
 125
 126                 for_each_possible_cpu(cpu)
 127                         cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
 128
 129                 if (used + avail + cached > c->capacity)
 130                         panic("used %llu avail %llu cached %llu capacity %llu\n",
 131                               used, avail, cached, c->capacity);
 132         }
 133 }
 134
 135 #else
 136
 137 static void bch2_fs_stats_verify(struct bch_fs *c) {}
 138 static void bch2_dev_stats_verify(struct bch_dev *ca) {}
 139 static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
 140
 141 #endif
 142
 143 /*
 144  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
 145  * wraparound:
 146  */
 147 void bch2_bucket_seq_cleanup(struct bch_fs *c)
 148 {
 149         u16 last_seq_ondisk = c->journal.last_seq_ondisk;
 150         struct bch_dev *ca;
 151         struct bucket_array *buckets;
 152         struct bucket *g;
 153         struct bucket_mark m;
 154         unsigned i;
 155
 156         for_each_member_device(ca, c, i) {
 157                 down_read(&ca->bucket_lock);
 158                 buckets = bucket_array(ca);
 159
 160                 for_each_bucket(g, buckets) {
 161                         bucket_cmpxchg(g, m, ({
 162                                 if (!m.journal_seq_valid ||
 163                                     bucket_needs_journal_commit(m, last_seq_ondisk))
 164                                         break;
 165
 166                                 m.journal_seq_valid = 0;
 167                         }));
 168                 }
 169                 up_read(&ca->bucket_lock);
 170         }
 171 }
 172
 173 #define bch2_usage_add(_acc, _stats)                                    \
 174 do {                                                                    \
 175         typeof(_acc) _a = (_acc), _s = (_stats);                        \
 176         unsigned i;                                                     \
 177                                                                         \
 178         for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)                 \
 179                 ((u64 *) (_a))[i] += ((u64 *) (_s))[i];                 \
 180 } while (0)
 181
 182 #define bch2_usage_read_raw(_stats)                                     \
 183 ({                                                                      \
 184         typeof(*this_cpu_ptr(_stats)) _acc;                             \
 185         int cpu;                                                        \
 186                                                                         \
 187         memset(&_acc, 0, sizeof(_acc));                                 \
 188                                                                         \
 189         for_each_possible_cpu(cpu)                                      \
 190                 bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));      \
 191                                                                         \
 192         _acc;                                                           \
 193 })
 194
 195 #define bch2_usage_read_cached(_c, _cached, _uncached)                  \
 196 ({                                                                      \
 197         typeof(_cached) _ret;                                           \
 198         unsigned _seq;                                                  \
 199                                                                         \
 200         do {                                                            \
 201                 _seq = read_seqcount_begin(&(_c)->gc_pos_lock);         \
 202                 _ret = (_c)->gc_pos.phase == GC_PHASE_DONE              \
 203                         ? bch2_usage_read_raw(_uncached)                        \
 204                         : (_cached);                                    \
 205         } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));        \
 206                                                                         \
 207         _ret;                                                           \
 208 })
 209
 210 struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
 211 {
 212         return bch2_usage_read_raw(ca->usage_percpu);
 213 }
 214
 215 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 216 {
 217         return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
 218 }
 219
 220 struct bch_fs_usage
 221 __bch2_fs_usage_read(struct bch_fs *c)
 222 {
 223         return bch2_usage_read_raw(c->usage_percpu);
 224 }
 225
 226 struct bch_fs_usage
 227 bch2_fs_usage_read(struct bch_fs *c)
 228 {
 229         return bch2_usage_read_cached(c,
 230                                      c->usage_cached,
 231                                      c->usage_percpu);
 232 }
 233
 234 struct fs_usage_sum {
 235         u64     data;
 236         u64     reserved;
 237 };
 238
 239 static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 240 {
 241         struct fs_usage_sum sum = { 0 };
 242         unsigned i;
 243
 244         for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
 245                 sum.data += (stats.s[i].data[S_META] +
 246                              stats.s[i].data[S_DIRTY]) * (i + 1);
 247                 sum.reserved += stats.s[i].persistent_reserved * (i + 1);
 248         }
 249
 250         sum.reserved += stats.online_reserved;
 251         return sum;
 252 }
 253
 254 #define RESERVE_FACTOR  6
 255
 256 static u64 reserve_factor(u64 r)
 257 {
 258         return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
 259 }
 260
 261 static u64 avail_factor(u64 r)
 262 {
 263         return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
 264 }
 265
 266 u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 267 {
 268         struct fs_usage_sum sum = __fs_usage_sum(stats);
 269
 270         return sum.data + reserve_factor(sum.reserved);
 271 }
 272
 273 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 274 {
 275         return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 276 }
 277
 278 u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
 279 {
 280         return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
 281 }
 282
 283 static inline int is_unavailable_bucket(struct bucket_mark m)
 284 {
 285         return !is_available_bucket(m);
 286 }
 287
 288 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 289 {
 290         return m.cached_sectors && !m.dirty_sectors
 291                 ?  BCH_DATA_CACHED
 292                 : m.data_type;
 293 }
 294
 295 static bool bucket_became_unavailable(struct bch_fs *c,
 296                                       struct bucket_mark old,
 297                                       struct bucket_mark new)
 298 {
 299         return is_available_bucket(old) &&
 300                !is_available_bucket(new) &&
 301                c && c->gc_pos.phase == GC_PHASE_DONE;
 302 }
 303
 304 void bch2_fs_usage_apply(struct bch_fs *c,
 305                         struct bch_fs_usage *stats,
 306                         struct disk_reservation *disk_res,
 307                         struct gc_pos gc_pos)
 308 {
 309         struct fs_usage_sum sum = __fs_usage_sum(*stats);
 310         s64 added = sum.data + sum.reserved;
 311
 312         /*
 313          * Not allowed to reduce sectors_available except by getting a
 314          * reservation:
 315          */
 316         BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
 317
 318         if (added > 0) {
 319                 disk_res->sectors       -= added;
 320                 stats->online_reserved  -= added;
 321         }
 322
 323         lg_local_lock(&c->usage_lock);
 324         /* online_reserved not subject to gc: */
 325         this_cpu_ptr(c->usage_percpu)->online_reserved +=
 326                 stats->online_reserved;
 327         stats->online_reserved = 0;
 328
 329         if (!gc_will_visit(c, gc_pos))
 330                 bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
 331
 332         bch2_fs_stats_verify(c);
 333         lg_local_unlock(&c->usage_lock);
 334
 335         memset(stats, 0, sizeof(*stats));
 336 }
 337
 338 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 339                                   struct bucket_mark old, struct bucket_mark new)
 340 {
 341         struct bch_dev_usage *dev_usage;
 342
 343         lockdep_assert_held(&c->usage_lock);
 344
 345         bch2_fs_inconsistent_on(old.data_type && new.data_type &&
 346                         old.data_type != new.data_type, c,
 347                         "different types of data in same bucket: %u, %u",
 348                         old.data_type, new.data_type);
 349
 350         dev_usage = this_cpu_ptr(ca->usage_percpu);
 351
 352         dev_usage->buckets[bucket_type(old)]--;
 353         dev_usage->buckets[bucket_type(new)]++;
 354
 355         dev_usage->buckets_alloc +=
 356                 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
 357         dev_usage->buckets_unavailable +=
 358                 is_unavailable_bucket(new) - is_unavailable_bucket(old);
 359
 360         dev_usage->sectors[old.data_type] -= old.dirty_sectors;
 361         dev_usage->sectors[new.data_type] += new.dirty_sectors;
 362         dev_usage->sectors[BCH_DATA_CACHED] +=
 363                 (int) new.cached_sectors - (int) old.cached_sectors;
 364
 365         if (!is_available_bucket(old) && is_available_bucket(new))
 366                 bch2_wake_allocator(ca);
 367
 368         bch2_dev_stats_verify(ca);
 369 }
 370
 371 #define bucket_data_cmpxchg(c, ca, g, new, expr)                \
 372 ({                                                              \
 373         struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
 374                                                                 \
 375         bch2_dev_usage_update(c, ca, _old, new);                \
 376         _old;                                                   \
 377 })
 378
 379 bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 380                             size_t b, struct bucket_mark *old)
 381 {
 382         struct bucket *g;
 383         struct bucket_mark new;
 384
 385         lg_local_lock(&c->usage_lock);
 386         g = bucket(ca, b);
 387
 388         *old = bucket_data_cmpxchg(c, ca, g, new, ({
 389                 if (!is_available_bucket(new)) {
 390                         lg_local_unlock(&c->usage_lock);
 391                         return false;
 392                 }
 393
 394                 new.owned_by_allocator  = 1;
 395                 new.data_type           = 0;
 396                 new.cached_sectors      = 0;
 397                 new.dirty_sectors       = 0;
 398                 new.gen++;
 399         }));
 400         lg_local_unlock(&c->usage_lock);
 401
 402         if (!old->owned_by_allocator && old->cached_sectors)
 403                 trace_invalidate(ca, bucket_to_sector(ca, b),
 404                                  old->cached_sectors);
 405         return true;
 406 }
 407
 408 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 409                             size_t b, bool owned_by_allocator,
 410                             struct gc_pos pos, unsigned flags)
 411 {
 412         struct bucket *g;
 413         struct bucket_mark old, new;
 414
 415         lg_local_lock(&c->usage_lock);
 416         g = bucket(ca, b);
 417
 418         if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
 419             gc_will_visit(c, pos)) {
 420                 lg_local_unlock(&c->usage_lock);
 421                 return;
 422         }
 423
 424         old = bucket_data_cmpxchg(c, ca, g, new, ({
 425                 new.owned_by_allocator  = owned_by_allocator;
 426         }));
 427         lg_local_unlock(&c->usage_lock);
 428
 429         BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
 430                c->gc_pos.phase == GC_PHASE_DONE);
 431 }
 432
 433 #define saturated_add(ca, dst, src, max)                        \
 434 do {                                                            \
 435         BUG_ON((int) (dst) + (src) < 0);                        \
 436         if ((dst) == (max))                                     \
 437                 ;                                               \
 438         else if ((dst) + (src) <= (max))                        \
 439                 dst += (src);                                   \
 440         else {                                                  \
 441                 dst = (max);                                    \
 442                 trace_sectors_saturated(ca);            \
 443         }                                                       \
 444 } while (0)
 445
 446 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 447                                size_t b, enum bch_data_type type,
 448                                unsigned sectors, struct gc_pos pos,
 449                                unsigned flags)
 450 {
 451         struct bucket *g;
 452         struct bucket_mark old, new;
 453
 454         BUG_ON(!type);
 455
 456         lg_local_lock(&c->usage_lock);
 457         g = bucket(ca, b);
 458
 459         if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
 460             gc_will_visit(c, pos)) {
 461                 lg_local_unlock(&c->usage_lock);
 462                 return;
 463         }
 464
 465         old = bucket_data_cmpxchg(c, ca, g, new, ({
 466                 saturated_add(ca, new.dirty_sectors, sectors,
 467                               GC_MAX_SECTORS_USED);
 468                 new.data_type           = type;
 469         }));
 470         lg_local_unlock(&c->usage_lock);
 471
 472         BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 473                bucket_became_unavailable(c, old, new));
 474 }
 475
 476 /* Reverting this until the copygc + compression issue is fixed: */
 477
 478 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 479 {
 480         if (!sectors)
 481                 return 0;
 482
 483         return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
 484                                     crc.uncompressed_size));
 485 }
 486
 487 /*
 488  * Checking against gc's position has to be done here, inside the cmpxchg()
 489  * loop, to avoid racing with the start of gc clearing all the marks - GC does
 490  * that with the gc pos seqlock held.
 491  */
 492 static void bch2_mark_pointer(struct bch_fs *c,
 493                               struct bkey_s_c_extent e,
 494                               const struct bch_extent_ptr *ptr,
 495                               struct bch_extent_crc_unpacked crc,
 496                               s64 sectors, enum s_alloc type,
 497                               struct bch_fs_usage *stats,
 498                               u64 journal_seq, unsigned flags)
 499 {
 500         struct bucket_mark old, new;
 501         unsigned saturated;
 502         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 503         struct bucket *g = PTR_BUCKET(ca, ptr);
 504         enum bch_data_type data_type = type == S_META
 505                 ? BCH_DATA_BTREE : BCH_DATA_USER;
 506         u64 v;
 507
 508         if (crc.compression_type) {
 509                 unsigned old_sectors, new_sectors;
 510
 511                 if (sectors > 0) {
 512                         old_sectors = 0;
 513                         new_sectors = sectors;
 514                 } else {
 515                         old_sectors = e.k->size;
 516                         new_sectors = e.k->size + sectors;
 517                 }
 518
 519                 sectors = -__disk_sectors(crc, old_sectors)
 520                           +__disk_sectors(crc, new_sectors);
 521         }
 522
 523         if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
 524                 if (journal_seq)
 525                         bucket_cmpxchg(g, new, ({
 526                                 new.journal_seq_valid   = 1;
 527                                 new.journal_seq         = journal_seq;
 528                         }));
 529
 530                 return;
 531         }
 532
 533         v = READ_ONCE(g->_mark.counter);
 534         do {
 535                 new.counter = old.counter = v;
 536                 saturated = 0;
 537
 538                 /*
 539                  * Check this after reading bucket mark to guard against
 540                  * the allocator invalidating a bucket after we've already
 541                  * checked the gen
 542                  */
 543                 if (gen_after(new.gen, ptr->gen)) {
 544                         BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
 545                         EBUG_ON(!ptr->cached &&
 546                                 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 547                         return;
 548                 }
 549
 550                 if (!ptr->cached &&
 551                     new.dirty_sectors == GC_MAX_SECTORS_USED &&
 552                     sectors < 0)
 553                         saturated = -sectors;
 554
 555                 if (ptr->cached)
 556                         saturated_add(ca, new.cached_sectors, sectors,
 557                                       GC_MAX_SECTORS_USED);
 558                 else
 559                         saturated_add(ca, new.dirty_sectors, sectors,
 560                                       GC_MAX_SECTORS_USED);
 561
 562                 if (!new.dirty_sectors &&
 563                     !new.cached_sectors) {
 564                         new.data_type   = 0;
 565
 566                         if (journal_seq) {
 567                                 new.journal_seq_valid = 1;
 568                                 new.journal_seq = journal_seq;
 569                         }
 570                 } else {
 571                         new.data_type = data_type;
 572                 }
 573
 574                 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
 575                         g->_mark = new;
 576                         break;
 577                 }
 578         } while ((v = cmpxchg(&g->_mark.counter,
 579                               old.counter,
 580                               new.counter)) != old.counter);
 581
 582         bch2_dev_usage_update(c, ca, old, new);
 583
 584         BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 585                bucket_became_unavailable(c, old, new));
 586
 587         if (saturated &&
 588             atomic_long_add_return(saturated,
 589                                    &ca->saturated_count) >=
 590             bucket_to_sector(ca, ca->free_inc.size)) {
 591                 if (c->gc_thread) {
 592                         trace_gc_sectors_saturated(c);
 593                         wake_up_process(c->gc_thread);
 594                 }
 595         }
 596 }
 597
 598 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 599                    s64 sectors, bool metadata,
 600                    struct gc_pos pos,
 601                    struct bch_fs_usage *stats,
 602                    u64 journal_seq, unsigned flags)
 603 {
 604         /*
 605          * synchronization w.r.t. GC:
 606          *
 607          * Normally, bucket sector counts/marks are updated on the fly, as
 608          * references are added/removed from the btree, the lists of buckets the
 609          * allocator owns, other metadata buckets, etc.
 610          *
 611          * When GC is in progress and going to mark this reference, we do _not_
 612          * mark this reference here, to avoid double counting - GC will count it
 613          * when it gets to it.
 614          *
 615          * To know whether we should mark a given reference (GC either isn't
 616          * running, or has already marked references at this position) we
 617          * construct a total order for everything GC walks. Then, we can simply
 618          * compare the position of the reference we're marking - @pos - with
 619          * GC's current position. If GC is going to mark this reference, GC's
 620          * current position will be less than @pos; if GC's current position is
 621          * greater than @pos GC has either already walked this position, or
 622          * isn't running.
 623          *
 624          * To avoid racing with GC's position changing, we have to deal with
 625          *  - GC's position being set to GC_POS_MIN when GC starts:
 626          *    usage_lock guards against this
 627          *  - GC's position overtaking @pos: we guard against this with
 628          *    whatever lock protects the data structure the reference lives in
 629          *    (e.g. the btree node lock, or the relevant allocator lock).
 630          */
 631
 632         lg_local_lock(&c->usage_lock);
 633         if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
 634             gc_will_visit(c, pos))
 635                 flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
 636
 637         switch (k.k->type) {
 638         case BCH_EXTENT:
 639         case BCH_EXTENT_CACHED: {
 640                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 641                 const struct bch_extent_ptr *ptr;
 642                 struct bch_extent_crc_unpacked crc;
 643                 enum s_alloc type = metadata ? S_META : S_DIRTY;
 644                 unsigned replicas = 0;
 645
 646                 BUG_ON(metadata && bkey_extent_is_cached(e.k));
 647                 BUG_ON(!sectors);
 648
 649                 extent_for_each_ptr_crc(e, ptr, crc) {
 650                         bch2_mark_pointer(c, e, ptr, crc, sectors, type,
 651                                           stats, journal_seq, flags);
 652                         replicas += !ptr->cached;
 653                 }
 654
 655                 if (replicas) {
 656                         BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
 657                         stats->s[replicas - 1].data[type] += sectors;
 658                 }
 659                 break;
 660         }
 661         case BCH_RESERVATION: {
 662                 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 663
 664                 if (r.v->nr_replicas) {
 665                         BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
 666                         stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
 667                 }
 668                 break;
 669         }
 670         }
 671         lg_local_unlock(&c->usage_lock);
 672 }
 673
 674 /* Disk reservations: */
 675
 676 static u64 __recalc_sectors_available(struct bch_fs *c)
 677 {
 678         int cpu;
 679
 680         for_each_possible_cpu(cpu)
 681                 per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 682
 683         return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
 684 }
 685
 686 /* Used by gc when it's starting: */
 687 void bch2_recalc_sectors_available(struct bch_fs *c)
 688 {
 689         lg_global_lock(&c->usage_lock);
 690         atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
 691         lg_global_unlock(&c->usage_lock);
 692 }
 693
 694 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 695 {
 696         lg_local_lock(&c->usage_lock);
 697         this_cpu_sub(c->usage_percpu->online_reserved,
 698                      res->sectors);
 699
 700         bch2_fs_stats_verify(c);
 701         lg_local_unlock(&c->usage_lock);
 702
 703         res->sectors = 0;
 704 }
 705
 706 #define SECTORS_CACHE   1024
 707
 708 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 709                               unsigned sectors, int flags)
 710 {
 711         struct bch_fs_usage *stats;
 712         u64 old, v, get;
 713         s64 sectors_available;
 714         int ret;
 715
 716         lg_local_lock(&c->usage_lock);
 717         stats = this_cpu_ptr(c->usage_percpu);
 718
 719         if (sectors <= stats->available_cache)
 720                 goto out;
 721
 722         v = atomic64_read(&c->sectors_available);
 723         do {
 724                 old = v;
 725                 get = min((u64) sectors + SECTORS_CACHE, old);
 726
 727                 if (get < sectors) {
 728                         lg_local_unlock(&c->usage_lock);
 729                         goto recalculate;
 730                 }
 731         } while ((v = atomic64_cmpxchg(&c->sectors_available,
 732                                        old, old - get)) != old);
 733
 734         stats->available_cache  += get;
 735
 736 out:
 737         stats->available_cache  -= sectors;
 738         stats->online_reserved  += sectors;
 739         res->sectors            += sectors;
 740
 741         bch2_disk_reservations_verify(c, flags);
 742         bch2_fs_stats_verify(c);
 743         lg_local_unlock(&c->usage_lock);
 744         return 0;
 745
 746 recalculate:
 747         /*
 748          * GC recalculates sectors_available when it starts, so that hopefully
 749          * we don't normally end up blocking here:
 750          */
 751
 752         /*
 753          * Piss fuck, we can be called from extent_insert_fixup() with btree
 754          * locks held:
 755          */
 756
 757         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
 758                 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
 759                         down_read(&c->gc_lock);
 760                 else if (!down_read_trylock(&c->gc_lock))
 761                         return -EINTR;
 762         }
 763         lg_global_lock(&c->usage_lock);
 764
 765         sectors_available = __recalc_sectors_available(c);
 766
 767         if (sectors <= sectors_available ||
 768             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 769                 atomic64_set(&c->sectors_available,
 770                              max_t(s64, 0, sectors_available - sectors));
 771                 stats->online_reserved  += sectors;
 772                 res->sectors            += sectors;
 773                 ret = 0;
 774
 775                 bch2_disk_reservations_verify(c, flags);
 776         } else {
 777                 atomic64_set(&c->sectors_available, sectors_available);
 778                 ret = -ENOSPC;
 779         }
 780
 781         bch2_fs_stats_verify(c);
 782         lg_global_unlock(&c->usage_lock);
 783         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
 784                 up_read(&c->gc_lock);
 785
 786         return ret;
 787 }
 788
 789 /* Startup/shutdown: */
 790
 791 static void buckets_free_rcu(struct rcu_head *rcu)
 792 {
 793         struct bucket_array *buckets =
 794                 container_of(rcu, struct bucket_array, rcu);
 795
 796         kvpfree(buckets,
 797                 sizeof(struct bucket_array) +
 798                 buckets->nbuckets * sizeof(struct bucket));
 799 }
 800
 801 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 802 {
 803         struct bucket_array *buckets = NULL, *old_buckets = NULL;
 804         unsigned long *buckets_dirty = NULL;
 805         u8 *oldest_gens = NULL;
 806         alloc_fifo      free[RESERVE_NR];
 807         alloc_fifo      free_inc;
 808         alloc_heap      alloc_heap;
 809         copygc_heap     copygc_heap;
 810
 811         size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
 812                              ca->mi.bucket_size / c->opts.btree_node_size);
 813         /* XXX: these should be tunable */
 814         size_t reserve_none     = max_t(size_t, 4, ca->mi.nbuckets >> 9);
 815         size_t copygc_reserve   = max_t(size_t, 16, ca->mi.nbuckets >> 7);
 816         size_t free_inc_reserve = copygc_reserve / 2;
 817         bool resize = ca->buckets != NULL,
 818              start_copygc = ca->copygc_thread != NULL;
 819         int ret = -ENOMEM;
 820         unsigned i;
 821
 822         memset(&free,           0, sizeof(free));
 823         memset(&free_inc,       0, sizeof(free_inc));
 824         memset(&alloc_heap,     0, sizeof(alloc_heap));
 825         memset(&copygc_heap,    0, sizeof(copygc_heap));
 826
 827         if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
 828                                             nbuckets * sizeof(struct bucket),
 829                                             GFP_KERNEL|__GFP_ZERO)) ||
 830             !(oldest_gens       = kvpmalloc(nbuckets * sizeof(u8),
 831                                             GFP_KERNEL|__GFP_ZERO)) ||
 832             !(buckets_dirty     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
 833                                             sizeof(unsigned long),
 834                                             GFP_KERNEL|__GFP_ZERO)) ||
 835             !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
 836             !init_fifo(&free[RESERVE_MOVINGGC],
 837                        copygc_reserve, GFP_KERNEL) ||
 838             !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
 839             !init_fifo(&free_inc,       free_inc_reserve, GFP_KERNEL) ||
 840             !init_heap(&alloc_heap,     free_inc_reserve, GFP_KERNEL) ||
 841             !init_heap(&copygc_heap,    copygc_reserve, GFP_KERNEL))
 842                 goto err;
 843
 844         buckets->first_bucket   = ca->mi.first_bucket;
 845         buckets->nbuckets       = nbuckets;
 846
 847         bch2_copygc_stop(ca);
 848
 849         down_write(&c->gc_lock);
 850         down_write(&ca->bucket_lock);
 851         lg_global_lock(&c->usage_lock);
 852
 853         old_buckets = bucket_array(ca);
 854
 855         if (resize) {
 856                 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
 857
 858                 memcpy(buckets->b,
 859                        old_buckets->b,
 860                        n * sizeof(struct bucket));
 861                 memcpy(oldest_gens,
 862                        ca->oldest_gens,
 863                        n * sizeof(u8));
 864                 memcpy(buckets_dirty,
 865                        ca->buckets_dirty,
 866                        BITS_TO_LONGS(n) * sizeof(unsigned long));
 867         }
 868
 869         rcu_assign_pointer(ca->buckets, buckets);
 870         buckets = old_buckets;
 871
 872         swap(ca->oldest_gens, oldest_gens);
 873         swap(ca->buckets_dirty, buckets_dirty);
 874
 875         lg_global_unlock(&c->usage_lock);
 876
 877         spin_lock(&c->freelist_lock);
 878         for (i = 0; i < RESERVE_NR; i++) {
 879                 fifo_move(&free[i], &ca->free[i]);
 880                 swap(ca->free[i], free[i]);
 881         }
 882         fifo_move(&free_inc, &ca->free_inc);
 883         swap(ca->free_inc, free_inc);
 884         spin_unlock(&c->freelist_lock);
 885
 886         /* with gc lock held, alloc_heap can't be in use: */
 887         swap(ca->alloc_heap, alloc_heap);
 888
 889         /* and we shut down copygc: */
 890         swap(ca->copygc_heap, copygc_heap);
 891
 892         nbuckets = ca->mi.nbuckets;
 893
 894         up_write(&ca->bucket_lock);
 895         up_write(&c->gc_lock);
 896
 897         if (start_copygc &&
 898             bch2_copygc_start(c, ca))
 899                 bch_err(ca, "error restarting copygc thread");
 900
 901         ret = 0;
 902 err:
 903         free_heap(&copygc_heap);
 904         free_heap(&alloc_heap);
 905         free_fifo(&free_inc);
 906         for (i = 0; i < RESERVE_NR; i++)
 907                 free_fifo(&free[i]);
 908         kvpfree(buckets_dirty,
 909                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
 910         kvpfree(oldest_gens,
 911                 nbuckets * sizeof(u8));
 912         if (buckets)
 913                 call_rcu(&old_buckets->rcu, buckets_free_rcu);
 914
 915         return ret;
 916 }
 917
 918 void bch2_dev_buckets_free(struct bch_dev *ca)
 919 {
 920         unsigned i;
 921
 922         free_heap(&ca->copygc_heap);
 923         free_heap(&ca->alloc_heap);
 924         free_fifo(&ca->free_inc);
 925         for (i = 0; i < RESERVE_NR; i++)
 926                 free_fifo(&ca->free[i]);
 927         kvpfree(ca->buckets_dirty,
 928                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
 929         kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
 930         kvpfree(ca->buckets,     sizeof(struct bucket_array) +
 931                 ca->mi.nbuckets * sizeof(struct bucket));
 932
 933         free_percpu(ca->usage_percpu);
 934 }
 935
 936 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 937 {
 938         if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
 939                 return -ENOMEM;
 940
 941         return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
 942 }