git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/buckets.c

   1 /*
   2  * Code for manipulating bucket marks for garbage collection.
   3  *
   4  * Copyright 2014 Datera, Inc.
   5  *
   6  * Bucket states:
   7  * - free bucket: mark == 0
   8  *   The bucket contains no data and will not be read
   9  *
  10  * - allocator bucket: owned_by_allocator == 1
  11  *   The bucket is on a free list, or it is an open bucket
  12  *
  13  * - cached bucket: owned_by_allocator == 0 &&
  14  *                  dirty_sectors == 0 &&
  15  *                  cached_sectors > 0
  16  *   The bucket contains data but may be safely discarded as there are
  17  *   enough replicas of the data on other cache devices, or it has been
  18  *   written back to the backing device
  19  *
  20  * - dirty bucket: owned_by_allocator == 0 &&
  21  *                 dirty_sectors > 0
  22  *   The bucket contains data that we must not discard (either only copy,
  23  *   or one of the 'main copies' for data requiring multiple replicas)
  24  *
  25  * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
  26  *   This is a btree node, journal or gen/prio bucket
  27  *
  28  * Lifecycle:
  29  *
  30  * bucket invalidated => bucket on freelist => open bucket =>
  31  *     [dirty bucket =>] cached bucket => bucket invalidated => ...
  32  *
  33  * Note that cache promotion can skip the dirty bucket step, as data
  34  * is copied from a deeper tier to a shallower tier, onto a cached
  35  * bucket.
  36  * Note also that a cached bucket can spontaneously become dirty --
  37  * see below.
  38  *
  39  * Only a traversal of the key space can determine whether a bucket is
  40  * truly dirty or cached.
  41  *
  42  * Transitions:
  43  *
  44  * - free => allocator: bucket was invalidated
  45  * - cached => allocator: bucket was invalidated
  46  *
  47  * - allocator => dirty: open bucket was filled up
  48  * - allocator => cached: open bucket was filled up
  49  * - allocator => metadata: metadata was allocated
  50  *
  51  * - dirty => cached: dirty sectors were copied to a deeper tier
  52  * - dirty => free: dirty sectors were overwritten or moved (copy gc)
  53  * - cached => free: cached sectors were overwritten
  54  *
  55  * - metadata => free: metadata was freed
  56  *
  57  * Oddities:
  58  * - cached => dirty: a device was removed so formerly replicated data
  59  *                    is no longer sufficiently replicated
  60  * - free => cached: cannot happen
  61  * - free => dirty: cannot happen
  62  * - free => metadata: cannot happen
  63  */
  64
  65 #include "bcache.h"
  66 #include "alloc.h"
  67 #include "btree_gc.h"
  68 #include "buckets.h"
  69
  70 #include <linux/preempt.h>
  71 #include <trace/events/bcache.h>
  72
  73 #ifdef DEBUG_BUCKETS
  74
  75 #define lg_local_lock   lg_global_lock
  76 #define lg_local_unlock lg_global_unlock
  77
  78 static void bch_fs_stats_verify(struct cache_set *c)
  79 {
  80         struct bucket_stats_cache_set stats =
  81                 __bch_bucket_stats_read_cache_set(c);
  82
  83         if ((s64) stats.sectors_dirty < 0)
  84                 panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
  85
  86         if ((s64) stats.sectors_cached < 0)
  87                 panic("sectors_cached underflow: %lli\n", stats.sectors_cached);
  88
  89         if ((s64) stats.sectors_meta < 0)
  90                 panic("sectors_meta underflow: %lli\n", stats.sectors_meta);
  91
  92         if ((s64) stats.sectors_persistent_reserved < 0)
  93                 panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved);
  94
  95         if ((s64) stats.sectors_online_reserved < 0)
  96                 panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
  97 }
  98
  99 #else
 100
 101 static void bch_fs_stats_verify(struct cache_set *c) {}
 102
 103 #endif
 104
 105 void bch_bucket_seq_cleanup(struct cache_set *c)
 106 {
 107         u16 last_seq_ondisk = c->journal.last_seq_ondisk;
 108         struct cache *ca;
 109         struct bucket *g;
 110         struct bucket_mark m;
 111         unsigned i;
 112
 113         for_each_cache(ca, c, i)
 114                 for_each_bucket(g, ca) {
 115                         bucket_cmpxchg(g, m, ({
 116                                 if (!m.wait_on_journal ||
 117                                     ((s16) last_seq_ondisk -
 118                                      (s16) m.journal_seq < 0))
 119                                         break;
 120
 121                                 m.wait_on_journal = 0;
 122                         }));
 123                 }
 124 }
 125
 126 #define bucket_stats_add(_acc, _stats)                                  \
 127 do {                                                                    \
 128         typeof(_acc) _a = (_acc), _s = (_stats);                        \
 129         unsigned i;                                                     \
 130                                                                         \
 131         for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)                 \
 132                 ((u64 *) (_a))[i] += ((u64 *) (_s))[i];                 \
 133 } while (0)
 134
 135 #define bucket_stats_read_raw(_stats)                                   \
 136 ({                                                                      \
 137         typeof(*this_cpu_ptr(_stats)) _acc = { 0 };                     \
 138         int cpu;                                                        \
 139                                                                         \
 140         for_each_possible_cpu(cpu)                                      \
 141                 bucket_stats_add(&_acc, per_cpu_ptr((_stats), cpu));    \
 142                                                                         \
 143         _acc;                                                           \
 144 })
 145
 146 #define bucket_stats_read_cached(_c, _cached, _uncached)                \
 147 ({                                                                      \
 148         typeof(_cached) _ret;                                           \
 149         unsigned _seq;                                                  \
 150                                                                         \
 151         do {                                                            \
 152                 _seq = read_seqcount_begin(&(_c)->gc_pos_lock);         \
 153                 _ret = (_c)->gc_pos.phase == GC_PHASE_DONE              \
 154                         ? bucket_stats_read_raw(_uncached)              \
 155                         : (_cached);                                    \
 156         } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));        \
 157                                                                         \
 158         _ret;                                                           \
 159 })
 160
 161 struct bucket_stats_cache __bch_bucket_stats_read_cache(struct cache *ca)
 162 {
 163         return bucket_stats_read_raw(ca->bucket_stats_percpu);
 164 }
 165
 166 struct bucket_stats_cache bch_bucket_stats_read_cache(struct cache *ca)
 167 {
 168         return bucket_stats_read_cached(ca->set,
 169                                 ca->bucket_stats_cached,
 170                                 ca->bucket_stats_percpu);
 171 }
 172
 173 struct bucket_stats_cache_set
 174 __bch_bucket_stats_read_cache_set(struct cache_set *c)
 175 {
 176         return bucket_stats_read_raw(c->bucket_stats_percpu);
 177 }
 178
 179 struct bucket_stats_cache_set
 180 bch_bucket_stats_read_cache_set(struct cache_set *c)
 181 {
 182         return bucket_stats_read_cached(c,
 183                                 c->bucket_stats_cached,
 184                                 c->bucket_stats_percpu);
 185 }
 186
 187 static inline int is_meta_bucket(struct bucket_mark m)
 188 {
 189         return !m.owned_by_allocator && m.is_metadata;
 190 }
 191
 192 static inline int is_dirty_bucket(struct bucket_mark m)
 193 {
 194         return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
 195 }
 196
 197 static inline int is_cached_bucket(struct bucket_mark m)
 198 {
 199         return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
 200 }
 201
 202 void bch_fs_stats_apply(struct cache_set *c,
 203                         struct bucket_stats_cache_set *stats,
 204                         struct disk_reservation *disk_res,
 205                         struct gc_pos gc_pos)
 206 {
 207         s64 added =
 208                 stats->s[S_COMPRESSED][S_META] +
 209                 stats->s[S_COMPRESSED][S_DIRTY] +
 210                 stats->persistent_reserved +
 211                 stats->online_reserved;
 212
 213         /*
 214          * Not allowed to reduce sectors_available except by getting a
 215          * reservation:
 216          */
 217         BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
 218
 219         if (added > 0) {
 220                 disk_res->sectors       -= added;
 221                 stats->online_reserved  -= added;
 222         }
 223
 224         lg_local_lock(&c->bucket_stats_lock);
 225         /* online_reserved not subject to gc: */
 226         this_cpu_ptr(c->bucket_stats_percpu)->online_reserved +=
 227                 stats->online_reserved;
 228         stats->online_reserved = 0;
 229
 230         if (!gc_will_visit(c, gc_pos))
 231                 bucket_stats_add(this_cpu_ptr(c->bucket_stats_percpu), stats);
 232
 233         bch_fs_stats_verify(c);
 234         lg_local_unlock(&c->bucket_stats_lock);
 235
 236         memset(stats, 0, sizeof(*stats));
 237 }
 238
 239 static void bucket_stats_update(struct cache *ca,
 240                         struct bucket_mark old, struct bucket_mark new,
 241                         bool may_make_unavailable,
 242                         struct bucket_stats_cache_set *bch_alloc_stats)
 243 {
 244         struct cache_set *c = ca->set;
 245         struct bucket_stats_cache *cache_stats;
 246
 247         BUG_ON(!may_make_unavailable &&
 248                is_available_bucket(old) &&
 249                !is_available_bucket(new) &&
 250                c->gc_pos.phase == GC_PHASE_DONE);
 251
 252         if (bch_alloc_stats) {
 253                 bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
 254                         (int) new.cached_sectors - (int) old.cached_sectors;
 255
 256                 bch_alloc_stats->s[S_COMPRESSED]
 257                         [old.is_metadata ? S_META : S_DIRTY] -=
 258                         old.dirty_sectors;
 259
 260                 bch_alloc_stats->s[S_COMPRESSED]
 261                         [new.is_metadata ? S_META : S_DIRTY] +=
 262                         new.dirty_sectors;
 263         }
 264
 265         preempt_disable();
 266         cache_stats = this_cpu_ptr(ca->bucket_stats_percpu);
 267
 268         cache_stats->sectors_cached +=
 269                 (int) new.cached_sectors - (int) old.cached_sectors;
 270
 271         if (old.is_metadata)
 272                 cache_stats->sectors_meta -= old.dirty_sectors;
 273         else
 274                 cache_stats->sectors_dirty -= old.dirty_sectors;
 275
 276         if (new.is_metadata)
 277                 cache_stats->sectors_meta += new.dirty_sectors;
 278         else
 279                 cache_stats->sectors_dirty += new.dirty_sectors;
 280
 281         cache_stats->buckets_alloc +=
 282                 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
 283
 284         cache_stats->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old);
 285         cache_stats->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
 286         cache_stats->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
 287         preempt_enable();
 288
 289         if (!is_available_bucket(old) && is_available_bucket(new))
 290                 bch_wake_allocator(ca);
 291 }
 292
 293 void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 294 {
 295         struct bucket_stats_cache_set stats = { 0 };
 296         struct bucket_mark old, new;
 297
 298         old = bucket_cmpxchg(g, new, ({
 299                 new.owned_by_allocator  = 1;
 300                 new.is_metadata         = 0;
 301                 new.cached_sectors      = 0;
 302                 new.dirty_sectors       = 0;
 303                 new.copygc              = 0;
 304                 new.gen++;
 305         }));
 306
 307         BUG_ON(old.dirty_sectors);
 308
 309         bucket_stats_update(ca, old, new, true, &stats);
 310
 311         /*
 312          * Ick:
 313          *
 314          * Only stats.sectors_cached should be nonzero: this is important
 315          * because in this path we modify bch_alloc_stats based on how the
 316          * bucket_mark was modified, and the sector counts in bucket_mark are
 317          * subject to (saturating) overflow - and if they did overflow, the
 318          * cache set stats will now be off. We can tolerate this for
 319          * sectors_cached, but not anything else:
 320          */
 321         stats.s[S_COMPRESSED][S_CACHED] = 0;
 322         stats.s[S_UNCOMPRESSED][S_CACHED] = 0;
 323         BUG_ON(!bch_is_zero(&stats, sizeof(stats)));
 324
 325         if (!old.owned_by_allocator && old.cached_sectors)
 326                 trace_bcache_invalidate(ca, g - ca->buckets,
 327                                         old.cached_sectors);
 328 }
 329
 330 void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
 331 {
 332         struct bucket_stats_cache_set stats = { 0 };
 333         struct bucket_mark old, new;
 334
 335         old = bucket_cmpxchg(g, new, ({
 336                 new.owned_by_allocator  = 0;
 337                 new.is_metadata         = 0;
 338                 new.cached_sectors      = 0;
 339                 new.dirty_sectors       = 0;
 340         }));
 341
 342         bucket_stats_update(ca, old, new, false, &stats);
 343 }
 344
 345 void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
 346                            bool owned_by_allocator)
 347 {
 348         struct bucket_stats_cache_set stats = { 0 };
 349         struct bucket_mark old, new;
 350
 351         old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
 352
 353         bucket_stats_update(ca, old, new, true, &stats);
 354 }
 355
 356 void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
 357                               bool may_make_unavailable)
 358 {
 359         struct bucket_stats_cache_set stats = { 0 };
 360         struct bucket_mark old, new;
 361
 362         old = bucket_cmpxchg(g, new, ({
 363                 new.is_metadata = 1;
 364                 new.had_metadata = 1;
 365         }));
 366
 367         BUG_ON(old.cached_sectors);
 368         BUG_ON(old.dirty_sectors);
 369
 370         bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
 371 }
 372
 373 #define saturated_add(ca, dst, src, max)                        \
 374 do {                                                            \
 375         BUG_ON((int) (dst) + (src) < 0);                        \
 376         if ((dst) == (max))                                     \
 377                 ;                                               \
 378         else if ((dst) + (src) <= (max))                        \
 379                 dst += (src);                                   \
 380         else {                                                  \
 381                 dst = (max);                                    \
 382                 trace_bcache_sectors_saturated(ca);             \
 383         }                                                       \
 384 } while (0)
 385
 386 #if 0
 387 /* Reverting this until the copygc + compression issue is fixed: */
 388
 389 static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
 390 {
 391         return crc_compression_type(crc)
 392                 ? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc)
 393                 : sectors;
 394 }
 395
 396 static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
 397 {
 398         return crc_compression_type(crc)
 399                 ? min_t(unsigned, crc_compressed_size(crc), sectors)
 400                 : sectors;
 401 }
 402 #else
 403 static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
 404 {
 405         return sectors;
 406 }
 407
 408 static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
 409 {
 410         return sectors;
 411 }
 412 #endif
 413
 414 /*
 415  * Checking against gc's position has to be done here, inside the cmpxchg()
 416  * loop, to avoid racing with the start of gc clearing all the marks - GC does
 417  * that with the gc pos seqlock held.
 418  */
 419 static void bch_mark_pointer(struct cache_set *c,
 420                              struct bkey_s_c_extent e,
 421                              struct cache *ca,
 422                              const union bch_extent_crc *crc,
 423                              const struct bch_extent_ptr *ptr,
 424                              s64 sectors, enum s_alloc type,
 425                              bool may_make_unavailable,
 426                              struct bucket_stats_cache_set *stats,
 427                              bool gc_will_visit, u64 journal_seq)
 428 {
 429         struct bucket_mark old, new;
 430         unsigned saturated;
 431         struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
 432         u64 v = READ_ONCE(g->_mark.counter);
 433         unsigned old_sectors, new_sectors;
 434         int disk_sectors, compressed_sectors;
 435
 436         if (sectors > 0) {
 437                 old_sectors = 0;
 438                 new_sectors = sectors;
 439         } else {
 440                 old_sectors = e.k->size;
 441                 new_sectors = e.k->size + sectors;
 442         }
 443
 444         disk_sectors = -__disk_sectors(crc, old_sectors)
 445                 + __disk_sectors(crc, new_sectors);
 446         compressed_sectors = -__compressed_sectors(crc, old_sectors)
 447                 + __compressed_sectors(crc, new_sectors);
 448
 449         if (gc_will_visit) {
 450                 if (journal_seq)
 451                         bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
 452
 453                 goto out;
 454         }
 455
 456         do {
 457                 new.counter = old.counter = v;
 458                 saturated = 0;
 459
 460                 /*
 461                  * Check this after reading bucket mark to guard against
 462                  * the allocator invalidating a bucket after we've already
 463                  * checked the gen
 464                  */
 465                 if (gen_after(old.gen, ptr->gen)) {
 466                         EBUG_ON(type != S_CACHED &&
 467                                 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 468                         return;
 469                 }
 470
 471                 EBUG_ON(type != S_CACHED &&
 472                         !may_make_unavailable &&
 473                         is_available_bucket(old) &&
 474                         test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 475
 476                 if (type != S_CACHED &&
 477                     new.dirty_sectors == GC_MAX_SECTORS_USED &&
 478                     disk_sectors < 0)
 479                         saturated = -disk_sectors;
 480
 481                 if (type == S_CACHED)
 482                         saturated_add(ca, new.cached_sectors, disk_sectors,
 483                                       GC_MAX_SECTORS_USED);
 484                 else
 485                         saturated_add(ca, new.dirty_sectors, disk_sectors,
 486                                       GC_MAX_SECTORS_USED);
 487
 488                 if (!new.dirty_sectors &&
 489                     !new.cached_sectors) {
 490                         new.is_metadata = false;
 491
 492                         if (journal_seq) {
 493                                 new.wait_on_journal = true;
 494                                 new.journal_seq = journal_seq;
 495                         }
 496                 } else {
 497                         new.is_metadata = (type == S_META);
 498                 }
 499
 500                 new.had_metadata |= new.is_metadata;
 501         } while ((v = cmpxchg(&g->_mark.counter,
 502                               old.counter,
 503                               new.counter)) != old.counter);
 504
 505         bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
 506
 507         if (saturated &&
 508             atomic_long_add_return(saturated,
 509                                    &ca->saturated_count) >=
 510             ca->free_inc.size << ca->bucket_bits) {
 511                 if (c->gc_thread) {
 512                         trace_bcache_gc_sectors_saturated(c);
 513                         wake_up_process(c->gc_thread);
 514                 }
 515         }
 516 out:
 517         stats->s[S_COMPRESSED][type]    += compressed_sectors;
 518         stats->s[S_UNCOMPRESSED][type]  += sectors;
 519 }
 520
 521 static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e,
 522                             s64 sectors, bool metadata,
 523                             bool may_make_unavailable,
 524                             struct bucket_stats_cache_set *stats,
 525                             bool gc_will_visit, u64 journal_seq)
 526 {
 527         const struct bch_extent_ptr *ptr;
 528         const union bch_extent_crc *crc;
 529         struct cache *ca;
 530         enum s_alloc type = metadata ? S_META : S_DIRTY;
 531
 532         BUG_ON(metadata && bkey_extent_is_cached(e.k));
 533         BUG_ON(!sectors);
 534
 535         rcu_read_lock();
 536         extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
 537                 trace_bcache_mark_bucket(ca, e.k, ptr, sectors, !ptr->cached);
 538
 539                 bch_mark_pointer(c, e, ca, crc, ptr, sectors,
 540                                  ptr->cached ? S_CACHED : type,
 541                                  may_make_unavailable,
 542                                  stats, gc_will_visit, journal_seq);
 543         }
 544         rcu_read_unlock();
 545 }
 546
 547 static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,
 548                            s64 sectors, bool metadata,
 549                            bool may_make_unavailable,
 550                            struct bucket_stats_cache_set *stats,
 551                            bool gc_will_visit, u64 journal_seq)
 552 {
 553         switch (k.k->type) {
 554         case BCH_EXTENT:
 555         case BCH_EXTENT_CACHED:
 556                 bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
 557                                 may_make_unavailable, stats,
 558                                 gc_will_visit, journal_seq);
 559                 break;
 560         case BCH_RESERVATION: {
 561                 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 562
 563                 stats->persistent_reserved += r.v->nr_replicas * sectors;
 564                 break;
 565         }
 566         }
 567 }
 568
 569 void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
 570                        s64 sectors, bool metadata,
 571                        struct bucket_stats_cache_set *stats)
 572 {
 573         __bch_mark_key(c, k, sectors, metadata, true, stats, false, 0);
 574 }
 575
 576 void bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
 577                      s64 sectors, bool metadata)
 578 {
 579         struct bucket_stats_cache_set stats = { 0 };
 580
 581         __bch_gc_mark_key(c, k, sectors, metadata, &stats);
 582
 583         preempt_disable();
 584         bucket_stats_add(this_cpu_ptr(c->bucket_stats_percpu), &stats);
 585         preempt_enable();
 586 }
 587
 588 void bch_mark_key(struct cache_set *c, struct bkey_s_c k,
 589                   s64 sectors, bool metadata, struct gc_pos gc_pos,
 590                   struct bucket_stats_cache_set *stats, u64 journal_seq)
 591 {
 592         /*
 593          * synchronization w.r.t. GC:
 594          *
 595          * Normally, bucket sector counts/marks are updated on the fly, as
 596          * references are added/removed from the btree, the lists of buckets the
 597          * allocator owns, other metadata buckets, etc.
 598          *
 599          * When GC is in progress and going to mark this reference, we do _not_
 600          * mark this reference here, to avoid double counting - GC will count it
 601          * when it gets to it.
 602          *
 603          * To know whether we should mark a given reference (GC either isn't
 604          * running, or has already marked references at this position) we
 605          * construct a total order for everything GC walks. Then, we can simply
 606          * compare the position of the reference we're marking - @gc_pos - with
 607          * GC's current position. If GC is going to mark this reference, GC's
 608          * current position will be less than @gc_pos; if GC's current position
 609          * is greater than @gc_pos GC has either already walked this position,
 610          * or isn't running.
 611          *
 612          * To avoid racing with GC's position changing, we have to deal with
 613          *  - GC's position being set to GC_POS_MIN when GC starts:
 614          *    bucket_stats_lock guards against this
 615          *  - GC's position overtaking @gc_pos: we guard against this with
 616          *    whatever lock protects the data structure the reference lives in
 617          *    (e.g. the btree node lock, or the relevant allocator lock).
 618          */
 619         lg_local_lock(&c->bucket_stats_lock);
 620         __bch_mark_key(c, k, sectors, metadata, false, stats,
 621                        gc_will_visit(c, gc_pos), journal_seq);
 622
 623         bch_fs_stats_verify(c);
 624         lg_local_unlock(&c->bucket_stats_lock);
 625 }
 626
 627 static u64 __recalc_sectors_available(struct cache_set *c)
 628 {
 629         return c->capacity - bch_fs_sectors_used(c);
 630 }
 631
 632 /* Used by gc when it's starting: */
 633 void bch_recalc_sectors_available(struct cache_set *c)
 634 {
 635         int cpu;
 636
 637         lg_global_lock(&c->bucket_stats_lock);
 638
 639         for_each_possible_cpu(cpu)
 640                 per_cpu_ptr(c->bucket_stats_percpu, cpu)->available_cache = 0;
 641
 642         atomic64_set(&c->sectors_available,
 643                      __recalc_sectors_available(c));
 644
 645         lg_global_unlock(&c->bucket_stats_lock);
 646 }
 647
 648 void bch_disk_reservation_put(struct cache_set *c,
 649                               struct disk_reservation *res)
 650 {
 651         if (res->sectors) {
 652                 lg_local_lock(&c->bucket_stats_lock);
 653                 this_cpu_sub(c->bucket_stats_percpu->online_reserved,
 654                              res->sectors);
 655
 656                 bch_fs_stats_verify(c);
 657                 lg_local_unlock(&c->bucket_stats_lock);
 658
 659                 res->sectors = 0;
 660         }
 661 }
 662
 663 #define SECTORS_CACHE   1024
 664
 665 int bch_disk_reservation_add(struct cache_set *c,
 666                              struct disk_reservation *res,
 667                              unsigned sectors, int flags)
 668 {
 669         struct bucket_stats_cache_set *stats;
 670         u64 old, new, v;
 671         s64 sectors_available;
 672         int ret;
 673
 674         sectors *= res->nr_replicas;
 675
 676         lg_local_lock(&c->bucket_stats_lock);
 677         stats = this_cpu_ptr(c->bucket_stats_percpu);
 678
 679         if (sectors >= stats->available_cache)
 680                 goto out;
 681
 682         v = atomic64_read(&c->sectors_available);
 683         do {
 684                 old = v;
 685                 if (old < sectors) {
 686                         lg_local_unlock(&c->bucket_stats_lock);
 687                         goto recalculate;
 688                 }
 689
 690                 new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
 691         } while ((v = atomic64_cmpxchg(&c->sectors_available,
 692                                        old, new)) != old);
 693
 694         stats->available_cache  += old - new;
 695 out:
 696         stats->available_cache  -= sectors;
 697         stats->online_reserved  += sectors;
 698         res->sectors            += sectors;
 699
 700         bch_fs_stats_verify(c);
 701         lg_local_unlock(&c->bucket_stats_lock);
 702         return 0;
 703
 704 recalculate:
 705         /*
 706          * GC recalculates sectors_available when it starts, so that hopefully
 707          * we don't normally end up blocking here:
 708          */
 709
 710         /*
 711          * Piss fuck, we can be called from extent_insert_fixup() with btree
 712          * locks held:
 713          */
 714
 715         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
 716                 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
 717                         down_read(&c->gc_lock);
 718                 else if (!down_read_trylock(&c->gc_lock))
 719                         return -EINTR;
 720         }
 721         lg_global_lock(&c->bucket_stats_lock);
 722
 723         sectors_available = __recalc_sectors_available(c);
 724
 725         if (sectors <= sectors_available ||
 726             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 727                 atomic64_set(&c->sectors_available,
 728                              max_t(s64, 0, sectors_available - sectors));
 729                 stats->online_reserved  += sectors;
 730                 res->sectors            += sectors;
 731                 ret = 0;
 732         } else {
 733                 atomic64_set(&c->sectors_available, sectors_available);
 734                 ret = -ENOSPC;
 735         }
 736
 737         bch_fs_stats_verify(c);
 738         lg_global_unlock(&c->bucket_stats_lock);
 739         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
 740                 up_read(&c->gc_lock);
 741
 742         return ret;
 743 }
 744
 745 int bch_disk_reservation_get(struct cache_set *c,
 746                              struct disk_reservation *res,
 747                              unsigned sectors, int flags)
 748 {
 749         res->sectors = 0;
 750         res->gen = c->capacity_gen;
 751         res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA)
 752                 ? c->opts.metadata_replicas
 753                 : c->opts.data_replicas;
 754
 755         return bch_disk_reservation_add(c, res, sectors, flags);
 756 }