git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/buckets.c

   1 /*
   2  * Code for manipulating bucket marks for garbage collection.
   3  *
   4  * Copyright 2014 Datera, Inc.
   5  *
   6  * Bucket states:
   7  * - free bucket: mark == 0
   8  *   The bucket contains no data and will not be read
   9  *
  10  * - allocator bucket: owned_by_allocator == 1
  11  *   The bucket is on a free list, or it is an open bucket
  12  *
  13  * - cached bucket: owned_by_allocator == 0 &&
  14  *                  dirty_sectors == 0 &&
  15  *                  cached_sectors > 0
  16  *   The bucket contains data but may be safely discarded as there are
  17  *   enough replicas of the data on other cache devices, or it has been
  18  *   written back to the backing device
  19  *
  20  * - dirty bucket: owned_by_allocator == 0 &&
  21  *                 dirty_sectors > 0
  22  *   The bucket contains data that we must not discard (either only copy,
  23  *   or one of the 'main copies' for data requiring multiple replicas)
  24  *
  25  * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
  26  *   This is a btree node, journal or gen/prio bucket
  27  *
  28  * Lifecycle:
  29  *
  30  * bucket invalidated => bucket on freelist => open bucket =>
  31  *     [dirty bucket =>] cached bucket => bucket invalidated => ...
  32  *
  33  * Note that cache promotion can skip the dirty bucket step, as data
  34  * is copied from a deeper tier to a shallower tier, onto a cached
  35  * bucket.
  36  * Note also that a cached bucket can spontaneously become dirty --
  37  * see below.
  38  *
  39  * Only a traversal of the key space can determine whether a bucket is
  40  * truly dirty or cached.
  41  *
  42  * Transitions:
  43  *
  44  * - free => allocator: bucket was invalidated
  45  * - cached => allocator: bucket was invalidated
  46  *
  47  * - allocator => dirty: open bucket was filled up
  48  * - allocator => cached: open bucket was filled up
  49  * - allocator => metadata: metadata was allocated
  50  *
  51  * - dirty => cached: dirty sectors were copied to a deeper tier
  52  * - dirty => free: dirty sectors were overwritten or moved (copy gc)
  53  * - cached => free: cached sectors were overwritten
  54  *
  55  * - metadata => free: metadata was freed
  56  *
  57  * Oddities:
  58  * - cached => dirty: a device was removed so formerly replicated data
  59  *                    is no longer sufficiently replicated
  60  * - free => cached: cannot happen
  61  * - free => dirty: cannot happen
  62  * - free => metadata: cannot happen
  63  */
  64
  65 #include "bcache.h"
  66 #include "alloc.h"
  67 #include "btree_gc.h"
  68 #include "buckets.h"
  69 #include "error.h"
  70
  71 #include <linux/preempt.h>
  72 #include <trace/events/bcache.h>
  73
  74 #ifdef DEBUG_BUCKETS
  75
  76 #define lg_local_lock   lg_global_lock
  77 #define lg_local_unlock lg_global_unlock
  78
  79 static void bch_fs_stats_verify(struct bch_fs *c)
  80 {
  81         struct bch_fs_usage stats =
  82                 __bch_fs_usage_read(c);
  83
  84         if ((s64) stats.sectors_dirty < 0)
  85                 panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
  86
  87         if ((s64) stats.sectors_cached < 0)
  88                 panic("sectors_cached underflow: %lli\n", stats.sectors_cached);
  89
  90         if ((s64) stats.sectors_meta < 0)
  91                 panic("sectors_meta underflow: %lli\n", stats.sectors_meta);
  92
  93         if ((s64) stats.sectors_persistent_reserved < 0)
  94                 panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved);
  95
  96         if ((s64) stats.sectors_online_reserved < 0)
  97                 panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
  98 }
  99
 100 #else
 101
 102 static void bch_fs_stats_verify(struct bch_fs *c) {}
 103
 104 #endif
 105
 106 /*
 107  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
 108  * wraparound:
 109  */
 110 void bch_bucket_seq_cleanup(struct bch_fs *c)
 111 {
 112         u16 last_seq_ondisk = c->journal.last_seq_ondisk;
 113         struct bch_dev *ca;
 114         struct bucket *g;
 115         struct bucket_mark m;
 116         unsigned i;
 117
 118         for_each_member_device(ca, c, i)
 119                 for_each_bucket(g, ca) {
 120                         bucket_cmpxchg(g, m, ({
 121                                 if (!m.journal_seq_valid ||
 122                                     bucket_needs_journal_commit(m, last_seq_ondisk))
 123                                         break;
 124
 125                                 m.journal_seq_valid = 0;
 126                         }));
 127                 }
 128 }
 129
 130 #define bch_usage_add(_acc, _stats)                                     \
 131 do {                                                                    \
 132         typeof(_acc) _a = (_acc), _s = (_stats);                        \
 133         unsigned i;                                                     \
 134                                                                         \
 135         for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)                 \
 136                 ((u64 *) (_a))[i] += ((u64 *) (_s))[i];                 \
 137 } while (0)
 138
 139 #define bch_usage_read_raw(_stats)                                      \
 140 ({                                                                      \
 141         typeof(*this_cpu_ptr(_stats)) _acc = { 0 };                     \
 142         int cpu;                                                        \
 143                                                                         \
 144         for_each_possible_cpu(cpu)                                      \
 145                 bch_usage_add(&_acc, per_cpu_ptr((_stats), cpu));       \
 146                                                                         \
 147         _acc;                                                           \
 148 })
 149
 150 #define bch_usage_read_cached(_c, _cached, _uncached)                   \
 151 ({                                                                      \
 152         typeof(_cached) _ret;                                           \
 153         unsigned _seq;                                                  \
 154                                                                         \
 155         do {                                                            \
 156                 _seq = read_seqcount_begin(&(_c)->gc_pos_lock);         \
 157                 _ret = (_c)->gc_pos.phase == GC_PHASE_DONE              \
 158                         ? bch_usage_read_raw(_uncached)                 \
 159                         : (_cached);                                    \
 160         } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));        \
 161                                                                         \
 162         _ret;                                                           \
 163 })
 164
 165 struct bch_dev_usage __bch_dev_usage_read(struct bch_dev *ca)
 166 {
 167         return bch_usage_read_raw(ca->usage_percpu);
 168 }
 169
 170 struct bch_dev_usage bch_dev_usage_read(struct bch_dev *ca)
 171 {
 172         return bch_usage_read_cached(ca->fs,
 173                                 ca->usage_cached,
 174                                 ca->usage_percpu);
 175 }
 176
 177 struct bch_fs_usage
 178 __bch_fs_usage_read(struct bch_fs *c)
 179 {
 180         return bch_usage_read_raw(c->usage_percpu);
 181 }
 182
 183 struct bch_fs_usage
 184 bch_fs_usage_read(struct bch_fs *c)
 185 {
 186         return bch_usage_read_cached(c,
 187                                      c->usage_cached,
 188                                      c->usage_percpu);
 189 }
 190
 191 static inline int is_meta_bucket(struct bucket_mark m)
 192 {
 193         return m.data_type != BUCKET_DATA;
 194 }
 195
 196 static inline int is_dirty_bucket(struct bucket_mark m)
 197 {
 198         return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
 199 }
 200
 201 static inline int is_cached_bucket(struct bucket_mark m)
 202 {
 203         return m.data_type == BUCKET_DATA &&
 204                 !m.dirty_sectors && !!m.cached_sectors;
 205 }
 206
 207 static inline enum s_alloc bucket_type(struct bucket_mark m)
 208 {
 209         return is_meta_bucket(m) ? S_META : S_DIRTY;
 210 }
 211
 212 static bool bucket_became_unavailable(struct bch_fs *c,
 213                                       struct bucket_mark old,
 214                                       struct bucket_mark new)
 215 {
 216         return is_available_bucket(old) &&
 217                !is_available_bucket(new) &&
 218                c && c->gc_pos.phase == GC_PHASE_DONE;
 219 }
 220
 221 void bch_fs_usage_apply(struct bch_fs *c,
 222                         struct bch_fs_usage *stats,
 223                         struct disk_reservation *disk_res,
 224                         struct gc_pos gc_pos)
 225 {
 226         s64 added =
 227                 stats->s[S_COMPRESSED][S_META] +
 228                 stats->s[S_COMPRESSED][S_DIRTY] +
 229                 stats->persistent_reserved +
 230                 stats->online_reserved;
 231
 232         /*
 233          * Not allowed to reduce sectors_available except by getting a
 234          * reservation:
 235          */
 236         BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
 237
 238         if (added > 0) {
 239                 disk_res->sectors       -= added;
 240                 stats->online_reserved  -= added;
 241         }
 242
 243         lg_local_lock(&c->usage_lock);
 244         /* online_reserved not subject to gc: */
 245         this_cpu_ptr(c->usage_percpu)->online_reserved +=
 246                 stats->online_reserved;
 247         stats->online_reserved = 0;
 248
 249         if (!gc_will_visit(c, gc_pos))
 250                 bch_usage_add(this_cpu_ptr(c->usage_percpu), stats);
 251
 252         bch_fs_stats_verify(c);
 253         lg_local_unlock(&c->usage_lock);
 254
 255         memset(stats, 0, sizeof(*stats));
 256 }
 257
 258 static void bch_fs_usage_update(struct bch_fs_usage *fs_usage,
 259                                 struct bucket_mark old, struct bucket_mark new)
 260 {
 261         fs_usage->s[S_COMPRESSED][S_CACHED] +=
 262                 (int) new.cached_sectors - (int) old.cached_sectors;
 263         fs_usage->s[S_COMPRESSED][bucket_type(old)] -=
 264                 old.dirty_sectors;
 265         fs_usage->s[S_COMPRESSED][bucket_type(new)] +=
 266                 new.dirty_sectors;
 267 }
 268
 269 static void bch_dev_usage_update(struct bch_dev *ca,
 270                                  struct bucket_mark old, struct bucket_mark new)
 271 {
 272         struct bch_fs *c = ca->fs;
 273         struct bch_dev_usage *dev_usage;
 274
 275         bch_fs_inconsistent_on(old.data_type && new.data_type &&
 276                         old.data_type != new.data_type, c,
 277                         "different types of metadata in same bucket: %u, %u",
 278                         old.data_type, new.data_type);
 279
 280         preempt_disable();
 281         dev_usage = this_cpu_ptr(ca->usage_percpu);
 282
 283         dev_usage->sectors[S_CACHED] +=
 284                 (int) new.cached_sectors - (int) old.cached_sectors;
 285
 286         dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
 287         dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
 288
 289         dev_usage->buckets_alloc +=
 290                 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
 291
 292         dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old);
 293         dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
 294         dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
 295         preempt_enable();
 296
 297         if (!is_available_bucket(old) && is_available_bucket(new))
 298                 bch_wake_allocator(ca);
 299 }
 300
 301 #define bucket_data_cmpxchg(ca, g, new, expr)                   \
 302 ({                                                              \
 303         struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
 304                                                                 \
 305         bch_dev_usage_update(ca, _old, new);                    \
 306         _old;                                                   \
 307 })
 308
 309 void bch_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
 310 {
 311         struct bch_fs_usage stats = { 0 };
 312         struct bucket_mark old, new;
 313
 314         old = bucket_data_cmpxchg(ca, g, new, ({
 315                 new.owned_by_allocator  = 1;
 316                 new.had_metadata        = 0;
 317                 new.data_type           = 0;
 318                 new.cached_sectors      = 0;
 319                 new.dirty_sectors       = 0;
 320                 new.copygc              = 0;
 321                 new.gen++;
 322         }));
 323
 324         /* XXX: we're not actually updating fs usage's cached sectors... */
 325         bch_fs_usage_update(&stats, old, new);
 326
 327         if (!old.owned_by_allocator && old.cached_sectors)
 328                 trace_bcache_invalidate(ca, g - ca->buckets,
 329                                         old.cached_sectors);
 330 }
 331
 332 void bch_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
 333 {
 334         struct bucket_mark old, new;
 335
 336         old = bucket_data_cmpxchg(ca, g, new, ({
 337                 new.owned_by_allocator  = 0;
 338                 new.data_type           = 0;
 339                 new.cached_sectors      = 0;
 340                 new.dirty_sectors       = 0;
 341         }));
 342
 343         BUG_ON(bucket_became_unavailable(ca->fs, old, new));
 344 }
 345
 346 void bch_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
 347                            bool owned_by_allocator)
 348 {
 349         struct bucket_mark new;
 350
 351         bucket_data_cmpxchg(ca, g, new, ({
 352                 new.owned_by_allocator = owned_by_allocator;
 353         }));
 354 }
 355
 356 void bch_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
 357                               enum bucket_data_type type,
 358                               bool may_make_unavailable)
 359 {
 360         struct bucket_mark old, new;
 361
 362         BUG_ON(!type);
 363
 364         old = bucket_data_cmpxchg(ca, g, new, ({
 365                 new.data_type = type;
 366                 new.had_metadata = 1;
 367         }));
 368
 369         BUG_ON(old.cached_sectors);
 370         BUG_ON(old.dirty_sectors);
 371         BUG_ON(!may_make_unavailable &&
 372                bucket_became_unavailable(ca->fs, old, new));
 373 }
 374
 375 #define saturated_add(ca, dst, src, max)                        \
 376 do {                                                            \
 377         BUG_ON((int) (dst) + (src) < 0);                        \
 378         if ((dst) == (max))                                     \
 379                 ;                                               \
 380         else if ((dst) + (src) <= (max))                        \
 381                 dst += (src);                                   \
 382         else {                                                  \
 383                 dst = (max);                                    \
 384                 trace_bcache_sectors_saturated(ca);             \
 385         }                                                       \
 386 } while (0)
 387
 388 #if 0
 389 /* Reverting this until the copygc + compression issue is fixed: */
 390
 391 static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
 392 {
 393         return crc_compression_type(crc)
 394                 ? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc)
 395                 : sectors;
 396 }
 397
 398 static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
 399 {
 400         return crc_compression_type(crc)
 401                 ? min_t(unsigned, crc_compressed_size(crc), sectors)
 402                 : sectors;
 403 }
 404 #else
 405 static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
 406 {
 407         return sectors;
 408 }
 409
 410 static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
 411 {
 412         return sectors;
 413 }
 414 #endif
 415
 416 /*
 417  * Checking against gc's position has to be done here, inside the cmpxchg()
 418  * loop, to avoid racing with the start of gc clearing all the marks - GC does
 419  * that with the gc pos seqlock held.
 420  */
 421 static void bch_mark_pointer(struct bch_fs *c,
 422                              struct bkey_s_c_extent e,
 423                              const union bch_extent_crc *crc,
 424                              const struct bch_extent_ptr *ptr,
 425                              s64 sectors, enum s_alloc type,
 426                              bool may_make_unavailable,
 427                              struct bch_fs_usage *stats,
 428                              bool gc_will_visit, u64 journal_seq)
 429 {
 430         struct bucket_mark old, new;
 431         unsigned saturated;
 432         struct bch_dev *ca = c->devs[ptr->dev];
 433         struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
 434         unsigned old_sectors, new_sectors;
 435         int disk_sectors, compressed_sectors;
 436
 437         if (sectors > 0) {
 438                 old_sectors = 0;
 439                 new_sectors = sectors;
 440         } else {
 441                 old_sectors = e.k->size;
 442                 new_sectors = e.k->size + sectors;
 443         }
 444
 445         disk_sectors = -__disk_sectors(crc, old_sectors)
 446                 + __disk_sectors(crc, new_sectors);
 447         compressed_sectors = -__compressed_sectors(crc, old_sectors)
 448                 + __compressed_sectors(crc, new_sectors);
 449
 450         if (gc_will_visit) {
 451                 if (journal_seq)
 452                         bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
 453
 454                 goto out;
 455         }
 456
 457         old = bucket_data_cmpxchg(ca, g, new, ({
 458                 saturated = 0;
 459
 460                 /*
 461                  * Check this after reading bucket mark to guard against
 462                  * the allocator invalidating a bucket after we've already
 463                  * checked the gen
 464                  */
 465                 if (gen_after(new.gen, ptr->gen)) {
 466                         EBUG_ON(type != S_CACHED &&
 467                                 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 468                         return;
 469                 }
 470
 471                 EBUG_ON(type != S_CACHED &&
 472                         !may_make_unavailable &&
 473                         is_available_bucket(new) &&
 474                         test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 475
 476                 if (type != S_CACHED &&
 477                     new.dirty_sectors == GC_MAX_SECTORS_USED &&
 478                     disk_sectors < 0)
 479                         saturated = -disk_sectors;
 480
 481                 if (type == S_CACHED)
 482                         saturated_add(ca, new.cached_sectors, disk_sectors,
 483                                       GC_MAX_SECTORS_USED);
 484                 else
 485                         saturated_add(ca, new.dirty_sectors, disk_sectors,
 486                                       GC_MAX_SECTORS_USED);
 487
 488                 if (!new.dirty_sectors &&
 489                     !new.cached_sectors) {
 490                         new.data_type   = 0;
 491
 492                         if (journal_seq) {
 493                                 new.journal_seq_valid = 1;
 494                                 new.journal_seq = journal_seq;
 495                         }
 496                 } else {
 497                         new.data_type = type == S_META
 498                                 ? BUCKET_BTREE : BUCKET_DATA;
 499                 }
 500
 501                 new.had_metadata |= is_meta_bucket(new);
 502         }));
 503
 504         BUG_ON(!may_make_unavailable &&
 505                bucket_became_unavailable(c, old, new));
 506
 507         if (saturated &&
 508             atomic_long_add_return(saturated,
 509                                    &ca->saturated_count) >=
 510             ca->free_inc.size << ca->bucket_bits) {
 511                 if (c->gc_thread) {
 512                         trace_bcache_gc_sectors_saturated(c);
 513                         wake_up_process(c->gc_thread);
 514                 }
 515         }
 516 out:
 517         stats->s[S_COMPRESSED][type]    += compressed_sectors;
 518         stats->s[S_UNCOMPRESSED][type]  += sectors;
 519 }
 520
 521 static void bch_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
 522                             s64 sectors, bool metadata,
 523                             bool may_make_unavailable,
 524                             struct bch_fs_usage *stats,
 525                             bool gc_will_visit, u64 journal_seq)
 526 {
 527         const struct bch_extent_ptr *ptr;
 528         const union bch_extent_crc *crc;
 529         enum s_alloc type = metadata ? S_META : S_DIRTY;
 530
 531         BUG_ON(metadata && bkey_extent_is_cached(e.k));
 532         BUG_ON(!sectors);
 533
 534         extent_for_each_ptr_crc(e, ptr, crc)
 535                 bch_mark_pointer(c, e, crc, ptr, sectors,
 536                                  ptr->cached ? S_CACHED : type,
 537                                  may_make_unavailable,
 538                                  stats, gc_will_visit, journal_seq);
 539 }
 540
 541 static void __bch_mark_key(struct bch_fs *c, struct bkey_s_c k,
 542                            s64 sectors, bool metadata,
 543                            bool may_make_unavailable,
 544                            struct bch_fs_usage *stats,
 545                            bool gc_will_visit, u64 journal_seq)
 546 {
 547         switch (k.k->type) {
 548         case BCH_EXTENT:
 549         case BCH_EXTENT_CACHED:
 550                 bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
 551                                 may_make_unavailable, stats,
 552                                 gc_will_visit, journal_seq);
 553                 break;
 554         case BCH_RESERVATION: {
 555                 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 556
 557                 stats->persistent_reserved += r.v->nr_replicas * sectors;
 558                 break;
 559         }
 560         }
 561 }
 562
 563 void __bch_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 564                        s64 sectors, bool metadata,
 565                        struct bch_fs_usage *stats)
 566 {
 567         __bch_mark_key(c, k, sectors, metadata, true, stats, false, 0);
 568 }
 569
 570 void bch_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 571                      s64 sectors, bool metadata)
 572 {
 573         struct bch_fs_usage stats = { 0 };
 574
 575         __bch_gc_mark_key(c, k, sectors, metadata, &stats);
 576
 577         preempt_disable();
 578         bch_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
 579         preempt_enable();
 580 }
 581
 582 void bch_mark_key(struct bch_fs *c, struct bkey_s_c k,
 583                   s64 sectors, bool metadata, struct gc_pos gc_pos,
 584                   struct bch_fs_usage *stats, u64 journal_seq)
 585 {
 586         /*
 587          * synchronization w.r.t. GC:
 588          *
 589          * Normally, bucket sector counts/marks are updated on the fly, as
 590          * references are added/removed from the btree, the lists of buckets the
 591          * allocator owns, other metadata buckets, etc.
 592          *
 593          * When GC is in progress and going to mark this reference, we do _not_
 594          * mark this reference here, to avoid double counting - GC will count it
 595          * when it gets to it.
 596          *
 597          * To know whether we should mark a given reference (GC either isn't
 598          * running, or has already marked references at this position) we
 599          * construct a total order for everything GC walks. Then, we can simply
 600          * compare the position of the reference we're marking - @gc_pos - with
 601          * GC's current position. If GC is going to mark this reference, GC's
 602          * current position will be less than @gc_pos; if GC's current position
 603          * is greater than @gc_pos GC has either already walked this position,
 604          * or isn't running.
 605          *
 606          * To avoid racing with GC's position changing, we have to deal with
 607          *  - GC's position being set to GC_POS_MIN when GC starts:
 608          *    usage_lock guards against this
 609          *  - GC's position overtaking @gc_pos: we guard against this with
 610          *    whatever lock protects the data structure the reference lives in
 611          *    (e.g. the btree node lock, or the relevant allocator lock).
 612          */
 613         lg_local_lock(&c->usage_lock);
 614         __bch_mark_key(c, k, sectors, metadata, false, stats,
 615                        gc_will_visit(c, gc_pos), journal_seq);
 616
 617         bch_fs_stats_verify(c);
 618         lg_local_unlock(&c->usage_lock);
 619 }
 620
 621 static u64 __recalc_sectors_available(struct bch_fs *c)
 622 {
 623         return c->capacity - bch_fs_sectors_used(c);
 624 }
 625
 626 /* Used by gc when it's starting: */
 627 void bch_recalc_sectors_available(struct bch_fs *c)
 628 {
 629         int cpu;
 630
 631         lg_global_lock(&c->usage_lock);
 632
 633         for_each_possible_cpu(cpu)
 634                 per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 635
 636         atomic64_set(&c->sectors_available,
 637                      __recalc_sectors_available(c));
 638
 639         lg_global_unlock(&c->usage_lock);
 640 }
 641
 642 void bch_disk_reservation_put(struct bch_fs *c,
 643                               struct disk_reservation *res)
 644 {
 645         if (res->sectors) {
 646                 lg_local_lock(&c->usage_lock);
 647                 this_cpu_sub(c->usage_percpu->online_reserved,
 648                              res->sectors);
 649
 650                 bch_fs_stats_verify(c);
 651                 lg_local_unlock(&c->usage_lock);
 652
 653                 res->sectors = 0;
 654         }
 655 }
 656
 657 #define SECTORS_CACHE   1024
 658
 659 int bch_disk_reservation_add(struct bch_fs *c,
 660                              struct disk_reservation *res,
 661                              unsigned sectors, int flags)
 662 {
 663         struct bch_fs_usage *stats;
 664         u64 old, new, v;
 665         s64 sectors_available;
 666         int ret;
 667
 668         sectors *= res->nr_replicas;
 669
 670         lg_local_lock(&c->usage_lock);
 671         stats = this_cpu_ptr(c->usage_percpu);
 672
 673         if (sectors >= stats->available_cache)
 674                 goto out;
 675
 676         v = atomic64_read(&c->sectors_available);
 677         do {
 678                 old = v;
 679                 if (old < sectors) {
 680                         lg_local_unlock(&c->usage_lock);
 681                         goto recalculate;
 682                 }
 683
 684                 new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
 685         } while ((v = atomic64_cmpxchg(&c->sectors_available,
 686                                        old, new)) != old);
 687
 688         stats->available_cache  += old - new;
 689 out:
 690         stats->available_cache  -= sectors;
 691         stats->online_reserved  += sectors;
 692         res->sectors            += sectors;
 693
 694         bch_fs_stats_verify(c);
 695         lg_local_unlock(&c->usage_lock);
 696         return 0;
 697
 698 recalculate:
 699         /*
 700          * GC recalculates sectors_available when it starts, so that hopefully
 701          * we don't normally end up blocking here:
 702          */
 703
 704         /*
 705          * Piss fuck, we can be called from extent_insert_fixup() with btree
 706          * locks held:
 707          */
 708
 709         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
 710                 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
 711                         down_read(&c->gc_lock);
 712                 else if (!down_read_trylock(&c->gc_lock))
 713                         return -EINTR;
 714         }
 715         lg_global_lock(&c->usage_lock);
 716
 717         sectors_available = __recalc_sectors_available(c);
 718
 719         if (sectors <= sectors_available ||
 720             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
 721                 atomic64_set(&c->sectors_available,
 722                              max_t(s64, 0, sectors_available - sectors));
 723                 stats->online_reserved  += sectors;
 724                 res->sectors            += sectors;
 725                 ret = 0;
 726         } else {
 727                 atomic64_set(&c->sectors_available, sectors_available);
 728                 ret = -ENOSPC;
 729         }
 730
 731         bch_fs_stats_verify(c);
 732         lg_global_unlock(&c->usage_lock);
 733         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
 734                 up_read(&c->gc_lock);
 735
 736         return ret;
 737 }
 738
 739 int bch_disk_reservation_get(struct bch_fs *c,
 740                              struct disk_reservation *res,
 741                              unsigned sectors, int flags)
 742 {
 743         res->sectors = 0;
 744         res->gen = c->capacity_gen;
 745         res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA)
 746                 ? c->opts.metadata_replicas
 747                 : c->opts.data_replicas;
 748
 749         return bch_disk_reservation_add(c, res, sectors, flags);
 750 }