git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/buckets.c

   1 /*
   2  * Code for manipulating bucket marks for garbage collection.
   3  *
   4  * Copyright 2014 Datera, Inc.
   5  *
   6  * Bucket states:
   7  * - free bucket: mark == 0
   8  *   The bucket contains no data and will not be read
   9  *
  10  * - allocator bucket: owned_by_allocator == 1
  11  *   The bucket is on a free list, or it is an open bucket
  12  *
  13  * - cached bucket: owned_by_allocator == 0 &&
  14  *                  dirty_sectors == 0 &&
  15  *                  cached_sectors > 0
  16  *   The bucket contains data but may be safely discarded as there are
  17  *   enough replicas of the data on other cache devices, or it has been
  18  *   written back to the backing device
  19  *
  20  * - dirty bucket: owned_by_allocator == 0 &&
  21  *                 dirty_sectors > 0
  22  *   The bucket contains data that we must not discard (either only copy,
  23  *   or one of the 'main copies' for data requiring multiple replicas)
  24  *
  25  * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
  26  *   This is a btree node, journal or gen/prio bucket
  27  *
  28  * Lifecycle:
  29  *
  30  * bucket invalidated => bucket on freelist => open bucket =>
  31  *     [dirty bucket =>] cached bucket => bucket invalidated => ...
  32  *
  33  * Note that cache promotion can skip the dirty bucket step, as data
  34  * is copied from a deeper tier to a shallower tier, onto a cached
  35  * bucket.
  36  * Note also that a cached bucket can spontaneously become dirty --
  37  * see below.
  38  *
  39  * Only a traversal of the key space can determine whether a bucket is
  40  * truly dirty or cached.
  41  *
  42  * Transitions:
  43  *
  44  * - free => allocator: bucket was invalidated
  45  * - cached => allocator: bucket was invalidated
  46  *
  47  * - allocator => dirty: open bucket was filled up
  48  * - allocator => cached: open bucket was filled up
  49  * - allocator => metadata: metadata was allocated
  50  *
  51  * - dirty => cached: dirty sectors were copied to a deeper tier
  52  * - dirty => free: dirty sectors were overwritten or moved (copy gc)
  53  * - cached => free: cached sectors were overwritten
  54  *
  55  * - metadata => free: metadata was freed
  56  *
  57  * Oddities:
  58  * - cached => dirty: a device was removed so formerly replicated data
  59  *                    is no longer sufficiently replicated
  60  * - free => cached: cannot happen
  61  * - free => dirty: cannot happen
  62  * - free => metadata: cannot happen
  63  */
  64
  65 #include "bcachefs.h"
  66 #include "alloc_background.h"
  67 #include "bset.h"
  68 #include "btree_gc.h"
  69 #include "btree_update.h"
  70 #include "buckets.h"
  71 #include "ec.h"
  72 #include "error.h"
  73 #include "movinggc.h"
  74 #include "replicas.h"
  75
  76 #include <linux/preempt.h>
  77 #include <trace/events/bcachefs.h>
  78
  79 /*
  80  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  81  * wraparound:
  82  */
  83 void bch2_bucket_seq_cleanup(struct bch_fs *c)
  84 {
  85         u64 journal_seq = atomic64_read(&c->journal.seq);
  86         u16 last_seq_ondisk = c->journal.last_seq_ondisk;
  87         struct bch_dev *ca;
  88         struct bucket_array *buckets;
  89         struct bucket *g;
  90         struct bucket_mark m;
  91         unsigned i;
  92
  93         if (journal_seq - c->last_bucket_seq_cleanup <
  94             (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
  95                 return;
  96
  97         c->last_bucket_seq_cleanup = journal_seq;
  98
  99         for_each_member_device(ca, c, i) {
 100                 down_read(&ca->bucket_lock);
 101                 buckets = bucket_array(ca);
 102
 103                 for_each_bucket(g, buckets) {
 104                         bucket_cmpxchg(g, m, ({
 105                                 if (!m.journal_seq_valid ||
 106                                     bucket_needs_journal_commit(m, last_seq_ondisk))
 107                                         break;
 108
 109                                 m.journal_seq_valid = 0;
 110                         }));
 111                 }
 112                 up_read(&ca->bucket_lock);
 113         }
 114 }
 115
 116 #define bch2_usage_read_raw(_stats)                                     \
 117 ({                                                                      \
 118         typeof(*this_cpu_ptr(_stats)) _acc;                             \
 119                                                                         \
 120         memset(&_acc, 0, sizeof(_acc));                                 \
 121         acc_u64s_percpu((u64 *) &_acc,                                  \
 122                         (u64 __percpu *) _stats,                        \
 123                         sizeof(_acc) / sizeof(u64));                    \
 124                                                                         \
 125         _acc;                                                           \
 126 })
 127
 128 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 129 {
 130         return bch2_usage_read_raw(ca->usage[0]);
 131 }
 132
 133 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
 134 {
 135         struct bch_fs_usage *ret;
 136         unsigned nr = READ_ONCE(c->replicas.nr);
 137 retry:
 138         ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
 139         if (unlikely(!ret))
 140                 return NULL;
 141
 142         percpu_down_read_preempt_disable(&c->mark_lock);
 143
 144         if (unlikely(nr < c->replicas.nr)) {
 145                 nr = c->replicas.nr;
 146                 percpu_up_read_preempt_enable(&c->mark_lock);
 147                 kfree(ret);
 148                 goto retry;
 149         }
 150
 151         acc_u64s_percpu((u64 *) ret,
 152                         (u64 __percpu *) c->usage[0],
 153                         sizeof(*ret) / sizeof(u64) + nr);
 154         percpu_up_read_preempt_enable(&c->mark_lock);
 155
 156         return ret;
 157 }
 158
 159 #define RESERVE_FACTOR  6
 160
 161 static u64 reserve_factor(u64 r)
 162 {
 163         return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
 164 }
 165
 166 static u64 avail_factor(u64 r)
 167 {
 168         return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 169 }
 170
 171 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 172 {
 173         return min(fs_usage.s.hidden +
 174                    fs_usage.s.data +
 175                    reserve_factor(fs_usage.s.reserved +
 176                                   fs_usage.s.online_reserved),
 177                    c->capacity);
 178 }
 179
 180 struct bch_fs_usage_short
 181 bch2_fs_usage_read_short(struct bch_fs *c)
 182 {
 183         struct bch_fs_usage_summarized usage =
 184                 bch2_usage_read_raw(&c->usage[0]->s);
 185         struct bch_fs_usage_short ret;
 186
 187         ret.capacity    = READ_ONCE(c->capacity) - usage.hidden;
 188         ret.used        = min(ret.capacity, usage.data +
 189                               reserve_factor(usage.reserved +
 190                                              usage.online_reserved));
 191         ret.nr_inodes   = usage.nr_inodes;
 192
 193         return ret;
 194 }
 195
 196 static inline int is_unavailable_bucket(struct bucket_mark m)
 197 {
 198         return !is_available_bucket(m);
 199 }
 200
 201 static inline int is_fragmented_bucket(struct bucket_mark m,
 202                                        struct bch_dev *ca)
 203 {
 204         if (!m.owned_by_allocator &&
 205             m.data_type == BCH_DATA_USER &&
 206             bucket_sectors_used(m))
 207                 return max_t(int, 0, (int) ca->mi.bucket_size -
 208                              bucket_sectors_used(m));
 209         return 0;
 210 }
 211
 212 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 213 {
 214         return m.cached_sectors && !m.dirty_sectors
 215                 ? BCH_DATA_CACHED
 216                 : m.data_type;
 217 }
 218
 219 static bool bucket_became_unavailable(struct bucket_mark old,
 220                                       struct bucket_mark new)
 221 {
 222         return is_available_bucket(old) &&
 223                !is_available_bucket(new);
 224 }
 225
 226 void bch2_fs_usage_apply(struct bch_fs *c,
 227                          struct bch_fs_usage *fs_usage,
 228                          struct disk_reservation *disk_res,
 229                          struct gc_pos gc_pos)
 230 {
 231         s64 added = fs_usage->s.data + fs_usage->s.reserved;
 232         s64 should_not_have_added;
 233
 234         percpu_rwsem_assert_held(&c->mark_lock);
 235
 236         /*
 237          * Not allowed to reduce sectors_available except by getting a
 238          * reservation:
 239          */
 240         should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
 241         if (WARN_ONCE(should_not_have_added > 0,
 242                       "disk usage increased without a reservation")) {
 243                 atomic64_sub(should_not_have_added, &c->sectors_available);
 244                 added -= should_not_have_added;
 245         }
 246
 247         if (added > 0) {
 248                 disk_res->sectors               -= added;
 249                 fs_usage->s.online_reserved     -= added;
 250         }
 251
 252         acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
 253                  (u64 *) fs_usage,
 254                  sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 255
 256         if (gc_visited(c, gc_pos)) {
 257                 BUG_ON(!c->usage[1]);
 258                 acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
 259                          (u64 *) fs_usage,
 260                          sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 261         }
 262 }
 263
 264 static inline void account_bucket(struct bch_fs_usage *fs_usage,
 265                                   struct bch_dev_usage *dev_usage,
 266                                   enum bch_data_type type,
 267                                   int nr, s64 size)
 268 {
 269         if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
 270                 fs_usage->s.hidden      += size;
 271
 272         dev_usage->buckets[type]        += nr;
 273 }
 274
 275 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 276                                   struct bch_fs_usage *fs_usage,
 277                                   struct bucket_mark old, struct bucket_mark new,
 278                                   bool gc)
 279 {
 280         struct bch_dev_usage *dev_usage;
 281
 282         percpu_rwsem_assert_held(&c->mark_lock);
 283
 284         bch2_fs_inconsistent_on(old.data_type && new.data_type &&
 285                                 old.data_type != new.data_type, c,
 286                 "different types of data in same bucket: %s, %s",
 287                 bch2_data_types[old.data_type],
 288                 bch2_data_types[new.data_type]);
 289
 290         dev_usage = this_cpu_ptr(ca->usage[gc]);
 291
 292         if (bucket_type(old))
 293                 account_bucket(fs_usage, dev_usage, bucket_type(old),
 294                                -1, -ca->mi.bucket_size);
 295
 296         if (bucket_type(new))
 297                 account_bucket(fs_usage, dev_usage, bucket_type(new),
 298                                1, ca->mi.bucket_size);
 299
 300         dev_usage->buckets_alloc +=
 301                 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
 302         dev_usage->buckets_ec +=
 303                 (int) new.stripe - (int) old.stripe;
 304         dev_usage->buckets_unavailable +=
 305                 is_unavailable_bucket(new) - is_unavailable_bucket(old);
 306
 307         dev_usage->sectors[old.data_type] -= old.dirty_sectors;
 308         dev_usage->sectors[new.data_type] += new.dirty_sectors;
 309         dev_usage->sectors[BCH_DATA_CACHED] +=
 310                 (int) new.cached_sectors - (int) old.cached_sectors;
 311         dev_usage->sectors_fragmented +=
 312                 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
 313
 314         if (!is_available_bucket(old) && is_available_bucket(new))
 315                 bch2_wake_allocator(ca);
 316 }
 317
 318 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
 319 {
 320         struct bucket_mark old = { .v.counter = 0 };
 321         struct bch_fs_usage *fs_usage;
 322         struct bucket_array *buckets;
 323         struct bucket *g;
 324
 325         percpu_down_read_preempt_disable(&c->mark_lock);
 326         fs_usage = this_cpu_ptr(c->usage[0]);
 327         buckets = bucket_array(ca);
 328
 329         for_each_bucket(g, buckets)
 330                 if (g->mark.data_type)
 331                         bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
 332         percpu_up_read_preempt_enable(&c->mark_lock);
 333 }
 334
 335 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)      \
 336 ({                                                              \
 337         struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
 338                                                                 \
 339         bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc);  \
 340         _old;                                                   \
 341 })
 342
 343 static inline void update_replicas(struct bch_fs *c,
 344                                    struct bch_fs_usage *fs_usage,
 345                                    struct bch_replicas_entry *r,
 346                                    s64 sectors)
 347 {
 348         int idx = bch2_replicas_entry_idx(c, r);
 349
 350         BUG_ON(idx < 0);
 351         BUG_ON(!sectors);
 352
 353         if (r->data_type == BCH_DATA_CACHED)
 354                 fs_usage->s.cached      += sectors;
 355         else
 356                 fs_usage->s.data        += sectors;
 357         fs_usage->data[idx]             += sectors;
 358 }
 359
 360 static inline void update_cached_sectors(struct bch_fs *c,
 361                                          struct bch_fs_usage *fs_usage,
 362                                          unsigned dev, s64 sectors)
 363 {
 364         struct bch_replicas_padded r;
 365
 366         r.e.data_type   = BCH_DATA_CACHED;
 367         r.e.nr_devs     = 1;
 368         r.e.nr_required = 1;
 369         r.e.devs[0]     = dev;
 370
 371         update_replicas(c, fs_usage, &r.e, sectors);
 372 }
 373
 374 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 375                                      size_t b, struct bucket_mark *old,
 376                                      bool gc)
 377 {
 378         struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 379         struct bucket *g = __bucket(ca, b, gc);
 380         struct bucket_mark new;
 381
 382         *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 383                 BUG_ON(!is_available_bucket(new));
 384
 385                 new.owned_by_allocator  = 1;
 386                 new.data_type           = 0;
 387                 new.cached_sectors      = 0;
 388                 new.dirty_sectors       = 0;
 389                 new.gen++;
 390         }));
 391
 392         if (old->cached_sectors)
 393                 update_cached_sectors(c, fs_usage, ca->dev_idx,
 394                                       -old->cached_sectors);
 395 }
 396
 397 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 398                             size_t b, struct bucket_mark *old)
 399 {
 400         percpu_rwsem_assert_held(&c->mark_lock);
 401
 402         __bch2_invalidate_bucket(c, ca, b, old, false);
 403
 404         if (!old->owned_by_allocator && old->cached_sectors)
 405                 trace_invalidate(ca, bucket_to_sector(ca, b),
 406                                  old->cached_sectors);
 407 }
 408
 409 static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 410                                      size_t b, bool owned_by_allocator,
 411                                      bool gc)
 412 {
 413         struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 414         struct bucket *g = __bucket(ca, b, gc);
 415         struct bucket_mark old, new;
 416
 417         old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 418                 new.owned_by_allocator  = owned_by_allocator;
 419         }));
 420
 421         BUG_ON(!gc &&
 422                !owned_by_allocator && !old.owned_by_allocator);
 423 }
 424
 425 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 426                             size_t b, bool owned_by_allocator,
 427                             struct gc_pos pos, unsigned flags)
 428 {
 429         percpu_rwsem_assert_held(&c->mark_lock);
 430
 431         if (!(flags & BCH_BUCKET_MARK_GC))
 432                 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
 433
 434         if ((flags & BCH_BUCKET_MARK_GC) ||
 435             gc_visited(c, pos))
 436                 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
 437 }
 438
 439 #define checked_add(a, b)                                       \
 440 do {                                                            \
 441         unsigned _res = (unsigned) (a) + (b);                   \
 442         (a) = _res;                                             \
 443         BUG_ON((a) != _res);                                    \
 444 } while (0)
 445
 446 static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 447                                         size_t b, enum bch_data_type type,
 448                                         unsigned sectors, bool gc)
 449 {
 450         struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
 451         struct bucket *g = __bucket(ca, b, gc);
 452         struct bucket_mark new;
 453
 454         BUG_ON(type != BCH_DATA_SB &&
 455                type != BCH_DATA_JOURNAL);
 456
 457         bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 458                 new.data_type   = type;
 459                 checked_add(new.dirty_sectors, sectors);
 460         }));
 461 }
 462
 463 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 464                                size_t b, enum bch_data_type type,
 465                                unsigned sectors, struct gc_pos pos,
 466                                unsigned flags)
 467 {
 468         BUG_ON(type != BCH_DATA_SB &&
 469                type != BCH_DATA_JOURNAL);
 470
 471         if (likely(c)) {
 472                 percpu_rwsem_assert_held(&c->mark_lock);
 473
 474                 if (!(flags & BCH_BUCKET_MARK_GC))
 475                         __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
 476                                                     false);
 477                 if ((flags & BCH_BUCKET_MARK_GC) ||
 478                     gc_visited(c, pos))
 479                         __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
 480                                                     true);
 481         } else {
 482                 struct bucket *g;
 483                 struct bucket_mark old, new;
 484
 485                 rcu_read_lock();
 486
 487                 g = bucket(ca, b);
 488                 old = bucket_cmpxchg(g, new, ({
 489                         new.data_type = type;
 490                         checked_add(new.dirty_sectors, sectors);
 491                 }));
 492
 493                 rcu_read_unlock();
 494         }
 495 }
 496
 497 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
 498                                   s64 delta)
 499 {
 500         if (delta > 0) {
 501                 /*
 502                  * marking a new extent, which _will have size_ @delta
 503                  *
 504                  * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
 505                  * case, we haven't actually created the key we'll be inserting
 506                  * yet (for the split) - so we don't want to be using
 507                  * k->size/crc.live_size here:
 508                  */
 509                 return __ptr_disk_sectors(p, delta);
 510         } else {
 511                 BUG_ON(-delta > p.crc.live_size);
 512
 513                 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
 514                         (s64) ptr_disk_sectors(p);
 515         }
 516 }
 517
 518 /*
 519  * Checking against gc's position has to be done here, inside the cmpxchg()
 520  * loop, to avoid racing with the start of gc clearing all the marks - GC does
 521  * that with the gc pos seqlock held.
 522  */
 523 static void bch2_mark_pointer(struct bch_fs *c,
 524                               struct extent_ptr_decoded p,
 525                               s64 sectors, enum bch_data_type data_type,
 526                               struct bch_fs_usage *fs_usage,
 527                               unsigned journal_seq, unsigned flags,
 528                               bool gc)
 529 {
 530         struct bucket_mark old, new;
 531         struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 532         size_t b = PTR_BUCKET_NR(ca, &p.ptr);
 533         struct bucket *g = __bucket(ca, b, gc);
 534         u64 v;
 535
 536         v = atomic64_read(&g->_mark.v);
 537         do {
 538                 new.v.counter = old.v.counter = v;
 539
 540                 /*
 541                  * Check this after reading bucket mark to guard against
 542                  * the allocator invalidating a bucket after we've already
 543                  * checked the gen
 544                  */
 545                 if (gen_after(new.gen, p.ptr.gen)) {
 546                         BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
 547                         EBUG_ON(!p.ptr.cached &&
 548                                 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 549                         return;
 550                 }
 551
 552                 if (!p.ptr.cached)
 553                         checked_add(new.dirty_sectors, sectors);
 554                 else
 555                         checked_add(new.cached_sectors, sectors);
 556
 557                 if (!new.dirty_sectors &&
 558                     !new.cached_sectors) {
 559                         new.data_type   = 0;
 560
 561                         if (journal_seq) {
 562                                 new.journal_seq_valid = 1;
 563                                 new.journal_seq = journal_seq;
 564                         }
 565                 } else {
 566                         new.data_type = data_type;
 567                 }
 568
 569                 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
 570                         g->_mark = new;
 571                         break;
 572                 }
 573         } while ((v = atomic64_cmpxchg(&g->_mark.v,
 574                               old.v.counter,
 575                               new.v.counter)) != old.v.counter);
 576
 577         bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 578
 579         BUG_ON(!gc && bucket_became_unavailable(old, new));
 580 }
 581
 582 static int bch2_mark_stripe_ptr(struct bch_fs *c,
 583                                 struct bch_extent_stripe_ptr p,
 584                                 enum bch_data_type data_type,
 585                                 struct bch_fs_usage *fs_usage,
 586                                 s64 sectors, unsigned flags,
 587                                 bool gc)
 588 {
 589         struct stripe *m;
 590         unsigned old, new, nr_data;
 591         int blocks_nonempty_delta;
 592         s64 parity_sectors;
 593
 594         m = genradix_ptr(&c->stripes[gc], p.idx);
 595
 596         if (!m || !m->alive) {
 597                 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
 598                                     (u64) p.idx);
 599                 return -1;
 600         }
 601
 602         BUG_ON(m->r.e.data_type != data_type);
 603
 604         nr_data = m->nr_blocks - m->nr_redundant;
 605
 606         parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
 607
 608         if (sectors < 0)
 609                 parity_sectors = -parity_sectors;
 610         sectors += parity_sectors;
 611
 612         new = atomic_add_return(sectors, &m->block_sectors[p.block]);
 613         old = new - sectors;
 614
 615         blocks_nonempty_delta = (int) !!new - (int) !!old;
 616         if (!blocks_nonempty_delta)
 617                 return 0;
 618
 619         atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
 620
 621         BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
 622
 623         if (!gc)
 624                 bch2_stripes_heap_update(c, m, p.idx);
 625
 626         update_replicas(c, fs_usage, &m->r.e, sectors);
 627
 628         return 0;
 629 }
 630
 631 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 632                             s64 sectors,
 633                             enum bch_data_type data_type,
 634                             struct bch_fs_usage *fs_usage,
 635                             unsigned journal_seq, unsigned flags,
 636                             bool gc)
 637 {
 638         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 639         const union bch_extent_entry *entry;
 640         struct extent_ptr_decoded p;
 641         struct bch_replicas_padded r;
 642         s64 dirty_sectors = 0;
 643         unsigned i;
 644         int ret;
 645
 646         r.e.data_type   = data_type;
 647         r.e.nr_devs     = 0;
 648         r.e.nr_required = 1;
 649
 650         BUG_ON(!sectors);
 651
 652         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 653                 s64 disk_sectors = data_type == BCH_DATA_BTREE
 654                         ? sectors
 655                         : ptr_disk_sectors_delta(p, sectors);
 656
 657                 bch2_mark_pointer(c, p, disk_sectors, data_type,
 658                                   fs_usage, journal_seq, flags, gc);
 659
 660                 if (p.ptr.cached) {
 661                         update_cached_sectors(c, fs_usage, p.ptr.dev,
 662                                               disk_sectors);
 663                 } else if (!p.ec_nr) {
 664                         dirty_sectors          += disk_sectors;
 665                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
 666                 } else {
 667                         for (i = 0; i < p.ec_nr; i++) {
 668                                 ret = bch2_mark_stripe_ptr(c, p.ec[i],
 669                                                 data_type, fs_usage,
 670                                                 disk_sectors, flags, gc);
 671                                 if (ret)
 672                                         return ret;
 673                         }
 674
 675                         r.e.nr_required = 0;
 676                 }
 677         }
 678
 679         if (dirty_sectors)
 680                 update_replicas(c, fs_usage, &r.e, dirty_sectors);
 681
 682         return 0;
 683 }
 684
 685 static void bucket_set_stripe(struct bch_fs *c,
 686                               const struct bch_stripe *v,
 687                               bool enabled,
 688                               struct bch_fs_usage *fs_usage,
 689                               u64 journal_seq,
 690                               bool gc)
 691 {
 692         unsigned i;
 693
 694         for (i = 0; i < v->nr_blocks; i++) {
 695                 const struct bch_extent_ptr *ptr = v->ptrs + i;
 696                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 697                 size_t b = PTR_BUCKET_NR(ca, ptr);
 698                 struct bucket *g = __bucket(ca, b, gc);
 699                 struct bucket_mark new, old;
 700
 701                 BUG_ON(ptr_stale(ca, ptr));
 702
 703                 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
 704                         new.stripe                      = enabled;
 705                         if (journal_seq) {
 706                                 new.journal_seq_valid   = 1;
 707                                 new.journal_seq         = journal_seq;
 708                         }
 709                 }));
 710
 711                 BUG_ON(old.stripe == enabled);
 712         }
 713 }
 714
 715 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 716                             bool inserting,
 717                             struct bch_fs_usage *fs_usage,
 718                             u64 journal_seq, unsigned flags,
 719                             bool gc)
 720 {
 721         struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 722         size_t idx = s.k->p.offset;
 723         struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 724         unsigned i;
 725
 726         if (!m || (!inserting && !m->alive)) {
 727                 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 728                                     idx);
 729                 return -1;
 730         }
 731
 732         if (inserting && m->alive) {
 733                 bch_err_ratelimited(c, "error marking stripe %zu: already exists",
 734                                     idx);
 735                 return -1;
 736         }
 737
 738         BUG_ON(atomic_read(&m->blocks_nonempty));
 739
 740         for (i = 0; i < EC_STRIPE_MAX; i++)
 741                 BUG_ON(atomic_read(&m->block_sectors[i]));
 742
 743         if (inserting) {
 744                 m->sectors      = le16_to_cpu(s.v->sectors);
 745                 m->algorithm    = s.v->algorithm;
 746                 m->nr_blocks    = s.v->nr_blocks;
 747                 m->nr_redundant = s.v->nr_redundant;
 748
 749                 memset(&m->r, 0, sizeof(m->r));
 750
 751                 m->r.e.data_type        = BCH_DATA_USER;
 752                 m->r.e.nr_devs          = s.v->nr_blocks;
 753                 m->r.e.nr_required      = s.v->nr_blocks - s.v->nr_redundant;
 754
 755                 for (i = 0; i < s.v->nr_blocks; i++)
 756                         m->r.e.devs[i] = s.v->ptrs[i].dev;
 757         }
 758
 759         /*
 760          * XXX: account for stripes somehow here
 761          */
 762 #if 0
 763         update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
 764 #endif
 765
 766         if (!gc) {
 767                 if (inserting)
 768                         bch2_stripes_heap_insert(c, m, idx);
 769                 else
 770                         bch2_stripes_heap_del(c, m, idx);
 771         } else {
 772                 m->alive = inserting;
 773         }
 774
 775         bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
 776         return 0;
 777 }
 778
 779 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 780                            bool inserting, s64 sectors,
 781                            struct bch_fs_usage *fs_usage,
 782                            unsigned journal_seq, unsigned flags,
 783                            bool gc)
 784 {
 785         int ret = 0;
 786
 787         switch (k.k->type) {
 788         case KEY_TYPE_btree_ptr:
 789                 ret = bch2_mark_extent(c, k, inserting
 790                                        ?  c->opts.btree_node_size
 791                                        : -c->opts.btree_node_size,
 792                                        BCH_DATA_BTREE,
 793                                        fs_usage, journal_seq, flags, gc);
 794                 break;
 795         case KEY_TYPE_extent:
 796                 ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
 797                                        fs_usage, journal_seq, flags, gc);
 798                 break;
 799         case KEY_TYPE_stripe:
 800                 ret = bch2_mark_stripe(c, k, inserting,
 801                                        fs_usage, journal_seq, flags, gc);
 802                 break;
 803         case KEY_TYPE_alloc:
 804                 if (inserting)
 805                         fs_usage->s.nr_inodes++;
 806                 else
 807                         fs_usage->s.nr_inodes--;
 808                 break;
 809         case KEY_TYPE_reservation: {
 810                 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 811
 812                 sectors *= replicas;
 813                 replicas = clamp_t(unsigned, replicas, 1,
 814                                    ARRAY_SIZE(fs_usage->persistent_reserved));
 815
 816                 fs_usage->s.reserved                            += sectors;
 817                 fs_usage->persistent_reserved[replicas - 1]     += sectors;
 818                 break;
 819         }
 820         default:
 821                 break;
 822         }
 823
 824         return ret;
 825 }
 826
 827 int bch2_mark_key_locked(struct bch_fs *c,
 828                    struct bkey_s_c k,
 829                    bool inserting, s64 sectors,
 830                    struct gc_pos pos,
 831                    struct bch_fs_usage *fs_usage,
 832                    u64 journal_seq, unsigned flags)
 833 {
 834         int ret;
 835
 836         if (!(flags & BCH_BUCKET_MARK_GC)) {
 837                 ret = __bch2_mark_key(c, k, inserting, sectors,
 838                                       fs_usage ?: this_cpu_ptr(c->usage[0]),
 839                                       journal_seq, flags, false);
 840                 if (ret)
 841                         return ret;
 842         }
 843
 844         if ((flags & BCH_BUCKET_MARK_GC) ||
 845             gc_visited(c, pos)) {
 846                 ret = __bch2_mark_key(c, k, inserting, sectors,
 847                                       this_cpu_ptr(c->usage[1]),
 848                                       journal_seq, flags, true);
 849                 if (ret)
 850                         return ret;
 851         }
 852
 853         return 0;
 854 }
 855
 856 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 857                   bool inserting, s64 sectors,
 858                   struct gc_pos pos,
 859                   struct bch_fs_usage *fs_usage,
 860                   u64 journal_seq, unsigned flags)
 861 {
 862         int ret;
 863
 864         percpu_down_read_preempt_disable(&c->mark_lock);
 865         ret = bch2_mark_key_locked(c, k, inserting, sectors,
 866                                    pos, fs_usage, journal_seq, flags);
 867         percpu_up_read_preempt_enable(&c->mark_lock);
 868
 869         return ret;
 870 }
 871
 872 void bch2_mark_update(struct btree_insert *trans,
 873                       struct btree_insert_entry *insert)
 874 {
 875         struct bch_fs           *c = trans->c;
 876         struct btree_iter       *iter = insert->iter;
 877         struct btree            *b = iter->l[0].b;
 878         struct btree_node_iter  node_iter = iter->l[0].iter;
 879         struct bch_fs_usage     *fs_usage;
 880         struct gc_pos           pos = gc_pos_btree_node(b);
 881         struct bkey_packed      *_k;
 882
 883         if (!btree_node_type_needs_gc(iter->btree_id))
 884                 return;
 885
 886         percpu_down_read_preempt_disable(&c->mark_lock);
 887         fs_usage = bch2_fs_usage_get_scratch(c);
 888
 889         if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
 890                 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 891                         bpos_min(insert->k->k.p, b->key.k.p).offset -
 892                         bkey_start_offset(&insert->k->k),
 893                         pos, fs_usage, trans->journal_res.seq, 0);
 894
 895         while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 896                                                       KEY_TYPE_discard))) {
 897                 struct bkey             unpacked;
 898                 struct bkey_s_c         k;
 899                 s64                     sectors = 0;
 900
 901                 k = bkey_disassemble(b, _k, &unpacked);
 902
 903                 if (btree_node_is_extents(b)
 904                     ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
 905                     : bkey_cmp(insert->k->k.p, k.k->p))
 906                         break;
 907
 908                 if (btree_node_is_extents(b)) {
 909                         switch (bch2_extent_overlap(&insert->k->k, k.k)) {
 910                         case BCH_EXTENT_OVERLAP_ALL:
 911                                 sectors = -((s64) k.k->size);
 912                                 break;
 913                         case BCH_EXTENT_OVERLAP_BACK:
 914                                 sectors = bkey_start_offset(&insert->k->k) -
 915                                         k.k->p.offset;
 916                                 break;
 917                         case BCH_EXTENT_OVERLAP_FRONT:
 918                                 sectors = bkey_start_offset(k.k) -
 919                                         insert->k->k.p.offset;
 920                                 break;
 921                         case BCH_EXTENT_OVERLAP_MIDDLE:
 922                                 sectors = k.k->p.offset - insert->k->k.p.offset;
 923                                 BUG_ON(sectors <= 0);
 924
 925                                 bch2_mark_key_locked(c, k, true, sectors,
 926                                         pos, fs_usage, trans->journal_res.seq, 0);
 927
 928                                 sectors = bkey_start_offset(&insert->k->k) -
 929                                         k.k->p.offset;
 930                                 break;
 931                         }
 932
 933                         BUG_ON(sectors >= 0);
 934                 }
 935
 936                 bch2_mark_key_locked(c, k, false, sectors,
 937                         pos, fs_usage, trans->journal_res.seq, 0);
 938
 939                 bch2_btree_node_iter_advance(&node_iter, b);
 940         }
 941
 942         bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos);
 943
 944         percpu_up_read_preempt_enable(&c->mark_lock);
 945 }
 946
 947 /* Disk reservations: */
 948
 949 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 950 {
 951         int cpu;
 952
 953         for_each_possible_cpu(cpu)
 954                 per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
 955
 956         return avail_factor(bch2_fs_sectors_free(c));
 957 }
 958
 959 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 960 {
 961         percpu_down_read_preempt_disable(&c->mark_lock);
 962         this_cpu_sub(c->usage[0]->s.online_reserved,
 963                      res->sectors);
 964         percpu_up_read_preempt_enable(&c->mark_lock);
 965
 966         res->sectors = 0;
 967 }
 968
 969 #define SECTORS_CACHE   1024
 970
 971 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 972                               unsigned sectors, int flags)
 973 {
 974         struct bch_fs_pcpu *pcpu;
 975         u64 old, v, get;
 976         s64 sectors_available;
 977         int ret;
 978
 979         percpu_down_read_preempt_disable(&c->mark_lock);
 980         pcpu = this_cpu_ptr(c->pcpu);
 981
 982         if (sectors <= pcpu->sectors_available)
 983                 goto out;
 984
 985         v = atomic64_read(&c->sectors_available);
 986         do {
 987                 old = v;
 988                 get = min((u64) sectors + SECTORS_CACHE, old);
 989
 990                 if (get < sectors) {
 991                         percpu_up_read_preempt_enable(&c->mark_lock);
 992                         goto recalculate;
 993                 }
 994         } while ((v = atomic64_cmpxchg(&c->sectors_available,
 995                                        old, old - get)) != old);
 996
 997         pcpu->sectors_available         += get;
 998
 999 out:
1000         pcpu->sectors_available         -= sectors;
1001         this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1002         res->sectors                    += sectors;
1003
1004         percpu_up_read_preempt_enable(&c->mark_lock);
1005         return 0;
1006
1007 recalculate:
1008         /*
1009          * GC recalculates sectors_available when it starts, so that hopefully
1010          * we don't normally end up blocking here:
1011          */
1012
1013         /*
1014          * Piss fuck, we can be called from extent_insert_fixup() with btree
1015          * locks held:
1016          */
1017
1018         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
1019                 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
1020                         down_read(&c->gc_lock);
1021                 else if (!down_read_trylock(&c->gc_lock))
1022                         return -EINTR;
1023         }
1024
1025         percpu_down_write(&c->mark_lock);
1026         sectors_available = bch2_recalc_sectors_available(c);
1027
1028         if (sectors <= sectors_available ||
1029             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1030                 atomic64_set(&c->sectors_available,
1031                              max_t(s64, 0, sectors_available - sectors));
1032                 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1033                 res->sectors                    += sectors;
1034                 ret = 0;
1035         } else {
1036                 atomic64_set(&c->sectors_available, sectors_available);
1037                 ret = -ENOSPC;
1038         }
1039
1040         percpu_up_write(&c->mark_lock);
1041
1042         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
1043                 up_read(&c->gc_lock);
1044
1045         return ret;
1046 }
1047
1048 /* Startup/shutdown: */
1049
1050 static void buckets_free_rcu(struct rcu_head *rcu)
1051 {
1052         struct bucket_array *buckets =
1053                 container_of(rcu, struct bucket_array, rcu);
1054
1055         kvpfree(buckets,
1056                 sizeof(struct bucket_array) +
1057                 buckets->nbuckets * sizeof(struct bucket));
1058 }
1059
1060 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1061 {
1062         struct bucket_array *buckets = NULL, *old_buckets = NULL;
1063         unsigned long *buckets_nouse = NULL;
1064         unsigned long *buckets_written = NULL;
1065         u8 *oldest_gens = NULL;
1066         alloc_fifo      free[RESERVE_NR];
1067         alloc_fifo      free_inc;
1068         alloc_heap      alloc_heap;
1069         copygc_heap     copygc_heap;
1070
1071         size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1072                              ca->mi.bucket_size / c->opts.btree_node_size);
1073         /* XXX: these should be tunable */
1074         size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
1075         size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
1076         size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
1077                                       btree_reserve * 2);
1078         bool resize = ca->buckets[0] != NULL,
1079              start_copygc = ca->copygc_thread != NULL;
1080         int ret = -ENOMEM;
1081         unsigned i;
1082
1083         memset(&free,           0, sizeof(free));
1084         memset(&free_inc,       0, sizeof(free_inc));
1085         memset(&alloc_heap,     0, sizeof(alloc_heap));
1086         memset(&copygc_heap,    0, sizeof(copygc_heap));
1087
1088         if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
1089                                             nbuckets * sizeof(struct bucket),
1090                                             GFP_KERNEL|__GFP_ZERO)) ||
1091             !(oldest_gens       = kvpmalloc(nbuckets * sizeof(u8),
1092                                             GFP_KERNEL|__GFP_ZERO)) ||
1093             !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1094                                             sizeof(unsigned long),
1095                                             GFP_KERNEL|__GFP_ZERO)) ||
1096             !(buckets_written   = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1097                                             sizeof(unsigned long),
1098                                             GFP_KERNEL|__GFP_ZERO)) ||
1099             !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1100             !init_fifo(&free[RESERVE_MOVINGGC],
1101                        copygc_reserve, GFP_KERNEL) ||
1102             !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1103             !init_fifo(&free_inc,       free_inc_nr, GFP_KERNEL) ||
1104             !init_heap(&alloc_heap,     ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1105             !init_heap(&copygc_heap,    copygc_reserve, GFP_KERNEL))
1106                 goto err;
1107
1108         buckets->first_bucket   = ca->mi.first_bucket;
1109         buckets->nbuckets       = nbuckets;
1110
1111         bch2_copygc_stop(ca);
1112
1113         if (resize) {
1114                 down_write(&c->gc_lock);
1115                 down_write(&ca->bucket_lock);
1116                 percpu_down_write(&c->mark_lock);
1117         }
1118
1119         old_buckets = bucket_array(ca);
1120
1121         if (resize) {
1122                 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1123
1124                 memcpy(buckets->b,
1125                        old_buckets->b,
1126                        n * sizeof(struct bucket));
1127                 memcpy(oldest_gens,
1128                        ca->oldest_gens,
1129                        n * sizeof(u8));
1130                 memcpy(buckets_nouse,
1131                        ca->buckets_nouse,
1132                        BITS_TO_LONGS(n) * sizeof(unsigned long));
1133                 memcpy(buckets_written,
1134                        ca->buckets_written,
1135                        BITS_TO_LONGS(n) * sizeof(unsigned long));
1136         }
1137
1138         rcu_assign_pointer(ca->buckets[0], buckets);
1139         buckets = old_buckets;
1140
1141         swap(ca->oldest_gens, oldest_gens);
1142         swap(ca->buckets_nouse, buckets_nouse);
1143         swap(ca->buckets_written, buckets_written);
1144
1145         if (resize)
1146                 percpu_up_write(&c->mark_lock);
1147
1148         spin_lock(&c->freelist_lock);
1149         for (i = 0; i < RESERVE_NR; i++) {
1150                 fifo_move(&free[i], &ca->free[i]);
1151                 swap(ca->free[i], free[i]);
1152         }
1153         fifo_move(&free_inc, &ca->free_inc);
1154         swap(ca->free_inc, free_inc);
1155         spin_unlock(&c->freelist_lock);
1156
1157         /* with gc lock held, alloc_heap can't be in use: */
1158         swap(ca->alloc_heap, alloc_heap);
1159
1160         /* and we shut down copygc: */
1161         swap(ca->copygc_heap, copygc_heap);
1162
1163         nbuckets = ca->mi.nbuckets;
1164
1165         if (resize) {
1166                 up_write(&ca->bucket_lock);
1167                 up_write(&c->gc_lock);
1168         }
1169
1170         if (start_copygc &&
1171             bch2_copygc_start(c, ca))
1172                 bch_err(ca, "error restarting copygc thread");
1173
1174         ret = 0;
1175 err:
1176         free_heap(&copygc_heap);
1177         free_heap(&alloc_heap);
1178         free_fifo(&free_inc);
1179         for (i = 0; i < RESERVE_NR; i++)
1180                 free_fifo(&free[i]);
1181         kvpfree(buckets_nouse,
1182                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1183         kvpfree(buckets_written,
1184                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1185         kvpfree(oldest_gens,
1186                 nbuckets * sizeof(u8));
1187         if (buckets)
1188                 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1189
1190         return ret;
1191 }
1192
1193 void bch2_dev_buckets_free(struct bch_dev *ca)
1194 {
1195         unsigned i;
1196
1197         free_heap(&ca->copygc_heap);
1198         free_heap(&ca->alloc_heap);
1199         free_fifo(&ca->free_inc);
1200         for (i = 0; i < RESERVE_NR; i++)
1201                 free_fifo(&ca->free[i]);
1202         kvpfree(ca->buckets_written,
1203                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1204         kvpfree(ca->buckets_nouse,
1205                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1206         kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
1207         kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1208                 sizeof(struct bucket_array) +
1209                 ca->mi.nbuckets * sizeof(struct bucket));
1210
1211         free_percpu(ca->usage[0]);
1212 }
1213
1214 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1215 {
1216         if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1217                 return -ENOMEM;
1218
1219         return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
1220 }