git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc.c

   1 /*
   2  * Primary bucket allocation code
   3  *
   4  * Copyright 2012 Google, Inc.
   5  *
   6  * Allocation in bcache is done in terms of buckets:
   7  *
   8  * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
   9  * btree pointers - they must match for the pointer to be considered valid.
  10  *
  11  * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
  12  * bucket simply by incrementing its gen.
  13  *
  14  * The gens (along with the priorities; it's really the gens are important but
  15  * the code is named as if it's the priorities) are written in an arbitrary list
  16  * of buckets on disk, with a pointer to them in the journal header.
  17  *
  18  * When we invalidate a bucket, we have to write its new gen to disk and wait
  19  * for that write to complete before we use it - otherwise after a crash we
  20  * could have pointers that appeared to be good but pointed to data that had
  21  * been overwritten.
  22  *
  23  * Since the gens and priorities are all stored contiguously on disk, we can
  24  * batch this up: We fill up the free_inc list with freshly invalidated buckets,
  25  * call prio_write(), and when prio_write() finishes we pull buckets off the
  26  * free_inc list and optionally discard them.
  27  *
  28  * free_inc isn't the only freelist - if it was, we'd often have to sleep while
  29  * priorities and gens were being written before we could allocate. c->free is a
  30  * smaller freelist, and buckets on that list are always ready to be used.
  31  *
  32  * If we've got discards enabled, that happens when a bucket moves from the
  33  * free_inc list to the free list.
  34  *
  35  * It's important to ensure that gens don't wrap around - with respect to
  36  * either the oldest gen in the btree or the gen on disk. This is quite
  37  * difficult to do in practice, but we explicitly guard against it anyways - if
  38  * a bucket is in danger of wrapping around we simply skip invalidating it that
  39  * time around, and we garbage collect or rewrite the priorities sooner than we
  40  * would have otherwise.
  41  *
  42  * bch2_bucket_alloc() allocates a single bucket from a specific device.
  43  *
  44  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
  45  * in a given filesystem.
  46  *
  47  * invalidate_buckets() drives all the processes described above. It's called
  48  * from bch2_bucket_alloc() and a few other places that need to make sure free
  49  * buckets are ready.
  50  *
  51  * invalidate_buckets_(lru|fifo)() find buckets that are available to be
  52  * invalidated, and then invalidate them and stick them on the free_inc list -
  53  * in either lru or fifo order.
  54  */
  55
  56 #include "bcachefs.h"
  57 #include "alloc.h"
  58 #include "btree_update.h"
  59 #include "buckets.h"
  60 #include "checksum.h"
  61 #include "clock.h"
  62 #include "debug.h"
  63 #include "error.h"
  64 #include "extents.h"
  65 #include "io.h"
  66 #include "journal.h"
  67 #include "super-io.h"
  68
  69 #include <linux/blkdev.h>
  70 #include <linux/kthread.h>
  71 #include <linux/math64.h>
  72 #include <linux/random.h>
  73 #include <linux/rcupdate.h>
  74 #include <linux/sched/task.h>
  75 #include <linux/sort.h>
  76 #include <trace/events/bcachefs.h>
  77
  78 static void bch2_recalc_min_prio(struct bch_dev *, int);
  79
  80 /* Allocation groups: */
  81
  82 void bch2_dev_group_remove(struct dev_group *grp, struct bch_dev *ca)
  83 {
  84         unsigned i;
  85
  86         spin_lock(&grp->lock);
  87
  88         for (i = 0; i < grp->nr; i++)
  89                 if (grp->d[i].dev == ca) {
  90                         grp->nr--;
  91                         memmove(&grp->d[i],
  92                                 &grp->d[i + 1],
  93                                 (grp->nr- i) * sizeof(grp->d[0]));
  94                         break;
  95                 }
  96
  97         spin_unlock(&grp->lock);
  98 }
  99
 100 void bch2_dev_group_add(struct dev_group *grp, struct bch_dev *ca)
 101 {
 102         unsigned i;
 103
 104         spin_lock(&grp->lock);
 105         for (i = 0; i < grp->nr; i++)
 106                 if (grp->d[i].dev == ca)
 107                         goto out;
 108
 109         BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
 110
 111         grp->d[grp->nr++].dev = ca;
 112 out:
 113         spin_unlock(&grp->lock);
 114 }
 115
 116 /* Ratelimiting/PD controllers */
 117
 118 static void pd_controllers_update(struct work_struct *work)
 119 {
 120         struct bch_fs *c = container_of(to_delayed_work(work),
 121                                            struct bch_fs,
 122                                            pd_controllers_update);
 123         struct bch_dev *ca;
 124         unsigned i, iter;
 125
 126         /* All units are in bytes */
 127         u64 faster_tiers_size   = 0;
 128         u64 faster_tiers_dirty  = 0;
 129
 130         u64 fastest_tier_size   = 0;
 131         u64 fastest_tier_free   = 0;
 132         u64 copygc_can_free     = 0;
 133
 134         rcu_read_lock();
 135         for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
 136                 bch2_pd_controller_update(&c->tiers[i].pd,
 137                                 div_u64(faster_tiers_size *
 138                                         c->tiering_percent, 100),
 139                                 faster_tiers_dirty,
 140                                 -1);
 141
 142                 spin_lock(&c->tiers[i].devs.lock);
 143                 group_for_each_dev(ca, &c->tiers[i].devs, iter) {
 144                         struct bch_dev_usage stats = bch2_dev_usage_read(ca);
 145                         unsigned bucket_bits = ca->bucket_bits + 9;
 146
 147                         u64 size = (ca->mi.nbuckets -
 148                                     ca->mi.first_bucket) << bucket_bits;
 149                         u64 dirty = stats.buckets_dirty << bucket_bits;
 150                         u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
 151                         /*
 152                          * Bytes of internal fragmentation, which can be
 153                          * reclaimed by copy GC
 154                          */
 155                         s64 fragmented = ((stats.buckets_dirty +
 156                                            stats.buckets_cached) <<
 157                                           bucket_bits) -
 158                                 ((stats.sectors[S_DIRTY] +
 159                                   stats.sectors[S_CACHED] ) << 9);
 160
 161                         fragmented = max(0LL, fragmented);
 162
 163                         bch2_pd_controller_update(&ca->moving_gc_pd,
 164                                                  free, fragmented, -1);
 165
 166                         faster_tiers_size               += size;
 167                         faster_tiers_dirty              += dirty;
 168
 169                         if (!c->fastest_tier ||
 170                             c->fastest_tier == &c->tiers[i]) {
 171                                 fastest_tier_size       += size;
 172                                 fastest_tier_free       += free;
 173                         }
 174
 175                         copygc_can_free                 += fragmented;
 176                 }
 177                 spin_unlock(&c->tiers[i].devs.lock);
 178         }
 179
 180         rcu_read_unlock();
 181
 182         /*
 183          * Throttle foreground writes if tier 0 is running out of free buckets,
 184          * and either tiering or copygc can free up space.
 185          *
 186          * Target will be small if there isn't any work to do - we don't want to
 187          * throttle foreground writes if we currently have all the free space
 188          * we're ever going to have.
 189          *
 190          * Otherwise, if there's work to do, try to keep 20% of tier0 available
 191          * for foreground writes.
 192          */
 193         if (c->fastest_tier)
 194                 copygc_can_free = U64_MAX;
 195
 196         bch2_pd_controller_update(&c->foreground_write_pd,
 197                                  min(copygc_can_free,
 198                                      div_u64(fastest_tier_size *
 199                                              c->foreground_target_percent,
 200                                              100)),
 201                                  fastest_tier_free,
 202                                  -1);
 203
 204         schedule_delayed_work(&c->pd_controllers_update,
 205                               c->pd_controllers_update_seconds * HZ);
 206 }
 207
 208 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 209 {
 210         unsigned bytes = offsetof(struct bch_alloc, data);
 211
 212         if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
 213                 bytes += 2;
 214         if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
 215                 bytes += 2;
 216
 217         return DIV_ROUND_UP(bytes, sizeof(u64));
 218 }
 219
 220 static const char *bch2_alloc_invalid(const struct bch_fs *c,
 221                                       struct bkey_s_c k)
 222 {
 223         if (k.k->p.inode >= c->sb.nr_devices ||
 224             !c->devs[k.k->p.inode])
 225                 return "invalid device";
 226
 227         switch (k.k->type) {
 228         case BCH_ALLOC: {
 229                 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 230
 231                 if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
 232                         return "incorrect value size";
 233                 break;
 234         }
 235         default:
 236                 return "invalid type";
 237         }
 238
 239         return NULL;
 240 }
 241
 242 static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
 243                                size_t size, struct bkey_s_c k)
 244 {
 245         buf[0] = '\0';
 246
 247         switch (k.k->type) {
 248         case BCH_ALLOC:
 249                 break;
 250         }
 251 }
 252
 253 const struct bkey_ops bch2_bkey_alloc_ops = {
 254         .key_invalid    = bch2_alloc_invalid,
 255         .val_to_text    = bch2_alloc_to_text,
 256 };
 257
 258 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
 259 {
 260         unsigned v;
 261
 262         switch (bytes) {
 263         case 1:
 264                 v = **p;
 265                 break;
 266         case 2:
 267                 v = le16_to_cpup((void *) *p);
 268                 break;
 269         case 4:
 270                 v = le32_to_cpup((void *) *p);
 271                 break;
 272         default:
 273                 BUG();
 274         }
 275
 276         *p += bytes;
 277         return v;
 278 }
 279
 280 static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
 281 {
 282         switch (bytes) {
 283         case 1:
 284                 **p = v;
 285                 break;
 286         case 2:
 287                 *((__le16 *) *p) = cpu_to_le16(v);
 288                 break;
 289         case 4:
 290                 *((__le32 *) *p) = cpu_to_le32(v);
 291                 break;
 292         default:
 293                 BUG();
 294         }
 295
 296         *p += bytes;
 297 }
 298
 299 static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 300 {
 301         struct bch_dev *ca;
 302         struct bkey_s_c_alloc a;
 303         struct bucket_mark new;
 304         struct bucket *g;
 305         const u8 *d;
 306
 307         if (k.k->type != BCH_ALLOC)
 308                 return;
 309
 310         a = bkey_s_c_to_alloc(k);
 311         ca = c->devs[a.k->p.inode];
 312
 313         if (a.k->p.offset >= ca->mi.nbuckets)
 314                 return;
 315
 316         g = ca->buckets + a.k->p.offset;
 317         bucket_cmpxchg(g, new, ({
 318                 new.gen = a.v->gen;
 319                 new.gen_valid = 1;
 320         }));
 321
 322         d = a.v->data;
 323         if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
 324                 g->prio[READ] = get_alloc_field(&d, 2);
 325         if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
 326                 g->prio[WRITE] = get_alloc_field(&d, 2);
 327 }
 328
 329 int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 330 {
 331         struct journal_replay *r;
 332         struct btree_iter iter;
 333         struct bkey_s_c k;
 334         int ret;
 335
 336         if (!c->btree_roots[BTREE_ID_ALLOC].b)
 337                 return 0;
 338
 339         for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
 340                 bch2_alloc_read_key(c, k);
 341                 bch2_btree_iter_cond_resched(&iter);
 342         }
 343
 344         ret = bch2_btree_iter_unlock(&iter);
 345         if (ret)
 346                 return ret;
 347
 348         list_for_each_entry(r, journal_replay_list, list) {
 349                 struct bkey_i *k, *n;
 350                 struct jset_entry *entry;
 351
 352                 for_each_jset_key(k, n, entry, &r->j)
 353                         if (entry->btree_id == BTREE_ID_ALLOC)
 354                                 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 355         }
 356
 357         return 0;
 358 }
 359
 360 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 361                                   struct bucket *g, struct btree_iter *iter,
 362                                   u64 *journal_seq)
 363 {
 364         struct bucket_mark m;
 365         __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
 366         struct bkey_i_alloc *a;
 367         u8 *d;
 368         int ret;
 369
 370         bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets));
 371
 372         do {
 373                 ret = bch2_btree_iter_traverse(iter);
 374                 if (ret)
 375                         break;
 376
 377                 /* read mark under btree node lock: */
 378                 m = READ_ONCE(g->mark);
 379                 a = bkey_alloc_init(&alloc_key.k);
 380                 a->k.p          = iter->pos;
 381                 a->v.fields     = 0;
 382                 a->v.gen        = m.gen;
 383                 set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
 384
 385                 d = a->v.data;
 386                 if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
 387                         put_alloc_field(&d, 2, g->prio[READ]);
 388                 if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
 389                         put_alloc_field(&d, 2, g->prio[WRITE]);
 390
 391                 bch2_btree_iter_set_pos(iter, a->k.p);
 392                 ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
 393                                            BTREE_INSERT_ATOMIC|
 394                                            BTREE_INSERT_NOFAIL|
 395                                            BTREE_INSERT_USE_RESERVE|
 396                                            BTREE_INSERT_USE_ALLOC_RESERVE|
 397                                            BTREE_INSERT_NOWAIT,
 398                                            BTREE_INSERT_ENTRY(iter, &a->k_i));
 399                 bch2_btree_iter_cond_resched(iter);
 400         } while (ret == -EINTR);
 401
 402         return ret;
 403 }
 404
 405 int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
 406 {
 407         struct bch_dev *ca;
 408         struct bucket *g;
 409         struct btree_iter iter;
 410         int ret;
 411
 412         if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
 413                 return 0;
 414
 415         ca = c->devs[pos.inode];
 416
 417         if (pos.offset >= ca->mi.nbuckets)
 418                 return 0;
 419
 420         g = ca->buckets + pos.offset;
 421
 422         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 423                              BTREE_ITER_INTENT);
 424
 425         ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL);
 426         bch2_btree_iter_unlock(&iter);
 427         return ret;
 428 }
 429
 430 int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
 431 {
 432         struct btree_iter iter;
 433         struct bucket *g;
 434         int ret = 0;
 435
 436         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 437                              BTREE_ITER_INTENT);
 438
 439         for_each_bucket(g, ca) {
 440                 ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq);
 441                 if (ret)
 442                         break;
 443         }
 444
 445         bch2_btree_iter_unlock(&iter);
 446         return ret;
 447 }
 448
 449 #define BUCKET_GC_GEN_MAX       96U
 450
 451 /**
 452  * wait_buckets_available - wait on reclaimable buckets
 453  *
 454  * If there aren't enough available buckets to fill up free_inc, wait until
 455  * there are.
 456  */
 457 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 458 {
 459         unsigned long gc_count = c->gc_count;
 460         int ret = 0;
 461
 462         while (1) {
 463                 set_current_state(TASK_INTERRUPTIBLE);
 464                 if (kthread_should_stop()) {
 465                         ret = -1;
 466                         break;
 467                 }
 468
 469                 if (gc_count != c->gc_count)
 470                         ca->inc_gen_really_needs_gc = 0;
 471
 472                 if ((ssize_t) (dev_buckets_available(ca) -
 473                                ca->inc_gen_really_needs_gc) >=
 474                     (ssize_t) fifo_free(&ca->free_inc))
 475                         break;
 476
 477                 up_read(&c->gc_lock);
 478                 schedule();
 479                 try_to_freeze();
 480                 down_read(&c->gc_lock);
 481         }
 482
 483         __set_current_state(TASK_RUNNING);
 484         return ret;
 485 }
 486
 487 static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
 488 {
 489         if (expensive_debug_checks(ca->fs)) {
 490                 size_t iter;
 491                 long i;
 492                 unsigned j;
 493
 494                 for (j = 0; j < RESERVE_NR; j++)
 495                         fifo_for_each_entry(i, &ca->free[j], iter)
 496                                 BUG_ON(i == bucket);
 497                 fifo_for_each_entry(i, &ca->free_inc, iter)
 498                         BUG_ON(i == bucket);
 499         }
 500 }
 501
 502 /* Bucket heap / gen */
 503
 504 void bch2_recalc_min_prio(struct bch_dev *ca, int rw)
 505 {
 506         struct bch_fs *c = ca->fs;
 507         struct prio_clock *clock = &c->prio_clock[rw];
 508         struct bucket *g;
 509         u16 max_delta = 1;
 510         unsigned i;
 511
 512         lockdep_assert_held(&c->bucket_lock);
 513
 514         /* Determine min prio for this particular cache */
 515         for_each_bucket(g, ca)
 516                 max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
 517
 518         ca->min_prio[rw] = clock->hand - max_delta;
 519
 520         /*
 521          * This may possibly increase the min prio for the whole cache, check
 522          * that as well.
 523          */
 524         max_delta = 1;
 525
 526         for_each_member_device(ca, c, i)
 527                 max_delta = max(max_delta,
 528                                 (u16) (clock->hand - ca->min_prio[rw]));
 529
 530         clock->min_prio = clock->hand - max_delta;
 531 }
 532
 533 static void bch2_rescale_prios(struct bch_fs *c, int rw)
 534 {
 535         struct prio_clock *clock = &c->prio_clock[rw];
 536         struct bch_dev *ca;
 537         struct bucket *g;
 538         unsigned i;
 539
 540         trace_rescale_prios(c);
 541
 542         for_each_member_device(ca, c, i) {
 543                 for_each_bucket(g, ca)
 544                         g->prio[rw] = clock->hand -
 545                                 (clock->hand - g->prio[rw]) / 2;
 546
 547                 bch2_recalc_min_prio(ca, rw);
 548         }
 549 }
 550
 551 static void bch2_inc_clock_hand(struct io_timer *timer)
 552 {
 553         struct prio_clock *clock = container_of(timer,
 554                                         struct prio_clock, rescale);
 555         struct bch_fs *c = container_of(clock,
 556                                 struct bch_fs, prio_clock[clock->rw]);
 557         u64 capacity;
 558
 559         mutex_lock(&c->bucket_lock);
 560
 561         clock->hand++;
 562
 563         /* if clock cannot be advanced more, rescale prio */
 564         if (clock->hand == (u16) (clock->min_prio - 1))
 565                 bch2_rescale_prios(c, clock->rw);
 566
 567         mutex_unlock(&c->bucket_lock);
 568
 569         capacity = READ_ONCE(c->capacity);
 570
 571         if (!capacity)
 572                 return;
 573
 574         /*
 575          * we only increment when 0.1% of the filesystem capacity has been read
 576          * or written too, this determines if it's time
 577          *
 578          * XXX: we shouldn't really be going off of the capacity of devices in
 579          * RW mode (that will be 0 when we're RO, yet we can still service
 580          * reads)
 581          */
 582         timer->expire += capacity >> 10;
 583
 584         bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 585 }
 586
 587 static void bch2_prio_timer_init(struct bch_fs *c, int rw)
 588 {
 589         struct prio_clock *clock = &c->prio_clock[rw];
 590         struct io_timer *timer = &clock->rescale;
 591
 592         clock->rw       = rw;
 593         timer->fn       = bch2_inc_clock_hand;
 594         timer->expire   = c->capacity >> 10;
 595 }
 596
 597 /*
 598  * Background allocation thread: scans for buckets to be invalidated,
 599  * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
 600  * then optionally issues discard commands to the newly free buckets, then puts
 601  * them on the various freelists.
 602  */
 603
 604 static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g)
 605 {
 606         return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
 607 }
 608
 609 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 610                                        struct bucket_mark mark)
 611 {
 612         if (!is_available_bucket(mark))
 613                 return false;
 614
 615         if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX / 2)
 616                 ca->inc_gen_needs_gc++;
 617
 618         if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX)
 619                 ca->inc_gen_really_needs_gc++;
 620
 621         return can_inc_bucket_gen(ca, g);
 622 }
 623
 624 static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
 625 {
 626         struct bch_fs *c = ca->fs;
 627         struct bucket_mark m;
 628
 629         spin_lock(&ca->freelist_lock);
 630         if (!bch2_invalidate_bucket(ca, g, &m)) {
 631                 spin_unlock(&ca->freelist_lock);
 632                 return;
 633         }
 634
 635         verify_not_on_freelist(ca, g - ca->buckets);
 636         BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
 637         spin_unlock(&ca->freelist_lock);
 638
 639         g->prio[READ] = c->prio_clock[READ].hand;
 640         g->prio[WRITE] = c->prio_clock[WRITE].hand;
 641
 642         if (m.cached_sectors) {
 643                 ca->allocator_invalidating_data = true;
 644         } else if (m.journal_seq_valid) {
 645                 u64 journal_seq = atomic64_read(&c->journal.seq);
 646                 u64 bucket_seq  = journal_seq;
 647
 648                 bucket_seq &= ~((u64) U16_MAX);
 649                 bucket_seq |= m.journal_seq;
 650
 651                 if (bucket_seq > journal_seq)
 652                         bucket_seq -= 1 << 16;
 653
 654                 ca->allocator_journal_seq_flush =
 655                         max(ca->allocator_journal_seq_flush, bucket_seq);
 656         }
 657 }
 658
 659 /*
 660  * Determines what order we're going to reuse buckets, smallest bucket_key()
 661  * first.
 662  *
 663  *
 664  * - We take into account the read prio of the bucket, which gives us an
 665  *   indication of how hot the data is -- we scale the prio so that the prio
 666  *   farthest from the clock is worth 1/8th of the closest.
 667  *
 668  * - The number of sectors of cached data in the bucket, which gives us an
 669  *   indication of the cost in cache misses this eviction will cause.
 670  *
 671  * - If hotness * sectors used compares equal, we pick the bucket with the
 672  *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
 673  *   number repeatedly forces us to run mark and sweep gc to avoid generation
 674  *   number wraparound.
 675  */
 676
 677 static unsigned long bucket_sort_key(struct bch_dev *ca,
 678                                      struct bucket *g,
 679                                      struct bucket_mark m)
 680 {
 681         /*
 682          * Time since last read, scaled to [0, 8) where larger value indicates
 683          * more recently read data:
 684          */
 685         unsigned long hotness =
 686                 (g->prio[READ]                  - ca->min_prio[READ]) * 7 /
 687                 (ca->fs->prio_clock[READ].hand  - ca->min_prio[READ]);
 688
 689         /* How much we want to keep the data in this bucket: */
 690         unsigned long data_wantness =
 691                 (hotness + 1) * bucket_sectors_used(m);
 692
 693         unsigned long needs_journal_commit =
 694                     bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
 695
 696         return  (data_wantness << 9) |
 697                 (needs_journal_commit << 8) |
 698                 bucket_gc_gen(ca, g);
 699 }
 700
 701 static inline int bucket_alloc_cmp(alloc_heap *h,
 702                                    struct alloc_heap_entry l,
 703                                    struct alloc_heap_entry r)
 704 {
 705         return (l.key > r.key) - (l.key < r.key);
 706 }
 707
 708 static void invalidate_buckets_lru(struct bch_dev *ca)
 709 {
 710         struct alloc_heap_entry e;
 711         struct bucket *g;
 712
 713         ca->alloc_heap.used = 0;
 714
 715         mutex_lock(&ca->fs->bucket_lock);
 716         bch2_recalc_min_prio(ca, READ);
 717         bch2_recalc_min_prio(ca, WRITE);
 718
 719         /*
 720          * Find buckets with lowest read priority, by building a maxheap sorted
 721          * by read priority and repeatedly replacing the maximum element until
 722          * all buckets have been visited.
 723          */
 724         for_each_bucket(g, ca) {
 725                 struct bucket_mark m = READ_ONCE(g->mark);
 726
 727                 if (!bch2_can_invalidate_bucket(ca, g, m))
 728                         continue;
 729
 730                 e = (struct alloc_heap_entry) {
 731                         .bucket = g - ca->buckets,
 732                         .key    = bucket_sort_key(ca, g, m)
 733                 };
 734
 735                 heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
 736         }
 737
 738         heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
 739
 740         /*
 741          * If we run out of buckets to invalidate, bch2_allocator_thread() will
 742          * kick stuff and retry us
 743          */
 744         while (!fifo_full(&ca->free_inc) &&
 745                heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
 746                 bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]);
 747
 748         mutex_unlock(&ca->fs->bucket_lock);
 749 }
 750
 751 static void invalidate_buckets_fifo(struct bch_dev *ca)
 752 {
 753         struct bucket_mark m;
 754         struct bucket *g;
 755         size_t checked = 0;
 756
 757         while (!fifo_full(&ca->free_inc)) {
 758                 if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
 759                     ca->fifo_last_bucket >= ca->mi.nbuckets)
 760                         ca->fifo_last_bucket = ca->mi.first_bucket;
 761
 762                 g = ca->buckets + ca->fifo_last_bucket++;
 763                 m = READ_ONCE(g->mark);
 764
 765                 if (bch2_can_invalidate_bucket(ca, g, m))
 766                         bch2_invalidate_one_bucket(ca, g);
 767
 768                 if (++checked >= ca->mi.nbuckets)
 769                         return;
 770         }
 771 }
 772
 773 static void invalidate_buckets_random(struct bch_dev *ca)
 774 {
 775         struct bucket_mark m;
 776         struct bucket *g;
 777         size_t checked = 0;
 778
 779         while (!fifo_full(&ca->free_inc)) {
 780                 size_t n = bch2_rand_range(ca->mi.nbuckets -
 781                                           ca->mi.first_bucket) +
 782                         ca->mi.first_bucket;
 783
 784                 g = ca->buckets + n;
 785                 m = READ_ONCE(g->mark);
 786
 787                 if (bch2_can_invalidate_bucket(ca, g, m))
 788                         bch2_invalidate_one_bucket(ca, g);
 789
 790                 if (++checked >= ca->mi.nbuckets / 2)
 791                         return;
 792         }
 793 }
 794
 795 static void invalidate_buckets(struct bch_dev *ca)
 796 {
 797         ca->inc_gen_needs_gc                    = 0;
 798         ca->inc_gen_really_needs_gc             = 0;
 799
 800         switch (ca->mi.replacement) {
 801         case CACHE_REPLACEMENT_LRU:
 802                 invalidate_buckets_lru(ca);
 803                 break;
 804         case CACHE_REPLACEMENT_FIFO:
 805                 invalidate_buckets_fifo(ca);
 806                 break;
 807         case CACHE_REPLACEMENT_RANDOM:
 808                 invalidate_buckets_random(ca);
 809                 break;
 810         }
 811 }
 812
 813 static int size_t_cmp(const void *_l, const void *_r)
 814 {
 815         const size_t *l = _l, *r = _r;
 816
 817         return (*l > *r) - (*l < *r);
 818 }
 819
 820 static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
 821                                     u64 *journal_seq)
 822 {
 823         struct btree_iter iter;
 824         unsigned nr_invalidated = 0;
 825         size_t b, i;
 826         int ret = 0;
 827
 828         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
 829                              BTREE_ITER_INTENT);
 830
 831         fifo_for_each_entry(b, &ca->free_inc, i) {
 832                 ret = __bch2_alloc_write_key(c, ca, ca->buckets + b,
 833                                              &iter, journal_seq);
 834                 if (ret)
 835                         break;
 836
 837                 nr_invalidated++;
 838         }
 839
 840         bch2_btree_iter_unlock(&iter);
 841         return nr_invalidated ?: ret;
 842 }
 843
 844 /*
 845  * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
 846  * then add it to the freelist, waiting until there's room if necessary:
 847  */
 848 static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
 849 {
 850         if (ca->mi.discard &&
 851             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 852                 blkdev_issue_discard(ca->disk_sb.bdev,
 853                                      bucket_to_sector(ca, bucket),
 854                                      ca->mi.bucket_size, GFP_NOIO, 0);
 855
 856         while (1) {
 857                 bool pushed = false;
 858                 unsigned i;
 859
 860                 set_current_state(TASK_INTERRUPTIBLE);
 861
 862                 /*
 863                  * Don't remove from free_inc until after it's added to
 864                  * freelist, so gc can find it:
 865                  */
 866                 spin_lock(&ca->freelist_lock);
 867                 for (i = 0; i < RESERVE_NR; i++)
 868                         if (fifo_push(&ca->free[i], bucket)) {
 869                                 fifo_pop(&ca->free_inc, bucket);
 870                                 closure_wake_up(&ca->fs->freelist_wait);
 871                                 pushed = true;
 872                                 break;
 873                         }
 874                 spin_unlock(&ca->freelist_lock);
 875
 876                 if (pushed)
 877                         break;
 878
 879                 if (kthread_should_stop())
 880                         break;
 881
 882                 schedule();
 883                 try_to_freeze();
 884         }
 885
 886         __set_current_state(TASK_RUNNING);
 887 }
 888
 889 /**
 890  * bch_allocator_thread - move buckets from free_inc to reserves
 891  *
 892  * The free_inc FIFO is populated by invalidate_buckets(), and
 893  * the reserves are depleted by bucket allocation. When we run out
 894  * of free_inc, try to invalidate some buckets and write out
 895  * prios and gens.
 896  */
 897 static int bch2_allocator_thread(void *arg)
 898 {
 899         struct bch_dev *ca = arg;
 900         struct bch_fs *c = ca->fs;
 901         u64 journal_seq;
 902         size_t bucket;
 903         int ret;
 904
 905         set_freezable();
 906
 907         while (1) {
 908                 while (1) {
 909                         while (ca->nr_invalidated) {
 910                                 BUG_ON(fifo_empty(&ca->free_inc));
 911
 912                                 bucket = fifo_peek(&ca->free_inc);
 913                                 discard_invalidated_bucket(ca, bucket);
 914                                 if (kthread_should_stop())
 915                                         goto out;
 916                                 --ca->nr_invalidated;
 917                         }
 918
 919                         if (fifo_empty(&ca->free_inc))
 920                                 break;
 921
 922                         journal_seq = 0;
 923                         ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
 924                         if (ret < 0)
 925                                 goto out;
 926
 927                         ca->nr_invalidated = ret;
 928
 929                         if (ca->nr_invalidated == fifo_used(&ca->free_inc))
 930                                 ca->alloc_thread_started = true;
 931
 932                         if (ca->allocator_invalidating_data)
 933                                 bch2_journal_flush_seq(&c->journal, journal_seq);
 934                         else if (ca->allocator_journal_seq_flush)
 935                                 bch2_journal_flush_seq(&c->journal,
 936                                                        ca->allocator_journal_seq_flush);
 937                 }
 938
 939                 /* Reset front/back so we can easily sort fifo entries later: */
 940                 ca->free_inc.front = ca->free_inc.back  = 0;
 941                 ca->allocator_journal_seq_flush         = 0;
 942                 ca->allocator_invalidating_data         = false;
 943
 944                 down_read(&c->gc_lock);
 945                 if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
 946                         up_read(&c->gc_lock);
 947                         goto out;
 948                 }
 949
 950                 while (1) {
 951                         /*
 952                          * Find some buckets that we can invalidate, either
 953                          * they're completely unused, or only contain clean data
 954                          * that's been written back to the backing device or
 955                          * another cache tier
 956                          */
 957
 958                         invalidate_buckets(ca);
 959                         trace_alloc_batch(ca, fifo_used(&ca->free_inc),
 960                                           ca->free_inc.size);
 961
 962                         if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
 963                              (!fifo_full(&ca->free_inc) &&
 964                               ca->inc_gen_really_needs_gc >=
 965                               fifo_free(&ca->free_inc))) &&
 966                             c->gc_thread) {
 967                                 atomic_inc(&c->kick_gc);
 968                                 wake_up_process(c->gc_thread);
 969                         }
 970
 971                         if (fifo_full(&ca->free_inc))
 972                                 break;
 973
 974                         if (wait_buckets_available(c, ca)) {
 975                                 up_read(&c->gc_lock);
 976                                 goto out;
 977                         }
 978                 }
 979                 up_read(&c->gc_lock);
 980
 981                 BUG_ON(ca->free_inc.front);
 982
 983                 spin_lock(&ca->freelist_lock);
 984                 sort(ca->free_inc.data,
 985                      ca->free_inc.back,
 986                      sizeof(ca->free_inc.data[0]),
 987                      size_t_cmp, NULL);
 988                 spin_unlock(&ca->freelist_lock);
 989
 990                 /*
 991                  * free_inc is now full of newly-invalidated buckets: next,
 992                  * write out the new bucket gens:
 993                  */
 994         }
 995 out:
 996         /*
 997          * Avoid a race with bch2_usage_update() trying to wake us up after
 998          * we've exited:
 999          */
1000         synchronize_rcu();
1001         return 0;
1002 }
1003
1004 /* Allocation */
1005
1006 static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
1007 {
1008         struct bucket *g;
1009         long r = -1;
1010
1011         if (!down_read_trylock(&c->gc_lock))
1012                 return r;
1013
1014         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
1015                 goto out;
1016
1017         for_each_bucket(g, ca)
1018                 if (!g->mark.touched_this_mount &&
1019                     is_available_bucket(g->mark) &&
1020                     bch2_mark_alloc_bucket_startup(ca, g)) {
1021                         r = g - ca->buckets;
1022                         break;
1023                 }
1024 out:
1025         up_read(&c->gc_lock);
1026         return r;
1027 }
1028
1029 /**
1030  * bch_bucket_alloc - allocate a single bucket from a specific device
1031  *
1032  * Returns index of bucket on success, 0 on failure
1033  * */
1034 long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
1035                        enum alloc_reserve reserve)
1036 {
1037         size_t r;
1038
1039         spin_lock(&ca->freelist_lock);
1040         if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
1041                 goto out;
1042
1043         switch (reserve) {
1044         case RESERVE_ALLOC:
1045                 if (fifo_pop(&ca->free[RESERVE_BTREE], r))
1046                         goto out;
1047                 break;
1048         case RESERVE_BTREE:
1049                 if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
1050                     ca->free[RESERVE_BTREE].size &&
1051                     fifo_pop(&ca->free[RESERVE_BTREE], r))
1052                         goto out;
1053                 break;
1054         case RESERVE_MOVINGGC:
1055                 if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
1056                         goto out;
1057                 break;
1058         default:
1059                 break;
1060         }
1061
1062         spin_unlock(&ca->freelist_lock);
1063
1064         if (unlikely(!ca->alloc_thread_started) &&
1065             (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
1066                 verify_not_on_freelist(ca, r);
1067                 goto out2;
1068         }
1069
1070         trace_bucket_alloc_fail(ca, reserve);
1071         return -1;
1072 out:
1073         verify_not_on_freelist(ca, r);
1074         spin_unlock(&ca->freelist_lock);
1075
1076         bch2_wake_allocator(ca);
1077 out2:
1078         ca->buckets[r].prio[READ]       = c->prio_clock[READ].hand;
1079         ca->buckets[r].prio[WRITE]      = c->prio_clock[WRITE].hand;
1080
1081         trace_bucket_alloc(ca, reserve);
1082         return r;
1083 }
1084
1085 enum bucket_alloc_ret {
1086         ALLOC_SUCCESS,
1087         NO_DEVICES,             /* -EROFS */
1088         FREELIST_EMPTY,         /* Allocator thread not keeping up */
1089 };
1090
1091 static void recalc_alloc_group_weights(struct bch_fs *c,
1092                                        struct dev_group *devs)
1093 {
1094         struct bch_dev *ca;
1095         u64 available_buckets = 1; /* avoid a divide by zero... */
1096         unsigned i;
1097
1098         for (i = 0; i < devs->nr; i++) {
1099                 ca = devs->d[i].dev;
1100
1101                 devs->d[i].weight = dev_buckets_free(ca);
1102                 available_buckets += devs->d[i].weight;
1103         }
1104
1105         for (i = 0; i < devs->nr; i++) {
1106                 const unsigned min_weight = U32_MAX >> 4;
1107                 const unsigned max_weight = U32_MAX;
1108
1109                 devs->d[i].weight =
1110                         min_weight +
1111                         div64_u64(devs->d[i].weight *
1112                                   devs->nr *
1113                                   (max_weight - min_weight),
1114                                   available_buckets);
1115                 devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
1116         }
1117 }
1118
1119 static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
1120                                                     struct open_bucket *ob,
1121                                                     enum alloc_reserve reserve,
1122                                                     unsigned nr_replicas,
1123                                                     struct dev_group *devs,
1124                                                     long *devs_used)
1125 {
1126         enum bucket_alloc_ret ret;
1127         unsigned fail_idx = -1, i;
1128         unsigned available = 0;
1129
1130         BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
1131
1132         if (ob->nr_ptrs >= nr_replicas)
1133                 return ALLOC_SUCCESS;
1134
1135         spin_lock(&devs->lock);
1136
1137         for (i = 0; i < devs->nr; i++)
1138                 available += !test_bit(devs->d[i].dev->dev_idx,
1139                                        devs_used);
1140
1141         recalc_alloc_group_weights(c, devs);
1142
1143         i = devs->cur_device;
1144
1145         while (ob->nr_ptrs < nr_replicas) {
1146                 struct bch_dev *ca;
1147                 long bucket;
1148
1149                 if (!available) {
1150                         ret = NO_DEVICES;
1151                         goto err;
1152                 }
1153
1154                 i++;
1155                 i %= devs->nr;
1156
1157                 ret = FREELIST_EMPTY;
1158                 if (i == fail_idx)
1159                         goto err;
1160
1161                 ca = devs->d[i].dev;
1162
1163                 if (test_bit(ca->dev_idx, devs_used))
1164                         continue;
1165
1166                 if (fail_idx == -1 &&
1167                     get_random_int() > devs->d[i].weight)
1168                         continue;
1169
1170                 bucket = bch2_bucket_alloc(c, ca, reserve);
1171                 if (bucket < 0) {
1172                         if (fail_idx == -1)
1173                                 fail_idx = i;
1174                         continue;
1175                 }
1176
1177                 /*
1178                  * open_bucket_add_buckets expects new pointers at the head of
1179                  * the list:
1180                  */
1181                 memmove(&ob->ptrs[1],
1182                         &ob->ptrs[0],
1183                         ob->nr_ptrs * sizeof(ob->ptrs[0]));
1184                 memmove(&ob->ptr_offset[1],
1185                         &ob->ptr_offset[0],
1186                         ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
1187                 ob->nr_ptrs++;
1188                 ob->ptrs[0] = (struct bch_extent_ptr) {
1189                         .gen    = ca->buckets[bucket].mark.gen,
1190                         .offset = bucket_to_sector(ca, bucket),
1191                         .dev    = ca->dev_idx,
1192                 };
1193                 ob->ptr_offset[0] = 0;
1194
1195                 __set_bit(ca->dev_idx, devs_used);
1196                 available--;
1197                 devs->cur_device = i;
1198         }
1199
1200         ret = ALLOC_SUCCESS;
1201 err:
1202         EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
1203         spin_unlock(&devs->lock);
1204         return ret;
1205 }
1206
1207 static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
1208                                                     struct write_point *wp,
1209                                                     struct open_bucket *ob,
1210                                                     unsigned nr_replicas,
1211                                                     enum alloc_reserve reserve,
1212                                                     long *devs_used)
1213 {
1214         struct bch_tier *tier;
1215         /*
1216          * this should implement policy - for a given type of allocation, decide
1217          * which devices to allocate from:
1218          *
1219          * XXX: switch off wp->type and do something more intelligent here
1220          */
1221         if (wp->group)
1222                 return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
1223                                               wp->group, devs_used);
1224
1225         /* foreground writes: prefer fastest tier: */
1226         tier = READ_ONCE(c->fastest_tier);
1227         if (tier)
1228                 bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
1229                                        &tier->devs, devs_used);
1230
1231         return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
1232                                       &c->all_devs, devs_used);
1233 }
1234
1235 static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
1236                                 struct open_bucket *ob, unsigned nr_replicas,
1237                                 enum alloc_reserve reserve, long *devs_used,
1238                                 struct closure *cl)
1239 {
1240         bool waiting = false;
1241
1242         while (1) {
1243                 switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
1244                                                reserve, devs_used)) {
1245                 case ALLOC_SUCCESS:
1246                         if (waiting)
1247                                 closure_wake_up(&c->freelist_wait);
1248
1249                         return 0;
1250
1251                 case NO_DEVICES:
1252                         if (waiting)
1253                                 closure_wake_up(&c->freelist_wait);
1254                         return -EROFS;
1255
1256                 case FREELIST_EMPTY:
1257                         if (!cl || waiting)
1258                                 trace_freelist_empty_fail(c,
1259                                                         reserve, cl);
1260
1261                         if (!cl)
1262                                 return -ENOSPC;
1263
1264                         if (waiting)
1265                                 return -EAGAIN;
1266
1267                         /* Retry allocation after adding ourself to waitlist: */
1268                         closure_wait(&c->freelist_wait, cl);
1269                         waiting = true;
1270                         break;
1271                 default:
1272                         BUG();
1273                 }
1274         }
1275 }
1276
1277 /* Open buckets: */
1278
1279 /*
1280  * Open buckets represent one or more buckets (on multiple devices) that are
1281  * currently being allocated from. They serve two purposes:
1282  *
1283  *  - They track buckets that have been partially allocated, allowing for
1284  *    sub-bucket sized allocations - they're used by the sector allocator below
1285  *
1286  *  - They provide a reference to the buckets they own that mark and sweep GC
1287  *    can find, until the new allocation has a pointer to it inserted into the
1288  *    btree
1289  *
1290  * When allocating some space with the sector allocator, the allocation comes
1291  * with a reference to an open bucket - the caller is required to put that
1292  * reference _after_ doing the index update that makes its allocation reachable.
1293  */
1294
1295 static void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
1296 {
1297         const struct bch_extent_ptr *ptr;
1298
1299         lockdep_assert_held(&c->open_buckets_lock);
1300
1301         open_bucket_for_each_ptr(ob, ptr) {
1302                 struct bch_dev *ca = c->devs[ptr->dev];
1303
1304                 bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
1305         }
1306
1307         ob->nr_ptrs = 0;
1308
1309         list_move(&ob->list, &c->open_buckets_free);
1310         c->open_buckets_nr_free++;
1311         closure_wake_up(&c->open_buckets_wait);
1312 }
1313
1314 void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *b)
1315 {
1316         if (atomic_dec_and_test(&b->pin)) {
1317                 spin_lock(&c->open_buckets_lock);
1318                 __bch2_open_bucket_put(c, b);
1319                 spin_unlock(&c->open_buckets_lock);
1320         }
1321 }
1322
1323 static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
1324                                                unsigned nr_reserved,
1325                                                struct closure *cl)
1326 {
1327         struct open_bucket *ret;
1328
1329         spin_lock(&c->open_buckets_lock);
1330
1331         if (c->open_buckets_nr_free > nr_reserved) {
1332                 BUG_ON(list_empty(&c->open_buckets_free));
1333                 ret = list_first_entry(&c->open_buckets_free,
1334                                        struct open_bucket, list);
1335                 list_move(&ret->list, &c->open_buckets_open);
1336                 BUG_ON(ret->nr_ptrs);
1337
1338                 atomic_set(&ret->pin, 1); /* XXX */
1339                 ret->has_full_ptrs      = false;
1340
1341                 c->open_buckets_nr_free--;
1342                 trace_open_bucket_alloc(c, cl);
1343         } else {
1344                 trace_open_bucket_alloc_fail(c, cl);
1345
1346                 if (cl) {
1347                         closure_wait(&c->open_buckets_wait, cl);
1348                         ret = ERR_PTR(-EAGAIN);
1349                 } else
1350                         ret = ERR_PTR(-ENOSPC);
1351         }
1352
1353         spin_unlock(&c->open_buckets_lock);
1354
1355         return ret;
1356 }
1357
1358 static unsigned ob_ptr_sectors_free(struct bch_fs *c,
1359                                     struct open_bucket *ob,
1360                                     struct bch_extent_ptr *ptr)
1361 {
1362         struct bch_dev *ca = c->devs[ptr->dev];
1363         unsigned i = ptr - ob->ptrs;
1364         unsigned bucket_size = ca->mi.bucket_size;
1365         unsigned used = (ptr->offset & (bucket_size - 1)) +
1366                 ob->ptr_offset[i];
1367
1368         BUG_ON(used > bucket_size);
1369
1370         return bucket_size - used;
1371 }
1372
1373 static unsigned open_bucket_sectors_free(struct bch_fs *c,
1374                                          struct open_bucket *ob,
1375                                          unsigned nr_replicas)
1376 {
1377         unsigned i, sectors_free = UINT_MAX;
1378
1379         for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
1380                 sectors_free = min(sectors_free,
1381                                    ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
1382
1383         return sectors_free != UINT_MAX ? sectors_free : 0;
1384 }
1385
1386 static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
1387                                          struct open_bucket *new,
1388                                          struct open_bucket *old)
1389 {
1390         unsigned i;
1391
1392         for (i = 0; i < old->nr_ptrs; i++)
1393                 if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
1394                         struct bch_extent_ptr tmp = old->ptrs[i];
1395
1396                         tmp.offset += old->ptr_offset[i];
1397                         new->ptrs[new->nr_ptrs] = tmp;
1398                         new->ptr_offset[new->nr_ptrs] = 0;
1399                         new->nr_ptrs++;
1400                 }
1401 }
1402
1403 static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
1404 {
1405 #ifdef CONFIG_BCACHEFS_DEBUG
1406         const struct bch_extent_ptr *ptr;
1407
1408         open_bucket_for_each_ptr(ob, ptr) {
1409                 struct bch_dev *ca = c->devs[ptr->dev];
1410
1411                 BUG_ON(ptr_stale(ca, ptr));
1412         }
1413 #endif
1414 }
1415
1416 /* Sector allocator */
1417
1418 static struct open_bucket *lock_writepoint(struct bch_fs *c,
1419                                            struct write_point *wp)
1420 {
1421         struct open_bucket *ob;
1422
1423         while ((ob = ACCESS_ONCE(wp->b))) {
1424                 mutex_lock(&ob->lock);
1425                 if (wp->b == ob)
1426                         break;
1427
1428                 mutex_unlock(&ob->lock);
1429         }
1430
1431         return ob;
1432 }
1433
1434 static int open_bucket_add_buckets(struct bch_fs *c,
1435                                    struct write_point *wp,
1436                                    struct open_bucket *ob,
1437                                    unsigned nr_replicas,
1438                                    unsigned nr_replicas_required,
1439                                    enum alloc_reserve reserve,
1440                                    struct closure *cl)
1441 {
1442         long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
1443         unsigned i;
1444         int ret;
1445
1446         /*
1447          * We might be allocating pointers to add to an existing extent
1448          * (tiering/copygc/migration) - if so, some of the pointers in our
1449          * existing open bucket might duplicate devices we already have. This is
1450          * moderately annoying.
1451          */
1452
1453         /* Short circuit all the fun stuff if posssible: */
1454         if (ob->nr_ptrs >= nr_replicas)
1455                 return 0;
1456
1457         memset(devs_used, 0, sizeof(devs_used));
1458
1459         for (i = 0; i < ob->nr_ptrs; i++)
1460                 __set_bit(ob->ptrs[i].dev, devs_used);
1461
1462         ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
1463                                    reserve, devs_used, cl);
1464
1465         if (ret == -EROFS &&
1466             ob->nr_ptrs >= nr_replicas_required)
1467                 ret = 0;
1468
1469         return ret;
1470 }
1471
1472 /*
1473  * Get us an open_bucket we can allocate from, return with it locked:
1474  */
1475 struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c,
1476                                              struct write_point *wp,
1477                                              unsigned nr_replicas,
1478                                              unsigned nr_replicas_required,
1479                                              enum alloc_reserve reserve,
1480                                              struct closure *cl)
1481 {
1482         struct open_bucket *ob;
1483         unsigned open_buckets_reserved = wp == &c->btree_write_point
1484                 ? 0 : BTREE_NODE_RESERVE;
1485         int ret;
1486
1487         BUG_ON(!nr_replicas);
1488 retry:
1489         ob = lock_writepoint(c, wp);
1490
1491         /*
1492          * If ob->sectors_free == 0, one or more of the buckets ob points to is
1493          * full. We can't drop pointers from an open bucket - garbage collection
1494          * still needs to find them; instead, we must allocate a new open bucket
1495          * and copy any pointers to non-full buckets into the new open bucket.
1496          */
1497         if (!ob || ob->has_full_ptrs) {
1498                 struct open_bucket *new_ob;
1499
1500                 new_ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
1501                 if (IS_ERR(new_ob))
1502                         return new_ob;
1503
1504                 mutex_lock(&new_ob->lock);
1505
1506                 /*
1507                  * We point the write point at the open_bucket before doing the
1508                  * allocation to avoid a race with shutdown:
1509                  */
1510                 if (race_fault() ||
1511                     cmpxchg(&wp->b, ob, new_ob) != ob) {
1512                         /* We raced: */
1513                         mutex_unlock(&new_ob->lock);
1514                         bch2_open_bucket_put(c, new_ob);
1515
1516                         if (ob)
1517                                 mutex_unlock(&ob->lock);
1518                         goto retry;
1519                 }
1520
1521                 if (ob) {
1522                         open_bucket_copy_unused_ptrs(c, new_ob, ob);
1523                         mutex_unlock(&ob->lock);
1524                         bch2_open_bucket_put(c, ob);
1525                 }
1526
1527                 ob = new_ob;
1528         }
1529
1530         ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
1531                                       nr_replicas_required,
1532                                       reserve, cl);
1533         if (ret) {
1534                 mutex_unlock(&ob->lock);
1535                 return ERR_PTR(ret);
1536         }
1537
1538         ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
1539
1540         BUG_ON(!ob->sectors_free);
1541         verify_not_stale(c, ob);
1542
1543         return ob;
1544 }
1545
1546 /*
1547  * Append pointers to the space we just allocated to @k, and mark @sectors space
1548  * as allocated out of @ob
1549  */
1550 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
1551                                     unsigned nr_replicas, struct open_bucket *ob,
1552                                     unsigned sectors)
1553 {
1554         struct bch_extent_ptr tmp;
1555         bool has_data = false;
1556         unsigned i;
1557
1558         /*
1559          * We're keeping any existing pointer k has, and appending new pointers:
1560          * __bch2_write() will only write to the pointers we add here:
1561          */
1562
1563         BUG_ON(sectors > ob->sectors_free);
1564
1565         /* didn't use all the ptrs: */
1566         if (nr_replicas < ob->nr_ptrs)
1567                 has_data = true;
1568
1569         for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
1570                 EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
1571
1572                 tmp = ob->ptrs[i];
1573                 tmp.cached = bkey_extent_is_cached(&e->k);
1574                 tmp.offset += ob->ptr_offset[i];
1575                 extent_ptr_append(e, tmp);
1576
1577                 ob->ptr_offset[i] += sectors;
1578
1579                 this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors);
1580         }
1581 }
1582
1583 /*
1584  * Append pointers to the space we just allocated to @k, and mark @sectors space
1585  * as allocated out of @ob
1586  */
1587 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
1588                             struct open_bucket *ob)
1589 {
1590         bool has_data = false;
1591         unsigned i;
1592
1593         for (i = 0; i < ob->nr_ptrs; i++) {
1594                 if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
1595                         ob->has_full_ptrs = true;
1596                 else
1597                         has_data = true;
1598         }
1599
1600         if (likely(has_data))
1601                 atomic_inc(&ob->pin);
1602         else
1603                 BUG_ON(xchg(&wp->b, NULL) != ob);
1604
1605         mutex_unlock(&ob->lock);
1606 }
1607
1608 /*
1609  * Allocates some space in the cache to write to, and k to point to the newly
1610  * allocated space, and updates k->size and k->offset (to point to the
1611  * end of the newly allocated space).
1612  *
1613  * May allocate fewer sectors than @sectors, k->size indicates how many
1614  * sectors were actually allocated.
1615  *
1616  * Return codes:
1617  * - -EAGAIN: closure was added to waitlist
1618  * - -ENOSPC: out of space and no closure provided
1619  *
1620  * @c  - filesystem.
1621  * @wp - write point to use for allocating sectors.
1622  * @k  - key to return the allocated space information.
1623  * @cl - closure to wait for a bucket
1624  */
1625 struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
1626                                        struct write_point *wp,
1627                                        struct bkey_i_extent *e,
1628                                        unsigned nr_replicas,
1629                                        unsigned nr_replicas_required,
1630                                        enum alloc_reserve reserve,
1631                                        struct closure *cl)
1632 {
1633         struct open_bucket *ob;
1634
1635         ob = bch2_alloc_sectors_start(c, wp, nr_replicas,
1636                                      nr_replicas_required,
1637                                      reserve, cl);
1638         if (IS_ERR_OR_NULL(ob))
1639                 return ob;
1640
1641         if (e->k.size > ob->sectors_free)
1642                 bch2_key_resize(&e->k, ob->sectors_free);
1643
1644         bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
1645
1646         bch2_alloc_sectors_done(c, wp, ob);
1647
1648         return ob;
1649 }
1650
1651 /* Startup/shutdown (ro/rw): */
1652
1653 void bch2_recalc_capacity(struct bch_fs *c)
1654 {
1655         struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
1656         struct bch_dev *ca;
1657         u64 total_capacity, capacity = 0, reserved_sectors = 0;
1658         unsigned long ra_pages = 0;
1659         unsigned i, j;
1660
1661         for_each_online_member(ca, c, i) {
1662                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
1663
1664                 ra_pages += bdi->ra_pages;
1665         }
1666
1667         c->bdi.ra_pages = ra_pages;
1668
1669         /* Find fastest, slowest tiers with devices: */
1670
1671         for (tier = c->tiers;
1672              tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
1673                 if (!tier->devs.nr)
1674                         continue;
1675                 if (!fastest_tier)
1676                         fastest_tier = tier;
1677                 slowest_tier = tier;
1678         }
1679
1680         c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
1681
1682         c->promote_write_point.group = &fastest_tier->devs;
1683
1684         if (!fastest_tier)
1685                 goto set_capacity;
1686
1687         /*
1688          * Capacity of the filesystem is the capacity of all the devices in the
1689          * slowest (highest) tier - we don't include lower tier devices.
1690          */
1691         spin_lock(&slowest_tier->devs.lock);
1692         group_for_each_dev(ca, &slowest_tier->devs, i) {
1693                 size_t reserve = 0;
1694
1695                 /*
1696                  * We need to reserve buckets (from the number
1697                  * of currently available buckets) against
1698                  * foreground writes so that mainly copygc can
1699                  * make forward progress.
1700                  *
1701                  * We need enough to refill the various reserves
1702                  * from scratch - copygc will use its entire
1703                  * reserve all at once, then run against when
1704                  * its reserve is refilled (from the formerly
1705                  * available buckets).
1706                  *
1707                  * This reserve is just used when considering if
1708                  * allocations for foreground writes must wait -
1709                  * not -ENOSPC calculations.
1710                  */
1711                 for (j = 0; j < RESERVE_NONE; j++)
1712                         reserve += ca->free[j].size;
1713
1714                 reserve += ca->free_inc.size;
1715
1716                 reserve += ARRAY_SIZE(c->write_points);
1717
1718                 if (ca->mi.tier)
1719                         reserve += 1;   /* tiering write point */
1720                 reserve += 1;           /* btree write point */
1721
1722                 reserved_sectors += reserve << ca->bucket_bits;
1723
1724                 capacity += (ca->mi.nbuckets -
1725                              ca->mi.first_bucket) <<
1726                         ca->bucket_bits;
1727         }
1728         spin_unlock(&slowest_tier->devs.lock);
1729 set_capacity:
1730         total_capacity = capacity;
1731
1732         capacity *= (100 - c->opts.gc_reserve_percent);
1733         capacity = div64_u64(capacity, 100);
1734
1735         BUG_ON(reserved_sectors > total_capacity);
1736
1737         capacity = min(capacity, total_capacity - reserved_sectors);
1738
1739         c->capacity = capacity;
1740
1741         if (c->capacity) {
1742                 bch2_io_timer_add(&c->io_clock[READ],
1743                                  &c->prio_clock[READ].rescale);
1744                 bch2_io_timer_add(&c->io_clock[WRITE],
1745                                  &c->prio_clock[WRITE].rescale);
1746         } else {
1747                 bch2_io_timer_del(&c->io_clock[READ],
1748                                  &c->prio_clock[READ].rescale);
1749                 bch2_io_timer_del(&c->io_clock[WRITE],
1750                                  &c->prio_clock[WRITE].rescale);
1751         }
1752
1753         /* Wake up case someone was waiting for buckets */
1754         closure_wake_up(&c->freelist_wait);
1755 }
1756
1757 static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
1758                                   struct write_point *wp)
1759 {
1760         struct open_bucket *ob;
1761         struct bch_extent_ptr *ptr;
1762
1763         ob = lock_writepoint(c, wp);
1764         if (!ob)
1765                 return;
1766
1767         for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
1768                 if (ptr->dev == ca->dev_idx)
1769                         goto found;
1770
1771         mutex_unlock(&ob->lock);
1772         return;
1773 found:
1774         BUG_ON(xchg(&wp->b, NULL) != ob);
1775         mutex_unlock(&ob->lock);
1776
1777         /* Drop writepoint's ref: */
1778         bch2_open_bucket_put(c, ob);
1779 }
1780
1781 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1782 {
1783         struct bch_extent_ptr *ptr;
1784         struct open_bucket *ob;
1785
1786         for (ob = c->open_buckets;
1787              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1788              ob++)
1789                 if (atomic_read(&ob->pin)) {
1790                         mutex_lock(&ob->lock);
1791                         for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
1792                                 if (ptr->dev == ca->dev_idx) {
1793                                         mutex_unlock(&ob->lock);
1794                                         return true;
1795                                 }
1796                         mutex_unlock(&ob->lock);
1797                 }
1798
1799         return false;
1800 }
1801
1802 /* device goes ro: */
1803 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1804 {
1805         struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
1806         struct closure cl;
1807         unsigned i;
1808
1809         BUG_ON(ca->alloc_thread);
1810
1811         closure_init_stack(&cl);
1812
1813         /* First, remove device from allocation groups: */
1814
1815         bch2_dev_group_remove(&c->journal.devs, ca);
1816         bch2_dev_group_remove(tier, ca);
1817         bch2_dev_group_remove(&c->all_devs, ca);
1818
1819         /*
1820          * Capacity is calculated based off of devices in allocation groups:
1821          */
1822         bch2_recalc_capacity(c);
1823
1824         /* Next, close write points that point to this device... */
1825         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1826                 bch2_stop_write_point(c, ca, &c->write_points[i]);
1827
1828         bch2_stop_write_point(c, ca, &ca->copygc_write_point);
1829         bch2_stop_write_point(c, ca, &c->promote_write_point);
1830         bch2_stop_write_point(c, ca, &ca->tiering_write_point);
1831         bch2_stop_write_point(c, ca, &c->migration_write_point);
1832         bch2_stop_write_point(c, ca, &c->btree_write_point);
1833
1834         mutex_lock(&c->btree_reserve_cache_lock);
1835         while (c->btree_reserve_cache_nr) {
1836                 struct btree_alloc *a =
1837                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1838
1839                 bch2_open_bucket_put(c, a->ob);
1840         }
1841         mutex_unlock(&c->btree_reserve_cache_lock);
1842
1843         /*
1844          * Wake up threads that were blocked on allocation, so they can notice
1845          * the device can no longer be removed and the capacity has changed:
1846          */
1847         closure_wake_up(&c->freelist_wait);
1848
1849         /*
1850          * journal_res_get() can block waiting for free space in the journal -
1851          * it needs to notice there may not be devices to allocate from anymore:
1852          */
1853         wake_up(&c->journal.wait);
1854
1855         /* Now wait for any in flight writes: */
1856
1857         while (1) {
1858                 closure_wait(&c->open_buckets_wait, &cl);
1859
1860                 if (!bch2_dev_has_open_write_point(c, ca)) {
1861                         closure_wake_up(&c->open_buckets_wait);
1862                         break;
1863                 }
1864
1865                 closure_sync(&cl);
1866         }
1867 }
1868
1869 /* device goes rw: */
1870 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1871 {
1872         struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
1873         struct bch_sb_field_journal *journal_buckets;
1874         bool has_journal;
1875
1876         bch2_dev_group_add(&c->all_devs, ca);
1877         bch2_dev_group_add(tier, ca);
1878
1879         mutex_lock(&c->sb_lock);
1880         journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb);
1881         has_journal = bch2_nr_journal_buckets(journal_buckets) >=
1882                 BCH_JOURNAL_BUCKETS_MIN;
1883         mutex_unlock(&c->sb_lock);
1884
1885         if (has_journal)
1886                 bch2_dev_group_add(&c->journal.devs, ca);
1887 }
1888
1889 /* stop allocator thread: */
1890 void bch2_dev_allocator_stop(struct bch_dev *ca)
1891 {
1892         struct task_struct *p = ca->alloc_thread;
1893
1894         ca->alloc_thread = NULL;
1895         smp_wmb();
1896
1897         /*
1898          * We need an rcu barrier between setting ca->alloc_thread = NULL and
1899          * the thread shutting down to avoid a race with bch2_usage_update() -
1900          * the allocator thread itself does a synchronize_rcu() on exit.
1901          *
1902          * XXX: it would be better to have the rcu barrier be asynchronous
1903          * instead of blocking us here
1904          */
1905         if (p)
1906                 kthread_stop(p);
1907 }
1908
1909 /* start allocator thread: */
1910 int bch2_dev_allocator_start(struct bch_dev *ca)
1911 {
1912         struct task_struct *p;
1913
1914         /*
1915          * allocator thread already started?
1916          */
1917         if (ca->alloc_thread)
1918                 return 0;
1919
1920         p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator");
1921         if (IS_ERR(p))
1922                 return PTR_ERR(p);
1923
1924         ca->alloc_thread = p;
1925         return 0;
1926 }
1927
1928 void bch2_fs_allocator_init(struct bch_fs *c)
1929 {
1930         unsigned i;
1931
1932         INIT_LIST_HEAD(&c->open_buckets_open);
1933         INIT_LIST_HEAD(&c->open_buckets_free);
1934         spin_lock_init(&c->open_buckets_lock);
1935         bch2_prio_timer_init(c, READ);
1936         bch2_prio_timer_init(c, WRITE);
1937
1938         /* open bucket 0 is a sentinal NULL: */
1939         mutex_init(&c->open_buckets[0].lock);
1940         INIT_LIST_HEAD(&c->open_buckets[0].list);
1941
1942         for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
1943                 mutex_init(&c->open_buckets[i].lock);
1944                 c->open_buckets_nr_free++;
1945                 list_add(&c->open_buckets[i].list, &c->open_buckets_free);
1946         }
1947
1948         spin_lock_init(&c->all_devs.lock);
1949
1950         for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
1951                 spin_lock_init(&c->tiers[i].devs.lock);
1952
1953         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1954                 c->write_points[i].throttle = true;
1955
1956         c->pd_controllers_update_seconds = 5;
1957         INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
1958
1959         spin_lock_init(&c->foreground_write_pd_lock);
1960         bch2_pd_controller_init(&c->foreground_write_pd);
1961         /*
1962          * We do not want the write rate to have an effect on the computed
1963          * rate, for two reasons:
1964          *
1965          * We do not call bch2_ratelimit_delay() at all if the write rate
1966          * exceeds 1GB/s. In this case, the PD controller will think we are
1967          * not "keeping up" and not change the rate.
1968          */
1969         c->foreground_write_pd.backpressure = 0;
1970         init_timer(&c->foreground_write_wakeup);
1971
1972         c->foreground_write_wakeup.data = (unsigned long) c;
1973         c->foreground_write_wakeup.function = bch2_wake_delayed_writes;
1974 }