git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_background.c

   1 #include "bcachefs.h"
   2 #include "alloc_background.h"
   3 #include "alloc_foreground.h"
   4 #include "btree_cache.h"
   5 #include "btree_io.h"
   6 #include "btree_update.h"
   7 #include "btree_update_interior.h"
   8 #include "btree_gc.h"
   9 #include "buckets.h"
  10 #include "clock.h"
  11 #include "debug.h"
  12 #include "ec.h"
  13 #include "error.h"
  14 #include "journal_io.h"
  15
  16 #include <linux/kthread.h>
  17 #include <linux/math64.h>
  18 #include <linux/random.h>
  19 #include <linux/rculist.h>
  20 #include <linux/rcupdate.h>
  21 #include <linux/sched/task.h>
  22 #include <linux/sort.h>
  23 #include <trace/events/bcachefs.h>
  24
  25 static const char * const bch2_alloc_field_names[] = {
  26 #define x(name, bytes) #name,
  27         BCH_ALLOC_FIELDS()
  28 #undef x
  29         NULL
  30 };
  31
  32 static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
  33
  34 /* Ratelimiting/PD controllers */
  35
  36 static void pd_controllers_update(struct work_struct *work)
  37 {
  38         struct bch_fs *c = container_of(to_delayed_work(work),
  39                                            struct bch_fs,
  40                                            pd_controllers_update);
  41         struct bch_dev *ca;
  42         unsigned i;
  43
  44         for_each_member_device(ca, c, i) {
  45                 struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
  46
  47                 u64 free = bucket_to_sector(ca,
  48                                 __dev_buckets_free(ca, stats)) << 9;
  49                 /*
  50                  * Bytes of internal fragmentation, which can be
  51                  * reclaimed by copy GC
  52                  */
  53                 s64 fragmented = (bucket_to_sector(ca,
  54                                         stats.buckets[BCH_DATA_USER] +
  55                                         stats.buckets[BCH_DATA_CACHED]) -
  56                                   (stats.sectors[BCH_DATA_USER] +
  57                                    stats.sectors[BCH_DATA_CACHED])) << 9;
  58
  59                 fragmented = max(0LL, fragmented);
  60
  61                 bch2_pd_controller_update(&ca->copygc_pd,
  62                                          free, fragmented, -1);
  63         }
  64
  65         schedule_delayed_work(&c->pd_controllers_update,
  66                               c->pd_controllers_update_seconds * HZ);
  67 }
  68
  69 /* Persistent alloc info: */
  70
  71 static inline u64 get_alloc_field(const struct bch_alloc *a,
  72                                   const void **p, unsigned field)
  73 {
  74         unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
  75         u64 v;
  76
  77         if (!(a->fields & (1 << field)))
  78                 return 0;
  79
  80         switch (bytes) {
  81         case 1:
  82                 v = *((const u8 *) *p);
  83                 break;
  84         case 2:
  85                 v = le16_to_cpup(*p);
  86                 break;
  87         case 4:
  88                 v = le32_to_cpup(*p);
  89                 break;
  90         case 8:
  91                 v = le64_to_cpup(*p);
  92                 break;
  93         default:
  94                 BUG();
  95         }
  96
  97         *p += bytes;
  98         return v;
  99 }
 100
 101 static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
 102                                    unsigned field, u64 v)
 103 {
 104         unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
 105
 106         if (!v)
 107                 return;
 108
 109         a->v.fields |= 1 << field;
 110
 111         switch (bytes) {
 112         case 1:
 113                 *((u8 *) *p) = v;
 114                 break;
 115         case 2:
 116                 *((__le16 *) *p) = cpu_to_le16(v);
 117                 break;
 118         case 4:
 119                 *((__le32 *) *p) = cpu_to_le32(v);
 120                 break;
 121         case 8:
 122                 *((__le64 *) *p) = cpu_to_le64(v);
 123                 break;
 124         default:
 125                 BUG();
 126         }
 127
 128         *p += bytes;
 129 }
 130
 131 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 132 {
 133         unsigned i, bytes = offsetof(struct bch_alloc, data);
 134
 135         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
 136                 if (a->fields & (1 << i))
 137                         bytes += BCH_ALLOC_FIELD_BYTES[i];
 138
 139         return DIV_ROUND_UP(bytes, sizeof(u64));
 140 }
 141
 142 const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 143 {
 144         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 145
 146         if (k.k->p.inode >= c->sb.nr_devices ||
 147             !c->devs[k.k->p.inode])
 148                 return "invalid device";
 149
 150         /* allow for unknown fields */
 151         if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
 152                 return "incorrect value size";
 153
 154         return NULL;
 155 }
 156
 157 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 158                         struct bkey_s_c k)
 159 {
 160         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 161         const void *d = a.v->data;
 162         unsigned i;
 163
 164         pr_buf(out, "gen %u", a.v->gen);
 165
 166         for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
 167                 if (a.v->fields & (1 << i))
 168                         pr_buf(out, " %s %llu",
 169                                bch2_alloc_field_names[i],
 170                                get_alloc_field(a.v, &d, i));
 171 }
 172
 173 static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 174 {
 175         const void *d = a->data;
 176         unsigned idx = 0;
 177
 178         g->_mark.gen            = a->gen;
 179         g->gen_valid            = 1;
 180         g->io_time[READ]        = get_alloc_field(a, &d, idx++);
 181         g->io_time[WRITE]       = get_alloc_field(a, &d, idx++);
 182         g->_mark.data_type      = get_alloc_field(a, &d, idx++);
 183         g->_mark.dirty_sectors  = get_alloc_field(a, &d, idx++);
 184         g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
 185 }
 186
 187 static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
 188                               struct bucket_mark m)
 189 {
 190         unsigned idx = 0;
 191         void *d = a->v.data;
 192
 193         a->v.fields     = 0;
 194         a->v.gen        = m.gen;
 195
 196         d = a->v.data;
 197         put_alloc_field(a, &d, idx++, g->io_time[READ]);
 198         put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
 199         put_alloc_field(a, &d, idx++, m.data_type);
 200         put_alloc_field(a, &d, idx++, m.dirty_sectors);
 201         put_alloc_field(a, &d, idx++, m.cached_sectors);
 202
 203         set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
 204 }
 205
 206 static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
 207 {
 208         struct bch_dev *ca;
 209         struct bkey_s_c_alloc a;
 210
 211         if (k.k->type != KEY_TYPE_alloc)
 212                 return;
 213
 214         a = bkey_s_c_to_alloc(k);
 215         ca = bch_dev_bkey_exists(c, a.k->p.inode);
 216
 217         if (a.k->p.offset >= ca->mi.nbuckets)
 218                 return;
 219
 220         percpu_down_read_preempt_disable(&c->mark_lock);
 221         __alloc_read_key(bucket(ca, a.k->p.offset), a.v);
 222         percpu_up_read_preempt_enable(&c->mark_lock);
 223 }
 224
 225 int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 226 {
 227         struct journal_replay *r;
 228         struct btree_iter iter;
 229         struct bkey_s_c k;
 230         struct bch_dev *ca;
 231         unsigned i;
 232         int ret;
 233
 234         for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
 235                 bch2_alloc_read_key(c, k);
 236                 bch2_btree_iter_cond_resched(&iter);
 237         }
 238
 239         ret = bch2_btree_iter_unlock(&iter);
 240         if (ret)
 241                 return ret;
 242
 243         list_for_each_entry(r, journal_replay_list, list) {
 244                 struct bkey_i *k, *n;
 245                 struct jset_entry *entry;
 246
 247                 for_each_jset_key(k, n, entry, &r->j)
 248                         if (entry->btree_id == BTREE_ID_ALLOC)
 249                                 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 250         }
 251
 252         for_each_member_device(ca, c, i)
 253                 bch2_dev_usage_from_buckets(c, ca);
 254
 255         mutex_lock(&c->bucket_clock[READ].lock);
 256         for_each_member_device(ca, c, i) {
 257                 down_read(&ca->bucket_lock);
 258                 bch2_recalc_oldest_io(c, ca, READ);
 259                 up_read(&ca->bucket_lock);
 260         }
 261         mutex_unlock(&c->bucket_clock[READ].lock);
 262
 263         mutex_lock(&c->bucket_clock[WRITE].lock);
 264         for_each_member_device(ca, c, i) {
 265                 down_read(&ca->bucket_lock);
 266                 bch2_recalc_oldest_io(c, ca, WRITE);
 267                 up_read(&ca->bucket_lock);
 268         }
 269         mutex_unlock(&c->bucket_clock[WRITE].lock);
 270
 271         return 0;
 272 }
 273
 274 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 275                                   size_t b, struct btree_iter *iter,
 276                                   u64 *journal_seq, unsigned flags)
 277 {
 278 #if 0
 279         __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
 280 #else
 281         /* hack: */
 282         __BKEY_PADDED(k, 8) alloc_key;
 283 #endif
 284         struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
 285         struct bucket *g;
 286         struct bucket_mark m, new;
 287         int ret;
 288
 289         BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 290
 291         a->k.p = POS(ca->dev_idx, b);
 292
 293         bch2_btree_iter_set_pos(iter, a->k.p);
 294
 295         ret = bch2_btree_iter_traverse(iter);
 296         if (ret)
 297                 return ret;
 298
 299         percpu_down_read_preempt_disable(&c->mark_lock);
 300         g = bucket(ca, b);
 301         m = READ_ONCE(g->mark);
 302
 303         if (!m.dirty) {
 304                 percpu_up_read_preempt_enable(&c->mark_lock);
 305                 return 0;
 306         }
 307
 308         __alloc_write_key(a, g, m);
 309         percpu_up_read_preempt_enable(&c->mark_lock);
 310
 311         bch2_btree_iter_cond_resched(iter);
 312
 313         ret = bch2_btree_insert_at(c, NULL, journal_seq,
 314                                    BTREE_INSERT_NOCHECK_RW|
 315                                    BTREE_INSERT_NOFAIL|
 316                                    BTREE_INSERT_USE_RESERVE|
 317                                    BTREE_INSERT_USE_ALLOC_RESERVE|
 318                                    flags,
 319                                    BTREE_INSERT_ENTRY(iter, &a->k_i));
 320         if (ret)
 321                 return ret;
 322
 323         new = m;
 324         new.dirty = false;
 325         atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
 326
 327         if (ca->buckets_written)
 328                 set_bit(b, ca->buckets_written);
 329
 330         return 0;
 331 }
 332
 333 int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 334 {
 335         struct bch_dev *ca;
 336         struct btree_iter iter;
 337         int ret;
 338
 339         if (k->k.p.inode >= c->sb.nr_devices ||
 340             !c->devs[k->k.p.inode])
 341                 return 0;
 342
 343         ca = bch_dev_bkey_exists(c, k->k.p.inode);
 344
 345         if (k->k.p.offset >= ca->mi.nbuckets)
 346                 return 0;
 347
 348         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
 349                              BTREE_ITER_INTENT);
 350
 351         ret = bch2_btree_iter_traverse(&iter);
 352         if (ret)
 353                 goto err;
 354
 355         /* check buckets_written with btree node locked: */
 356
 357         ret = test_bit(k->k.p.offset, ca->buckets_written)
 358                 ? 0
 359                 : bch2_btree_insert_at(c, NULL, NULL,
 360                                        BTREE_INSERT_NOFAIL|
 361                                        BTREE_INSERT_JOURNAL_REPLAY,
 362                                        BTREE_INSERT_ENTRY(&iter, k));
 363 err:
 364         bch2_btree_iter_unlock(&iter);
 365         return ret;
 366 }
 367
 368 int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
 369 {
 370         struct bch_dev *ca;
 371         unsigned i;
 372         int ret = 0;
 373
 374         *wrote = false;
 375
 376         for_each_rw_member(ca, c, i) {
 377                 struct btree_iter iter;
 378                 struct bucket_array *buckets;
 379                 size_t b;
 380
 381                 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
 382                                      BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 383
 384                 down_read(&ca->bucket_lock);
 385                 buckets = bucket_array(ca);
 386
 387                 for (b = buckets->first_bucket;
 388                      b < buckets->nbuckets;
 389                      b++) {
 390                         if (!buckets->b[b].mark.dirty)
 391                                 continue;
 392
 393                         ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
 394                                                      nowait
 395                                                      ? BTREE_INSERT_NOWAIT
 396                                                      : 0);
 397                         if (ret)
 398                                 break;
 399
 400                         *wrote = true;
 401                 }
 402                 up_read(&ca->bucket_lock);
 403                 bch2_btree_iter_unlock(&iter);
 404
 405                 if (ret) {
 406                         percpu_ref_put(&ca->io_ref);
 407                         break;
 408                 }
 409         }
 410
 411         return ret;
 412 }
 413
 414 /* Bucket IO clocks: */
 415
 416 static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
 417 {
 418         struct bucket_clock *clock = &c->bucket_clock[rw];
 419         struct bucket_array *buckets = bucket_array(ca);
 420         struct bucket *g;
 421         u16 max_last_io = 0;
 422         unsigned i;
 423
 424         lockdep_assert_held(&c->bucket_clock[rw].lock);
 425
 426         /* Recalculate max_last_io for this device: */
 427         for_each_bucket(g, buckets)
 428                 max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
 429
 430         ca->max_last_bucket_io[rw] = max_last_io;
 431
 432         /* Recalculate global max_last_io: */
 433         max_last_io = 0;
 434
 435         for_each_member_device(ca, c, i)
 436                 max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
 437
 438         clock->max_last_io = max_last_io;
 439 }
 440
 441 static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
 442 {
 443         struct bucket_clock *clock = &c->bucket_clock[rw];
 444         struct bucket_array *buckets;
 445         struct bch_dev *ca;
 446         struct bucket *g;
 447         unsigned i;
 448
 449         trace_rescale_prios(c);
 450
 451         for_each_member_device(ca, c, i) {
 452                 down_read(&ca->bucket_lock);
 453                 buckets = bucket_array(ca);
 454
 455                 for_each_bucket(g, buckets)
 456                         g->io_time[rw] = clock->hand -
 457                         bucket_last_io(c, g, rw) / 2;
 458
 459                 bch2_recalc_oldest_io(c, ca, rw);
 460
 461                 up_read(&ca->bucket_lock);
 462         }
 463 }
 464
 465 static inline u64 bucket_clock_freq(u64 capacity)
 466 {
 467         return max(capacity >> 10, 2028ULL);
 468 }
 469
 470 static void bch2_inc_clock_hand(struct io_timer *timer)
 471 {
 472         struct bucket_clock *clock = container_of(timer,
 473                                                 struct bucket_clock, rescale);
 474         struct bch_fs *c = container_of(clock,
 475                                         struct bch_fs, bucket_clock[clock->rw]);
 476         struct bch_dev *ca;
 477         u64 capacity;
 478         unsigned i;
 479
 480         mutex_lock(&clock->lock);
 481
 482         /* if clock cannot be advanced more, rescale prio */
 483         if (clock->max_last_io >= U16_MAX - 2)
 484                 bch2_rescale_bucket_io_times(c, clock->rw);
 485
 486         BUG_ON(clock->max_last_io >= U16_MAX - 2);
 487
 488         for_each_member_device(ca, c, i)
 489                 ca->max_last_bucket_io[clock->rw]++;
 490         clock->max_last_io++;
 491         clock->hand++;
 492
 493         mutex_unlock(&clock->lock);
 494
 495         capacity = READ_ONCE(c->capacity);
 496
 497         if (!capacity)
 498                 return;
 499
 500         /*
 501          * we only increment when 0.1% of the filesystem capacity has been read
 502          * or written too, this determines if it's time
 503          *
 504          * XXX: we shouldn't really be going off of the capacity of devices in
 505          * RW mode (that will be 0 when we're RO, yet we can still service
 506          * reads)
 507          */
 508         timer->expire += bucket_clock_freq(capacity);
 509
 510         bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 511 }
 512
 513 static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
 514 {
 515         struct bucket_clock *clock = &c->bucket_clock[rw];
 516
 517         clock->hand             = 1;
 518         clock->rw               = rw;
 519         clock->rescale.fn       = bch2_inc_clock_hand;
 520         clock->rescale.expire   = bucket_clock_freq(c->capacity);
 521         mutex_init(&clock->lock);
 522 }
 523
 524 /* Background allocator thread: */
 525
 526 /*
 527  * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
 528  * (marking them as invalidated on disk), then optionally issues discard
 529  * commands to the newly free buckets, then puts them on the various freelists.
 530  */
 531
 532 #define BUCKET_GC_GEN_MAX       96U
 533
 534 /**
 535  * wait_buckets_available - wait on reclaimable buckets
 536  *
 537  * If there aren't enough available buckets to fill up free_inc, wait until
 538  * there are.
 539  */
 540 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 541 {
 542         unsigned long gc_count = c->gc_count;
 543         int ret = 0;
 544
 545         while (1) {
 546                 set_current_state(TASK_INTERRUPTIBLE);
 547                 if (kthread_should_stop()) {
 548                         ret = 1;
 549                         break;
 550                 }
 551
 552                 if (gc_count != c->gc_count)
 553                         ca->inc_gen_really_needs_gc = 0;
 554
 555                 if ((ssize_t) (dev_buckets_available(c, ca) -
 556                                ca->inc_gen_really_needs_gc) >=
 557                     (ssize_t) fifo_free(&ca->free_inc))
 558                         break;
 559
 560                 up_read(&c->gc_lock);
 561                 schedule();
 562                 try_to_freeze();
 563                 down_read(&c->gc_lock);
 564         }
 565
 566         __set_current_state(TASK_RUNNING);
 567         return ret;
 568 }
 569
 570 static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 571                                        size_t bucket,
 572                                        struct bucket_mark mark)
 573 {
 574         u8 gc_gen;
 575
 576         if (!is_available_bucket(mark))
 577                 return false;
 578
 579         if (ca->buckets_nouse &&
 580             test_bit(bucket, ca->buckets_nouse))
 581                 return false;
 582
 583         gc_gen = bucket_gc_gen(ca, bucket);
 584
 585         if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
 586                 ca->inc_gen_needs_gc++;
 587
 588         if (gc_gen >= BUCKET_GC_GEN_MAX)
 589                 ca->inc_gen_really_needs_gc++;
 590
 591         return gc_gen < BUCKET_GC_GEN_MAX;
 592 }
 593
 594 /*
 595  * Determines what order we're going to reuse buckets, smallest bucket_key()
 596  * first.
 597  *
 598  *
 599  * - We take into account the read prio of the bucket, which gives us an
 600  *   indication of how hot the data is -- we scale the prio so that the prio
 601  *   farthest from the clock is worth 1/8th of the closest.
 602  *
 603  * - The number of sectors of cached data in the bucket, which gives us an
 604  *   indication of the cost in cache misses this eviction will cause.
 605  *
 606  * - If hotness * sectors used compares equal, we pick the bucket with the
 607  *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
 608  *   number repeatedly forces us to run mark and sweep gc to avoid generation
 609  *   number wraparound.
 610  */
 611
 612 static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
 613                                      size_t b, struct bucket_mark m)
 614 {
 615         unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
 616         unsigned max_last_io = ca->max_last_bucket_io[READ];
 617
 618         /*
 619          * Time since last read, scaled to [0, 8) where larger value indicates
 620          * more recently read data:
 621          */
 622         unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
 623
 624         /* How much we want to keep the data in this bucket: */
 625         unsigned long data_wantness =
 626                 (hotness + 1) * bucket_sectors_used(m);
 627
 628         unsigned long needs_journal_commit =
 629                 bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
 630
 631         return  (data_wantness << 9) |
 632                 (needs_journal_commit << 8) |
 633                 (bucket_gc_gen(ca, b) / 16);
 634 }
 635
 636 static inline int bucket_alloc_cmp(alloc_heap *h,
 637                                    struct alloc_heap_entry l,
 638                                    struct alloc_heap_entry r)
 639 {
 640         return (l.key > r.key) - (l.key < r.key) ?:
 641                 (l.nr < r.nr)  - (l.nr  > r.nr) ?:
 642                 (l.bucket > r.bucket) - (l.bucket < r.bucket);
 643 }
 644
 645 static inline int bucket_idx_cmp(const void *_l, const void *_r)
 646 {
 647         const struct alloc_heap_entry *l = _l, *r = _r;
 648
 649         return (l->bucket > r->bucket) - (l->bucket < r->bucket);
 650 }
 651
 652 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 653 {
 654         struct bucket_array *buckets;
 655         struct alloc_heap_entry e = { 0 };
 656         size_t b, i, nr = 0;
 657
 658         ca->alloc_heap.used = 0;
 659
 660         mutex_lock(&c->bucket_clock[READ].lock);
 661         down_read(&ca->bucket_lock);
 662
 663         buckets = bucket_array(ca);
 664
 665         bch2_recalc_oldest_io(c, ca, READ);
 666
 667         /*
 668          * Find buckets with lowest read priority, by building a maxheap sorted
 669          * by read priority and repeatedly replacing the maximum element until
 670          * all buckets have been visited.
 671          */
 672         for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
 673                 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
 674                 unsigned long key = bucket_sort_key(c, ca, b, m);
 675
 676                 if (!bch2_can_invalidate_bucket(ca, b, m))
 677                         continue;
 678
 679                 if (e.nr && e.bucket + e.nr == b && e.key == key) {
 680                         e.nr++;
 681                 } else {
 682                         if (e.nr)
 683                                 heap_add_or_replace(&ca->alloc_heap, e,
 684                                         -bucket_alloc_cmp, NULL);
 685
 686                         e = (struct alloc_heap_entry) {
 687                                 .bucket = b,
 688                                 .nr     = 1,
 689                                 .key    = key,
 690                         };
 691                 }
 692
 693                 cond_resched();
 694         }
 695
 696         if (e.nr)
 697                 heap_add_or_replace(&ca->alloc_heap, e,
 698                                 -bucket_alloc_cmp, NULL);
 699
 700         for (i = 0; i < ca->alloc_heap.used; i++)
 701                 nr += ca->alloc_heap.data[i].nr;
 702
 703         while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
 704                 nr -= ca->alloc_heap.data[0].nr;
 705                 heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
 706         }
 707
 708         up_read(&ca->bucket_lock);
 709         mutex_unlock(&c->bucket_clock[READ].lock);
 710 }
 711
 712 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 713 {
 714         struct bucket_array *buckets = bucket_array(ca);
 715         struct bucket_mark m;
 716         size_t b, start;
 717
 718         if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
 719             ca->fifo_last_bucket >= ca->mi.nbuckets)
 720                 ca->fifo_last_bucket = ca->mi.first_bucket;
 721
 722         start = ca->fifo_last_bucket;
 723
 724         do {
 725                 ca->fifo_last_bucket++;
 726                 if (ca->fifo_last_bucket == ca->mi.nbuckets)
 727                         ca->fifo_last_bucket = ca->mi.first_bucket;
 728
 729                 b = ca->fifo_last_bucket;
 730                 m = READ_ONCE(buckets->b[b].mark);
 731
 732                 if (bch2_can_invalidate_bucket(ca, b, m)) {
 733                         struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
 734
 735                         heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 736                         if (heap_full(&ca->alloc_heap))
 737                                 break;
 738                 }
 739
 740                 cond_resched();
 741         } while (ca->fifo_last_bucket != start);
 742 }
 743
 744 static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 745 {
 746         struct bucket_array *buckets = bucket_array(ca);
 747         struct bucket_mark m;
 748         size_t checked, i;
 749
 750         for (checked = 0;
 751              checked < ca->mi.nbuckets / 2;
 752              checked++) {
 753                 size_t b = bch2_rand_range(ca->mi.nbuckets -
 754                                            ca->mi.first_bucket) +
 755                         ca->mi.first_bucket;
 756
 757                 m = READ_ONCE(buckets->b[b].mark);
 758
 759                 if (bch2_can_invalidate_bucket(ca, b, m)) {
 760                         struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
 761
 762                         heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 763                         if (heap_full(&ca->alloc_heap))
 764                                 break;
 765                 }
 766
 767                 cond_resched();
 768         }
 769
 770         sort(ca->alloc_heap.data,
 771              ca->alloc_heap.used,
 772              sizeof(ca->alloc_heap.data[0]),
 773              bucket_idx_cmp, NULL);
 774
 775         /* remove duplicates: */
 776         for (i = 0; i + 1 < ca->alloc_heap.used; i++)
 777                 if (ca->alloc_heap.data[i].bucket ==
 778                     ca->alloc_heap.data[i + 1].bucket)
 779                         ca->alloc_heap.data[i].nr = 0;
 780 }
 781
 782 static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 783 {
 784         size_t i, nr = 0;
 785
 786         ca->inc_gen_needs_gc                    = 0;
 787
 788         switch (ca->mi.replacement) {
 789         case CACHE_REPLACEMENT_LRU:
 790                 find_reclaimable_buckets_lru(c, ca);
 791                 break;
 792         case CACHE_REPLACEMENT_FIFO:
 793                 find_reclaimable_buckets_fifo(c, ca);
 794                 break;
 795         case CACHE_REPLACEMENT_RANDOM:
 796                 find_reclaimable_buckets_random(c, ca);
 797                 break;
 798         }
 799
 800         heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
 801
 802         for (i = 0; i < ca->alloc_heap.used; i++)
 803                 nr += ca->alloc_heap.data[i].nr;
 804
 805         return nr;
 806 }
 807
 808 static inline long next_alloc_bucket(struct bch_dev *ca)
 809 {
 810         struct alloc_heap_entry e, *top = ca->alloc_heap.data;
 811
 812         while (ca->alloc_heap.used) {
 813                 if (top->nr) {
 814                         size_t b = top->bucket;
 815
 816                         top->bucket++;
 817                         top->nr--;
 818                         return b;
 819                 }
 820
 821                 heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 822         }
 823
 824         return -1;
 825 }
 826
 827 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 828                                        size_t bucket, u64 *flush_seq)
 829 {
 830         struct bucket_mark m;
 831
 832         percpu_down_read_preempt_disable(&c->mark_lock);
 833         spin_lock(&c->freelist_lock);
 834
 835         bch2_invalidate_bucket(c, ca, bucket, &m);
 836
 837         verify_not_on_freelist(c, ca, bucket);
 838         BUG_ON(!fifo_push(&ca->free_inc, bucket));
 839
 840         spin_unlock(&c->freelist_lock);
 841
 842         bucket_io_clock_reset(c, ca, bucket, READ);
 843         bucket_io_clock_reset(c, ca, bucket, WRITE);
 844
 845         percpu_up_read_preempt_enable(&c->mark_lock);
 846
 847         if (m.journal_seq_valid) {
 848                 u64 journal_seq = atomic64_read(&c->journal.seq);
 849                 u64 bucket_seq  = journal_seq;
 850
 851                 bucket_seq &= ~((u64) U16_MAX);
 852                 bucket_seq |= m.journal_seq;
 853
 854                 if (bucket_seq > journal_seq)
 855                         bucket_seq -= 1 << 16;
 856
 857                 *flush_seq = max(*flush_seq, bucket_seq);
 858         }
 859
 860         return m.cached_sectors != 0;
 861 }
 862
 863 /*
 864  * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
 865  */
 866 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 867 {
 868         struct btree_iter iter;
 869         u64 journal_seq = 0;
 870         int ret = 0;
 871         long b;
 872
 873         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
 874                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 875
 876         /* Only use nowait if we've already invalidated at least one bucket: */
 877         while (!ret &&
 878                !fifo_full(&ca->free_inc) &&
 879                (b = next_alloc_bucket(ca)) >= 0) {
 880                 bool must_flush =
 881                         bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
 882
 883                 ret = __bch2_alloc_write_key(c, ca, b, &iter,
 884                                 must_flush ? &journal_seq : NULL,
 885                                 !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
 886         }
 887
 888         bch2_btree_iter_unlock(&iter);
 889
 890         /* If we used NOWAIT, don't return the error: */
 891         if (!fifo_empty(&ca->free_inc))
 892                 ret = 0;
 893         if (ret) {
 894                 bch_err(ca, "error invalidating buckets: %i", ret);
 895                 return ret;
 896         }
 897
 898         if (journal_seq)
 899                 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
 900         if (ret) {
 901                 bch_err(ca, "journal error: %i", ret);
 902                 return ret;
 903         }
 904
 905         return 0;
 906 }
 907
 908 static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
 909 {
 910         unsigned i;
 911         int ret = 0;
 912
 913         while (1) {
 914                 set_current_state(TASK_INTERRUPTIBLE);
 915
 916                 spin_lock(&c->freelist_lock);
 917                 for (i = 0; i < RESERVE_NR; i++)
 918                         if (fifo_push(&ca->free[i], bucket)) {
 919                                 fifo_pop(&ca->free_inc, bucket);
 920
 921                                 closure_wake_up(&c->freelist_wait);
 922                                 ca->allocator_blocked_full = false;
 923
 924                                 spin_unlock(&c->freelist_lock);
 925                                 goto out;
 926                         }
 927
 928                 if (!ca->allocator_blocked_full) {
 929                         ca->allocator_blocked_full = true;
 930                         closure_wake_up(&c->freelist_wait);
 931                 }
 932
 933                 spin_unlock(&c->freelist_lock);
 934
 935                 if ((current->flags & PF_KTHREAD) &&
 936                     kthread_should_stop()) {
 937                         ret = 1;
 938                         break;
 939                 }
 940
 941                 schedule();
 942                 try_to_freeze();
 943         }
 944 out:
 945         __set_current_state(TASK_RUNNING);
 946         return ret;
 947 }
 948
 949 /*
 950  * Pulls buckets off free_inc, discards them (if enabled), then adds them to
 951  * freelists, waiting until there's room if necessary:
 952  */
 953 static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 954 {
 955         while (!fifo_empty(&ca->free_inc)) {
 956                 size_t bucket = fifo_peek(&ca->free_inc);
 957
 958                 if (ca->mi.discard &&
 959                     blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 960                         blkdev_issue_discard(ca->disk_sb.bdev,
 961                                              bucket_to_sector(ca, bucket),
 962                                              ca->mi.bucket_size, GFP_NOIO, 0);
 963
 964                 if (push_invalidated_bucket(c, ca, bucket))
 965                         return 1;
 966         }
 967
 968         return 0;
 969 }
 970
 971 /**
 972  * bch_allocator_thread - move buckets from free_inc to reserves
 973  *
 974  * The free_inc FIFO is populated by find_reclaimable_buckets(), and
 975  * the reserves are depleted by bucket allocation. When we run out
 976  * of free_inc, try to invalidate some buckets and write out
 977  * prios and gens.
 978  */
 979 static int bch2_allocator_thread(void *arg)
 980 {
 981         struct bch_dev *ca = arg;
 982         struct bch_fs *c = ca->fs;
 983         size_t nr;
 984         int ret;
 985
 986         set_freezable();
 987
 988         while (1) {
 989                 cond_resched();
 990
 991                 pr_debug("discarding %zu invalidated buckets",
 992                          fifo_used(&ca->free_inc));
 993
 994                 ret = discard_invalidated_buckets(c, ca);
 995                 if (ret)
 996                         goto stop;
 997
 998                 down_read(&c->gc_lock);
 999
1000                 ret = bch2_invalidate_buckets(c, ca);
1001                 if (ret) {
1002                         up_read(&c->gc_lock);
1003                         goto stop;
1004                 }
1005
1006                 if (!fifo_empty(&ca->free_inc)) {
1007                         up_read(&c->gc_lock);
1008                         continue;
1009                 }
1010
1011                 pr_debug("free_inc now empty");
1012
1013                 do {
1014                         /*
1015                          * Find some buckets that we can invalidate, either
1016                          * they're completely unused, or only contain clean data
1017                          * that's been written back to the backing device or
1018                          * another cache tier
1019                          */
1020
1021                         pr_debug("scanning for reclaimable buckets");
1022
1023                         nr = find_reclaimable_buckets(c, ca);
1024
1025                         pr_debug("found %zu buckets", nr);
1026
1027                         trace_alloc_batch(ca, nr, ca->alloc_heap.size);
1028
1029                         if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
1030                              ca->inc_gen_really_needs_gc) &&
1031                             c->gc_thread) {
1032                                 atomic_inc(&c->kick_gc);
1033                                 wake_up_process(c->gc_thread);
1034                         }
1035
1036                         /*
1037                          * If we found any buckets, we have to invalidate them
1038                          * before we scan for more - but if we didn't find very
1039                          * many we may want to wait on more buckets being
1040                          * available so we don't spin:
1041                          */
1042                         if (!nr ||
1043                             (nr < ALLOC_SCAN_BATCH(ca) &&
1044                              !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
1045                                 ca->allocator_blocked = true;
1046                                 closure_wake_up(&c->freelist_wait);
1047
1048                                 ret = wait_buckets_available(c, ca);
1049                                 if (ret) {
1050                                         up_read(&c->gc_lock);
1051                                         goto stop;
1052                                 }
1053                         }
1054                 } while (!nr);
1055
1056                 ca->allocator_blocked = false;
1057                 up_read(&c->gc_lock);
1058
1059                 pr_debug("%zu buckets to invalidate", nr);
1060
1061                 /*
1062                  * alloc_heap is now full of newly-invalidated buckets: next,
1063                  * write out the new bucket gens:
1064                  */
1065         }
1066
1067 stop:
1068         pr_debug("alloc thread stopping (ret %i)", ret);
1069         return 0;
1070 }
1071
1072 /* Startup/shutdown (ro/rw): */
1073
1074 void bch2_recalc_capacity(struct bch_fs *c)
1075 {
1076         struct bch_dev *ca;
1077         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
1078         unsigned bucket_size_max = 0;
1079         unsigned long ra_pages = 0;
1080         unsigned i, j;
1081
1082         lockdep_assert_held(&c->state_lock);
1083
1084         for_each_online_member(ca, c, i) {
1085                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
1086
1087                 ra_pages += bdi->ra_pages;
1088         }
1089
1090         bch2_set_ra_pages(c, ra_pages);
1091
1092         for_each_rw_member(ca, c, i) {
1093                 u64 dev_reserve = 0;
1094
1095                 /*
1096                  * We need to reserve buckets (from the number
1097                  * of currently available buckets) against
1098                  * foreground writes so that mainly copygc can
1099                  * make forward progress.
1100                  *
1101                  * We need enough to refill the various reserves
1102                  * from scratch - copygc will use its entire
1103                  * reserve all at once, then run against when
1104                  * its reserve is refilled (from the formerly
1105                  * available buckets).
1106                  *
1107                  * This reserve is just used when considering if
1108                  * allocations for foreground writes must wait -
1109                  * not -ENOSPC calculations.
1110                  */
1111                 for (j = 0; j < RESERVE_NONE; j++)
1112                         dev_reserve += ca->free[j].size;
1113
1114                 dev_reserve += 1;       /* btree write point */
1115                 dev_reserve += 1;       /* copygc write point */
1116                 dev_reserve += 1;       /* rebalance write point */
1117
1118                 dev_reserve *= ca->mi.bucket_size;
1119
1120                 ca->copygc_threshold = dev_reserve;
1121
1122                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1123                                              ca->mi.first_bucket);
1124
1125                 reserved_sectors += dev_reserve * 2;
1126
1127                 bucket_size_max = max_t(unsigned, bucket_size_max,
1128                                         ca->mi.bucket_size);
1129         }
1130
1131         gc_reserve = c->opts.gc_reserve_bytes
1132                 ? c->opts.gc_reserve_bytes >> 9
1133                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1134
1135         reserved_sectors = max(gc_reserve, reserved_sectors);
1136
1137         reserved_sectors = min(reserved_sectors, capacity);
1138
1139         c->capacity = capacity - reserved_sectors;
1140
1141         c->bucket_size_max = bucket_size_max;
1142
1143         if (c->capacity) {
1144                 bch2_io_timer_add(&c->io_clock[READ],
1145                                  &c->bucket_clock[READ].rescale);
1146                 bch2_io_timer_add(&c->io_clock[WRITE],
1147                                  &c->bucket_clock[WRITE].rescale);
1148         } else {
1149                 bch2_io_timer_del(&c->io_clock[READ],
1150                                  &c->bucket_clock[READ].rescale);
1151                 bch2_io_timer_del(&c->io_clock[WRITE],
1152                                  &c->bucket_clock[WRITE].rescale);
1153         }
1154
1155         /* Wake up case someone was waiting for buckets */
1156         closure_wake_up(&c->freelist_wait);
1157 }
1158
1159 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1160 {
1161         struct open_bucket *ob;
1162         bool ret = false;
1163
1164         for (ob = c->open_buckets;
1165              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1166              ob++) {
1167                 spin_lock(&ob->lock);
1168                 if (ob->valid && !ob->on_partial_list &&
1169                     ob->ptr.dev == ca->dev_idx)
1170                         ret = true;
1171                 spin_unlock(&ob->lock);
1172         }
1173
1174         return ret;
1175 }
1176
1177 /* device goes ro: */
1178 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1179 {
1180         unsigned i;
1181
1182         BUG_ON(ca->alloc_thread);
1183
1184         /* First, remove device from allocation groups: */
1185
1186         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1187                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1188
1189         /*
1190          * Capacity is calculated based off of devices in allocation groups:
1191          */
1192         bch2_recalc_capacity(c);
1193
1194         /* Next, close write points that point to this device... */
1195         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1196                 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1197
1198         bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
1199         bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1200         bch2_writepoint_stop(c, ca, &c->btree_write_point);
1201
1202         mutex_lock(&c->btree_reserve_cache_lock);
1203         while (c->btree_reserve_cache_nr) {
1204                 struct btree_alloc *a =
1205                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1206
1207                 bch2_open_buckets_put(c, &a->ob);
1208         }
1209         mutex_unlock(&c->btree_reserve_cache_lock);
1210
1211         while (1) {
1212                 struct open_bucket *ob;
1213
1214                 spin_lock(&c->freelist_lock);
1215                 if (!ca->open_buckets_partial_nr) {
1216                         spin_unlock(&c->freelist_lock);
1217                         break;
1218                 }
1219                 ob = c->open_buckets +
1220                         ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1221                 ob->on_partial_list = false;
1222                 spin_unlock(&c->freelist_lock);
1223
1224                 bch2_open_bucket_put(c, ob);
1225         }
1226
1227         bch2_ec_stop_dev(c, ca);
1228
1229         /*
1230          * Wake up threads that were blocked on allocation, so they can notice
1231          * the device can no longer be removed and the capacity has changed:
1232          */
1233         closure_wake_up(&c->freelist_wait);
1234
1235         /*
1236          * journal_res_get() can block waiting for free space in the journal -
1237          * it needs to notice there may not be devices to allocate from anymore:
1238          */
1239         wake_up(&c->journal.wait);
1240
1241         /* Now wait for any in flight writes: */
1242
1243         closure_wait_event(&c->open_buckets_wait,
1244                            !bch2_dev_has_open_write_point(c, ca));
1245 }
1246
1247 /* device goes rw: */
1248 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1249 {
1250         unsigned i;
1251
1252         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1253                 if (ca->mi.data_allowed & (1 << i))
1254                         set_bit(ca->dev_idx, c->rw_devs[i].d);
1255 }
1256
1257 void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
1258 {
1259         closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
1260 }
1261
1262 /* stop allocator thread: */
1263 void bch2_dev_allocator_stop(struct bch_dev *ca)
1264 {
1265         struct task_struct *p;
1266
1267         p = rcu_dereference_protected(ca->alloc_thread, 1);
1268         ca->alloc_thread = NULL;
1269
1270         /*
1271          * We need an rcu barrier between setting ca->alloc_thread = NULL and
1272          * the thread shutting down to avoid bch2_wake_allocator() racing:
1273          *
1274          * XXX: it would be better to have the rcu barrier be asynchronous
1275          * instead of blocking us here
1276          */
1277         synchronize_rcu();
1278
1279         if (p) {
1280                 kthread_stop(p);
1281                 put_task_struct(p);
1282         }
1283 }
1284
1285 /* start allocator thread: */
1286 int bch2_dev_allocator_start(struct bch_dev *ca)
1287 {
1288         struct task_struct *p;
1289
1290         /*
1291          * allocator thread already started?
1292          */
1293         if (ca->alloc_thread)
1294                 return 0;
1295
1296         p = kthread_create(bch2_allocator_thread, ca,
1297                            "bch_alloc[%s]", ca->name);
1298         if (IS_ERR(p))
1299                 return PTR_ERR(p);
1300
1301         get_task_struct(p);
1302         rcu_assign_pointer(ca->alloc_thread, p);
1303         wake_up_process(p);
1304         return 0;
1305 }
1306
1307 static void flush_held_btree_writes(struct bch_fs *c)
1308 {
1309         struct bucket_table *tbl;
1310         struct rhash_head *pos;
1311         struct btree *b;
1312         bool nodes_blocked;
1313         size_t i;
1314         struct closure cl;
1315
1316         closure_init_stack(&cl);
1317
1318         clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1319 again:
1320         pr_debug("flushing dirty btree nodes");
1321         cond_resched();
1322         closure_wait(&c->btree_interior_update_wait, &cl);
1323
1324         nodes_blocked = false;
1325
1326         rcu_read_lock();
1327         for_each_cached_btree(b, c, tbl, i, pos)
1328                 if (btree_node_need_write(b)) {
1329                         if (btree_node_may_write(b)) {
1330                                 rcu_read_unlock();
1331                                 btree_node_lock_type(c, b, SIX_LOCK_read);
1332                                 bch2_btree_node_write(c, b, SIX_LOCK_read);
1333                                 six_unlock_read(&b->lock);
1334                                 goto again;
1335                         } else {
1336                                 nodes_blocked = true;
1337                         }
1338                 }
1339         rcu_read_unlock();
1340
1341         if (c->btree_roots_dirty)
1342                 bch2_journal_meta(&c->journal);
1343
1344         if (nodes_blocked) {
1345                 closure_sync(&cl);
1346                 goto again;
1347         }
1348
1349         closure_wake_up(&c->btree_interior_update_wait);
1350         closure_sync(&cl);
1351
1352         closure_wait_event(&c->btree_interior_update_wait,
1353                            !bch2_btree_interior_updates_nr_pending(c));
1354 }
1355
1356 static void allocator_start_issue_discards(struct bch_fs *c)
1357 {
1358         struct bch_dev *ca;
1359         unsigned dev_iter;
1360         size_t bu;
1361
1362         for_each_rw_member(ca, c, dev_iter)
1363                 while (fifo_pop(&ca->free_inc, bu))
1364                         blkdev_issue_discard(ca->disk_sb.bdev,
1365                                              bucket_to_sector(ca, bu),
1366                                              ca->mi.bucket_size, GFP_NOIO, 0);
1367 }
1368
1369 static int resize_free_inc(struct bch_dev *ca)
1370 {
1371         alloc_fifo free_inc;
1372
1373         if (!fifo_full(&ca->free_inc))
1374                 return 0;
1375
1376         if (!init_fifo(&free_inc,
1377                        ca->free_inc.size * 2,
1378                        GFP_KERNEL))
1379                 return -ENOMEM;
1380
1381         fifo_move(&free_inc, &ca->free_inc);
1382         swap(free_inc, ca->free_inc);
1383         free_fifo(&free_inc);
1384         return 0;
1385 }
1386
1387 static int __bch2_fs_allocator_start(struct bch_fs *c)
1388 {
1389         struct bch_dev *ca;
1390         unsigned dev_iter;
1391         u64 journal_seq = 0;
1392         long bu;
1393         int ret = 0;
1394
1395         if (test_alloc_startup(c))
1396                 goto not_enough;
1397
1398         /* Scan for buckets that are already invalidated: */
1399         for_each_rw_member(ca, c, dev_iter) {
1400                 struct bucket_array *buckets;
1401                 struct bucket_mark m;
1402
1403                 down_read(&ca->bucket_lock);
1404                 percpu_down_read_preempt_disable(&c->mark_lock);
1405
1406                 buckets = bucket_array(ca);
1407
1408                 for (bu = buckets->first_bucket;
1409                      bu < buckets->nbuckets; bu++) {
1410                         m = READ_ONCE(buckets->b[bu].mark);
1411
1412                         if (!buckets->b[bu].gen_valid ||
1413                             !test_bit(bu, ca->buckets_nouse) ||
1414                             !is_available_bucket(m) ||
1415                             m.cached_sectors)
1416                                 continue;
1417
1418                         bch2_mark_alloc_bucket(c, ca, bu, true,
1419                                         gc_pos_alloc(c, NULL), 0);
1420
1421                         fifo_push(&ca->free_inc, bu);
1422
1423                         discard_invalidated_buckets(c, ca);
1424
1425                         if (fifo_full(&ca->free[RESERVE_BTREE]))
1426                                 break;
1427                 }
1428                 percpu_up_read_preempt_enable(&c->mark_lock);
1429                 up_read(&ca->bucket_lock);
1430         }
1431
1432         /* did we find enough buckets? */
1433         for_each_rw_member(ca, c, dev_iter)
1434                 if (!fifo_full(&ca->free[RESERVE_BTREE])) {
1435                         percpu_ref_put(&ca->io_ref);
1436                         goto not_enough;
1437                 }
1438
1439         return 0;
1440 not_enough:
1441         pr_debug("not enough empty buckets; scanning for reclaimable buckets");
1442
1443         /*
1444          * We're moving buckets to freelists _before_ they've been marked as
1445          * invalidated on disk - we have to so that we can allocate new btree
1446          * nodes to mark them as invalidated on disk.
1447          *
1448          * However, we can't _write_ to any of these buckets yet - they might
1449          * have cached data in them, which is live until they're marked as
1450          * invalidated on disk:
1451          */
1452         set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1453
1454         while (1) {
1455                 bool wrote = false;
1456
1457                 for_each_rw_member(ca, c, dev_iter) {
1458                         find_reclaimable_buckets(c, ca);
1459
1460                         while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
1461                                (bu = next_alloc_bucket(ca)) >= 0) {
1462                                 ret = resize_free_inc(ca);
1463                                 if (ret) {
1464                                         percpu_ref_put(&ca->io_ref);
1465                                         return ret;
1466                                 }
1467
1468                                 bch2_invalidate_one_bucket(c, ca, bu,
1469                                                            &journal_seq);
1470
1471                                 fifo_push(&ca->free[RESERVE_BTREE], bu);
1472                                 bucket_set_dirty(ca, bu);
1473                         }
1474                 }
1475
1476                 pr_debug("done scanning for reclaimable buckets");
1477
1478                 /*
1479                  * XXX: it's possible for this to deadlock waiting on journal reclaim,
1480                  * since we're holding btree writes. What then?
1481                  */
1482                 ret = bch2_alloc_write(c, true, &wrote);
1483
1484                 /*
1485                  * If bch2_alloc_write() did anything, it may have used some
1486                  * buckets, and we need the RESERVE_BTREE freelist full - so we
1487                  * need to loop and scan again.
1488                  * And if it errored, it may have been because there weren't
1489                  * enough buckets, so just scan and loop again as long as it
1490                  * made some progress:
1491                  */
1492                 if (!wrote && ret)
1493                         return ret;
1494                 if (!wrote && !ret)
1495                         break;
1496         }
1497
1498         pr_debug("flushing journal");
1499
1500         ret = bch2_journal_flush(&c->journal);
1501         if (ret)
1502                 return ret;
1503
1504         pr_debug("issuing discards");
1505         allocator_start_issue_discards(c);
1506
1507         set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
1508
1509         /* now flush dirty btree nodes: */
1510         flush_held_btree_writes(c);
1511
1512         return 0;
1513 }
1514
1515 int bch2_fs_allocator_start(struct bch_fs *c)
1516 {
1517         struct bch_dev *ca;
1518         unsigned i;
1519         bool wrote;
1520         int ret;
1521
1522         down_read(&c->gc_lock);
1523         ret = __bch2_fs_allocator_start(c);
1524         up_read(&c->gc_lock);
1525
1526         if (ret)
1527                 return ret;
1528
1529         for_each_rw_member(ca, c, i) {
1530                 ret = bch2_dev_allocator_start(ca);
1531                 if (ret) {
1532                         percpu_ref_put(&ca->io_ref);
1533                         return ret;
1534                 }
1535         }
1536
1537         return bch2_alloc_write(c, false, &wrote);
1538 }
1539
1540 void bch2_fs_allocator_background_init(struct bch_fs *c)
1541 {
1542         spin_lock_init(&c->freelist_lock);
1543         bch2_bucket_clock_init(c, READ);
1544         bch2_bucket_clock_init(c, WRITE);
1545
1546         c->pd_controllers_update_seconds = 5;
1547         INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
1548 }