]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_background.c
Update bcachefs sources to 99750eab4d bcachefs: Persist stripe blocks_used
[bcachefs-tools-debian] / libbcachefs / alloc_background.c
1 #include "bcachefs.h"
2 #include "alloc_background.h"
3 #include "alloc_foreground.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_update.h"
7 #include "btree_update_interior.h"
8 #include "btree_gc.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "debug.h"
12 #include "ec.h"
13 #include "error.h"
14 #include "journal_io.h"
15
16 #include <linux/kthread.h>
17 #include <linux/math64.h>
18 #include <linux/random.h>
19 #include <linux/rculist.h>
20 #include <linux/rcupdate.h>
21 #include <linux/sched/task.h>
22 #include <linux/sort.h>
23 #include <trace/events/bcachefs.h>
24
25 static const char * const bch2_alloc_field_names[] = {
26 #define x(name, bytes) #name,
27         BCH_ALLOC_FIELDS()
28 #undef x
29         NULL
30 };
31
32 static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
33
34 /* Ratelimiting/PD controllers */
35
36 static void pd_controllers_update(struct work_struct *work)
37 {
38         struct bch_fs *c = container_of(to_delayed_work(work),
39                                            struct bch_fs,
40                                            pd_controllers_update);
41         struct bch_dev *ca;
42         unsigned i;
43
44         for_each_member_device(ca, c, i) {
45                 struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
46
47                 u64 free = bucket_to_sector(ca,
48                                 __dev_buckets_free(ca, stats)) << 9;
49                 /*
50                  * Bytes of internal fragmentation, which can be
51                  * reclaimed by copy GC
52                  */
53                 s64 fragmented = (bucket_to_sector(ca,
54                                         stats.buckets[BCH_DATA_USER] +
55                                         stats.buckets[BCH_DATA_CACHED]) -
56                                   (stats.sectors[BCH_DATA_USER] +
57                                    stats.sectors[BCH_DATA_CACHED])) << 9;
58
59                 fragmented = max(0LL, fragmented);
60
61                 bch2_pd_controller_update(&ca->copygc_pd,
62                                          free, fragmented, -1);
63         }
64
65         schedule_delayed_work(&c->pd_controllers_update,
66                               c->pd_controllers_update_seconds * HZ);
67 }
68
69 /* Persistent alloc info: */
70
71 static inline u64 get_alloc_field(const struct bch_alloc *a,
72                                   const void **p, unsigned field)
73 {
74         unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
75         u64 v;
76
77         if (!(a->fields & (1 << field)))
78                 return 0;
79
80         switch (bytes) {
81         case 1:
82                 v = *((const u8 *) *p);
83                 break;
84         case 2:
85                 v = le16_to_cpup(*p);
86                 break;
87         case 4:
88                 v = le32_to_cpup(*p);
89                 break;
90         case 8:
91                 v = le64_to_cpup(*p);
92                 break;
93         default:
94                 BUG();
95         }
96
97         *p += bytes;
98         return v;
99 }
100
101 static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
102                                    unsigned field, u64 v)
103 {
104         unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
105
106         if (!v)
107                 return;
108
109         a->v.fields |= 1 << field;
110
111         switch (bytes) {
112         case 1:
113                 *((u8 *) *p) = v;
114                 break;
115         case 2:
116                 *((__le16 *) *p) = cpu_to_le16(v);
117                 break;
118         case 4:
119                 *((__le32 *) *p) = cpu_to_le32(v);
120                 break;
121         case 8:
122                 *((__le64 *) *p) = cpu_to_le64(v);
123                 break;
124         default:
125                 BUG();
126         }
127
128         *p += bytes;
129 }
130
131 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
132 {
133         unsigned i, bytes = offsetof(struct bch_alloc, data);
134
135         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
136                 if (a->fields & (1 << i))
137                         bytes += BCH_ALLOC_FIELD_BYTES[i];
138
139         return DIV_ROUND_UP(bytes, sizeof(u64));
140 }
141
142 const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
143 {
144         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
145
146         if (k.k->p.inode >= c->sb.nr_devices ||
147             !c->devs[k.k->p.inode])
148                 return "invalid device";
149
150         /* allow for unknown fields */
151         if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
152                 return "incorrect value size";
153
154         return NULL;
155 }
156
157 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
158                         struct bkey_s_c k)
159 {
160         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
161         const void *d = a.v->data;
162         unsigned i;
163
164         pr_buf(out, "gen %u", a.v->gen);
165
166         for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
167                 if (a.v->fields & (1 << i))
168                         pr_buf(out, " %s %llu",
169                                bch2_alloc_field_names[i],
170                                get_alloc_field(a.v, &d, i));
171 }
172
173 static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
174 {
175         const void *d = a->data;
176         unsigned idx = 0;
177
178         g->_mark.gen            = a->gen;
179         g->gen_valid            = 1;
180         g->io_time[READ]        = get_alloc_field(a, &d, idx++);
181         g->io_time[WRITE]       = get_alloc_field(a, &d, idx++);
182         g->_mark.data_type      = get_alloc_field(a, &d, idx++);
183         g->_mark.dirty_sectors  = get_alloc_field(a, &d, idx++);
184         g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
185 }
186
187 static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
188                               struct bucket_mark m)
189 {
190         unsigned idx = 0;
191         void *d = a->v.data;
192
193         a->v.fields     = 0;
194         a->v.gen        = m.gen;
195
196         d = a->v.data;
197         put_alloc_field(a, &d, idx++, g->io_time[READ]);
198         put_alloc_field(a, &d, idx++, g->io_time[WRITE]);
199         put_alloc_field(a, &d, idx++, m.data_type);
200         put_alloc_field(a, &d, idx++, m.dirty_sectors);
201         put_alloc_field(a, &d, idx++, m.cached_sectors);
202
203         set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
204 }
205
206 static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
207 {
208         struct bch_dev *ca;
209         struct bkey_s_c_alloc a;
210
211         if (k.k->type != KEY_TYPE_alloc)
212                 return;
213
214         a = bkey_s_c_to_alloc(k);
215         ca = bch_dev_bkey_exists(c, a.k->p.inode);
216
217         if (a.k->p.offset >= ca->mi.nbuckets)
218                 return;
219
220         percpu_down_read_preempt_disable(&c->mark_lock);
221         __alloc_read_key(bucket(ca, a.k->p.offset), a.v);
222         percpu_up_read_preempt_enable(&c->mark_lock);
223 }
224
225 int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
226 {
227         struct journal_replay *r;
228         struct btree_iter iter;
229         struct bkey_s_c k;
230         struct bch_dev *ca;
231         unsigned i;
232         int ret;
233
234         for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
235                 bch2_alloc_read_key(c, k);
236                 bch2_btree_iter_cond_resched(&iter);
237         }
238
239         ret = bch2_btree_iter_unlock(&iter);
240         if (ret)
241                 return ret;
242
243         list_for_each_entry(r, journal_replay_list, list) {
244                 struct bkey_i *k, *n;
245                 struct jset_entry *entry;
246
247                 for_each_jset_key(k, n, entry, &r->j)
248                         if (entry->btree_id == BTREE_ID_ALLOC)
249                                 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
250         }
251
252         for_each_member_device(ca, c, i)
253                 bch2_dev_usage_from_buckets(c, ca);
254
255         mutex_lock(&c->bucket_clock[READ].lock);
256         for_each_member_device(ca, c, i) {
257                 down_read(&ca->bucket_lock);
258                 bch2_recalc_oldest_io(c, ca, READ);
259                 up_read(&ca->bucket_lock);
260         }
261         mutex_unlock(&c->bucket_clock[READ].lock);
262
263         mutex_lock(&c->bucket_clock[WRITE].lock);
264         for_each_member_device(ca, c, i) {
265                 down_read(&ca->bucket_lock);
266                 bch2_recalc_oldest_io(c, ca, WRITE);
267                 up_read(&ca->bucket_lock);
268         }
269         mutex_unlock(&c->bucket_clock[WRITE].lock);
270
271         return 0;
272 }
273
274 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
275                                   size_t b, struct btree_iter *iter,
276                                   u64 *journal_seq, unsigned flags)
277 {
278 #if 0
279         __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
280 #else
281         /* hack: */
282         __BKEY_PADDED(k, 8) alloc_key;
283 #endif
284         struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
285         struct bucket *g;
286         struct bucket_mark m, new;
287         int ret;
288
289         BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
290
291         a->k.p = POS(ca->dev_idx, b);
292
293         bch2_btree_iter_set_pos(iter, a->k.p);
294
295         ret = bch2_btree_iter_traverse(iter);
296         if (ret)
297                 return ret;
298
299         percpu_down_read_preempt_disable(&c->mark_lock);
300         g = bucket(ca, b);
301         m = READ_ONCE(g->mark);
302
303         if (!m.dirty) {
304                 percpu_up_read_preempt_enable(&c->mark_lock);
305                 return 0;
306         }
307
308         __alloc_write_key(a, g, m);
309         percpu_up_read_preempt_enable(&c->mark_lock);
310
311         bch2_btree_iter_cond_resched(iter);
312
313         ret = bch2_btree_insert_at(c, NULL, journal_seq,
314                                    BTREE_INSERT_NOCHECK_RW|
315                                    BTREE_INSERT_NOFAIL|
316                                    BTREE_INSERT_USE_RESERVE|
317                                    BTREE_INSERT_USE_ALLOC_RESERVE|
318                                    flags,
319                                    BTREE_INSERT_ENTRY(iter, &a->k_i));
320         if (ret)
321                 return ret;
322
323         new = m;
324         new.dirty = false;
325         atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
326
327         if (ca->buckets_written)
328                 set_bit(b, ca->buckets_written);
329
330         return 0;
331 }
332
333 int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
334 {
335         struct bch_dev *ca;
336         struct btree_iter iter;
337         int ret;
338
339         if (k->k.p.inode >= c->sb.nr_devices ||
340             !c->devs[k->k.p.inode])
341                 return 0;
342
343         ca = bch_dev_bkey_exists(c, k->k.p.inode);
344
345         if (k->k.p.offset >= ca->mi.nbuckets)
346                 return 0;
347
348         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
349                              BTREE_ITER_INTENT);
350
351         ret = bch2_btree_iter_traverse(&iter);
352         if (ret)
353                 goto err;
354
355         /* check buckets_written with btree node locked: */
356
357         ret = test_bit(k->k.p.offset, ca->buckets_written)
358                 ? 0
359                 : bch2_btree_insert_at(c, NULL, NULL,
360                                        BTREE_INSERT_NOFAIL|
361                                        BTREE_INSERT_JOURNAL_REPLAY,
362                                        BTREE_INSERT_ENTRY(&iter, k));
363 err:
364         bch2_btree_iter_unlock(&iter);
365         return ret;
366 }
367
368 int bch2_alloc_write(struct bch_fs *c, bool nowait, bool *wrote)
369 {
370         struct bch_dev *ca;
371         unsigned i;
372         int ret = 0;
373
374         *wrote = false;
375
376         for_each_rw_member(ca, c, i) {
377                 struct btree_iter iter;
378                 struct bucket_array *buckets;
379                 size_t b;
380
381                 bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
382                                      BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
383
384                 down_read(&ca->bucket_lock);
385                 buckets = bucket_array(ca);
386
387                 for (b = buckets->first_bucket;
388                      b < buckets->nbuckets;
389                      b++) {
390                         if (!buckets->b[b].mark.dirty)
391                                 continue;
392
393                         ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL,
394                                                      nowait
395                                                      ? BTREE_INSERT_NOWAIT
396                                                      : 0);
397                         if (ret)
398                                 break;
399
400                         *wrote = true;
401                 }
402                 up_read(&ca->bucket_lock);
403                 bch2_btree_iter_unlock(&iter);
404
405                 if (ret) {
406                         percpu_ref_put(&ca->io_ref);
407                         break;
408                 }
409         }
410
411         return ret;
412 }
413
414 /* Bucket IO clocks: */
415
416 static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
417 {
418         struct bucket_clock *clock = &c->bucket_clock[rw];
419         struct bucket_array *buckets = bucket_array(ca);
420         struct bucket *g;
421         u16 max_last_io = 0;
422         unsigned i;
423
424         lockdep_assert_held(&c->bucket_clock[rw].lock);
425
426         /* Recalculate max_last_io for this device: */
427         for_each_bucket(g, buckets)
428                 max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
429
430         ca->max_last_bucket_io[rw] = max_last_io;
431
432         /* Recalculate global max_last_io: */
433         max_last_io = 0;
434
435         for_each_member_device(ca, c, i)
436                 max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
437
438         clock->max_last_io = max_last_io;
439 }
440
441 static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
442 {
443         struct bucket_clock *clock = &c->bucket_clock[rw];
444         struct bucket_array *buckets;
445         struct bch_dev *ca;
446         struct bucket *g;
447         unsigned i;
448
449         trace_rescale_prios(c);
450
451         for_each_member_device(ca, c, i) {
452                 down_read(&ca->bucket_lock);
453                 buckets = bucket_array(ca);
454
455                 for_each_bucket(g, buckets)
456                         g->io_time[rw] = clock->hand -
457                         bucket_last_io(c, g, rw) / 2;
458
459                 bch2_recalc_oldest_io(c, ca, rw);
460
461                 up_read(&ca->bucket_lock);
462         }
463 }
464
465 static inline u64 bucket_clock_freq(u64 capacity)
466 {
467         return max(capacity >> 10, 2028ULL);
468 }
469
470 static void bch2_inc_clock_hand(struct io_timer *timer)
471 {
472         struct bucket_clock *clock = container_of(timer,
473                                                 struct bucket_clock, rescale);
474         struct bch_fs *c = container_of(clock,
475                                         struct bch_fs, bucket_clock[clock->rw]);
476         struct bch_dev *ca;
477         u64 capacity;
478         unsigned i;
479
480         mutex_lock(&clock->lock);
481
482         /* if clock cannot be advanced more, rescale prio */
483         if (clock->max_last_io >= U16_MAX - 2)
484                 bch2_rescale_bucket_io_times(c, clock->rw);
485
486         BUG_ON(clock->max_last_io >= U16_MAX - 2);
487
488         for_each_member_device(ca, c, i)
489                 ca->max_last_bucket_io[clock->rw]++;
490         clock->max_last_io++;
491         clock->hand++;
492
493         mutex_unlock(&clock->lock);
494
495         capacity = READ_ONCE(c->capacity);
496
497         if (!capacity)
498                 return;
499
500         /*
501          * we only increment when 0.1% of the filesystem capacity has been read
502          * or written too, this determines if it's time
503          *
504          * XXX: we shouldn't really be going off of the capacity of devices in
505          * RW mode (that will be 0 when we're RO, yet we can still service
506          * reads)
507          */
508         timer->expire += bucket_clock_freq(capacity);
509
510         bch2_io_timer_add(&c->io_clock[clock->rw], timer);
511 }
512
513 static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
514 {
515         struct bucket_clock *clock = &c->bucket_clock[rw];
516
517         clock->hand             = 1;
518         clock->rw               = rw;
519         clock->rescale.fn       = bch2_inc_clock_hand;
520         clock->rescale.expire   = bucket_clock_freq(c->capacity);
521         mutex_init(&clock->lock);
522 }
523
524 /* Background allocator thread: */
525
526 /*
527  * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
528  * (marking them as invalidated on disk), then optionally issues discard
529  * commands to the newly free buckets, then puts them on the various freelists.
530  */
531
532 #define BUCKET_GC_GEN_MAX       96U
533
534 /**
535  * wait_buckets_available - wait on reclaimable buckets
536  *
537  * If there aren't enough available buckets to fill up free_inc, wait until
538  * there are.
539  */
540 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
541 {
542         unsigned long gc_count = c->gc_count;
543         int ret = 0;
544
545         while (1) {
546                 set_current_state(TASK_INTERRUPTIBLE);
547                 if (kthread_should_stop()) {
548                         ret = 1;
549                         break;
550                 }
551
552                 if (gc_count != c->gc_count)
553                         ca->inc_gen_really_needs_gc = 0;
554
555                 if ((ssize_t) (dev_buckets_available(c, ca) -
556                                ca->inc_gen_really_needs_gc) >=
557                     (ssize_t) fifo_free(&ca->free_inc))
558                         break;
559
560                 up_read(&c->gc_lock);
561                 schedule();
562                 try_to_freeze();
563                 down_read(&c->gc_lock);
564         }
565
566         __set_current_state(TASK_RUNNING);
567         return ret;
568 }
569
570 static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
571                                        size_t bucket,
572                                        struct bucket_mark mark)
573 {
574         u8 gc_gen;
575
576         if (!is_available_bucket(mark))
577                 return false;
578
579         if (ca->buckets_nouse &&
580             test_bit(bucket, ca->buckets_nouse))
581                 return false;
582
583         gc_gen = bucket_gc_gen(ca, bucket);
584
585         if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
586                 ca->inc_gen_needs_gc++;
587
588         if (gc_gen >= BUCKET_GC_GEN_MAX)
589                 ca->inc_gen_really_needs_gc++;
590
591         return gc_gen < BUCKET_GC_GEN_MAX;
592 }
593
594 /*
595  * Determines what order we're going to reuse buckets, smallest bucket_key()
596  * first.
597  *
598  *
599  * - We take into account the read prio of the bucket, which gives us an
600  *   indication of how hot the data is -- we scale the prio so that the prio
601  *   farthest from the clock is worth 1/8th of the closest.
602  *
603  * - The number of sectors of cached data in the bucket, which gives us an
604  *   indication of the cost in cache misses this eviction will cause.
605  *
606  * - If hotness * sectors used compares equal, we pick the bucket with the
607  *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
608  *   number repeatedly forces us to run mark and sweep gc to avoid generation
609  *   number wraparound.
610  */
611
612 static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
613                                      size_t b, struct bucket_mark m)
614 {
615         unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
616         unsigned max_last_io = ca->max_last_bucket_io[READ];
617
618         /*
619          * Time since last read, scaled to [0, 8) where larger value indicates
620          * more recently read data:
621          */
622         unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
623
624         /* How much we want to keep the data in this bucket: */
625         unsigned long data_wantness =
626                 (hotness + 1) * bucket_sectors_used(m);
627
628         unsigned long needs_journal_commit =
629                 bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
630
631         return  (data_wantness << 9) |
632                 (needs_journal_commit << 8) |
633                 (bucket_gc_gen(ca, b) / 16);
634 }
635
636 static inline int bucket_alloc_cmp(alloc_heap *h,
637                                    struct alloc_heap_entry l,
638                                    struct alloc_heap_entry r)
639 {
640         return (l.key > r.key) - (l.key < r.key) ?:
641                 (l.nr < r.nr)  - (l.nr  > r.nr) ?:
642                 (l.bucket > r.bucket) - (l.bucket < r.bucket);
643 }
644
645 static inline int bucket_idx_cmp(const void *_l, const void *_r)
646 {
647         const struct alloc_heap_entry *l = _l, *r = _r;
648
649         return (l->bucket > r->bucket) - (l->bucket < r->bucket);
650 }
651
652 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
653 {
654         struct bucket_array *buckets;
655         struct alloc_heap_entry e = { 0 };
656         size_t b, i, nr = 0;
657
658         ca->alloc_heap.used = 0;
659
660         mutex_lock(&c->bucket_clock[READ].lock);
661         down_read(&ca->bucket_lock);
662
663         buckets = bucket_array(ca);
664
665         bch2_recalc_oldest_io(c, ca, READ);
666
667         /*
668          * Find buckets with lowest read priority, by building a maxheap sorted
669          * by read priority and repeatedly replacing the maximum element until
670          * all buckets have been visited.
671          */
672         for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
673                 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
674                 unsigned long key = bucket_sort_key(c, ca, b, m);
675
676                 if (!bch2_can_invalidate_bucket(ca, b, m))
677                         continue;
678
679                 if (e.nr && e.bucket + e.nr == b && e.key == key) {
680                         e.nr++;
681                 } else {
682                         if (e.nr)
683                                 heap_add_or_replace(&ca->alloc_heap, e,
684                                         -bucket_alloc_cmp, NULL);
685
686                         e = (struct alloc_heap_entry) {
687                                 .bucket = b,
688                                 .nr     = 1,
689                                 .key    = key,
690                         };
691                 }
692
693                 cond_resched();
694         }
695
696         if (e.nr)
697                 heap_add_or_replace(&ca->alloc_heap, e,
698                                 -bucket_alloc_cmp, NULL);
699
700         for (i = 0; i < ca->alloc_heap.used; i++)
701                 nr += ca->alloc_heap.data[i].nr;
702
703         while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
704                 nr -= ca->alloc_heap.data[0].nr;
705                 heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
706         }
707
708         up_read(&ca->bucket_lock);
709         mutex_unlock(&c->bucket_clock[READ].lock);
710 }
711
712 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
713 {
714         struct bucket_array *buckets = bucket_array(ca);
715         struct bucket_mark m;
716         size_t b, start;
717
718         if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
719             ca->fifo_last_bucket >= ca->mi.nbuckets)
720                 ca->fifo_last_bucket = ca->mi.first_bucket;
721
722         start = ca->fifo_last_bucket;
723
724         do {
725                 ca->fifo_last_bucket++;
726                 if (ca->fifo_last_bucket == ca->mi.nbuckets)
727                         ca->fifo_last_bucket = ca->mi.first_bucket;
728
729                 b = ca->fifo_last_bucket;
730                 m = READ_ONCE(buckets->b[b].mark);
731
732                 if (bch2_can_invalidate_bucket(ca, b, m)) {
733                         struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
734
735                         heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
736                         if (heap_full(&ca->alloc_heap))
737                                 break;
738                 }
739
740                 cond_resched();
741         } while (ca->fifo_last_bucket != start);
742 }
743
744 static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
745 {
746         struct bucket_array *buckets = bucket_array(ca);
747         struct bucket_mark m;
748         size_t checked, i;
749
750         for (checked = 0;
751              checked < ca->mi.nbuckets / 2;
752              checked++) {
753                 size_t b = bch2_rand_range(ca->mi.nbuckets -
754                                            ca->mi.first_bucket) +
755                         ca->mi.first_bucket;
756
757                 m = READ_ONCE(buckets->b[b].mark);
758
759                 if (bch2_can_invalidate_bucket(ca, b, m)) {
760                         struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
761
762                         heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
763                         if (heap_full(&ca->alloc_heap))
764                                 break;
765                 }
766
767                 cond_resched();
768         }
769
770         sort(ca->alloc_heap.data,
771              ca->alloc_heap.used,
772              sizeof(ca->alloc_heap.data[0]),
773              bucket_idx_cmp, NULL);
774
775         /* remove duplicates: */
776         for (i = 0; i + 1 < ca->alloc_heap.used; i++)
777                 if (ca->alloc_heap.data[i].bucket ==
778                     ca->alloc_heap.data[i + 1].bucket)
779                         ca->alloc_heap.data[i].nr = 0;
780 }
781
782 static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
783 {
784         size_t i, nr = 0;
785
786         ca->inc_gen_needs_gc                    = 0;
787
788         switch (ca->mi.replacement) {
789         case CACHE_REPLACEMENT_LRU:
790                 find_reclaimable_buckets_lru(c, ca);
791                 break;
792         case CACHE_REPLACEMENT_FIFO:
793                 find_reclaimable_buckets_fifo(c, ca);
794                 break;
795         case CACHE_REPLACEMENT_RANDOM:
796                 find_reclaimable_buckets_random(c, ca);
797                 break;
798         }
799
800         heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
801
802         for (i = 0; i < ca->alloc_heap.used; i++)
803                 nr += ca->alloc_heap.data[i].nr;
804
805         return nr;
806 }
807
808 static inline long next_alloc_bucket(struct bch_dev *ca)
809 {
810         struct alloc_heap_entry e, *top = ca->alloc_heap.data;
811
812         while (ca->alloc_heap.used) {
813                 if (top->nr) {
814                         size_t b = top->bucket;
815
816                         top->bucket++;
817                         top->nr--;
818                         return b;
819                 }
820
821                 heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
822         }
823
824         return -1;
825 }
826
827 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
828                                        size_t bucket, u64 *flush_seq)
829 {
830         struct bucket_mark m;
831
832         percpu_down_read_preempt_disable(&c->mark_lock);
833         spin_lock(&c->freelist_lock);
834
835         bch2_invalidate_bucket(c, ca, bucket, &m);
836
837         verify_not_on_freelist(c, ca, bucket);
838         BUG_ON(!fifo_push(&ca->free_inc, bucket));
839
840         spin_unlock(&c->freelist_lock);
841
842         bucket_io_clock_reset(c, ca, bucket, READ);
843         bucket_io_clock_reset(c, ca, bucket, WRITE);
844
845         percpu_up_read_preempt_enable(&c->mark_lock);
846
847         if (m.journal_seq_valid) {
848                 u64 journal_seq = atomic64_read(&c->journal.seq);
849                 u64 bucket_seq  = journal_seq;
850
851                 bucket_seq &= ~((u64) U16_MAX);
852                 bucket_seq |= m.journal_seq;
853
854                 if (bucket_seq > journal_seq)
855                         bucket_seq -= 1 << 16;
856
857                 *flush_seq = max(*flush_seq, bucket_seq);
858         }
859
860         return m.cached_sectors != 0;
861 }
862
863 /*
864  * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
865  */
866 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
867 {
868         struct btree_iter iter;
869         u64 journal_seq = 0;
870         int ret = 0;
871         long b;
872
873         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
874                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
875
876         /* Only use nowait if we've already invalidated at least one bucket: */
877         while (!ret &&
878                !fifo_full(&ca->free_inc) &&
879                (b = next_alloc_bucket(ca)) >= 0) {
880                 bool must_flush =
881                         bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
882
883                 ret = __bch2_alloc_write_key(c, ca, b, &iter,
884                                 must_flush ? &journal_seq : NULL,
885                                 !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
886         }
887
888         bch2_btree_iter_unlock(&iter);
889
890         /* If we used NOWAIT, don't return the error: */
891         if (!fifo_empty(&ca->free_inc))
892                 ret = 0;
893         if (ret) {
894                 bch_err(ca, "error invalidating buckets: %i", ret);
895                 return ret;
896         }
897
898         if (journal_seq)
899                 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
900         if (ret) {
901                 bch_err(ca, "journal error: %i", ret);
902                 return ret;
903         }
904
905         return 0;
906 }
907
908 static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
909 {
910         unsigned i;
911         int ret = 0;
912
913         while (1) {
914                 set_current_state(TASK_INTERRUPTIBLE);
915
916                 spin_lock(&c->freelist_lock);
917                 for (i = 0; i < RESERVE_NR; i++)
918                         if (fifo_push(&ca->free[i], bucket)) {
919                                 fifo_pop(&ca->free_inc, bucket);
920
921                                 closure_wake_up(&c->freelist_wait);
922                                 ca->allocator_blocked_full = false;
923
924                                 spin_unlock(&c->freelist_lock);
925                                 goto out;
926                         }
927
928                 if (!ca->allocator_blocked_full) {
929                         ca->allocator_blocked_full = true;
930                         closure_wake_up(&c->freelist_wait);
931                 }
932
933                 spin_unlock(&c->freelist_lock);
934
935                 if ((current->flags & PF_KTHREAD) &&
936                     kthread_should_stop()) {
937                         ret = 1;
938                         break;
939                 }
940
941                 schedule();
942                 try_to_freeze();
943         }
944 out:
945         __set_current_state(TASK_RUNNING);
946         return ret;
947 }
948
949 /*
950  * Pulls buckets off free_inc, discards them (if enabled), then adds them to
951  * freelists, waiting until there's room if necessary:
952  */
953 static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
954 {
955         while (!fifo_empty(&ca->free_inc)) {
956                 size_t bucket = fifo_peek(&ca->free_inc);
957
958                 if (ca->mi.discard &&
959                     blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
960                         blkdev_issue_discard(ca->disk_sb.bdev,
961                                              bucket_to_sector(ca, bucket),
962                                              ca->mi.bucket_size, GFP_NOIO, 0);
963
964                 if (push_invalidated_bucket(c, ca, bucket))
965                         return 1;
966         }
967
968         return 0;
969 }
970
971 /**
972  * bch_allocator_thread - move buckets from free_inc to reserves
973  *
974  * The free_inc FIFO is populated by find_reclaimable_buckets(), and
975  * the reserves are depleted by bucket allocation. When we run out
976  * of free_inc, try to invalidate some buckets and write out
977  * prios and gens.
978  */
979 static int bch2_allocator_thread(void *arg)
980 {
981         struct bch_dev *ca = arg;
982         struct bch_fs *c = ca->fs;
983         size_t nr;
984         int ret;
985
986         set_freezable();
987
988         while (1) {
989                 cond_resched();
990
991                 pr_debug("discarding %zu invalidated buckets",
992                          fifo_used(&ca->free_inc));
993
994                 ret = discard_invalidated_buckets(c, ca);
995                 if (ret)
996                         goto stop;
997
998                 down_read(&c->gc_lock);
999
1000                 ret = bch2_invalidate_buckets(c, ca);
1001                 if (ret) {
1002                         up_read(&c->gc_lock);
1003                         goto stop;
1004                 }
1005
1006                 if (!fifo_empty(&ca->free_inc)) {
1007                         up_read(&c->gc_lock);
1008                         continue;
1009                 }
1010
1011                 pr_debug("free_inc now empty");
1012
1013                 do {
1014                         /*
1015                          * Find some buckets that we can invalidate, either
1016                          * they're completely unused, or only contain clean data
1017                          * that's been written back to the backing device or
1018                          * another cache tier
1019                          */
1020
1021                         pr_debug("scanning for reclaimable buckets");
1022
1023                         nr = find_reclaimable_buckets(c, ca);
1024
1025                         pr_debug("found %zu buckets", nr);
1026
1027                         trace_alloc_batch(ca, nr, ca->alloc_heap.size);
1028
1029                         if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
1030                              ca->inc_gen_really_needs_gc) &&
1031                             c->gc_thread) {
1032                                 atomic_inc(&c->kick_gc);
1033                                 wake_up_process(c->gc_thread);
1034                         }
1035
1036                         /*
1037                          * If we found any buckets, we have to invalidate them
1038                          * before we scan for more - but if we didn't find very
1039                          * many we may want to wait on more buckets being
1040                          * available so we don't spin:
1041                          */
1042                         if (!nr ||
1043                             (nr < ALLOC_SCAN_BATCH(ca) &&
1044                              !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
1045                                 ca->allocator_blocked = true;
1046                                 closure_wake_up(&c->freelist_wait);
1047
1048                                 ret = wait_buckets_available(c, ca);
1049                                 if (ret) {
1050                                         up_read(&c->gc_lock);
1051                                         goto stop;
1052                                 }
1053                         }
1054                 } while (!nr);
1055
1056                 ca->allocator_blocked = false;
1057                 up_read(&c->gc_lock);
1058
1059                 pr_debug("%zu buckets to invalidate", nr);
1060
1061                 /*
1062                  * alloc_heap is now full of newly-invalidated buckets: next,
1063                  * write out the new bucket gens:
1064                  */
1065         }
1066
1067 stop:
1068         pr_debug("alloc thread stopping (ret %i)", ret);
1069         return 0;
1070 }
1071
1072 /* Startup/shutdown (ro/rw): */
1073
1074 void bch2_recalc_capacity(struct bch_fs *c)
1075 {
1076         struct bch_dev *ca;
1077         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
1078         unsigned bucket_size_max = 0;
1079         unsigned long ra_pages = 0;
1080         unsigned i, j;
1081
1082         lockdep_assert_held(&c->state_lock);
1083
1084         for_each_online_member(ca, c, i) {
1085                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
1086
1087                 ra_pages += bdi->ra_pages;
1088         }
1089
1090         bch2_set_ra_pages(c, ra_pages);
1091
1092         for_each_rw_member(ca, c, i) {
1093                 u64 dev_reserve = 0;
1094
1095                 /*
1096                  * We need to reserve buckets (from the number
1097                  * of currently available buckets) against
1098                  * foreground writes so that mainly copygc can
1099                  * make forward progress.
1100                  *
1101                  * We need enough to refill the various reserves
1102                  * from scratch - copygc will use its entire
1103                  * reserve all at once, then run against when
1104                  * its reserve is refilled (from the formerly
1105                  * available buckets).
1106                  *
1107                  * This reserve is just used when considering if
1108                  * allocations for foreground writes must wait -
1109                  * not -ENOSPC calculations.
1110                  */
1111                 for (j = 0; j < RESERVE_NONE; j++)
1112                         dev_reserve += ca->free[j].size;
1113
1114                 dev_reserve += 1;       /* btree write point */
1115                 dev_reserve += 1;       /* copygc write point */
1116                 dev_reserve += 1;       /* rebalance write point */
1117
1118                 dev_reserve *= ca->mi.bucket_size;
1119
1120                 ca->copygc_threshold = dev_reserve;
1121
1122                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1123                                              ca->mi.first_bucket);
1124
1125                 reserved_sectors += dev_reserve * 2;
1126
1127                 bucket_size_max = max_t(unsigned, bucket_size_max,
1128                                         ca->mi.bucket_size);
1129         }
1130
1131         gc_reserve = c->opts.gc_reserve_bytes
1132                 ? c->opts.gc_reserve_bytes >> 9
1133                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1134
1135         reserved_sectors = max(gc_reserve, reserved_sectors);
1136
1137         reserved_sectors = min(reserved_sectors, capacity);
1138
1139         c->capacity = capacity - reserved_sectors;
1140
1141         c->bucket_size_max = bucket_size_max;
1142
1143         if (c->capacity) {
1144                 bch2_io_timer_add(&c->io_clock[READ],
1145                                  &c->bucket_clock[READ].rescale);
1146                 bch2_io_timer_add(&c->io_clock[WRITE],
1147                                  &c->bucket_clock[WRITE].rescale);
1148         } else {
1149                 bch2_io_timer_del(&c->io_clock[READ],
1150                                  &c->bucket_clock[READ].rescale);
1151                 bch2_io_timer_del(&c->io_clock[WRITE],
1152                                  &c->bucket_clock[WRITE].rescale);
1153         }
1154
1155         /* Wake up case someone was waiting for buckets */
1156         closure_wake_up(&c->freelist_wait);
1157 }
1158
1159 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1160 {
1161         struct open_bucket *ob;
1162         bool ret = false;
1163
1164         for (ob = c->open_buckets;
1165              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1166              ob++) {
1167                 spin_lock(&ob->lock);
1168                 if (ob->valid && !ob->on_partial_list &&
1169                     ob->ptr.dev == ca->dev_idx)
1170                         ret = true;
1171                 spin_unlock(&ob->lock);
1172         }
1173
1174         return ret;
1175 }
1176
1177 /* device goes ro: */
1178 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1179 {
1180         unsigned i;
1181
1182         BUG_ON(ca->alloc_thread);
1183
1184         /* First, remove device from allocation groups: */
1185
1186         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1187                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1188
1189         /*
1190          * Capacity is calculated based off of devices in allocation groups:
1191          */
1192         bch2_recalc_capacity(c);
1193
1194         /* Next, close write points that point to this device... */
1195         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1196                 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1197
1198         bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
1199         bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1200         bch2_writepoint_stop(c, ca, &c->btree_write_point);
1201
1202         mutex_lock(&c->btree_reserve_cache_lock);
1203         while (c->btree_reserve_cache_nr) {
1204                 struct btree_alloc *a =
1205                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1206
1207                 bch2_open_buckets_put(c, &a->ob);
1208         }
1209         mutex_unlock(&c->btree_reserve_cache_lock);
1210
1211         while (1) {
1212                 struct open_bucket *ob;
1213
1214                 spin_lock(&c->freelist_lock);
1215                 if (!ca->open_buckets_partial_nr) {
1216                         spin_unlock(&c->freelist_lock);
1217                         break;
1218                 }
1219                 ob = c->open_buckets +
1220                         ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1221                 ob->on_partial_list = false;
1222                 spin_unlock(&c->freelist_lock);
1223
1224                 bch2_open_bucket_put(c, ob);
1225         }
1226
1227         bch2_ec_stop_dev(c, ca);
1228
1229         /*
1230          * Wake up threads that were blocked on allocation, so they can notice
1231          * the device can no longer be removed and the capacity has changed:
1232          */
1233         closure_wake_up(&c->freelist_wait);
1234
1235         /*
1236          * journal_res_get() can block waiting for free space in the journal -
1237          * it needs to notice there may not be devices to allocate from anymore:
1238          */
1239         wake_up(&c->journal.wait);
1240
1241         /* Now wait for any in flight writes: */
1242
1243         closure_wait_event(&c->open_buckets_wait,
1244                            !bch2_dev_has_open_write_point(c, ca));
1245 }
1246
1247 /* device goes rw: */
1248 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1249 {
1250         unsigned i;
1251
1252         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1253                 if (ca->mi.data_allowed & (1 << i))
1254                         set_bit(ca->dev_idx, c->rw_devs[i].d);
1255 }
1256
1257 void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
1258 {
1259         closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
1260 }
1261
1262 /* stop allocator thread: */
1263 void bch2_dev_allocator_stop(struct bch_dev *ca)
1264 {
1265         struct task_struct *p;
1266
1267         p = rcu_dereference_protected(ca->alloc_thread, 1);
1268         ca->alloc_thread = NULL;
1269
1270         /*
1271          * We need an rcu barrier between setting ca->alloc_thread = NULL and
1272          * the thread shutting down to avoid bch2_wake_allocator() racing:
1273          *
1274          * XXX: it would be better to have the rcu barrier be asynchronous
1275          * instead of blocking us here
1276          */
1277         synchronize_rcu();
1278
1279         if (p) {
1280                 kthread_stop(p);
1281                 put_task_struct(p);
1282         }
1283 }
1284
1285 /* start allocator thread: */
1286 int bch2_dev_allocator_start(struct bch_dev *ca)
1287 {
1288         struct task_struct *p;
1289
1290         /*
1291          * allocator thread already started?
1292          */
1293         if (ca->alloc_thread)
1294                 return 0;
1295
1296         p = kthread_create(bch2_allocator_thread, ca,
1297                            "bch_alloc[%s]", ca->name);
1298         if (IS_ERR(p))
1299                 return PTR_ERR(p);
1300
1301         get_task_struct(p);
1302         rcu_assign_pointer(ca->alloc_thread, p);
1303         wake_up_process(p);
1304         return 0;
1305 }
1306
1307 static void flush_held_btree_writes(struct bch_fs *c)
1308 {
1309         struct bucket_table *tbl;
1310         struct rhash_head *pos;
1311         struct btree *b;
1312         bool nodes_blocked;
1313         size_t i;
1314         struct closure cl;
1315
1316         closure_init_stack(&cl);
1317
1318         clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1319 again:
1320         pr_debug("flushing dirty btree nodes");
1321         cond_resched();
1322         closure_wait(&c->btree_interior_update_wait, &cl);
1323
1324         nodes_blocked = false;
1325
1326         rcu_read_lock();
1327         for_each_cached_btree(b, c, tbl, i, pos)
1328                 if (btree_node_need_write(b)) {
1329                         if (btree_node_may_write(b)) {
1330                                 rcu_read_unlock();
1331                                 btree_node_lock_type(c, b, SIX_LOCK_read);
1332                                 bch2_btree_node_write(c, b, SIX_LOCK_read);
1333                                 six_unlock_read(&b->lock);
1334                                 goto again;
1335                         } else {
1336                                 nodes_blocked = true;
1337                         }
1338                 }
1339         rcu_read_unlock();
1340
1341         if (c->btree_roots_dirty)
1342                 bch2_journal_meta(&c->journal);
1343
1344         if (nodes_blocked) {
1345                 closure_sync(&cl);
1346                 goto again;
1347         }
1348
1349         closure_wake_up(&c->btree_interior_update_wait);
1350         closure_sync(&cl);
1351
1352         closure_wait_event(&c->btree_interior_update_wait,
1353                            !bch2_btree_interior_updates_nr_pending(c));
1354 }
1355
1356 static void allocator_start_issue_discards(struct bch_fs *c)
1357 {
1358         struct bch_dev *ca;
1359         unsigned dev_iter;
1360         size_t bu;
1361
1362         for_each_rw_member(ca, c, dev_iter)
1363                 while (fifo_pop(&ca->free_inc, bu))
1364                         blkdev_issue_discard(ca->disk_sb.bdev,
1365                                              bucket_to_sector(ca, bu),
1366                                              ca->mi.bucket_size, GFP_NOIO, 0);
1367 }
1368
1369 static int resize_free_inc(struct bch_dev *ca)
1370 {
1371         alloc_fifo free_inc;
1372
1373         if (!fifo_full(&ca->free_inc))
1374                 return 0;
1375
1376         if (!init_fifo(&free_inc,
1377                        ca->free_inc.size * 2,
1378                        GFP_KERNEL))
1379                 return -ENOMEM;
1380
1381         fifo_move(&free_inc, &ca->free_inc);
1382         swap(free_inc, ca->free_inc);
1383         free_fifo(&free_inc);
1384         return 0;
1385 }
1386
1387 static int __bch2_fs_allocator_start(struct bch_fs *c)
1388 {
1389         struct bch_dev *ca;
1390         unsigned dev_iter;
1391         u64 journal_seq = 0;
1392         long bu;
1393         int ret = 0;
1394
1395         if (test_alloc_startup(c))
1396                 goto not_enough;
1397
1398         /* Scan for buckets that are already invalidated: */
1399         for_each_rw_member(ca, c, dev_iter) {
1400                 struct bucket_array *buckets;
1401                 struct bucket_mark m;
1402
1403                 down_read(&ca->bucket_lock);
1404                 percpu_down_read_preempt_disable(&c->mark_lock);
1405
1406                 buckets = bucket_array(ca);
1407
1408                 for (bu = buckets->first_bucket;
1409                      bu < buckets->nbuckets; bu++) {
1410                         m = READ_ONCE(buckets->b[bu].mark);
1411
1412                         if (!buckets->b[bu].gen_valid ||
1413                             !test_bit(bu, ca->buckets_nouse) ||
1414                             !is_available_bucket(m) ||
1415                             m.cached_sectors)
1416                                 continue;
1417
1418                         bch2_mark_alloc_bucket(c, ca, bu, true,
1419                                         gc_pos_alloc(c, NULL), 0);
1420
1421                         fifo_push(&ca->free_inc, bu);
1422
1423                         discard_invalidated_buckets(c, ca);
1424
1425                         if (fifo_full(&ca->free[RESERVE_BTREE]))
1426                                 break;
1427                 }
1428                 percpu_up_read_preempt_enable(&c->mark_lock);
1429                 up_read(&ca->bucket_lock);
1430         }
1431
1432         /* did we find enough buckets? */
1433         for_each_rw_member(ca, c, dev_iter)
1434                 if (!fifo_full(&ca->free[RESERVE_BTREE])) {
1435                         percpu_ref_put(&ca->io_ref);
1436                         goto not_enough;
1437                 }
1438
1439         return 0;
1440 not_enough:
1441         pr_debug("not enough empty buckets; scanning for reclaimable buckets");
1442
1443         /*
1444          * We're moving buckets to freelists _before_ they've been marked as
1445          * invalidated on disk - we have to so that we can allocate new btree
1446          * nodes to mark them as invalidated on disk.
1447          *
1448          * However, we can't _write_ to any of these buckets yet - they might
1449          * have cached data in them, which is live until they're marked as
1450          * invalidated on disk:
1451          */
1452         set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
1453
1454         while (1) {
1455                 bool wrote = false;
1456
1457                 for_each_rw_member(ca, c, dev_iter) {
1458                         find_reclaimable_buckets(c, ca);
1459
1460                         while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
1461                                (bu = next_alloc_bucket(ca)) >= 0) {
1462                                 ret = resize_free_inc(ca);
1463                                 if (ret) {
1464                                         percpu_ref_put(&ca->io_ref);
1465                                         return ret;
1466                                 }
1467
1468                                 bch2_invalidate_one_bucket(c, ca, bu,
1469                                                            &journal_seq);
1470
1471                                 fifo_push(&ca->free[RESERVE_BTREE], bu);
1472                                 bucket_set_dirty(ca, bu);
1473                         }
1474                 }
1475
1476                 pr_debug("done scanning for reclaimable buckets");
1477
1478                 /*
1479                  * XXX: it's possible for this to deadlock waiting on journal reclaim,
1480                  * since we're holding btree writes. What then?
1481                  */
1482                 ret = bch2_alloc_write(c, true, &wrote);
1483
1484                 /*
1485                  * If bch2_alloc_write() did anything, it may have used some
1486                  * buckets, and we need the RESERVE_BTREE freelist full - so we
1487                  * need to loop and scan again.
1488                  * And if it errored, it may have been because there weren't
1489                  * enough buckets, so just scan and loop again as long as it
1490                  * made some progress:
1491                  */
1492                 if (!wrote && ret)
1493                         return ret;
1494                 if (!wrote && !ret)
1495                         break;
1496         }
1497
1498         pr_debug("flushing journal");
1499
1500         ret = bch2_journal_flush(&c->journal);
1501         if (ret)
1502                 return ret;
1503
1504         pr_debug("issuing discards");
1505         allocator_start_issue_discards(c);
1506
1507         set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
1508
1509         /* now flush dirty btree nodes: */
1510         flush_held_btree_writes(c);
1511
1512         return 0;
1513 }
1514
1515 int bch2_fs_allocator_start(struct bch_fs *c)
1516 {
1517         struct bch_dev *ca;
1518         unsigned i;
1519         bool wrote;
1520         int ret;
1521
1522         down_read(&c->gc_lock);
1523         ret = __bch2_fs_allocator_start(c);
1524         up_read(&c->gc_lock);
1525
1526         if (ret)
1527                 return ret;
1528
1529         for_each_rw_member(ca, c, i) {
1530                 ret = bch2_dev_allocator_start(ca);
1531                 if (ret) {
1532                         percpu_ref_put(&ca->io_ref);
1533                         return ret;
1534                 }
1535         }
1536
1537         return bch2_alloc_write(c, false, &wrote);
1538 }
1539
1540 void bch2_fs_allocator_background_init(struct bch_fs *c)
1541 {
1542         spin_lock_init(&c->freelist_lock);
1543         bch2_bucket_clock_init(c, READ);
1544         bch2_bucket_clock_init(c, WRITE);
1545
1546         c->pd_controllers_update_seconds = 5;
1547         INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
1548 }