]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc.c
36dc947c06091a2783c21d47059f05aa18b575a4
[bcachefs-tools-debian] / libbcachefs / alloc.c
1 /*
2  * Primary bucket allocation code
3  *
4  * Copyright 2012 Google, Inc.
5  *
6  * Allocation in bcache is done in terms of buckets:
7  *
8  * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
9  * btree pointers - they must match for the pointer to be considered valid.
10  *
11  * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
12  * bucket simply by incrementing its gen.
13  *
14  * The gens (along with the priorities; it's really the gens are important but
15  * the code is named as if it's the priorities) are written in an arbitrary list
16  * of buckets on disk, with a pointer to them in the journal header.
17  *
18  * When we invalidate a bucket, we have to write its new gen to disk and wait
19  * for that write to complete before we use it - otherwise after a crash we
20  * could have pointers that appeared to be good but pointed to data that had
21  * been overwritten.
22  *
23  * Since the gens and priorities are all stored contiguously on disk, we can
24  * batch this up: We fill up the free_inc list with freshly invalidated buckets,
25  * call prio_write(), and when prio_write() finishes we pull buckets off the
26  * free_inc list and optionally discard them.
27  *
28  * free_inc isn't the only freelist - if it was, we'd often have to sleep while
29  * priorities and gens were being written before we could allocate. c->free is a
30  * smaller freelist, and buckets on that list are always ready to be used.
31  *
32  * If we've got discards enabled, that happens when a bucket moves from the
33  * free_inc list to the free list.
34  *
35  * It's important to ensure that gens don't wrap around - with respect to
36  * either the oldest gen in the btree or the gen on disk. This is quite
37  * difficult to do in practice, but we explicitly guard against it anyways - if
38  * a bucket is in danger of wrapping around we simply skip invalidating it that
39  * time around, and we garbage collect or rewrite the priorities sooner than we
40  * would have otherwise.
41  *
42  * bch2_bucket_alloc() allocates a single bucket from a specific device.
43  *
44  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
45  * in a given filesystem.
46  *
47  * invalidate_buckets() drives all the processes described above. It's called
48  * from bch2_bucket_alloc() and a few other places that need to make sure free
49  * buckets are ready.
50  *
51  * invalidate_buckets_(lru|fifo)() find buckets that are available to be
52  * invalidated, and then invalidate them and stick them on the free_inc list -
53  * in either lru or fifo order.
54  */
55
56 #include "bcachefs.h"
57 #include "alloc.h"
58 #include "btree_update.h"
59 #include "buckets.h"
60 #include "checksum.h"
61 #include "clock.h"
62 #include "debug.h"
63 #include "error.h"
64 #include "extents.h"
65 #include "io.h"
66 #include "journal.h"
67 #include "super-io.h"
68
69 #include <linux/blkdev.h>
70 #include <linux/kthread.h>
71 #include <linux/math64.h>
72 #include <linux/random.h>
73 #include <linux/rcupdate.h>
74 #include <linux/sched/task.h>
75 #include <linux/sort.h>
76 #include <trace/events/bcachefs.h>
77
78 static void bch2_recalc_min_prio(struct bch_dev *, int);
79
80 /* Allocation groups: */
81
82 void bch2_dev_group_remove(struct dev_group *grp, struct bch_dev *ca)
83 {
84         unsigned i;
85
86         spin_lock(&grp->lock);
87
88         for (i = 0; i < grp->nr; i++)
89                 if (grp->d[i].dev == ca) {
90                         grp->nr--;
91                         memmove(&grp->d[i],
92                                 &grp->d[i + 1],
93                                 (grp->nr- i) * sizeof(grp->d[0]));
94                         break;
95                 }
96
97         spin_unlock(&grp->lock);
98 }
99
100 void bch2_dev_group_add(struct dev_group *grp, struct bch_dev *ca)
101 {
102         unsigned i;
103
104         spin_lock(&grp->lock);
105         for (i = 0; i < grp->nr; i++)
106                 if (grp->d[i].dev == ca)
107                         goto out;
108
109         BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
110
111         grp->d[grp->nr++].dev = ca;
112 out:
113         spin_unlock(&grp->lock);
114 }
115
116 /* Ratelimiting/PD controllers */
117
118 static void pd_controllers_update(struct work_struct *work)
119 {
120         struct bch_fs *c = container_of(to_delayed_work(work),
121                                            struct bch_fs,
122                                            pd_controllers_update);
123         struct bch_dev *ca;
124         unsigned i, iter;
125
126         /* All units are in bytes */
127         u64 faster_tiers_size   = 0;
128         u64 faster_tiers_dirty  = 0;
129
130         u64 fastest_tier_size   = 0;
131         u64 fastest_tier_free   = 0;
132         u64 copygc_can_free     = 0;
133
134         rcu_read_lock();
135         for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
136                 bch2_pd_controller_update(&c->tiers[i].pd,
137                                 div_u64(faster_tiers_size *
138                                         c->tiering_percent, 100),
139                                 faster_tiers_dirty,
140                                 -1);
141
142                 spin_lock(&c->tiers[i].devs.lock);
143                 group_for_each_dev(ca, &c->tiers[i].devs, iter) {
144                         struct bch_dev_usage stats = bch2_dev_usage_read(ca);
145                         unsigned bucket_bits = ca->bucket_bits + 9;
146
147                         u64 size = (ca->mi.nbuckets -
148                                     ca->mi.first_bucket) << bucket_bits;
149                         u64 dirty = stats.buckets_dirty << bucket_bits;
150                         u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
151                         /*
152                          * Bytes of internal fragmentation, which can be
153                          * reclaimed by copy GC
154                          */
155                         s64 fragmented = ((stats.buckets_dirty +
156                                            stats.buckets_cached) <<
157                                           bucket_bits) -
158                                 ((stats.sectors[S_DIRTY] +
159                                   stats.sectors[S_CACHED] ) << 9);
160
161                         fragmented = max(0LL, fragmented);
162
163                         bch2_pd_controller_update(&ca->moving_gc_pd,
164                                                  free, fragmented, -1);
165
166                         faster_tiers_size               += size;
167                         faster_tiers_dirty              += dirty;
168
169                         if (!c->fastest_tier ||
170                             c->fastest_tier == &c->tiers[i]) {
171                                 fastest_tier_size       += size;
172                                 fastest_tier_free       += free;
173                         }
174
175                         copygc_can_free                 += fragmented;
176                 }
177                 spin_unlock(&c->tiers[i].devs.lock);
178         }
179
180         rcu_read_unlock();
181
182         /*
183          * Throttle foreground writes if tier 0 is running out of free buckets,
184          * and either tiering or copygc can free up space.
185          *
186          * Target will be small if there isn't any work to do - we don't want to
187          * throttle foreground writes if we currently have all the free space
188          * we're ever going to have.
189          *
190          * Otherwise, if there's work to do, try to keep 20% of tier0 available
191          * for foreground writes.
192          */
193         if (c->fastest_tier)
194                 copygc_can_free = U64_MAX;
195
196         bch2_pd_controller_update(&c->foreground_write_pd,
197                                  min(copygc_can_free,
198                                      div_u64(fastest_tier_size *
199                                              c->foreground_target_percent,
200                                              100)),
201                                  fastest_tier_free,
202                                  -1);
203
204         schedule_delayed_work(&c->pd_controllers_update,
205                               c->pd_controllers_update_seconds * HZ);
206 }
207
208 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
209 {
210         unsigned bytes = offsetof(struct bch_alloc, data);
211
212         if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
213                 bytes += 2;
214         if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
215                 bytes += 2;
216
217         return DIV_ROUND_UP(bytes, sizeof(u64));
218 }
219
220 static const char *bch2_alloc_invalid(const struct bch_fs *c,
221                                       struct bkey_s_c k)
222 {
223         if (k.k->p.inode >= c->sb.nr_devices ||
224             !c->devs[k.k->p.inode])
225                 return "invalid device";
226
227         switch (k.k->type) {
228         case BCH_ALLOC: {
229                 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
230
231                 if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
232                         return "incorrect value size";
233                 break;
234         }
235         default:
236                 return "invalid type";
237         }
238
239         return NULL;
240 }
241
242 static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
243                                size_t size, struct bkey_s_c k)
244 {
245         buf[0] = '\0';
246
247         switch (k.k->type) {
248         case BCH_ALLOC:
249                 break;
250         }
251 }
252
253 const struct bkey_ops bch2_bkey_alloc_ops = {
254         .key_invalid    = bch2_alloc_invalid,
255         .val_to_text    = bch2_alloc_to_text,
256 };
257
258 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
259 {
260         unsigned v;
261
262         switch (bytes) {
263         case 1:
264                 v = **p;
265                 break;
266         case 2:
267                 v = le16_to_cpup((void *) *p);
268                 break;
269         case 4:
270                 v = le32_to_cpup((void *) *p);
271                 break;
272         default:
273                 BUG();
274         }
275
276         *p += bytes;
277         return v;
278 }
279
280 static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v)
281 {
282         switch (bytes) {
283         case 1:
284                 **p = v;
285                 break;
286         case 2:
287                 *((__le16 *) *p) = cpu_to_le16(v);
288                 break;
289         case 4:
290                 *((__le32 *) *p) = cpu_to_le32(v);
291                 break;
292         default:
293                 BUG();
294         }
295
296         *p += bytes;
297 }
298
299 static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
300 {
301         struct bch_dev *ca;
302         struct bkey_s_c_alloc a;
303         struct bucket_mark new;
304         struct bucket *g;
305         const u8 *d;
306
307         if (k.k->type != BCH_ALLOC)
308                 return;
309
310         a = bkey_s_c_to_alloc(k);
311         ca = c->devs[a.k->p.inode];
312
313         if (a.k->p.offset >= ca->mi.nbuckets)
314                 return;
315
316         g = ca->buckets + a.k->p.offset;
317         bucket_cmpxchg(g, new, ({
318                 new.gen = a.v->gen;
319                 new.gen_valid = 1;
320         }));
321
322         d = a.v->data;
323         if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
324                 g->prio[READ] = get_alloc_field(&d, 2);
325         if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
326                 g->prio[WRITE] = get_alloc_field(&d, 2);
327 }
328
329 int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
330 {
331         struct journal_replay *r;
332         struct btree_iter iter;
333         struct bkey_s_c k;
334         int ret;
335
336         if (!c->btree_roots[BTREE_ID_ALLOC].b)
337                 return 0;
338
339         for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) {
340                 bch2_alloc_read_key(c, k);
341                 bch2_btree_iter_cond_resched(&iter);
342         }
343
344         ret = bch2_btree_iter_unlock(&iter);
345         if (ret)
346                 return ret;
347
348         list_for_each_entry(r, journal_replay_list, list) {
349                 struct bkey_i *k, *n;
350                 struct jset_entry *entry;
351
352                 for_each_jset_key(k, n, entry, &r->j)
353                         if (entry->btree_id == BTREE_ID_ALLOC)
354                                 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
355         }
356
357         return 0;
358 }
359
360 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
361                                   struct bucket *g, struct btree_iter *iter,
362                                   u64 *journal_seq)
363 {
364         struct bucket_mark m;
365         __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
366         struct bkey_i_alloc *a;
367         u8 *d;
368         int ret;
369
370         bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets));
371
372         do {
373                 ret = bch2_btree_iter_traverse(iter);
374                 if (ret)
375                         break;
376
377                 /* read mark under btree node lock: */
378                 m = READ_ONCE(g->mark);
379                 a = bkey_alloc_init(&alloc_key.k);
380                 a->k.p          = iter->pos;
381                 a->v.fields     = 0;
382                 a->v.gen        = m.gen;
383                 set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
384
385                 d = a->v.data;
386                 if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
387                         put_alloc_field(&d, 2, g->prio[READ]);
388                 if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
389                         put_alloc_field(&d, 2, g->prio[WRITE]);
390
391                 bch2_btree_iter_set_pos(iter, a->k.p);
392                 ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
393                                            BTREE_INSERT_ATOMIC|
394                                            BTREE_INSERT_NOFAIL|
395                                            BTREE_INSERT_USE_RESERVE|
396                                            BTREE_INSERT_USE_ALLOC_RESERVE|
397                                            BTREE_INSERT_NOWAIT,
398                                            BTREE_INSERT_ENTRY(iter, &a->k_i));
399                 bch2_btree_iter_cond_resched(iter);
400         } while (ret == -EINTR);
401
402         return ret;
403 }
404
405 int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
406 {
407         struct bch_dev *ca;
408         struct bucket *g;
409         struct btree_iter iter;
410         int ret;
411
412         if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
413                 return 0;
414
415         ca = c->devs[pos.inode];
416
417         if (pos.offset >= ca->mi.nbuckets)
418                 return 0;
419
420         g = ca->buckets + pos.offset;
421
422         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
423                              BTREE_ITER_INTENT);
424
425         ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL);
426         bch2_btree_iter_unlock(&iter);
427         return ret;
428 }
429
430 int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq)
431 {
432         struct btree_iter iter;
433         struct bucket *g;
434         int ret = 0;
435
436         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
437                              BTREE_ITER_INTENT);
438
439         for_each_bucket(g, ca) {
440                 ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq);
441                 if (ret)
442                         break;
443         }
444
445         bch2_btree_iter_unlock(&iter);
446         return ret;
447 }
448
449 #define BUCKET_GC_GEN_MAX       96U
450
451 /**
452  * wait_buckets_available - wait on reclaimable buckets
453  *
454  * If there aren't enough available buckets to fill up free_inc, wait until
455  * there are.
456  */
457 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
458 {
459         unsigned long gc_count = c->gc_count;
460         int ret = 0;
461
462         while (1) {
463                 set_current_state(TASK_INTERRUPTIBLE);
464                 if (kthread_should_stop()) {
465                         ret = -1;
466                         break;
467                 }
468
469                 if (gc_count != c->gc_count)
470                         ca->inc_gen_really_needs_gc = 0;
471
472                 if ((ssize_t) (dev_buckets_available(ca) -
473                                ca->inc_gen_really_needs_gc) >=
474                     (ssize_t) fifo_free(&ca->free_inc))
475                         break;
476
477                 up_read(&c->gc_lock);
478                 schedule();
479                 try_to_freeze();
480                 down_read(&c->gc_lock);
481         }
482
483         __set_current_state(TASK_RUNNING);
484         return ret;
485 }
486
487 static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
488 {
489         if (expensive_debug_checks(ca->fs)) {
490                 size_t iter;
491                 long i;
492                 unsigned j;
493
494                 for (j = 0; j < RESERVE_NR; j++)
495                         fifo_for_each_entry(i, &ca->free[j], iter)
496                                 BUG_ON(i == bucket);
497                 fifo_for_each_entry(i, &ca->free_inc, iter)
498                         BUG_ON(i == bucket);
499         }
500 }
501
502 /* Bucket heap / gen */
503
504 void bch2_recalc_min_prio(struct bch_dev *ca, int rw)
505 {
506         struct bch_fs *c = ca->fs;
507         struct prio_clock *clock = &c->prio_clock[rw];
508         struct bucket *g;
509         u16 max_delta = 1;
510         unsigned i;
511
512         lockdep_assert_held(&c->bucket_lock);
513
514         /* Determine min prio for this particular cache */
515         for_each_bucket(g, ca)
516                 max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
517
518         ca->min_prio[rw] = clock->hand - max_delta;
519
520         /*
521          * This may possibly increase the min prio for the whole cache, check
522          * that as well.
523          */
524         max_delta = 1;
525
526         for_each_member_device(ca, c, i)
527                 max_delta = max(max_delta,
528                                 (u16) (clock->hand - ca->min_prio[rw]));
529
530         clock->min_prio = clock->hand - max_delta;
531 }
532
533 static void bch2_rescale_prios(struct bch_fs *c, int rw)
534 {
535         struct prio_clock *clock = &c->prio_clock[rw];
536         struct bch_dev *ca;
537         struct bucket *g;
538         unsigned i;
539
540         trace_rescale_prios(c);
541
542         for_each_member_device(ca, c, i) {
543                 for_each_bucket(g, ca)
544                         g->prio[rw] = clock->hand -
545                                 (clock->hand - g->prio[rw]) / 2;
546
547                 bch2_recalc_min_prio(ca, rw);
548         }
549 }
550
551 static void bch2_inc_clock_hand(struct io_timer *timer)
552 {
553         struct prio_clock *clock = container_of(timer,
554                                         struct prio_clock, rescale);
555         struct bch_fs *c = container_of(clock,
556                                 struct bch_fs, prio_clock[clock->rw]);
557         u64 capacity;
558
559         mutex_lock(&c->bucket_lock);
560
561         clock->hand++;
562
563         /* if clock cannot be advanced more, rescale prio */
564         if (clock->hand == (u16) (clock->min_prio - 1))
565                 bch2_rescale_prios(c, clock->rw);
566
567         mutex_unlock(&c->bucket_lock);
568
569         capacity = READ_ONCE(c->capacity);
570
571         if (!capacity)
572                 return;
573
574         /*
575          * we only increment when 0.1% of the filesystem capacity has been read
576          * or written too, this determines if it's time
577          *
578          * XXX: we shouldn't really be going off of the capacity of devices in
579          * RW mode (that will be 0 when we're RO, yet we can still service
580          * reads)
581          */
582         timer->expire += capacity >> 10;
583
584         bch2_io_timer_add(&c->io_clock[clock->rw], timer);
585 }
586
587 static void bch2_prio_timer_init(struct bch_fs *c, int rw)
588 {
589         struct prio_clock *clock = &c->prio_clock[rw];
590         struct io_timer *timer = &clock->rescale;
591
592         clock->rw       = rw;
593         timer->fn       = bch2_inc_clock_hand;
594         timer->expire   = c->capacity >> 10;
595 }
596
597 /*
598  * Background allocation thread: scans for buckets to be invalidated,
599  * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
600  * then optionally issues discard commands to the newly free buckets, then puts
601  * them on the various freelists.
602  */
603
604 static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g)
605 {
606         return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
607 }
608
609 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
610                                        struct bucket_mark mark)
611 {
612         if (!is_available_bucket(mark))
613                 return false;
614
615         if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX / 2)
616                 ca->inc_gen_needs_gc++;
617
618         if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX)
619                 ca->inc_gen_really_needs_gc++;
620
621         return can_inc_bucket_gen(ca, g);
622 }
623
624 static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
625 {
626         struct bch_fs *c = ca->fs;
627         struct bucket_mark m;
628
629         spin_lock(&ca->freelist_lock);
630         if (!bch2_invalidate_bucket(ca, g, &m)) {
631                 spin_unlock(&ca->freelist_lock);
632                 return;
633         }
634
635         verify_not_on_freelist(ca, g - ca->buckets);
636         BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
637         spin_unlock(&ca->freelist_lock);
638
639         g->prio[READ] = c->prio_clock[READ].hand;
640         g->prio[WRITE] = c->prio_clock[WRITE].hand;
641
642         if (m.cached_sectors) {
643                 ca->allocator_invalidating_data = true;
644         } else if (m.journal_seq_valid) {
645                 u64 journal_seq = atomic64_read(&c->journal.seq);
646                 u64 bucket_seq  = journal_seq;
647
648                 bucket_seq &= ~((u64) U16_MAX);
649                 bucket_seq |= m.journal_seq;
650
651                 if (bucket_seq > journal_seq)
652                         bucket_seq -= 1 << 16;
653
654                 ca->allocator_journal_seq_flush =
655                         max(ca->allocator_journal_seq_flush, bucket_seq);
656         }
657 }
658
659 /*
660  * Determines what order we're going to reuse buckets, smallest bucket_key()
661  * first.
662  *
663  *
664  * - We take into account the read prio of the bucket, which gives us an
665  *   indication of how hot the data is -- we scale the prio so that the prio
666  *   farthest from the clock is worth 1/8th of the closest.
667  *
668  * - The number of sectors of cached data in the bucket, which gives us an
669  *   indication of the cost in cache misses this eviction will cause.
670  *
671  * - If hotness * sectors used compares equal, we pick the bucket with the
672  *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
673  *   number repeatedly forces us to run mark and sweep gc to avoid generation
674  *   number wraparound.
675  */
676
677 static unsigned long bucket_sort_key(struct bch_dev *ca,
678                                      struct bucket *g,
679                                      struct bucket_mark m)
680 {
681         /*
682          * Time since last read, scaled to [0, 8) where larger value indicates
683          * more recently read data:
684          */
685         unsigned long hotness =
686                 (g->prio[READ]                  - ca->min_prio[READ]) * 7 /
687                 (ca->fs->prio_clock[READ].hand  - ca->min_prio[READ]);
688
689         /* How much we want to keep the data in this bucket: */
690         unsigned long data_wantness =
691                 (hotness + 1) * bucket_sectors_used(m);
692
693         unsigned long needs_journal_commit =
694                     bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
695
696         return  (data_wantness << 9) |
697                 (needs_journal_commit << 8) |
698                 bucket_gc_gen(ca, g);
699 }
700
701 static inline int bucket_alloc_cmp(alloc_heap *h,
702                                    struct alloc_heap_entry l,
703                                    struct alloc_heap_entry r)
704 {
705         return (l.key > r.key) - (l.key < r.key);
706 }
707
708 static void invalidate_buckets_lru(struct bch_dev *ca)
709 {
710         struct alloc_heap_entry e;
711         struct bucket *g;
712
713         ca->alloc_heap.used = 0;
714
715         mutex_lock(&ca->fs->bucket_lock);
716         bch2_recalc_min_prio(ca, READ);
717         bch2_recalc_min_prio(ca, WRITE);
718
719         /*
720          * Find buckets with lowest read priority, by building a maxheap sorted
721          * by read priority and repeatedly replacing the maximum element until
722          * all buckets have been visited.
723          */
724         for_each_bucket(g, ca) {
725                 struct bucket_mark m = READ_ONCE(g->mark);
726
727                 if (!bch2_can_invalidate_bucket(ca, g, m))
728                         continue;
729
730                 e = (struct alloc_heap_entry) {
731                         .bucket = g - ca->buckets,
732                         .key    = bucket_sort_key(ca, g, m)
733                 };
734
735                 heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
736         }
737
738         heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
739
740         /*
741          * If we run out of buckets to invalidate, bch2_allocator_thread() will
742          * kick stuff and retry us
743          */
744         while (!fifo_full(&ca->free_inc) &&
745                heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
746                 bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]);
747
748         mutex_unlock(&ca->fs->bucket_lock);
749 }
750
751 static void invalidate_buckets_fifo(struct bch_dev *ca)
752 {
753         struct bucket_mark m;
754         struct bucket *g;
755         size_t checked = 0;
756
757         while (!fifo_full(&ca->free_inc)) {
758                 if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
759                     ca->fifo_last_bucket >= ca->mi.nbuckets)
760                         ca->fifo_last_bucket = ca->mi.first_bucket;
761
762                 g = ca->buckets + ca->fifo_last_bucket++;
763                 m = READ_ONCE(g->mark);
764
765                 if (bch2_can_invalidate_bucket(ca, g, m))
766                         bch2_invalidate_one_bucket(ca, g);
767
768                 if (++checked >= ca->mi.nbuckets)
769                         return;
770         }
771 }
772
773 static void invalidate_buckets_random(struct bch_dev *ca)
774 {
775         struct bucket_mark m;
776         struct bucket *g;
777         size_t checked = 0;
778
779         while (!fifo_full(&ca->free_inc)) {
780                 size_t n = bch2_rand_range(ca->mi.nbuckets -
781                                           ca->mi.first_bucket) +
782                         ca->mi.first_bucket;
783
784                 g = ca->buckets + n;
785                 m = READ_ONCE(g->mark);
786
787                 if (bch2_can_invalidate_bucket(ca, g, m))
788                         bch2_invalidate_one_bucket(ca, g);
789
790                 if (++checked >= ca->mi.nbuckets / 2)
791                         return;
792         }
793 }
794
795 static void invalidate_buckets(struct bch_dev *ca)
796 {
797         ca->inc_gen_needs_gc                    = 0;
798         ca->inc_gen_really_needs_gc             = 0;
799
800         switch (ca->mi.replacement) {
801         case CACHE_REPLACEMENT_LRU:
802                 invalidate_buckets_lru(ca);
803                 break;
804         case CACHE_REPLACEMENT_FIFO:
805                 invalidate_buckets_fifo(ca);
806                 break;
807         case CACHE_REPLACEMENT_RANDOM:
808                 invalidate_buckets_random(ca);
809                 break;
810         }
811 }
812
813 static int size_t_cmp(const void *_l, const void *_r)
814 {
815         const size_t *l = _l, *r = _r;
816
817         return (*l > *r) - (*l < *r);
818 }
819
820 static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
821                                     u64 *journal_seq)
822 {
823         struct btree_iter iter;
824         unsigned nr_invalidated = 0;
825         size_t b, i;
826         int ret = 0;
827
828         bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
829                              BTREE_ITER_INTENT);
830
831         fifo_for_each_entry(b, &ca->free_inc, i) {
832                 ret = __bch2_alloc_write_key(c, ca, ca->buckets + b,
833                                              &iter, journal_seq);
834                 if (ret)
835                         break;
836
837                 nr_invalidated++;
838         }
839
840         bch2_btree_iter_unlock(&iter);
841         return nr_invalidated ?: ret;
842 }
843
844 /*
845  * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
846  * then add it to the freelist, waiting until there's room if necessary:
847  */
848 static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
849 {
850         if (ca->mi.discard &&
851             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
852                 blkdev_issue_discard(ca->disk_sb.bdev,
853                                      bucket_to_sector(ca, bucket),
854                                      ca->mi.bucket_size, GFP_NOIO, 0);
855
856         while (1) {
857                 bool pushed = false;
858                 unsigned i;
859
860                 set_current_state(TASK_INTERRUPTIBLE);
861
862                 /*
863                  * Don't remove from free_inc until after it's added to
864                  * freelist, so gc can find it:
865                  */
866                 spin_lock(&ca->freelist_lock);
867                 for (i = 0; i < RESERVE_NR; i++)
868                         if (fifo_push(&ca->free[i], bucket)) {
869                                 fifo_pop(&ca->free_inc, bucket);
870                                 closure_wake_up(&ca->fs->freelist_wait);
871                                 pushed = true;
872                                 break;
873                         }
874                 spin_unlock(&ca->freelist_lock);
875
876                 if (pushed)
877                         break;
878
879                 if (kthread_should_stop())
880                         break;
881
882                 schedule();
883                 try_to_freeze();
884         }
885
886         __set_current_state(TASK_RUNNING);
887 }
888
889 /**
890  * bch_allocator_thread - move buckets from free_inc to reserves
891  *
892  * The free_inc FIFO is populated by invalidate_buckets(), and
893  * the reserves are depleted by bucket allocation. When we run out
894  * of free_inc, try to invalidate some buckets and write out
895  * prios and gens.
896  */
897 static int bch2_allocator_thread(void *arg)
898 {
899         struct bch_dev *ca = arg;
900         struct bch_fs *c = ca->fs;
901         u64 journal_seq;
902         size_t bucket;
903         int ret;
904
905         set_freezable();
906
907         while (1) {
908                 while (1) {
909                         while (ca->nr_invalidated) {
910                                 BUG_ON(fifo_empty(&ca->free_inc));
911
912                                 bucket = fifo_peek(&ca->free_inc);
913                                 discard_invalidated_bucket(ca, bucket);
914                                 if (kthread_should_stop())
915                                         goto out;
916                                 --ca->nr_invalidated;
917                         }
918
919                         if (fifo_empty(&ca->free_inc))
920                                 break;
921
922                         journal_seq = 0;
923                         ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
924                         if (ret < 0)
925                                 goto out;
926
927                         ca->nr_invalidated = ret;
928
929                         if (ca->nr_invalidated == fifo_used(&ca->free_inc))
930                                 ca->alloc_thread_started = true;
931
932                         if (ca->allocator_invalidating_data)
933                                 bch2_journal_flush_seq(&c->journal, journal_seq);
934                         else if (ca->allocator_journal_seq_flush)
935                                 bch2_journal_flush_seq(&c->journal,
936                                                        ca->allocator_journal_seq_flush);
937                 }
938
939                 /* Reset front/back so we can easily sort fifo entries later: */
940                 ca->free_inc.front = ca->free_inc.back  = 0;
941                 ca->allocator_journal_seq_flush         = 0;
942                 ca->allocator_invalidating_data         = false;
943
944                 down_read(&c->gc_lock);
945                 if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
946                         up_read(&c->gc_lock);
947                         goto out;
948                 }
949
950                 while (1) {
951                         /*
952                          * Find some buckets that we can invalidate, either
953                          * they're completely unused, or only contain clean data
954                          * that's been written back to the backing device or
955                          * another cache tier
956                          */
957
958                         invalidate_buckets(ca);
959                         trace_alloc_batch(ca, fifo_used(&ca->free_inc),
960                                           ca->free_inc.size);
961
962                         if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
963                              (!fifo_full(&ca->free_inc) &&
964                               ca->inc_gen_really_needs_gc >=
965                               fifo_free(&ca->free_inc))) &&
966                             c->gc_thread) {
967                                 atomic_inc(&c->kick_gc);
968                                 wake_up_process(c->gc_thread);
969                         }
970
971                         if (fifo_full(&ca->free_inc))
972                                 break;
973
974                         if (wait_buckets_available(c, ca)) {
975                                 up_read(&c->gc_lock);
976                                 goto out;
977                         }
978                 }
979                 up_read(&c->gc_lock);
980
981                 BUG_ON(ca->free_inc.front);
982
983                 spin_lock(&ca->freelist_lock);
984                 sort(ca->free_inc.data,
985                      ca->free_inc.back,
986                      sizeof(ca->free_inc.data[0]),
987                      size_t_cmp, NULL);
988                 spin_unlock(&ca->freelist_lock);
989
990                 /*
991                  * free_inc is now full of newly-invalidated buckets: next,
992                  * write out the new bucket gens:
993                  */
994         }
995 out:
996         /*
997          * Avoid a race with bch2_usage_update() trying to wake us up after
998          * we've exited:
999          */
1000         synchronize_rcu();
1001         return 0;
1002 }
1003
1004 /* Allocation */
1005
1006 static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
1007 {
1008         struct bucket *g;
1009         long r = -1;
1010
1011         if (!down_read_trylock(&c->gc_lock))
1012                 return r;
1013
1014         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
1015                 goto out;
1016
1017         for_each_bucket(g, ca)
1018                 if (!g->mark.touched_this_mount &&
1019                     is_available_bucket(g->mark) &&
1020                     bch2_mark_alloc_bucket_startup(ca, g)) {
1021                         r = g - ca->buckets;
1022                         break;
1023                 }
1024 out:
1025         up_read(&c->gc_lock);
1026         return r;
1027 }
1028
1029 /**
1030  * bch_bucket_alloc - allocate a single bucket from a specific device
1031  *
1032  * Returns index of bucket on success, 0 on failure
1033  * */
1034 long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
1035                        enum alloc_reserve reserve)
1036 {
1037         size_t r;
1038
1039         spin_lock(&ca->freelist_lock);
1040         if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
1041                 goto out;
1042
1043         switch (reserve) {
1044         case RESERVE_ALLOC:
1045                 if (fifo_pop(&ca->free[RESERVE_BTREE], r))
1046                         goto out;
1047                 break;
1048         case RESERVE_BTREE:
1049                 if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
1050                     ca->free[RESERVE_BTREE].size &&
1051                     fifo_pop(&ca->free[RESERVE_BTREE], r))
1052                         goto out;
1053                 break;
1054         case RESERVE_MOVINGGC:
1055                 if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
1056                         goto out;
1057                 break;
1058         default:
1059                 break;
1060         }
1061
1062         spin_unlock(&ca->freelist_lock);
1063
1064         if (unlikely(!ca->alloc_thread_started) &&
1065             (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
1066                 verify_not_on_freelist(ca, r);
1067                 goto out2;
1068         }
1069
1070         trace_bucket_alloc_fail(ca, reserve);
1071         return -1;
1072 out:
1073         verify_not_on_freelist(ca, r);
1074         spin_unlock(&ca->freelist_lock);
1075
1076         bch2_wake_allocator(ca);
1077 out2:
1078         ca->buckets[r].prio[READ]       = c->prio_clock[READ].hand;
1079         ca->buckets[r].prio[WRITE]      = c->prio_clock[WRITE].hand;
1080
1081         trace_bucket_alloc(ca, reserve);
1082         return r;
1083 }
1084
1085 enum bucket_alloc_ret {
1086         ALLOC_SUCCESS,
1087         NO_DEVICES,             /* -EROFS */
1088         FREELIST_EMPTY,         /* Allocator thread not keeping up */
1089 };
1090
1091 static void recalc_alloc_group_weights(struct bch_fs *c,
1092                                        struct dev_group *devs)
1093 {
1094         struct bch_dev *ca;
1095         u64 available_buckets = 1; /* avoid a divide by zero... */
1096         unsigned i;
1097
1098         for (i = 0; i < devs->nr; i++) {
1099                 ca = devs->d[i].dev;
1100
1101                 devs->d[i].weight = dev_buckets_free(ca);
1102                 available_buckets += devs->d[i].weight;
1103         }
1104
1105         for (i = 0; i < devs->nr; i++) {
1106                 const unsigned min_weight = U32_MAX >> 4;
1107                 const unsigned max_weight = U32_MAX;
1108
1109                 devs->d[i].weight =
1110                         min_weight +
1111                         div64_u64(devs->d[i].weight *
1112                                   devs->nr *
1113                                   (max_weight - min_weight),
1114                                   available_buckets);
1115                 devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
1116         }
1117 }
1118
1119 static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c,
1120                                                     struct open_bucket *ob,
1121                                                     enum alloc_reserve reserve,
1122                                                     unsigned nr_replicas,
1123                                                     struct dev_group *devs,
1124                                                     long *devs_used)
1125 {
1126         enum bucket_alloc_ret ret;
1127         unsigned fail_idx = -1, i;
1128         unsigned available = 0;
1129
1130         BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
1131
1132         if (ob->nr_ptrs >= nr_replicas)
1133                 return ALLOC_SUCCESS;
1134
1135         spin_lock(&devs->lock);
1136
1137         for (i = 0; i < devs->nr; i++)
1138                 available += !test_bit(devs->d[i].dev->dev_idx,
1139                                        devs_used);
1140
1141         recalc_alloc_group_weights(c, devs);
1142
1143         i = devs->cur_device;
1144
1145         while (ob->nr_ptrs < nr_replicas) {
1146                 struct bch_dev *ca;
1147                 long bucket;
1148
1149                 if (!available) {
1150                         ret = NO_DEVICES;
1151                         goto err;
1152                 }
1153
1154                 i++;
1155                 i %= devs->nr;
1156
1157                 ret = FREELIST_EMPTY;
1158                 if (i == fail_idx)
1159                         goto err;
1160
1161                 ca = devs->d[i].dev;
1162
1163                 if (test_bit(ca->dev_idx, devs_used))
1164                         continue;
1165
1166                 if (fail_idx == -1 &&
1167                     get_random_int() > devs->d[i].weight)
1168                         continue;
1169
1170                 bucket = bch2_bucket_alloc(c, ca, reserve);
1171                 if (bucket < 0) {
1172                         if (fail_idx == -1)
1173                                 fail_idx = i;
1174                         continue;
1175                 }
1176
1177                 /*
1178                  * open_bucket_add_buckets expects new pointers at the head of
1179                  * the list:
1180                  */
1181                 memmove(&ob->ptrs[1],
1182                         &ob->ptrs[0],
1183                         ob->nr_ptrs * sizeof(ob->ptrs[0]));
1184                 memmove(&ob->ptr_offset[1],
1185                         &ob->ptr_offset[0],
1186                         ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
1187                 ob->nr_ptrs++;
1188                 ob->ptrs[0] = (struct bch_extent_ptr) {
1189                         .gen    = ca->buckets[bucket].mark.gen,
1190                         .offset = bucket_to_sector(ca, bucket),
1191                         .dev    = ca->dev_idx,
1192                 };
1193                 ob->ptr_offset[0] = 0;
1194
1195                 __set_bit(ca->dev_idx, devs_used);
1196                 available--;
1197                 devs->cur_device = i;
1198         }
1199
1200         ret = ALLOC_SUCCESS;
1201 err:
1202         EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
1203         spin_unlock(&devs->lock);
1204         return ret;
1205 }
1206
1207 static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
1208                                                     struct write_point *wp,
1209                                                     struct open_bucket *ob,
1210                                                     unsigned nr_replicas,
1211                                                     enum alloc_reserve reserve,
1212                                                     long *devs_used)
1213 {
1214         struct bch_tier *tier;
1215         /*
1216          * this should implement policy - for a given type of allocation, decide
1217          * which devices to allocate from:
1218          *
1219          * XXX: switch off wp->type and do something more intelligent here
1220          */
1221         if (wp->group)
1222                 return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
1223                                               wp->group, devs_used);
1224
1225         /* foreground writes: prefer fastest tier: */
1226         tier = READ_ONCE(c->fastest_tier);
1227         if (tier)
1228                 bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
1229                                        &tier->devs, devs_used);
1230
1231         return bch2_bucket_alloc_group(c, ob, reserve, nr_replicas,
1232                                       &c->all_devs, devs_used);
1233 }
1234
1235 static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
1236                                 struct open_bucket *ob, unsigned nr_replicas,
1237                                 enum alloc_reserve reserve, long *devs_used,
1238                                 struct closure *cl)
1239 {
1240         bool waiting = false;
1241
1242         while (1) {
1243                 switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
1244                                                reserve, devs_used)) {
1245                 case ALLOC_SUCCESS:
1246                         if (waiting)
1247                                 closure_wake_up(&c->freelist_wait);
1248
1249                         return 0;
1250
1251                 case NO_DEVICES:
1252                         if (waiting)
1253                                 closure_wake_up(&c->freelist_wait);
1254                         return -EROFS;
1255
1256                 case FREELIST_EMPTY:
1257                         if (!cl || waiting)
1258                                 trace_freelist_empty_fail(c,
1259                                                         reserve, cl);
1260
1261                         if (!cl)
1262                                 return -ENOSPC;
1263
1264                         if (waiting)
1265                                 return -EAGAIN;
1266
1267                         /* Retry allocation after adding ourself to waitlist: */
1268                         closure_wait(&c->freelist_wait, cl);
1269                         waiting = true;
1270                         break;
1271                 default:
1272                         BUG();
1273                 }
1274         }
1275 }
1276
1277 /* Open buckets: */
1278
1279 /*
1280  * Open buckets represent one or more buckets (on multiple devices) that are
1281  * currently being allocated from. They serve two purposes:
1282  *
1283  *  - They track buckets that have been partially allocated, allowing for
1284  *    sub-bucket sized allocations - they're used by the sector allocator below
1285  *
1286  *  - They provide a reference to the buckets they own that mark and sweep GC
1287  *    can find, until the new allocation has a pointer to it inserted into the
1288  *    btree
1289  *
1290  * When allocating some space with the sector allocator, the allocation comes
1291  * with a reference to an open bucket - the caller is required to put that
1292  * reference _after_ doing the index update that makes its allocation reachable.
1293  */
1294
1295 static void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
1296 {
1297         const struct bch_extent_ptr *ptr;
1298
1299         lockdep_assert_held(&c->open_buckets_lock);
1300
1301         open_bucket_for_each_ptr(ob, ptr) {
1302                 struct bch_dev *ca = c->devs[ptr->dev];
1303
1304                 bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
1305         }
1306
1307         ob->nr_ptrs = 0;
1308
1309         list_move(&ob->list, &c->open_buckets_free);
1310         c->open_buckets_nr_free++;
1311         closure_wake_up(&c->open_buckets_wait);
1312 }
1313
1314 void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *b)
1315 {
1316         if (atomic_dec_and_test(&b->pin)) {
1317                 spin_lock(&c->open_buckets_lock);
1318                 __bch2_open_bucket_put(c, b);
1319                 spin_unlock(&c->open_buckets_lock);
1320         }
1321 }
1322
1323 static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
1324                                                unsigned nr_reserved,
1325                                                struct closure *cl)
1326 {
1327         struct open_bucket *ret;
1328
1329         spin_lock(&c->open_buckets_lock);
1330
1331         if (c->open_buckets_nr_free > nr_reserved) {
1332                 BUG_ON(list_empty(&c->open_buckets_free));
1333                 ret = list_first_entry(&c->open_buckets_free,
1334                                        struct open_bucket, list);
1335                 list_move(&ret->list, &c->open_buckets_open);
1336                 BUG_ON(ret->nr_ptrs);
1337
1338                 atomic_set(&ret->pin, 1); /* XXX */
1339                 ret->has_full_ptrs      = false;
1340
1341                 c->open_buckets_nr_free--;
1342                 trace_open_bucket_alloc(c, cl);
1343         } else {
1344                 trace_open_bucket_alloc_fail(c, cl);
1345
1346                 if (cl) {
1347                         closure_wait(&c->open_buckets_wait, cl);
1348                         ret = ERR_PTR(-EAGAIN);
1349                 } else
1350                         ret = ERR_PTR(-ENOSPC);
1351         }
1352
1353         spin_unlock(&c->open_buckets_lock);
1354
1355         return ret;
1356 }
1357
1358 static unsigned ob_ptr_sectors_free(struct bch_fs *c,
1359                                     struct open_bucket *ob,
1360                                     struct bch_extent_ptr *ptr)
1361 {
1362         struct bch_dev *ca = c->devs[ptr->dev];
1363         unsigned i = ptr - ob->ptrs;
1364         unsigned bucket_size = ca->mi.bucket_size;
1365         unsigned used = (ptr->offset & (bucket_size - 1)) +
1366                 ob->ptr_offset[i];
1367
1368         BUG_ON(used > bucket_size);
1369
1370         return bucket_size - used;
1371 }
1372
1373 static unsigned open_bucket_sectors_free(struct bch_fs *c,
1374                                          struct open_bucket *ob,
1375                                          unsigned nr_replicas)
1376 {
1377         unsigned i, sectors_free = UINT_MAX;
1378
1379         for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
1380                 sectors_free = min(sectors_free,
1381                                    ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
1382
1383         return sectors_free != UINT_MAX ? sectors_free : 0;
1384 }
1385
1386 static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
1387                                          struct open_bucket *new,
1388                                          struct open_bucket *old)
1389 {
1390         unsigned i;
1391
1392         for (i = 0; i < old->nr_ptrs; i++)
1393                 if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
1394                         struct bch_extent_ptr tmp = old->ptrs[i];
1395
1396                         tmp.offset += old->ptr_offset[i];
1397                         new->ptrs[new->nr_ptrs] = tmp;
1398                         new->ptr_offset[new->nr_ptrs] = 0;
1399                         new->nr_ptrs++;
1400                 }
1401 }
1402
1403 static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
1404 {
1405 #ifdef CONFIG_BCACHEFS_DEBUG
1406         const struct bch_extent_ptr *ptr;
1407
1408         open_bucket_for_each_ptr(ob, ptr) {
1409                 struct bch_dev *ca = c->devs[ptr->dev];
1410
1411                 BUG_ON(ptr_stale(ca, ptr));
1412         }
1413 #endif
1414 }
1415
1416 /* Sector allocator */
1417
1418 static struct open_bucket *lock_writepoint(struct bch_fs *c,
1419                                            struct write_point *wp)
1420 {
1421         struct open_bucket *ob;
1422
1423         while ((ob = ACCESS_ONCE(wp->b))) {
1424                 mutex_lock(&ob->lock);
1425                 if (wp->b == ob)
1426                         break;
1427
1428                 mutex_unlock(&ob->lock);
1429         }
1430
1431         return ob;
1432 }
1433
1434 static int open_bucket_add_buckets(struct bch_fs *c,
1435                                    struct write_point *wp,
1436                                    struct open_bucket *ob,
1437                                    unsigned nr_replicas,
1438                                    unsigned nr_replicas_required,
1439                                    enum alloc_reserve reserve,
1440                                    struct closure *cl)
1441 {
1442         long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
1443         unsigned i;
1444         int ret;
1445
1446         /*
1447          * We might be allocating pointers to add to an existing extent
1448          * (tiering/copygc/migration) - if so, some of the pointers in our
1449          * existing open bucket might duplicate devices we already have. This is
1450          * moderately annoying.
1451          */
1452
1453         /* Short circuit all the fun stuff if posssible: */
1454         if (ob->nr_ptrs >= nr_replicas)
1455                 return 0;
1456
1457         memset(devs_used, 0, sizeof(devs_used));
1458
1459         for (i = 0; i < ob->nr_ptrs; i++)
1460                 __set_bit(ob->ptrs[i].dev, devs_used);
1461
1462         ret = bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
1463                                    reserve, devs_used, cl);
1464
1465         if (ret == -EROFS &&
1466             ob->nr_ptrs >= nr_replicas_required)
1467                 ret = 0;
1468
1469         return ret;
1470 }
1471
1472 /*
1473  * Get us an open_bucket we can allocate from, return with it locked:
1474  */
1475 struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c,
1476                                              struct write_point *wp,
1477                                              unsigned nr_replicas,
1478                                              unsigned nr_replicas_required,
1479                                              enum alloc_reserve reserve,
1480                                              struct closure *cl)
1481 {
1482         struct open_bucket *ob;
1483         unsigned open_buckets_reserved = wp == &c->btree_write_point
1484                 ? 0 : BTREE_NODE_RESERVE;
1485         int ret;
1486
1487         BUG_ON(!nr_replicas);
1488 retry:
1489         ob = lock_writepoint(c, wp);
1490
1491         /*
1492          * If ob->sectors_free == 0, one or more of the buckets ob points to is
1493          * full. We can't drop pointers from an open bucket - garbage collection
1494          * still needs to find them; instead, we must allocate a new open bucket
1495          * and copy any pointers to non-full buckets into the new open bucket.
1496          */
1497         if (!ob || ob->has_full_ptrs) {
1498                 struct open_bucket *new_ob;
1499
1500                 new_ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
1501                 if (IS_ERR(new_ob))
1502                         return new_ob;
1503
1504                 mutex_lock(&new_ob->lock);
1505
1506                 /*
1507                  * We point the write point at the open_bucket before doing the
1508                  * allocation to avoid a race with shutdown:
1509                  */
1510                 if (race_fault() ||
1511                     cmpxchg(&wp->b, ob, new_ob) != ob) {
1512                         /* We raced: */
1513                         mutex_unlock(&new_ob->lock);
1514                         bch2_open_bucket_put(c, new_ob);
1515
1516                         if (ob)
1517                                 mutex_unlock(&ob->lock);
1518                         goto retry;
1519                 }
1520
1521                 if (ob) {
1522                         open_bucket_copy_unused_ptrs(c, new_ob, ob);
1523                         mutex_unlock(&ob->lock);
1524                         bch2_open_bucket_put(c, ob);
1525                 }
1526
1527                 ob = new_ob;
1528         }
1529
1530         ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
1531                                       nr_replicas_required,
1532                                       reserve, cl);
1533         if (ret) {
1534                 mutex_unlock(&ob->lock);
1535                 return ERR_PTR(ret);
1536         }
1537
1538         ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
1539
1540         BUG_ON(!ob->sectors_free);
1541         verify_not_stale(c, ob);
1542
1543         return ob;
1544 }
1545
1546 /*
1547  * Append pointers to the space we just allocated to @k, and mark @sectors space
1548  * as allocated out of @ob
1549  */
1550 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
1551                                     unsigned nr_replicas, struct open_bucket *ob,
1552                                     unsigned sectors)
1553 {
1554         struct bch_extent_ptr tmp;
1555         bool has_data = false;
1556         unsigned i;
1557
1558         /*
1559          * We're keeping any existing pointer k has, and appending new pointers:
1560          * __bch2_write() will only write to the pointers we add here:
1561          */
1562
1563         BUG_ON(sectors > ob->sectors_free);
1564
1565         /* didn't use all the ptrs: */
1566         if (nr_replicas < ob->nr_ptrs)
1567                 has_data = true;
1568
1569         for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
1570                 EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
1571
1572                 tmp = ob->ptrs[i];
1573                 tmp.cached = bkey_extent_is_cached(&e->k);
1574                 tmp.offset += ob->ptr_offset[i];
1575                 extent_ptr_append(e, tmp);
1576
1577                 ob->ptr_offset[i] += sectors;
1578
1579                 this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors);
1580         }
1581 }
1582
1583 /*
1584  * Append pointers to the space we just allocated to @k, and mark @sectors space
1585  * as allocated out of @ob
1586  */
1587 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
1588                             struct open_bucket *ob)
1589 {
1590         bool has_data = false;
1591         unsigned i;
1592
1593         for (i = 0; i < ob->nr_ptrs; i++) {
1594                 if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
1595                         ob->has_full_ptrs = true;
1596                 else
1597                         has_data = true;
1598         }
1599
1600         if (likely(has_data))
1601                 atomic_inc(&ob->pin);
1602         else
1603                 BUG_ON(xchg(&wp->b, NULL) != ob);
1604
1605         mutex_unlock(&ob->lock);
1606 }
1607
1608 /*
1609  * Allocates some space in the cache to write to, and k to point to the newly
1610  * allocated space, and updates k->size and k->offset (to point to the
1611  * end of the newly allocated space).
1612  *
1613  * May allocate fewer sectors than @sectors, k->size indicates how many
1614  * sectors were actually allocated.
1615  *
1616  * Return codes:
1617  * - -EAGAIN: closure was added to waitlist
1618  * - -ENOSPC: out of space and no closure provided
1619  *
1620  * @c  - filesystem.
1621  * @wp - write point to use for allocating sectors.
1622  * @k  - key to return the allocated space information.
1623  * @cl - closure to wait for a bucket
1624  */
1625 struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
1626                                        struct write_point *wp,
1627                                        struct bkey_i_extent *e,
1628                                        unsigned nr_replicas,
1629                                        unsigned nr_replicas_required,
1630                                        enum alloc_reserve reserve,
1631                                        struct closure *cl)
1632 {
1633         struct open_bucket *ob;
1634
1635         ob = bch2_alloc_sectors_start(c, wp, nr_replicas,
1636                                      nr_replicas_required,
1637                                      reserve, cl);
1638         if (IS_ERR_OR_NULL(ob))
1639                 return ob;
1640
1641         if (e->k.size > ob->sectors_free)
1642                 bch2_key_resize(&e->k, ob->sectors_free);
1643
1644         bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
1645
1646         bch2_alloc_sectors_done(c, wp, ob);
1647
1648         return ob;
1649 }
1650
1651 /* Startup/shutdown (ro/rw): */
1652
1653 void bch2_recalc_capacity(struct bch_fs *c)
1654 {
1655         struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
1656         struct bch_dev *ca;
1657         u64 total_capacity, capacity = 0, reserved_sectors = 0;
1658         unsigned long ra_pages = 0;
1659         unsigned i, j;
1660
1661         for_each_online_member(ca, c, i) {
1662                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
1663
1664                 ra_pages += bdi->ra_pages;
1665         }
1666
1667         c->bdi.ra_pages = ra_pages;
1668
1669         /* Find fastest, slowest tiers with devices: */
1670
1671         for (tier = c->tiers;
1672              tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
1673                 if (!tier->devs.nr)
1674                         continue;
1675                 if (!fastest_tier)
1676                         fastest_tier = tier;
1677                 slowest_tier = tier;
1678         }
1679
1680         c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
1681
1682         c->promote_write_point.group = &fastest_tier->devs;
1683
1684         if (!fastest_tier)
1685                 goto set_capacity;
1686
1687         /*
1688          * Capacity of the filesystem is the capacity of all the devices in the
1689          * slowest (highest) tier - we don't include lower tier devices.
1690          */
1691         spin_lock(&slowest_tier->devs.lock);
1692         group_for_each_dev(ca, &slowest_tier->devs, i) {
1693                 size_t reserve = 0;
1694
1695                 /*
1696                  * We need to reserve buckets (from the number
1697                  * of currently available buckets) against
1698                  * foreground writes so that mainly copygc can
1699                  * make forward progress.
1700                  *
1701                  * We need enough to refill the various reserves
1702                  * from scratch - copygc will use its entire
1703                  * reserve all at once, then run against when
1704                  * its reserve is refilled (from the formerly
1705                  * available buckets).
1706                  *
1707                  * This reserve is just used when considering if
1708                  * allocations for foreground writes must wait -
1709                  * not -ENOSPC calculations.
1710                  */
1711                 for (j = 0; j < RESERVE_NONE; j++)
1712                         reserve += ca->free[j].size;
1713
1714                 reserve += ca->free_inc.size;
1715
1716                 reserve += ARRAY_SIZE(c->write_points);
1717
1718                 if (ca->mi.tier)
1719                         reserve += 1;   /* tiering write point */
1720                 reserve += 1;           /* btree write point */
1721
1722                 reserved_sectors += reserve << ca->bucket_bits;
1723
1724                 capacity += (ca->mi.nbuckets -
1725                              ca->mi.first_bucket) <<
1726                         ca->bucket_bits;
1727         }
1728         spin_unlock(&slowest_tier->devs.lock);
1729 set_capacity:
1730         total_capacity = capacity;
1731
1732         capacity *= (100 - c->opts.gc_reserve_percent);
1733         capacity = div64_u64(capacity, 100);
1734
1735         BUG_ON(reserved_sectors > total_capacity);
1736
1737         capacity = min(capacity, total_capacity - reserved_sectors);
1738
1739         c->capacity = capacity;
1740
1741         if (c->capacity) {
1742                 bch2_io_timer_add(&c->io_clock[READ],
1743                                  &c->prio_clock[READ].rescale);
1744                 bch2_io_timer_add(&c->io_clock[WRITE],
1745                                  &c->prio_clock[WRITE].rescale);
1746         } else {
1747                 bch2_io_timer_del(&c->io_clock[READ],
1748                                  &c->prio_clock[READ].rescale);
1749                 bch2_io_timer_del(&c->io_clock[WRITE],
1750                                  &c->prio_clock[WRITE].rescale);
1751         }
1752
1753         /* Wake up case someone was waiting for buckets */
1754         closure_wake_up(&c->freelist_wait);
1755 }
1756
1757 static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
1758                                   struct write_point *wp)
1759 {
1760         struct open_bucket *ob;
1761         struct bch_extent_ptr *ptr;
1762
1763         ob = lock_writepoint(c, wp);
1764         if (!ob)
1765                 return;
1766
1767         for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
1768                 if (ptr->dev == ca->dev_idx)
1769                         goto found;
1770
1771         mutex_unlock(&ob->lock);
1772         return;
1773 found:
1774         BUG_ON(xchg(&wp->b, NULL) != ob);
1775         mutex_unlock(&ob->lock);
1776
1777         /* Drop writepoint's ref: */
1778         bch2_open_bucket_put(c, ob);
1779 }
1780
1781 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1782 {
1783         struct bch_extent_ptr *ptr;
1784         struct open_bucket *ob;
1785
1786         for (ob = c->open_buckets;
1787              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1788              ob++)
1789                 if (atomic_read(&ob->pin)) {
1790                         mutex_lock(&ob->lock);
1791                         for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
1792                                 if (ptr->dev == ca->dev_idx) {
1793                                         mutex_unlock(&ob->lock);
1794                                         return true;
1795                                 }
1796                         mutex_unlock(&ob->lock);
1797                 }
1798
1799         return false;
1800 }
1801
1802 /* device goes ro: */
1803 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1804 {
1805         struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
1806         struct closure cl;
1807         unsigned i;
1808
1809         BUG_ON(ca->alloc_thread);
1810
1811         closure_init_stack(&cl);
1812
1813         /* First, remove device from allocation groups: */
1814
1815         bch2_dev_group_remove(&c->journal.devs, ca);
1816         bch2_dev_group_remove(tier, ca);
1817         bch2_dev_group_remove(&c->all_devs, ca);
1818
1819         /*
1820          * Capacity is calculated based off of devices in allocation groups:
1821          */
1822         bch2_recalc_capacity(c);
1823
1824         /* Next, close write points that point to this device... */
1825         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1826                 bch2_stop_write_point(c, ca, &c->write_points[i]);
1827
1828         bch2_stop_write_point(c, ca, &ca->copygc_write_point);
1829         bch2_stop_write_point(c, ca, &c->promote_write_point);
1830         bch2_stop_write_point(c, ca, &ca->tiering_write_point);
1831         bch2_stop_write_point(c, ca, &c->migration_write_point);
1832         bch2_stop_write_point(c, ca, &c->btree_write_point);
1833
1834         mutex_lock(&c->btree_reserve_cache_lock);
1835         while (c->btree_reserve_cache_nr) {
1836                 struct btree_alloc *a =
1837                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1838
1839                 bch2_open_bucket_put(c, a->ob);
1840         }
1841         mutex_unlock(&c->btree_reserve_cache_lock);
1842
1843         /*
1844          * Wake up threads that were blocked on allocation, so they can notice
1845          * the device can no longer be removed and the capacity has changed:
1846          */
1847         closure_wake_up(&c->freelist_wait);
1848
1849         /*
1850          * journal_res_get() can block waiting for free space in the journal -
1851          * it needs to notice there may not be devices to allocate from anymore:
1852          */
1853         wake_up(&c->journal.wait);
1854
1855         /* Now wait for any in flight writes: */
1856
1857         while (1) {
1858                 closure_wait(&c->open_buckets_wait, &cl);
1859
1860                 if (!bch2_dev_has_open_write_point(c, ca)) {
1861                         closure_wake_up(&c->open_buckets_wait);
1862                         break;
1863                 }
1864
1865                 closure_sync(&cl);
1866         }
1867 }
1868
1869 /* device goes rw: */
1870 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1871 {
1872         struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
1873         struct bch_sb_field_journal *journal_buckets;
1874         bool has_journal;
1875
1876         bch2_dev_group_add(&c->all_devs, ca);
1877         bch2_dev_group_add(tier, ca);
1878
1879         mutex_lock(&c->sb_lock);
1880         journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb);
1881         has_journal = bch2_nr_journal_buckets(journal_buckets) >=
1882                 BCH_JOURNAL_BUCKETS_MIN;
1883         mutex_unlock(&c->sb_lock);
1884
1885         if (has_journal)
1886                 bch2_dev_group_add(&c->journal.devs, ca);
1887 }
1888
1889 /* stop allocator thread: */
1890 void bch2_dev_allocator_stop(struct bch_dev *ca)
1891 {
1892         struct task_struct *p = ca->alloc_thread;
1893
1894         ca->alloc_thread = NULL;
1895         smp_wmb();
1896
1897         /*
1898          * We need an rcu barrier between setting ca->alloc_thread = NULL and
1899          * the thread shutting down to avoid a race with bch2_usage_update() -
1900          * the allocator thread itself does a synchronize_rcu() on exit.
1901          *
1902          * XXX: it would be better to have the rcu barrier be asynchronous
1903          * instead of blocking us here
1904          */
1905         if (p)
1906                 kthread_stop(p);
1907 }
1908
1909 /* start allocator thread: */
1910 int bch2_dev_allocator_start(struct bch_dev *ca)
1911 {
1912         struct task_struct *p;
1913
1914         /*
1915          * allocator thread already started?
1916          */
1917         if (ca->alloc_thread)
1918                 return 0;
1919
1920         p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator");
1921         if (IS_ERR(p))
1922                 return PTR_ERR(p);
1923
1924         ca->alloc_thread = p;
1925         return 0;
1926 }
1927
1928 void bch2_fs_allocator_init(struct bch_fs *c)
1929 {
1930         unsigned i;
1931
1932         INIT_LIST_HEAD(&c->open_buckets_open);
1933         INIT_LIST_HEAD(&c->open_buckets_free);
1934         spin_lock_init(&c->open_buckets_lock);
1935         bch2_prio_timer_init(c, READ);
1936         bch2_prio_timer_init(c, WRITE);
1937
1938         /* open bucket 0 is a sentinal NULL: */
1939         mutex_init(&c->open_buckets[0].lock);
1940         INIT_LIST_HEAD(&c->open_buckets[0].list);
1941
1942         for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
1943                 mutex_init(&c->open_buckets[i].lock);
1944                 c->open_buckets_nr_free++;
1945                 list_add(&c->open_buckets[i].list, &c->open_buckets_free);
1946         }
1947
1948         spin_lock_init(&c->all_devs.lock);
1949
1950         for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
1951                 spin_lock_init(&c->tiers[i].devs.lock);
1952
1953         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1954                 c->write_points[i].throttle = true;
1955
1956         c->pd_controllers_update_seconds = 5;
1957         INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
1958
1959         spin_lock_init(&c->foreground_write_pd_lock);
1960         bch2_pd_controller_init(&c->foreground_write_pd);
1961         /*
1962          * We do not want the write rate to have an effect on the computed
1963          * rate, for two reasons:
1964          *
1965          * We do not call bch2_ratelimit_delay() at all if the write rate
1966          * exceeds 1GB/s. In this case, the PD controller will think we are
1967          * not "keeping up" and not change the rate.
1968          */
1969         c->foreground_write_pd.backpressure = 0;
1970         init_timer(&c->foreground_write_wakeup);
1971
1972         c->foreground_write_wakeup.data = (unsigned long) c;
1973         c->foreground_write_wakeup.function = bch2_wake_delayed_writes;
1974 }