]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_foreground.c
Update bcachefs sources to fb8a27f6d4 bcachefs: Delete a redundant tracepoint
[bcachefs-tools-debian] / libbcachefs / alloc_foreground.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "btree_iter.h"
18 #include "btree_update.h"
19 #include "btree_gc.h"
20 #include "buckets.h"
21 #include "buckets_waiting_for_journal.h"
22 #include "clock.h"
23 #include "debug.h"
24 #include "disk_groups.h"
25 #include "ec.h"
26 #include "error.h"
27 #include "io.h"
28 #include "journal.h"
29
30 #include <linux/math64.h>
31 #include <linux/rculist.h>
32 #include <linux/rcupdate.h>
33 #include <trace/events/bcachefs.h>
34
35 const char * const bch2_alloc_reserves[] = {
36 #define x(t) #t,
37         BCH_ALLOC_RESERVES()
38 #undef x
39         NULL
40 };
41
42 /*
43  * Open buckets represent a bucket that's currently being allocated from.  They
44  * serve two purposes:
45  *
46  *  - They track buckets that have been partially allocated, allowing for
47  *    sub-bucket sized allocations - they're used by the sector allocator below
48  *
49  *  - They provide a reference to the buckets they own that mark and sweep GC
50  *    can find, until the new allocation has a pointer to it inserted into the
51  *    btree
52  *
53  * When allocating some space with the sector allocator, the allocation comes
54  * with a reference to an open bucket - the caller is required to put that
55  * reference _after_ doing the index update that makes its allocation reachable.
56  */
57
58 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
59 {
60         open_bucket_idx_t idx = ob - c->open_buckets;
61         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
62
63         ob->hash = *slot;
64         *slot = idx;
65 }
66
67 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
68 {
69         open_bucket_idx_t idx = ob - c->open_buckets;
70         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
71
72         while (*slot != idx) {
73                 BUG_ON(!*slot);
74                 slot = &c->open_buckets[*slot].hash;
75         }
76
77         *slot = ob->hash;
78         ob->hash = 0;
79 }
80
81 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
82 {
83         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
84
85         if (ob->ec) {
86                 bch2_ec_bucket_written(c, ob);
87                 return;
88         }
89
90         percpu_down_read(&c->mark_lock);
91         spin_lock(&ob->lock);
92
93         ob->valid = false;
94         ob->data_type = 0;
95
96         spin_unlock(&ob->lock);
97         percpu_up_read(&c->mark_lock);
98
99         spin_lock(&c->freelist_lock);
100         bch2_open_bucket_hash_remove(c, ob);
101
102         ob->freelist = c->open_buckets_freelist;
103         c->open_buckets_freelist = ob - c->open_buckets;
104
105         c->open_buckets_nr_free++;
106         ca->nr_open_buckets--;
107         spin_unlock(&c->freelist_lock);
108
109         closure_wake_up(&c->open_buckets_wait);
110 }
111
112 void bch2_open_bucket_write_error(struct bch_fs *c,
113                                   struct open_buckets *obs,
114                                   unsigned dev)
115 {
116         struct open_bucket *ob;
117         unsigned i;
118
119         open_bucket_for_each(c, obs, ob, i)
120                 if (ob->dev == dev && ob->ec)
121                         bch2_ec_bucket_cancel(c, ob);
122 }
123
124 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
125 {
126         struct open_bucket *ob;
127
128         BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
129
130         ob = c->open_buckets + c->open_buckets_freelist;
131         c->open_buckets_freelist = ob->freelist;
132         atomic_set(&ob->pin, 1);
133         ob->data_type = 0;
134
135         c->open_buckets_nr_free--;
136         return ob;
137 }
138
139 static void open_bucket_free_unused(struct bch_fs *c,
140                                     struct write_point *wp,
141                                     struct open_bucket *ob)
142 {
143         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
144         bool may_realloc = wp->data_type == BCH_DATA_user;
145
146         BUG_ON(ca->open_buckets_partial_nr >
147                ARRAY_SIZE(ca->open_buckets_partial));
148
149         if (ca->open_buckets_partial_nr <
150             ARRAY_SIZE(ca->open_buckets_partial) &&
151             may_realloc) {
152                 spin_lock(&c->freelist_lock);
153                 ob->on_partial_list = true;
154                 ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
155                         ob - c->open_buckets;
156                 spin_unlock(&c->freelist_lock);
157
158                 closure_wake_up(&c->open_buckets_wait);
159                 closure_wake_up(&c->freelist_wait);
160         } else {
161                 bch2_open_bucket_put(c, ob);
162         }
163 }
164
165 /* _only_ for allocating the journal on a new device: */
166 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
167 {
168         while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
169                 u64 b = ca->new_fs_bucket_idx++;
170
171                 if (!is_superblock_bucket(ca, b) &&
172                     (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
173                         return b;
174         }
175
176         return -1;
177 }
178
179 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
180 {
181         switch (reserve) {
182         case RESERVE_btree:
183         case RESERVE_btree_movinggc:
184                 return 0;
185         case RESERVE_movinggc:
186                 return OPEN_BUCKETS_COUNT / 4;
187         default:
188                 return OPEN_BUCKETS_COUNT / 2;
189         }
190 }
191
192 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
193                                               u64 bucket,
194                                               enum alloc_reserve reserve,
195                                               struct bch_alloc_v4 *a,
196                                               u64 *skipped_open,
197                                               u64 *skipped_need_journal_commit,
198                                               u64 *skipped_nouse,
199                                               struct closure *cl)
200 {
201         struct open_bucket *ob;
202
203         if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
204                 (*skipped_nouse)++;
205                 return NULL;
206         }
207
208         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
209                 (*skipped_open)++;
210                 return NULL;
211         }
212
213         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
214                         c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
215                 (*skipped_need_journal_commit)++;
216                 return NULL;
217         }
218
219         spin_lock(&c->freelist_lock);
220
221         if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
222                 if (cl)
223                         closure_wait(&c->open_buckets_wait, cl);
224
225                 if (!c->blocked_allocate_open_bucket)
226                         c->blocked_allocate_open_bucket = local_clock();
227
228                 spin_unlock(&c->freelist_lock);
229                 return ERR_PTR(-OPEN_BUCKETS_EMPTY);
230         }
231
232         /* Recheck under lock: */
233         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
234                 spin_unlock(&c->freelist_lock);
235                 (*skipped_open)++;
236                 return NULL;
237         }
238
239         ob = bch2_open_bucket_alloc(c);
240
241         spin_lock(&ob->lock);
242
243         ob->valid       = true;
244         ob->sectors_free = ca->mi.bucket_size;
245         ob->alloc_reserve = reserve;
246         ob->dev         = ca->dev_idx;
247         ob->gen         = a->gen;
248         ob->bucket      = bucket;
249         spin_unlock(&ob->lock);
250
251         ca->nr_open_buckets++;
252         bch2_open_bucket_hash_add(c, ob);
253
254         if (c->blocked_allocate_open_bucket) {
255                 bch2_time_stats_update(
256                         &c->times[BCH_TIME_blocked_allocate_open_bucket],
257                         c->blocked_allocate_open_bucket);
258                 c->blocked_allocate_open_bucket = 0;
259         }
260
261         if (c->blocked_allocate) {
262                 bch2_time_stats_update(
263                         &c->times[BCH_TIME_blocked_allocate],
264                         c->blocked_allocate);
265                 c->blocked_allocate = 0;
266         }
267
268         spin_unlock(&c->freelist_lock);
269
270         trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
271         return ob;
272 }
273
274 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
275                                             enum alloc_reserve reserve, u64 free_entry,
276                                             u64 *skipped_open,
277                                             u64 *skipped_need_journal_commit,
278                                             u64 *skipped_nouse,
279                                             struct closure *cl)
280 {
281         struct bch_fs *c = trans->c;
282         struct btree_iter iter;
283         struct bkey_s_c k;
284         struct open_bucket *ob;
285         struct bch_alloc_v4 a;
286         u64 b = free_entry & ~(~0ULL << 56);
287         unsigned genbits = free_entry >> 56;
288         struct printbuf buf = PRINTBUF;
289         int ret;
290
291         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
292         k = bch2_btree_iter_peek_slot(&iter);
293         ret = bkey_err(k);
294         if (ret) {
295                 ob = ERR_PTR(ret);
296                 goto err;
297         }
298
299         bch2_alloc_to_v4(k, &a);
300
301         if (bch2_fs_inconsistent_on(a.data_type != BCH_DATA_free, c,
302                         "non free bucket in freespace btree (state %s)\n"
303                         "  %s\n"
304                         "  at %llu (genbits %u)",
305                         bch2_data_types[a.data_type],
306                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
307                         free_entry, genbits)) {
308                 ob = ERR_PTR(-EIO);
309                 goto err;
310         }
311
312         if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c,
313                         "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
314                         "  %s",
315                         genbits, alloc_freespace_genbits(a) >> 56,
316                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
317                 ob = ERR_PTR(-EIO);
318                 goto err;
319         }
320
321         if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c,
322                         "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)",
323                         b, ca->mi.first_bucket, ca->mi.nbuckets)) {
324                 ob = ERR_PTR(-EIO);
325                 goto err;
326         }
327
328         ob = __try_alloc_bucket(c, ca, b, reserve, &a,
329                                 skipped_open,
330                                 skipped_need_journal_commit,
331                                 skipped_nouse,
332                                 cl);
333 err:
334         bch2_trans_iter_exit(trans, &iter);
335         printbuf_exit(&buf);
336         return ob;
337 }
338
339 static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
340                                                     enum alloc_reserve reserve)
341 {
342         struct open_bucket *ob;
343         int i;
344
345         spin_lock(&c->freelist_lock);
346
347         for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
348                 ob = c->open_buckets + ca->open_buckets_partial[i];
349
350                 if (reserve <= ob->alloc_reserve) {
351                         array_remove_item(ca->open_buckets_partial,
352                                           ca->open_buckets_partial_nr,
353                                           i);
354                         ob->on_partial_list = false;
355                         ob->alloc_reserve = reserve;
356                         spin_unlock(&c->freelist_lock);
357                         return ob;
358                 }
359         }
360
361         spin_unlock(&c->freelist_lock);
362         return NULL;
363 }
364
365 /*
366  * This path is for before the freespace btree is initialized:
367  *
368  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
369  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
370  */
371 static noinline struct open_bucket *
372 bch2_bucket_alloc_trans_early(struct btree_trans *trans,
373                               struct bch_dev *ca,
374                               enum alloc_reserve reserve,
375                               u64 *cur_bucket,
376                               u64 *buckets_seen,
377                               u64 *skipped_open,
378                               u64 *skipped_need_journal_commit,
379                               u64 *skipped_nouse,
380                               struct closure *cl)
381 {
382         struct btree_iter iter;
383         struct bkey_s_c k;
384         struct open_bucket *ob = NULL;
385         int ret;
386
387         *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
388         *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
389
390         for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
391                            BTREE_ITER_SLOTS, k, ret) {
392                 struct bch_alloc_v4 a;
393
394                 if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
395                         break;
396
397                 if (ca->new_fs_bucket_idx &&
398                     is_superblock_bucket(ca, k.k->p.offset))
399                         continue;
400
401                 bch2_alloc_to_v4(k, &a);
402
403                 if (a.data_type != BCH_DATA_free)
404                         continue;
405
406                 (*buckets_seen)++;
407
408                 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
409                                         skipped_open,
410                                         skipped_need_journal_commit,
411                                         skipped_nouse,
412                                         cl);
413                 if (ob)
414                         break;
415         }
416         bch2_trans_iter_exit(trans, &iter);
417
418         *cur_bucket = iter.pos.offset;
419
420         return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
421 }
422
423 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
424                                                    struct bch_dev *ca,
425                                                    enum alloc_reserve reserve,
426                                                    u64 *cur_bucket,
427                                                    u64 *buckets_seen,
428                                                    u64 *skipped_open,
429                                                    u64 *skipped_need_journal_commit,
430                                                    u64 *skipped_nouse,
431                                                    struct closure *cl)
432 {
433         struct btree_iter iter;
434         struct bkey_s_c k;
435         struct open_bucket *ob = NULL;
436         int ret;
437
438         if (unlikely(!ca->mi.freespace_initialized))
439                 return bch2_bucket_alloc_trans_early(trans, ca, reserve,
440                                                      cur_bucket,
441                                                      buckets_seen,
442                                                      skipped_open,
443                                                      skipped_need_journal_commit,
444                                                      skipped_nouse,
445                                                      cl);
446
447         BUG_ON(ca->new_fs_bucket_idx);
448
449         for_each_btree_key(trans, iter, BTREE_ID_freespace,
450                            POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
451                 if (k.k->p.inode != ca->dev_idx)
452                         break;
453
454                 for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
455                      *cur_bucket != k.k->p.offset && !ob;
456                      (*cur_bucket)++) {
457                         if (btree_trans_too_many_iters(trans)) {
458                                 ob = ERR_PTR(-EINTR);
459                                 break;
460                         }
461
462                         (*buckets_seen)++;
463
464                         ob = try_alloc_bucket(trans, ca, reserve,
465                                               *cur_bucket,
466                                               skipped_open,
467                                               skipped_need_journal_commit,
468                                               skipped_nouse,
469                                               cl);
470                 }
471                 if (ob)
472                         break;
473         }
474         bch2_trans_iter_exit(trans, &iter);
475
476         return ob ?: ERR_PTR(ret);
477 }
478
479 /**
480  * bch_bucket_alloc - allocate a single bucket from a specific device
481  *
482  * Returns index of bucket on success, 0 on failure
483  * */
484 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
485                                       enum alloc_reserve reserve,
486                                       bool may_alloc_partial,
487                                       struct closure *cl)
488 {
489         struct open_bucket *ob = NULL;
490         struct bch_dev_usage usage;
491         u64 avail;
492         u64 cur_bucket = 0;
493         u64 buckets_seen = 0;
494         u64 skipped_open = 0;
495         u64 skipped_need_journal_commit = 0;
496         u64 skipped_nouse = 0;
497         bool waiting = false;
498         int ret;
499 again:
500         usage = bch2_dev_usage_read(ca);
501         avail = __dev_buckets_available(ca, usage,reserve);
502
503         if (usage.d[BCH_DATA_need_discard].buckets > avail)
504                 bch2_do_discards(c);
505
506         if (usage.d[BCH_DATA_need_gc_gens].buckets > avail)
507                 bch2_do_gc_gens(c);
508
509         if (should_invalidate_buckets(ca, usage))
510                 bch2_do_invalidates(c);
511
512         if (!avail) {
513                 if (cl && !waiting) {
514                         closure_wait(&c->freelist_wait, cl);
515                         waiting = true;
516                         goto again;
517                 }
518
519                 if (!c->blocked_allocate)
520                         c->blocked_allocate = local_clock();
521
522                 ob = ERR_PTR(-FREELIST_EMPTY);
523                 goto err;
524         }
525
526         if (waiting)
527                 closure_wake_up(&c->freelist_wait);
528
529         if (may_alloc_partial) {
530                 ob = try_alloc_partial_bucket(c, ca, reserve);
531                 if (ob)
532                         return ob;
533         }
534
535         ret = bch2_trans_do(c, NULL, NULL, 0,
536                         PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
537                                                         &cur_bucket,
538                                                         &buckets_seen,
539                                                         &skipped_open,
540                                                         &skipped_need_journal_commit,
541                                                         &skipped_nouse,
542                                                         cl)));
543
544         if (skipped_need_journal_commit * 2 > avail)
545                 bch2_journal_flush_async(&c->journal, NULL);
546 err:
547         if (!ob)
548                 ob = ERR_PTR(ret ?: -FREELIST_EMPTY);
549
550         if (IS_ERR(ob)) {
551                 trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
552                                         buckets_seen,
553                                         skipped_open,
554                                         skipped_need_journal_commit,
555                                         skipped_nouse,
556                                         cl == NULL, PTR_ERR(ob));
557                 atomic_long_inc(&c->bucket_alloc_fail);
558         }
559
560         return ob;
561 }
562
563 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
564                             unsigned l, unsigned r)
565 {
566         return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
567                 (stripe->next_alloc[l] < stripe->next_alloc[r]));
568 }
569
570 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
571
572 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
573                                           struct dev_stripe_state *stripe,
574                                           struct bch_devs_mask *devs)
575 {
576         struct dev_alloc_list ret = { .nr = 0 };
577         unsigned i;
578
579         for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
580                 ret.devs[ret.nr++] = i;
581
582         bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
583         return ret;
584 }
585
586 void bch2_dev_stripe_increment(struct bch_dev *ca,
587                                struct dev_stripe_state *stripe)
588 {
589         u64 *v = stripe->next_alloc + ca->dev_idx;
590         u64 free_space = dev_buckets_available(ca, RESERVE_none);
591         u64 free_space_inv = free_space
592                 ? div64_u64(1ULL << 48, free_space)
593                 : 1ULL << 48;
594         u64 scale = *v / 4;
595
596         if (*v + free_space_inv >= *v)
597                 *v += free_space_inv;
598         else
599                 *v = U64_MAX;
600
601         for (v = stripe->next_alloc;
602              v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
603                 *v = *v < scale ? 0 : *v - scale;
604 }
605
606 #define BUCKET_MAY_ALLOC_PARTIAL        (1 << 0)
607 #define BUCKET_ALLOC_USE_DURABILITY     (1 << 1)
608
609 static void add_new_bucket(struct bch_fs *c,
610                            struct open_buckets *ptrs,
611                            struct bch_devs_mask *devs_may_alloc,
612                            unsigned *nr_effective,
613                            bool *have_cache,
614                            unsigned flags,
615                            struct open_bucket *ob)
616 {
617         unsigned durability =
618                 bch_dev_bkey_exists(c, ob->dev)->mi.durability;
619
620         __clear_bit(ob->dev, devs_may_alloc->d);
621         *nr_effective   += (flags & BUCKET_ALLOC_USE_DURABILITY)
622                 ? durability : 1;
623         *have_cache     |= !durability;
624
625         ob_push(c, ptrs, ob);
626 }
627
628 int bch2_bucket_alloc_set(struct bch_fs *c,
629                       struct open_buckets *ptrs,
630                       struct dev_stripe_state *stripe,
631                       struct bch_devs_mask *devs_may_alloc,
632                       unsigned nr_replicas,
633                       unsigned *nr_effective,
634                       bool *have_cache,
635                       enum alloc_reserve reserve,
636                       unsigned flags,
637                       struct closure *cl)
638 {
639         struct dev_alloc_list devs_sorted =
640                 bch2_dev_alloc_list(c, stripe, devs_may_alloc);
641         unsigned dev;
642         struct bch_dev *ca;
643         int ret = -INSUFFICIENT_DEVICES;
644         unsigned i;
645
646         BUG_ON(*nr_effective >= nr_replicas);
647
648         for (i = 0; i < devs_sorted.nr; i++) {
649                 struct open_bucket *ob;
650
651                 dev = devs_sorted.devs[i];
652
653                 rcu_read_lock();
654                 ca = rcu_dereference(c->devs[dev]);
655                 if (ca)
656                         percpu_ref_get(&ca->ref);
657                 rcu_read_unlock();
658
659                 if (!ca)
660                         continue;
661
662                 if (!ca->mi.durability && *have_cache) {
663                         percpu_ref_put(&ca->ref);
664                         continue;
665                 }
666
667                 ob = bch2_bucket_alloc(c, ca, reserve,
668                                 flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
669                 if (!IS_ERR(ob))
670                         bch2_dev_stripe_increment(ca, stripe);
671                 percpu_ref_put(&ca->ref);
672
673                 if (IS_ERR(ob)) {
674                         ret = PTR_ERR(ob);
675
676                         if (cl)
677                                 break;
678                         continue;
679                 }
680
681                 add_new_bucket(c, ptrs, devs_may_alloc,
682                                nr_effective, have_cache, flags, ob);
683
684                 if (*nr_effective >= nr_replicas) {
685                         ret = 0;
686                         break;
687                 }
688         }
689
690         return ret;
691 }
692
693 /* Allocate from stripes: */
694
695 /*
696  * if we can't allocate a new stripe because there are already too many
697  * partially filled stripes, force allocating from an existing stripe even when
698  * it's to a device we don't want:
699  */
700
701 static int bucket_alloc_from_stripe(struct bch_fs *c,
702                          struct open_buckets *ptrs,
703                          struct write_point *wp,
704                          struct bch_devs_mask *devs_may_alloc,
705                          u16 target,
706                          unsigned erasure_code,
707                          unsigned nr_replicas,
708                          unsigned *nr_effective,
709                          bool *have_cache,
710                          unsigned flags,
711                          struct closure *cl)
712 {
713         struct dev_alloc_list devs_sorted;
714         struct ec_stripe_head *h;
715         struct open_bucket *ob;
716         struct bch_dev *ca;
717         unsigned i, ec_idx;
718
719         if (!erasure_code)
720                 return 0;
721
722         if (nr_replicas < 2)
723                 return 0;
724
725         if (ec_open_bucket(c, ptrs))
726                 return 0;
727
728         h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
729                                     wp == &c->copygc_write_point,
730                                     cl);
731         if (IS_ERR(h))
732                 return -PTR_ERR(h);
733         if (!h)
734                 return 0;
735
736         devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
737
738         for (i = 0; i < devs_sorted.nr; i++)
739                 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
740                         if (!h->s->blocks[ec_idx])
741                                 continue;
742
743                         ob = c->open_buckets + h->s->blocks[ec_idx];
744                         if (ob->dev == devs_sorted.devs[i] &&
745                             !test_and_set_bit(ec_idx, h->s->blocks_allocated))
746                                 goto got_bucket;
747                 }
748         goto out_put_head;
749 got_bucket:
750         ca = bch_dev_bkey_exists(c, ob->dev);
751
752         ob->ec_idx      = ec_idx;
753         ob->ec          = h->s;
754
755         add_new_bucket(c, ptrs, devs_may_alloc,
756                        nr_effective, have_cache, flags, ob);
757         atomic_inc(&h->s->pin);
758 out_put_head:
759         bch2_ec_stripe_head_put(c, h);
760         return 0;
761 }
762
763 /* Sector allocator */
764
765 static void get_buckets_from_writepoint(struct bch_fs *c,
766                                         struct open_buckets *ptrs,
767                                         struct write_point *wp,
768                                         struct bch_devs_mask *devs_may_alloc,
769                                         unsigned nr_replicas,
770                                         unsigned *nr_effective,
771                                         bool *have_cache,
772                                         unsigned flags,
773                                         bool need_ec)
774 {
775         struct open_buckets ptrs_skip = { .nr = 0 };
776         struct open_bucket *ob;
777         unsigned i;
778
779         open_bucket_for_each(c, &wp->ptrs, ob, i) {
780                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
781
782                 if (*nr_effective < nr_replicas &&
783                     test_bit(ob->dev, devs_may_alloc->d) &&
784                     (ca->mi.durability ||
785                      (wp->data_type == BCH_DATA_user && !*have_cache)) &&
786                     (ob->ec || !need_ec)) {
787                         add_new_bucket(c, ptrs, devs_may_alloc,
788                                        nr_effective, have_cache,
789                                        flags, ob);
790                 } else {
791                         ob_push(c, &ptrs_skip, ob);
792                 }
793         }
794         wp->ptrs = ptrs_skip;
795 }
796
797 static int open_bucket_add_buckets(struct bch_fs *c,
798                         struct open_buckets *ptrs,
799                         struct write_point *wp,
800                         struct bch_devs_list *devs_have,
801                         u16 target,
802                         unsigned erasure_code,
803                         unsigned nr_replicas,
804                         unsigned *nr_effective,
805                         bool *have_cache,
806                         enum alloc_reserve reserve,
807                         unsigned flags,
808                         struct closure *_cl)
809 {
810         struct bch_devs_mask devs;
811         struct open_bucket *ob;
812         struct closure *cl = NULL;
813         int ret;
814         unsigned i;
815
816         rcu_read_lock();
817         devs = target_rw_devs(c, wp->data_type, target);
818         rcu_read_unlock();
819
820         /* Don't allocate from devices we already have pointers to: */
821         for (i = 0; i < devs_have->nr; i++)
822                 __clear_bit(devs_have->devs[i], devs.d);
823
824         open_bucket_for_each(c, ptrs, ob, i)
825                 __clear_bit(ob->dev, devs.d);
826
827         if (erasure_code) {
828                 if (!ec_open_bucket(c, ptrs)) {
829                         get_buckets_from_writepoint(c, ptrs, wp, &devs,
830                                                     nr_replicas, nr_effective,
831                                                     have_cache, flags, true);
832                         if (*nr_effective >= nr_replicas)
833                                 return 0;
834                 }
835
836                 if (!ec_open_bucket(c, ptrs)) {
837                         ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
838                                                  target, erasure_code,
839                                                  nr_replicas, nr_effective,
840                                                  have_cache, flags, _cl);
841                         if (ret == -FREELIST_EMPTY ||
842                             ret == -OPEN_BUCKETS_EMPTY)
843                                 return ret;
844                         if (*nr_effective >= nr_replicas)
845                                 return 0;
846                 }
847         }
848
849         get_buckets_from_writepoint(c, ptrs, wp, &devs,
850                                     nr_replicas, nr_effective,
851                                     have_cache, flags, false);
852         if (*nr_effective >= nr_replicas)
853                 return 0;
854
855 retry_blocking:
856         /*
857          * Try nonblocking first, so that if one device is full we'll try from
858          * other devices:
859          */
860         ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
861                                 nr_replicas, nr_effective, have_cache,
862                                 reserve, flags, cl);
863         if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
864                 cl = _cl;
865                 goto retry_blocking;
866         }
867
868         return ret;
869 }
870
871 void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
872                                 struct open_buckets *obs)
873 {
874         struct open_buckets ptrs = { .nr = 0 };
875         struct open_bucket *ob, *ob2;
876         unsigned i, j;
877
878         open_bucket_for_each(c, obs, ob, i) {
879                 bool drop = !ca || ob->dev == ca->dev_idx;
880
881                 if (!drop && ob->ec) {
882                         mutex_lock(&ob->ec->lock);
883                         for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
884                                 if (!ob->ec->blocks[j])
885                                         continue;
886
887                                 ob2 = c->open_buckets + ob->ec->blocks[j];
888                                 drop |= ob2->dev == ca->dev_idx;
889                         }
890                         mutex_unlock(&ob->ec->lock);
891                 }
892
893                 if (drop)
894                         bch2_open_bucket_put(c, ob);
895                 else
896                         ob_push(c, &ptrs, ob);
897         }
898
899         *obs = ptrs;
900 }
901
902 void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
903                           struct write_point *wp)
904 {
905         mutex_lock(&wp->lock);
906         bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
907         mutex_unlock(&wp->lock);
908 }
909
910 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
911                                                  unsigned long write_point)
912 {
913         unsigned hash =
914                 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
915
916         return &c->write_points_hash[hash];
917 }
918
919 static struct write_point *__writepoint_find(struct hlist_head *head,
920                                              unsigned long write_point)
921 {
922         struct write_point *wp;
923
924         rcu_read_lock();
925         hlist_for_each_entry_rcu(wp, head, node)
926                 if (wp->write_point == write_point)
927                         goto out;
928         wp = NULL;
929 out:
930         rcu_read_unlock();
931         return wp;
932 }
933
934 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
935 {
936         u64 stranded    = c->write_points_nr * c->bucket_size_max;
937         u64 free        = bch2_fs_usage_read_short(c).free;
938
939         return stranded * factor > free;
940 }
941
942 static bool try_increase_writepoints(struct bch_fs *c)
943 {
944         struct write_point *wp;
945
946         if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
947             too_many_writepoints(c, 32))
948                 return false;
949
950         wp = c->write_points + c->write_points_nr++;
951         hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
952         return true;
953 }
954
955 static bool try_decrease_writepoints(struct bch_fs *c,
956                                      unsigned old_nr)
957 {
958         struct write_point *wp;
959
960         mutex_lock(&c->write_points_hash_lock);
961         if (c->write_points_nr < old_nr) {
962                 mutex_unlock(&c->write_points_hash_lock);
963                 return true;
964         }
965
966         if (c->write_points_nr == 1 ||
967             !too_many_writepoints(c, 8)) {
968                 mutex_unlock(&c->write_points_hash_lock);
969                 return false;
970         }
971
972         wp = c->write_points + --c->write_points_nr;
973
974         hlist_del_rcu(&wp->node);
975         mutex_unlock(&c->write_points_hash_lock);
976
977         bch2_writepoint_stop(c, NULL, wp);
978         return true;
979 }
980
981 static struct write_point *writepoint_find(struct bch_fs *c,
982                                            unsigned long write_point)
983 {
984         struct write_point *wp, *oldest;
985         struct hlist_head *head;
986
987         if (!(write_point & 1UL)) {
988                 wp = (struct write_point *) write_point;
989                 mutex_lock(&wp->lock);
990                 return wp;
991         }
992
993         head = writepoint_hash(c, write_point);
994 restart_find:
995         wp = __writepoint_find(head, write_point);
996         if (wp) {
997 lock_wp:
998                 mutex_lock(&wp->lock);
999                 if (wp->write_point == write_point)
1000                         goto out;
1001                 mutex_unlock(&wp->lock);
1002                 goto restart_find;
1003         }
1004 restart_find_oldest:
1005         oldest = NULL;
1006         for (wp = c->write_points;
1007              wp < c->write_points + c->write_points_nr; wp++)
1008                 if (!oldest || time_before64(wp->last_used, oldest->last_used))
1009                         oldest = wp;
1010
1011         mutex_lock(&oldest->lock);
1012         mutex_lock(&c->write_points_hash_lock);
1013         if (oldest >= c->write_points + c->write_points_nr ||
1014             try_increase_writepoints(c)) {
1015                 mutex_unlock(&c->write_points_hash_lock);
1016                 mutex_unlock(&oldest->lock);
1017                 goto restart_find_oldest;
1018         }
1019
1020         wp = __writepoint_find(head, write_point);
1021         if (wp && wp != oldest) {
1022                 mutex_unlock(&c->write_points_hash_lock);
1023                 mutex_unlock(&oldest->lock);
1024                 goto lock_wp;
1025         }
1026
1027         wp = oldest;
1028         hlist_del_rcu(&wp->node);
1029         wp->write_point = write_point;
1030         hlist_add_head_rcu(&wp->node, head);
1031         mutex_unlock(&c->write_points_hash_lock);
1032 out:
1033         wp->last_used = sched_clock();
1034         return wp;
1035 }
1036
1037 /*
1038  * Get us an open_bucket we can allocate from, return with it locked:
1039  */
1040 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
1041                                 unsigned target,
1042                                 unsigned erasure_code,
1043                                 struct write_point_specifier write_point,
1044                                 struct bch_devs_list *devs_have,
1045                                 unsigned nr_replicas,
1046                                 unsigned nr_replicas_required,
1047                                 enum alloc_reserve reserve,
1048                                 unsigned flags,
1049                                 struct closure *cl)
1050 {
1051         struct write_point *wp;
1052         struct open_bucket *ob;
1053         struct open_buckets ptrs;
1054         unsigned nr_effective, write_points_nr;
1055         unsigned ob_flags = 0;
1056         bool have_cache;
1057         int ret;
1058         int i;
1059
1060         if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
1061                 ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
1062
1063         BUG_ON(!nr_replicas || !nr_replicas_required);
1064 retry:
1065         ptrs.nr         = 0;
1066         nr_effective    = 0;
1067         write_points_nr = c->write_points_nr;
1068         have_cache      = false;
1069
1070         wp = writepoint_find(c, write_point.v);
1071
1072         if (wp->data_type == BCH_DATA_user)
1073                 ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
1074
1075         /* metadata may not allocate on cache devices: */
1076         if (wp->data_type != BCH_DATA_user)
1077                 have_cache = true;
1078
1079         if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1080                 ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
1081                                               target, erasure_code,
1082                                               nr_replicas, &nr_effective,
1083                                               &have_cache, reserve,
1084                                               ob_flags, cl);
1085         } else {
1086                 ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
1087                                               target, erasure_code,
1088                                               nr_replicas, &nr_effective,
1089                                               &have_cache, reserve,
1090                                               ob_flags, NULL);
1091                 if (!ret)
1092                         goto alloc_done;
1093
1094                 ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
1095                                               0, erasure_code,
1096                                               nr_replicas, &nr_effective,
1097                                               &have_cache, reserve,
1098                                               ob_flags, cl);
1099         }
1100 alloc_done:
1101         BUG_ON(!ret && nr_effective < nr_replicas);
1102
1103         if (erasure_code && !ec_open_bucket(c, &ptrs))
1104                 pr_debug("failed to get ec bucket: ret %u", ret);
1105
1106         if (ret == -INSUFFICIENT_DEVICES &&
1107             nr_effective >= nr_replicas_required)
1108                 ret = 0;
1109
1110         if (ret)
1111                 goto err;
1112
1113         /* Free buckets we didn't use: */
1114         open_bucket_for_each(c, &wp->ptrs, ob, i)
1115                 open_bucket_free_unused(c, wp, ob);
1116
1117         wp->ptrs = ptrs;
1118
1119         wp->sectors_free = UINT_MAX;
1120
1121         open_bucket_for_each(c, &wp->ptrs, ob, i)
1122                 wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1123
1124         BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1125
1126         return wp;
1127 err:
1128         open_bucket_for_each(c, &wp->ptrs, ob, i)
1129                 if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1130                         ob_push(c, &ptrs, ob);
1131                 else
1132                         open_bucket_free_unused(c, wp, ob);
1133         wp->ptrs = ptrs;
1134
1135         mutex_unlock(&wp->lock);
1136
1137         if (ret == -FREELIST_EMPTY &&
1138             try_decrease_writepoints(c, write_points_nr))
1139                 goto retry;
1140
1141         switch (ret) {
1142         case -OPEN_BUCKETS_EMPTY:
1143         case -FREELIST_EMPTY:
1144                 return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
1145         case -INSUFFICIENT_DEVICES:
1146                 return ERR_PTR(-EROFS);
1147         default:
1148                 return ERR_PTR(ret);
1149         }
1150 }
1151
1152 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1153 {
1154         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1155
1156         return (struct bch_extent_ptr) {
1157                 .type   = 1 << BCH_EXTENT_ENTRY_ptr,
1158                 .gen    = ob->gen,
1159                 .dev    = ob->dev,
1160                 .offset = bucket_to_sector(ca, ob->bucket) +
1161                         ca->mi.bucket_size -
1162                         ob->sectors_free,
1163         };
1164 }
1165
1166 /*
1167  * Append pointers to the space we just allocated to @k, and mark @sectors space
1168  * as allocated out of @ob
1169  */
1170 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1171                                     struct bkey_i *k, unsigned sectors,
1172                                     bool cached)
1173
1174 {
1175         struct open_bucket *ob;
1176         unsigned i;
1177
1178         BUG_ON(sectors > wp->sectors_free);
1179         wp->sectors_free -= sectors;
1180
1181         open_bucket_for_each(c, &wp->ptrs, ob, i) {
1182                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1183                 struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
1184
1185                 ptr.cached = cached ||
1186                         (!ca->mi.durability &&
1187                          wp->data_type == BCH_DATA_user);
1188
1189                 bch2_bkey_append_ptr(k, ptr);
1190
1191                 BUG_ON(sectors > ob->sectors_free);
1192                 ob->sectors_free -= sectors;
1193         }
1194 }
1195
1196 /*
1197  * Append pointers to the space we just allocated to @k, and mark @sectors space
1198  * as allocated out of @ob
1199  */
1200 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1201 {
1202         struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
1203         struct open_bucket *ob;
1204         unsigned i;
1205
1206         open_bucket_for_each(c, &wp->ptrs, ob, i)
1207                 ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
1208         wp->ptrs = keep;
1209
1210         mutex_unlock(&wp->lock);
1211
1212         bch2_open_buckets_put(c, &ptrs);
1213 }
1214
1215 static inline void writepoint_init(struct write_point *wp,
1216                                    enum bch_data_type type)
1217 {
1218         mutex_init(&wp->lock);
1219         wp->data_type = type;
1220 }
1221
1222 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1223 {
1224         struct open_bucket *ob;
1225         struct write_point *wp;
1226
1227         mutex_init(&c->write_points_hash_lock);
1228         c->write_points_nr = ARRAY_SIZE(c->write_points);
1229
1230         /* open bucket 0 is a sentinal NULL: */
1231         spin_lock_init(&c->open_buckets[0].lock);
1232
1233         for (ob = c->open_buckets + 1;
1234              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1235                 spin_lock_init(&ob->lock);
1236                 c->open_buckets_nr_free++;
1237
1238                 ob->freelist = c->open_buckets_freelist;
1239                 c->open_buckets_freelist = ob - c->open_buckets;
1240         }
1241
1242         writepoint_init(&c->btree_write_point,          BCH_DATA_btree);
1243         writepoint_init(&c->rebalance_write_point,      BCH_DATA_user);
1244         writepoint_init(&c->copygc_write_point,         BCH_DATA_user);
1245
1246         for (wp = c->write_points;
1247              wp < c->write_points + c->write_points_nr; wp++) {
1248                 writepoint_init(wp, BCH_DATA_user);
1249
1250                 wp->last_used   = sched_clock();
1251                 wp->write_point = (unsigned long) wp;
1252                 hlist_add_head_rcu(&wp->node,
1253                                    writepoint_hash(c, wp->write_point));
1254         }
1255 }
1256
1257 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1258 {
1259         struct open_bucket *ob;
1260
1261         for (ob = c->open_buckets;
1262              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1263              ob++) {
1264                 spin_lock(&ob->lock);
1265                 if (ob->valid && !ob->on_partial_list) {
1266                         pr_buf(out, "%zu ref %u type %s %u:%llu:%u\n",
1267                                ob - c->open_buckets,
1268                                atomic_read(&ob->pin),
1269                                bch2_data_types[ob->data_type],
1270                                ob->dev, ob->bucket, ob->gen);
1271                 }
1272                 spin_unlock(&ob->lock);
1273         }
1274 }