]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_foreground.c
Update bcachefs sources to 400f275d46 bcachefs: Fix check_overlapping_extents()
[bcachefs-tools-debian] / libbcachefs / alloc_foreground.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37
38 const char * const bch2_alloc_reserves[] = {
39 #define x(t) #t,
40         BCH_ALLOC_RESERVES()
41 #undef x
42         NULL
43 };
44
45 /*
46  * Open buckets represent a bucket that's currently being allocated from.  They
47  * serve two purposes:
48  *
49  *  - They track buckets that have been partially allocated, allowing for
50  *    sub-bucket sized allocations - they're used by the sector allocator below
51  *
52  *  - They provide a reference to the buckets they own that mark and sweep GC
53  *    can find, until the new allocation has a pointer to it inserted into the
54  *    btree
55  *
56  * When allocating some space with the sector allocator, the allocation comes
57  * with a reference to an open bucket - the caller is required to put that
58  * reference _after_ doing the index update that makes its allocation reachable.
59  */
60
61 void bch2_reset_alloc_cursors(struct bch_fs *c)
62 {
63         struct bch_dev *ca;
64         unsigned i;
65
66         rcu_read_lock();
67         for_each_member_device_rcu(ca, c, i, NULL)
68                 ca->alloc_cursor = 0;
69         rcu_read_unlock();
70 }
71
72 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
73 {
74         open_bucket_idx_t idx = ob - c->open_buckets;
75         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
76
77         ob->hash = *slot;
78         *slot = idx;
79 }
80
81 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
82 {
83         open_bucket_idx_t idx = ob - c->open_buckets;
84         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
85
86         while (*slot != idx) {
87                 BUG_ON(!*slot);
88                 slot = &c->open_buckets[*slot].hash;
89         }
90
91         *slot = ob->hash;
92         ob->hash = 0;
93 }
94
95 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
96 {
97         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
98
99         if (ob->ec) {
100                 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
101                 return;
102         }
103
104         percpu_down_read(&c->mark_lock);
105         spin_lock(&ob->lock);
106
107         ob->valid = false;
108         ob->data_type = 0;
109
110         spin_unlock(&ob->lock);
111         percpu_up_read(&c->mark_lock);
112
113         spin_lock(&c->freelist_lock);
114         bch2_open_bucket_hash_remove(c, ob);
115
116         ob->freelist = c->open_buckets_freelist;
117         c->open_buckets_freelist = ob - c->open_buckets;
118
119         c->open_buckets_nr_free++;
120         ca->nr_open_buckets--;
121         spin_unlock(&c->freelist_lock);
122
123         closure_wake_up(&c->open_buckets_wait);
124 }
125
126 void bch2_open_bucket_write_error(struct bch_fs *c,
127                                   struct open_buckets *obs,
128                                   unsigned dev)
129 {
130         struct open_bucket *ob;
131         unsigned i;
132
133         open_bucket_for_each(c, obs, ob, i)
134                 if (ob->dev == dev && ob->ec)
135                         bch2_ec_bucket_cancel(c, ob);
136 }
137
138 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
139 {
140         struct open_bucket *ob;
141
142         BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
143
144         ob = c->open_buckets + c->open_buckets_freelist;
145         c->open_buckets_freelist = ob->freelist;
146         atomic_set(&ob->pin, 1);
147         ob->data_type = 0;
148
149         c->open_buckets_nr_free--;
150         return ob;
151 }
152
153 static void open_bucket_free_unused(struct bch_fs *c,
154                                     struct write_point *wp,
155                                     struct open_bucket *ob)
156 {
157         BUG_ON(c->open_buckets_partial_nr >=
158                ARRAY_SIZE(c->open_buckets_partial));
159
160         spin_lock(&c->freelist_lock);
161         ob->on_partial_list = true;
162         c->open_buckets_partial[c->open_buckets_partial_nr++] =
163                 ob - c->open_buckets;
164         spin_unlock(&c->freelist_lock);
165
166         closure_wake_up(&c->open_buckets_wait);
167         closure_wake_up(&c->freelist_wait);
168 }
169
170 /* _only_ for allocating the journal on a new device: */
171 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
172 {
173         while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
174                 u64 b = ca->new_fs_bucket_idx++;
175
176                 if (!is_superblock_bucket(ca, b) &&
177                     (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
178                         return b;
179         }
180
181         return -1;
182 }
183
184 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
185 {
186         switch (reserve) {
187         case RESERVE_btree:
188         case RESERVE_btree_movinggc:
189                 return 0;
190         case RESERVE_movinggc:
191                 return OPEN_BUCKETS_COUNT / 4;
192         default:
193                 return OPEN_BUCKETS_COUNT / 2;
194         }
195 }
196
197 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
198                                               u64 bucket,
199                                               enum alloc_reserve reserve,
200                                               const struct bch_alloc_v4 *a,
201                                               struct bucket_alloc_state *s,
202                                               struct closure *cl)
203 {
204         struct open_bucket *ob;
205
206         if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
207                 s->skipped_nouse++;
208                 return NULL;
209         }
210
211         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
212                 s->skipped_open++;
213                 return NULL;
214         }
215
216         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
217                         c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
218                 s->skipped_need_journal_commit++;
219                 return NULL;
220         }
221
222         if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
223                 s->skipped_nocow++;
224                 return NULL;
225         }
226
227         spin_lock(&c->freelist_lock);
228
229         if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
230                 if (cl)
231                         closure_wait(&c->open_buckets_wait, cl);
232
233                 if (!c->blocked_allocate_open_bucket)
234                         c->blocked_allocate_open_bucket = local_clock();
235
236                 spin_unlock(&c->freelist_lock);
237                 return ERR_PTR(-BCH_ERR_open_buckets_empty);
238         }
239
240         /* Recheck under lock: */
241         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
242                 spin_unlock(&c->freelist_lock);
243                 s->skipped_open++;
244                 return NULL;
245         }
246
247         ob = bch2_open_bucket_alloc(c);
248
249         spin_lock(&ob->lock);
250
251         ob->valid       = true;
252         ob->sectors_free = ca->mi.bucket_size;
253         ob->dev         = ca->dev_idx;
254         ob->gen         = a->gen;
255         ob->bucket      = bucket;
256         spin_unlock(&ob->lock);
257
258         ca->nr_open_buckets++;
259         bch2_open_bucket_hash_add(c, ob);
260
261         if (c->blocked_allocate_open_bucket) {
262                 bch2_time_stats_update(
263                         &c->times[BCH_TIME_blocked_allocate_open_bucket],
264                         c->blocked_allocate_open_bucket);
265                 c->blocked_allocate_open_bucket = 0;
266         }
267
268         if (c->blocked_allocate) {
269                 bch2_time_stats_update(
270                         &c->times[BCH_TIME_blocked_allocate],
271                         c->blocked_allocate);
272                 c->blocked_allocate = 0;
273         }
274
275         spin_unlock(&c->freelist_lock);
276         return ob;
277 }
278
279 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
280                                             enum alloc_reserve reserve, u64 free_entry,
281                                             struct bucket_alloc_state *s,
282                                             struct bkey_s_c freespace_k,
283                                             struct closure *cl)
284 {
285         struct bch_fs *c = trans->c;
286         struct btree_iter iter = { NULL };
287         struct bkey_s_c k;
288         struct open_bucket *ob;
289         struct bch_alloc_v4 a_convert;
290         const struct bch_alloc_v4 *a;
291         u64 b = free_entry & ~(~0ULL << 56);
292         unsigned genbits = free_entry >> 56;
293         struct printbuf buf = PRINTBUF;
294         int ret;
295
296         if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
297                 prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
298                        "  freespace key ",
299                         ca->mi.first_bucket, ca->mi.nbuckets);
300                 bch2_bkey_val_to_text(&buf, c, freespace_k);
301                 bch2_trans_inconsistent(trans, "%s", buf.buf);
302                 ob = ERR_PTR(-EIO);
303                 goto err;
304         }
305
306         k = bch2_bkey_get_iter(trans, &iter,
307                                BTREE_ID_alloc, POS(ca->dev_idx, b),
308                                BTREE_ITER_CACHED);
309         ret = bkey_err(k);
310         if (ret) {
311                 ob = ERR_PTR(ret);
312                 goto err;
313         }
314
315         a = bch2_alloc_to_v4(k, &a_convert);
316
317         if (a->data_type != BCH_DATA_free) {
318                 if (!test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
319                         ob = NULL;
320                         goto err;
321                 }
322
323                 prt_printf(&buf, "non free bucket in freespace btree\n"
324                        "  freespace key ");
325                 bch2_bkey_val_to_text(&buf, c, freespace_k);
326                 prt_printf(&buf, "\n  ");
327                 bch2_bkey_val_to_text(&buf, c, k);
328                 bch2_trans_inconsistent(trans, "%s", buf.buf);
329                 ob = ERR_PTR(-EIO);
330                 goto err;
331         }
332
333         if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
334             test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
335                 prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
336                        "  freespace key ",
337                        genbits, alloc_freespace_genbits(*a) >> 56);
338                 bch2_bkey_val_to_text(&buf, c, freespace_k);
339                 prt_printf(&buf, "\n  ");
340                 bch2_bkey_val_to_text(&buf, c, k);
341                 bch2_trans_inconsistent(trans, "%s", buf.buf);
342                 ob = ERR_PTR(-EIO);
343                 goto err;
344
345         }
346
347         if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
348                 struct bch_backpointer bp;
349                 struct bpos bp_pos = POS_MIN;
350
351                 ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
352                                                 &bp_pos, &bp,
353                                                 BTREE_ITER_NOPRESERVE);
354                 if (ret) {
355                         ob = ERR_PTR(ret);
356                         goto err;
357                 }
358
359                 if (!bkey_eq(bp_pos, POS_MAX)) {
360                         /*
361                          * Bucket may have data in it - we don't call
362                          * bc2h_trans_inconnsistent() because fsck hasn't
363                          * finished yet
364                          */
365                         ob = NULL;
366                         goto err;
367                 }
368         }
369
370         ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
371         if (!ob)
372                 iter.path->preserve = false;
373 err:
374         set_btree_iter_dontneed(&iter);
375         bch2_trans_iter_exit(trans, &iter);
376         printbuf_exit(&buf);
377         return ob;
378 }
379
380 /*
381  * This path is for before the freespace btree is initialized:
382  *
383  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
384  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
385  */
386 static noinline struct open_bucket *
387 bch2_bucket_alloc_early(struct btree_trans *trans,
388                         struct bch_dev *ca,
389                         enum alloc_reserve reserve,
390                         struct bucket_alloc_state *s,
391                         struct closure *cl)
392 {
393         struct btree_iter iter;
394         struct bkey_s_c k;
395         struct open_bucket *ob = NULL;
396         u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
397         u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
398         int ret;
399 again:
400         for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
401                            BTREE_ITER_SLOTS, k, ret) {
402                 struct bch_alloc_v4 a_convert;
403                 const struct bch_alloc_v4 *a;
404
405                 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
406                         break;
407
408                 if (ca->new_fs_bucket_idx &&
409                     is_superblock_bucket(ca, k.k->p.offset))
410                         continue;
411
412                 a = bch2_alloc_to_v4(k, &a_convert);
413
414                 if (a->data_type != BCH_DATA_free)
415                         continue;
416
417                 s->buckets_seen++;
418
419                 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, a, s, cl);
420                 if (ob)
421                         break;
422         }
423         bch2_trans_iter_exit(trans, &iter);
424
425         ca->alloc_cursor = alloc_cursor;
426
427         if (!ob && ret)
428                 ob = ERR_PTR(ret);
429
430         if (!ob && alloc_cursor > alloc_start) {
431                 alloc_cursor = alloc_start;
432                 goto again;
433         }
434
435         return ob;
436 }
437
438 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
439                                                    struct bch_dev *ca,
440                                                    enum alloc_reserve reserve,
441                                                    struct bucket_alloc_state *s,
442                                                    struct closure *cl)
443 {
444         struct btree_iter iter;
445         struct bkey_s_c k;
446         struct open_bucket *ob = NULL;
447         u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
448         u64 alloc_cursor = alloc_start;
449         int ret;
450
451         BUG_ON(ca->new_fs_bucket_idx);
452 again:
453         for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
454                                      POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
455                 if (k.k->p.inode != ca->dev_idx)
456                         break;
457
458                 for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
459                      alloc_cursor < k.k->p.offset;
460                      alloc_cursor++) {
461                         ret = btree_trans_too_many_iters(trans);
462                         if (ret) {
463                                 ob = ERR_PTR(ret);
464                                 break;
465                         }
466
467                         s->buckets_seen++;
468
469                         ob = try_alloc_bucket(trans, ca, reserve,
470                                               alloc_cursor, s, k, cl);
471                         if (ob) {
472                                 iter.path->preserve = false;
473                                 break;
474                         }
475                 }
476
477                 if (ob || ret)
478                         break;
479         }
480         bch2_trans_iter_exit(trans, &iter);
481
482         ca->alloc_cursor = alloc_cursor;
483
484         if (!ob && ret)
485                 ob = ERR_PTR(ret);
486
487         if (!ob && alloc_start > ca->mi.first_bucket) {
488                 alloc_cursor = alloc_start = ca->mi.first_bucket;
489                 goto again;
490         }
491
492         return ob;
493 }
494
495 /**
496  * bch_bucket_alloc - allocate a single bucket from a specific device
497  *
498  * Returns index of bucket on success, 0 on failure
499  */
500 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
501                                       struct bch_dev *ca,
502                                       enum alloc_reserve reserve,
503                                       struct closure *cl,
504                                       struct bch_dev_usage *usage)
505 {
506         struct bch_fs *c = trans->c;
507         struct open_bucket *ob = NULL;
508         bool freespace = READ_ONCE(ca->mi.freespace_initialized);
509         u64 avail;
510         struct bucket_alloc_state s = { 0 };
511         bool waiting = false;
512 again:
513         bch2_dev_usage_read_fast(ca, usage);
514         avail = dev_buckets_free(ca, *usage, reserve);
515
516         if (usage->d[BCH_DATA_need_discard].buckets > avail)
517                 bch2_do_discards(c);
518
519         if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
520                 bch2_do_gc_gens(c);
521
522         if (should_invalidate_buckets(ca, *usage))
523                 bch2_do_invalidates(c);
524
525         if (!avail) {
526                 if (cl && !waiting) {
527                         closure_wait(&c->freelist_wait, cl);
528                         waiting = true;
529                         goto again;
530                 }
531
532                 if (!c->blocked_allocate)
533                         c->blocked_allocate = local_clock();
534
535                 ob = ERR_PTR(-BCH_ERR_freelist_empty);
536                 goto err;
537         }
538
539         if (waiting)
540                 closure_wake_up(&c->freelist_wait);
541 alloc:
542         ob = likely(freespace)
543                 ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
544                 : bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
545
546         if (s.skipped_need_journal_commit * 2 > avail)
547                 bch2_journal_flush_async(&c->journal, NULL);
548
549         if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
550                 freespace = false;
551                 goto alloc;
552         }
553 err:
554         if (!ob)
555                 ob = ERR_PTR(-BCH_ERR_no_buckets_found);
556
557         if (!IS_ERR(ob))
558                 trace_and_count(c, bucket_alloc, ca,
559                                 bch2_alloc_reserves[reserve],
560                                 ob->bucket,
561                                 usage->d[BCH_DATA_free].buckets,
562                                 avail,
563                                 bch2_copygc_wait_amount(c),
564                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
565                                 &s,
566                                 cl == NULL,
567                                 "");
568         else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
569                 trace_and_count(c, bucket_alloc_fail, ca,
570                                 bch2_alloc_reserves[reserve],
571                                 0,
572                                 usage->d[BCH_DATA_free].buckets,
573                                 avail,
574                                 bch2_copygc_wait_amount(c),
575                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
576                                 &s,
577                                 cl == NULL,
578                                 bch2_err_str(PTR_ERR(ob)));
579
580         return ob;
581 }
582
583 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
584                                       enum alloc_reserve reserve,
585                                       struct closure *cl)
586 {
587         struct bch_dev_usage usage;
588         struct open_bucket *ob;
589
590         bch2_trans_do(c, NULL, NULL, 0,
591                       PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
592                                                         cl, &usage)));
593         return ob;
594 }
595
596 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
597                             unsigned l, unsigned r)
598 {
599         return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
600                 (stripe->next_alloc[l] < stripe->next_alloc[r]));
601 }
602
603 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
604
605 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
606                                           struct dev_stripe_state *stripe,
607                                           struct bch_devs_mask *devs)
608 {
609         struct dev_alloc_list ret = { .nr = 0 };
610         unsigned i;
611
612         for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
613                 ret.devs[ret.nr++] = i;
614
615         bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
616         return ret;
617 }
618
619 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
620                                struct dev_stripe_state *stripe,
621                                struct bch_dev_usage *usage)
622 {
623         u64 *v = stripe->next_alloc + ca->dev_idx;
624         u64 free_space = dev_buckets_available(ca, RESERVE_none);
625         u64 free_space_inv = free_space
626                 ? div64_u64(1ULL << 48, free_space)
627                 : 1ULL << 48;
628         u64 scale = *v / 4;
629
630         if (*v + free_space_inv >= *v)
631                 *v += free_space_inv;
632         else
633                 *v = U64_MAX;
634
635         for (v = stripe->next_alloc;
636              v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
637                 *v = *v < scale ? 0 : *v - scale;
638 }
639
640 void bch2_dev_stripe_increment(struct bch_dev *ca,
641                                struct dev_stripe_state *stripe)
642 {
643         struct bch_dev_usage usage;
644
645         bch2_dev_usage_read_fast(ca, &usage);
646         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
647 }
648
649 static int add_new_bucket(struct bch_fs *c,
650                            struct open_buckets *ptrs,
651                            struct bch_devs_mask *devs_may_alloc,
652                            unsigned nr_replicas,
653                            unsigned *nr_effective,
654                            bool *have_cache,
655                            unsigned flags,
656                            struct open_bucket *ob)
657 {
658         unsigned durability =
659                 bch_dev_bkey_exists(c, ob->dev)->mi.durability;
660
661         BUG_ON(*nr_effective >= nr_replicas);
662         BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
663
664         __clear_bit(ob->dev, devs_may_alloc->d);
665         *nr_effective   += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
666                 ? durability : 1;
667         *have_cache     |= !durability;
668
669         ob_push(c, ptrs, ob);
670
671         if (*nr_effective >= nr_replicas)
672                 return 1;
673         if (ob->ec)
674                 return 1;
675         return 0;
676 }
677
678 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
679                       struct open_buckets *ptrs,
680                       struct dev_stripe_state *stripe,
681                       struct bch_devs_mask *devs_may_alloc,
682                       unsigned nr_replicas,
683                       unsigned *nr_effective,
684                       bool *have_cache,
685                       unsigned flags,
686                       enum bch_data_type data_type,
687                       enum alloc_reserve reserve,
688                       struct closure *cl)
689 {
690         struct bch_fs *c = trans->c;
691         struct dev_alloc_list devs_sorted =
692                 bch2_dev_alloc_list(c, stripe, devs_may_alloc);
693         unsigned dev;
694         struct bch_dev *ca;
695         int ret = -BCH_ERR_insufficient_devices;
696         unsigned i;
697
698         BUG_ON(*nr_effective >= nr_replicas);
699
700         for (i = 0; i < devs_sorted.nr; i++) {
701                 struct bch_dev_usage usage;
702                 struct open_bucket *ob;
703
704                 dev = devs_sorted.devs[i];
705
706                 rcu_read_lock();
707                 ca = rcu_dereference(c->devs[dev]);
708                 if (ca)
709                         percpu_ref_get(&ca->ref);
710                 rcu_read_unlock();
711
712                 if (!ca)
713                         continue;
714
715                 if (!ca->mi.durability && *have_cache) {
716                         percpu_ref_put(&ca->ref);
717                         continue;
718                 }
719
720                 ob = bch2_bucket_alloc_trans(trans, ca, reserve, cl, &usage);
721                 if (!IS_ERR(ob))
722                         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
723                 percpu_ref_put(&ca->ref);
724
725                 if (IS_ERR(ob)) {
726                         ret = PTR_ERR(ob);
727                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
728                                 break;
729                         continue;
730                 }
731
732                 ob->data_type = data_type;
733
734                 if (add_new_bucket(c, ptrs, devs_may_alloc,
735                                    nr_replicas, nr_effective,
736                                    have_cache, flags, ob)) {
737                         ret = 0;
738                         break;
739                 }
740         }
741
742         return ret;
743 }
744
745 /* Allocate from stripes: */
746
747 /*
748  * if we can't allocate a new stripe because there are already too many
749  * partially filled stripes, force allocating from an existing stripe even when
750  * it's to a device we don't want:
751  */
752
753 static int bucket_alloc_from_stripe(struct btree_trans *trans,
754                          struct open_buckets *ptrs,
755                          struct write_point *wp,
756                          struct bch_devs_mask *devs_may_alloc,
757                          u16 target,
758                          unsigned nr_replicas,
759                          unsigned *nr_effective,
760                          bool *have_cache,
761                          enum alloc_reserve reserve,
762                          unsigned flags,
763                          struct closure *cl)
764 {
765         struct bch_fs *c = trans->c;
766         struct dev_alloc_list devs_sorted;
767         struct ec_stripe_head *h;
768         struct open_bucket *ob;
769         struct bch_dev *ca;
770         unsigned i, ec_idx;
771         int ret = 0;
772
773         if (nr_replicas < 2)
774                 return 0;
775
776         if (ec_open_bucket(c, ptrs))
777                 return 0;
778
779         h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, reserve, cl);
780         if (IS_ERR(h))
781                 return PTR_ERR(h);
782         if (!h)
783                 return 0;
784
785         devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
786
787         for (i = 0; i < devs_sorted.nr; i++)
788                 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
789                         if (!h->s->blocks[ec_idx])
790                                 continue;
791
792                         ob = c->open_buckets + h->s->blocks[ec_idx];
793                         if (ob->dev == devs_sorted.devs[i] &&
794                             !test_and_set_bit(ec_idx, h->s->blocks_allocated))
795                                 goto got_bucket;
796                 }
797         goto out_put_head;
798 got_bucket:
799         ca = bch_dev_bkey_exists(c, ob->dev);
800
801         ob->ec_idx      = ec_idx;
802         ob->ec          = h->s;
803         ec_stripe_new_get(h->s, STRIPE_REF_io);
804
805         ret = add_new_bucket(c, ptrs, devs_may_alloc,
806                              nr_replicas, nr_effective,
807                              have_cache, flags, ob);
808 out_put_head:
809         bch2_ec_stripe_head_put(c, h);
810         return ret;
811 }
812
813 /* Sector allocator */
814
815 static bool want_bucket(struct bch_fs *c,
816                         struct write_point *wp,
817                         struct bch_devs_mask *devs_may_alloc,
818                         bool *have_cache, bool ec,
819                         struct open_bucket *ob)
820 {
821         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
822
823         if (!test_bit(ob->dev, devs_may_alloc->d))
824                 return false;
825
826         if (ob->data_type != wp->data_type)
827                 return false;
828
829         if (!ca->mi.durability &&
830             (wp->data_type == BCH_DATA_btree || ec || *have_cache))
831                 return false;
832
833         if (ec != (ob->ec != NULL))
834                 return false;
835
836         return true;
837 }
838
839 static int bucket_alloc_set_writepoint(struct bch_fs *c,
840                                        struct open_buckets *ptrs,
841                                        struct write_point *wp,
842                                        struct bch_devs_mask *devs_may_alloc,
843                                        unsigned nr_replicas,
844                                        unsigned *nr_effective,
845                                        bool *have_cache,
846                                        bool ec, unsigned flags)
847 {
848         struct open_buckets ptrs_skip = { .nr = 0 };
849         struct open_bucket *ob;
850         unsigned i;
851         int ret = 0;
852
853         open_bucket_for_each(c, &wp->ptrs, ob, i) {
854                 if (!ret && want_bucket(c, wp, devs_may_alloc,
855                                         have_cache, ec, ob))
856                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
857                                        nr_replicas, nr_effective,
858                                        have_cache, flags, ob);
859                 else
860                         ob_push(c, &ptrs_skip, ob);
861         }
862         wp->ptrs = ptrs_skip;
863
864         return ret;
865 }
866
867 static int bucket_alloc_set_partial(struct bch_fs *c,
868                                     struct open_buckets *ptrs,
869                                     struct write_point *wp,
870                                     struct bch_devs_mask *devs_may_alloc,
871                                     unsigned nr_replicas,
872                                     unsigned *nr_effective,
873                                     bool *have_cache, bool ec,
874                                     enum alloc_reserve reserve,
875                                     unsigned flags)
876 {
877         int i, ret = 0;
878
879         if (!c->open_buckets_partial_nr)
880                 return 0;
881
882         spin_lock(&c->freelist_lock);
883
884         if (!c->open_buckets_partial_nr)
885                 goto unlock;
886
887         for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
888                 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
889
890                 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
891                         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
892                         struct bch_dev_usage usage;
893                         u64 avail;
894
895                         bch2_dev_usage_read_fast(ca, &usage);
896                         avail = dev_buckets_free(ca, usage, reserve);
897                         if (!avail)
898                                 continue;
899
900                         array_remove_item(c->open_buckets_partial,
901                                           c->open_buckets_partial_nr,
902                                           i);
903                         ob->on_partial_list = false;
904
905                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
906                                              nr_replicas, nr_effective,
907                                              have_cache, flags, ob);
908                         if (ret)
909                                 break;
910                 }
911         }
912 unlock:
913         spin_unlock(&c->freelist_lock);
914         return ret;
915 }
916
917 static int __open_bucket_add_buckets(struct btree_trans *trans,
918                         struct open_buckets *ptrs,
919                         struct write_point *wp,
920                         struct bch_devs_list *devs_have,
921                         u16 target,
922                         bool erasure_code,
923                         unsigned nr_replicas,
924                         unsigned *nr_effective,
925                         bool *have_cache,
926                         enum alloc_reserve reserve,
927                         unsigned flags,
928                         struct closure *_cl)
929 {
930         struct bch_fs *c = trans->c;
931         struct bch_devs_mask devs;
932         struct open_bucket *ob;
933         struct closure *cl = NULL;
934         unsigned i;
935         int ret;
936
937         rcu_read_lock();
938         devs = target_rw_devs(c, wp->data_type, target);
939         rcu_read_unlock();
940
941         /* Don't allocate from devices we already have pointers to: */
942         for (i = 0; i < devs_have->nr; i++)
943                 __clear_bit(devs_have->devs[i], devs.d);
944
945         open_bucket_for_each(c, ptrs, ob, i)
946                 __clear_bit(ob->dev, devs.d);
947
948         if (erasure_code && ec_open_bucket(c, ptrs))
949                 return 0;
950
951         ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
952                                  nr_replicas, nr_effective,
953                                  have_cache, erasure_code, flags);
954         if (ret)
955                 return ret;
956
957         ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
958                                  nr_replicas, nr_effective,
959                                  have_cache, erasure_code, reserve, flags);
960         if (ret)
961                 return ret;
962
963         if (erasure_code) {
964                 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
965                                          target,
966                                          nr_replicas, nr_effective,
967                                          have_cache,
968                                          reserve, flags, _cl);
969         } else {
970 retry_blocking:
971                 /*
972                  * Try nonblocking first, so that if one device is full we'll try from
973                  * other devices:
974                  */
975                 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
976                                         nr_replicas, nr_effective, have_cache,
977                                         flags, wp->data_type, reserve, cl);
978                 if (ret &&
979                     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
980                     !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
981                     !cl && _cl) {
982                         cl = _cl;
983                         goto retry_blocking;
984                 }
985
986         }
987
988         return ret;
989 }
990
991 static int open_bucket_add_buckets(struct btree_trans *trans,
992                         struct open_buckets *ptrs,
993                         struct write_point *wp,
994                         struct bch_devs_list *devs_have,
995                         u16 target,
996                         unsigned erasure_code,
997                         unsigned nr_replicas,
998                         unsigned *nr_effective,
999                         bool *have_cache,
1000                         enum alloc_reserve reserve,
1001                         unsigned flags,
1002                         struct closure *cl)
1003 {
1004         int ret;
1005
1006         if (erasure_code) {
1007                 ret = __open_bucket_add_buckets(trans, ptrs, wp,
1008                                 devs_have, target, erasure_code,
1009                                 nr_replicas, nr_effective, have_cache,
1010                                 reserve, flags, cl);
1011                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1012                     bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1013                     bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1014                     bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1015                         return ret;
1016                 if (*nr_effective >= nr_replicas)
1017                         return 0;
1018         }
1019
1020         ret = __open_bucket_add_buckets(trans, ptrs, wp,
1021                         devs_have, target, false,
1022                         nr_replicas, nr_effective, have_cache,
1023                         reserve, flags, cl);
1024         return ret < 0 ? ret : 0;
1025 }
1026
1027 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1028                                struct bch_dev *ca, bool ec)
1029 {
1030         if (ec) {
1031                 return ob->ec != NULL;
1032         } else if (ca) {
1033                 bool drop = ob->dev == ca->dev_idx;
1034                 struct open_bucket *ob2;
1035                 unsigned i;
1036
1037                 if (!drop && ob->ec) {
1038                         mutex_lock(&ob->ec->lock);
1039                         for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
1040                                 if (!ob->ec->blocks[i])
1041                                         continue;
1042
1043                                 ob2 = c->open_buckets + ob->ec->blocks[i];
1044                                 drop |= ob2->dev == ca->dev_idx;
1045                         }
1046                         mutex_unlock(&ob->ec->lock);
1047                 }
1048
1049                 return drop;
1050         } else {
1051                 return true;
1052         }
1053 }
1054
1055 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1056                                  bool ec, struct write_point *wp)
1057 {
1058         struct open_buckets ptrs = { .nr = 0 };
1059         struct open_bucket *ob;
1060         unsigned i;
1061
1062         mutex_lock(&wp->lock);
1063         open_bucket_for_each(c, &wp->ptrs, ob, i)
1064                 if (should_drop_bucket(ob, c, ca, ec))
1065                         bch2_open_bucket_put(c, ob);
1066                 else
1067                         ob_push(c, &ptrs, ob);
1068         wp->ptrs = ptrs;
1069         mutex_unlock(&wp->lock);
1070 }
1071
1072 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1073                             bool ec)
1074 {
1075         unsigned i;
1076
1077         /* Next, close write points that point to this device... */
1078         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1079                 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1080
1081         bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1082         bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1083         bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1084
1085         mutex_lock(&c->btree_reserve_cache_lock);
1086         while (c->btree_reserve_cache_nr) {
1087                 struct btree_alloc *a =
1088                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1089
1090                 bch2_open_buckets_put(c, &a->ob);
1091         }
1092         mutex_unlock(&c->btree_reserve_cache_lock);
1093
1094         spin_lock(&c->freelist_lock);
1095         i = 0;
1096         while (i < c->open_buckets_partial_nr) {
1097                 struct open_bucket *ob =
1098                         c->open_buckets + c->open_buckets_partial[i];
1099
1100                 if (should_drop_bucket(ob, c, ca, ec)) {
1101                         --c->open_buckets_partial_nr;
1102                         swap(c->open_buckets_partial[i],
1103                              c->open_buckets_partial[c->open_buckets_partial_nr]);
1104                         ob->on_partial_list = false;
1105                         spin_unlock(&c->freelist_lock);
1106                         bch2_open_bucket_put(c, ob);
1107                         spin_lock(&c->freelist_lock);
1108                 } else {
1109                         i++;
1110                 }
1111         }
1112         spin_unlock(&c->freelist_lock);
1113
1114         bch2_ec_stop_dev(c, ca);
1115 }
1116
1117 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1118                                                  unsigned long write_point)
1119 {
1120         unsigned hash =
1121                 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1122
1123         return &c->write_points_hash[hash];
1124 }
1125
1126 static struct write_point *__writepoint_find(struct hlist_head *head,
1127                                              unsigned long write_point)
1128 {
1129         struct write_point *wp;
1130
1131         rcu_read_lock();
1132         hlist_for_each_entry_rcu(wp, head, node)
1133                 if (wp->write_point == write_point)
1134                         goto out;
1135         wp = NULL;
1136 out:
1137         rcu_read_unlock();
1138         return wp;
1139 }
1140
1141 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1142 {
1143         u64 stranded    = c->write_points_nr * c->bucket_size_max;
1144         u64 free        = bch2_fs_usage_read_short(c).free;
1145
1146         return stranded * factor > free;
1147 }
1148
1149 static bool try_increase_writepoints(struct bch_fs *c)
1150 {
1151         struct write_point *wp;
1152
1153         if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1154             too_many_writepoints(c, 32))
1155                 return false;
1156
1157         wp = c->write_points + c->write_points_nr++;
1158         hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1159         return true;
1160 }
1161
1162 static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
1163 {
1164         struct write_point *wp;
1165
1166         mutex_lock(&c->write_points_hash_lock);
1167         if (c->write_points_nr < old_nr) {
1168                 mutex_unlock(&c->write_points_hash_lock);
1169                 return true;
1170         }
1171
1172         if (c->write_points_nr == 1 ||
1173             !too_many_writepoints(c, 8)) {
1174                 mutex_unlock(&c->write_points_hash_lock);
1175                 return false;
1176         }
1177
1178         wp = c->write_points + --c->write_points_nr;
1179
1180         hlist_del_rcu(&wp->node);
1181         mutex_unlock(&c->write_points_hash_lock);
1182
1183         bch2_writepoint_stop(c, NULL, false, wp);
1184         return true;
1185 }
1186
1187 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
1188                                   struct mutex *lock)
1189 {
1190         if (!mutex_trylock(lock)) {
1191                 bch2_trans_unlock(trans);
1192                 mutex_lock(lock);
1193         }
1194 }
1195
1196 static struct write_point *writepoint_find(struct btree_trans *trans,
1197                                            unsigned long write_point)
1198 {
1199         struct bch_fs *c = trans->c;
1200         struct write_point *wp, *oldest;
1201         struct hlist_head *head;
1202
1203         if (!(write_point & 1UL)) {
1204                 wp = (struct write_point *) write_point;
1205                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1206                 return wp;
1207         }
1208
1209         head = writepoint_hash(c, write_point);
1210 restart_find:
1211         wp = __writepoint_find(head, write_point);
1212         if (wp) {
1213 lock_wp:
1214                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1215                 if (wp->write_point == write_point)
1216                         goto out;
1217                 mutex_unlock(&wp->lock);
1218                 goto restart_find;
1219         }
1220 restart_find_oldest:
1221         oldest = NULL;
1222         for (wp = c->write_points;
1223              wp < c->write_points + c->write_points_nr; wp++)
1224                 if (!oldest || time_before64(wp->last_used, oldest->last_used))
1225                         oldest = wp;
1226
1227         bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1228         bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1229         if (oldest >= c->write_points + c->write_points_nr ||
1230             try_increase_writepoints(c)) {
1231                 mutex_unlock(&c->write_points_hash_lock);
1232                 mutex_unlock(&oldest->lock);
1233                 goto restart_find_oldest;
1234         }
1235
1236         wp = __writepoint_find(head, write_point);
1237         if (wp && wp != oldest) {
1238                 mutex_unlock(&c->write_points_hash_lock);
1239                 mutex_unlock(&oldest->lock);
1240                 goto lock_wp;
1241         }
1242
1243         wp = oldest;
1244         hlist_del_rcu(&wp->node);
1245         wp->write_point = write_point;
1246         hlist_add_head_rcu(&wp->node, head);
1247         mutex_unlock(&c->write_points_hash_lock);
1248 out:
1249         wp->last_used = local_clock();
1250         return wp;
1251 }
1252
1253 /*
1254  * Get us an open_bucket we can allocate from, return with it locked:
1255  */
1256 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1257                              unsigned target,
1258                              unsigned erasure_code,
1259                              struct write_point_specifier write_point,
1260                              struct bch_devs_list *devs_have,
1261                              unsigned nr_replicas,
1262                              unsigned nr_replicas_required,
1263                              enum alloc_reserve reserve,
1264                              unsigned flags,
1265                              struct closure *cl,
1266                              struct write_point **wp_ret)
1267 {
1268         struct bch_fs *c = trans->c;
1269         struct write_point *wp;
1270         struct open_bucket *ob;
1271         struct open_buckets ptrs;
1272         unsigned nr_effective, write_points_nr;
1273         bool have_cache;
1274         int ret;
1275         int i;
1276
1277         BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1278
1279         BUG_ON(!nr_replicas || !nr_replicas_required);
1280 retry:
1281         ptrs.nr         = 0;
1282         nr_effective    = 0;
1283         write_points_nr = c->write_points_nr;
1284         have_cache      = false;
1285
1286         *wp_ret = wp = writepoint_find(trans, write_point.v);
1287
1288         /* metadata may not allocate on cache devices: */
1289         if (wp->data_type != BCH_DATA_user)
1290                 have_cache = true;
1291
1292         if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1293                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1294                                               target, erasure_code,
1295                                               nr_replicas, &nr_effective,
1296                                               &have_cache, reserve,
1297                                               flags, NULL);
1298                 if (!ret ||
1299                     bch2_err_matches(ret, BCH_ERR_transaction_restart))
1300                         goto alloc_done;
1301
1302                 /* Don't retry from all devices if we're out of open buckets: */
1303                 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1304                         goto allocate_blocking;
1305
1306                 /*
1307                  * Only try to allocate cache (durability = 0 devices) from the
1308                  * specified target:
1309                  */
1310                 have_cache = true;
1311
1312                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1313                                               0, erasure_code,
1314                                               nr_replicas, &nr_effective,
1315                                               &have_cache, reserve,
1316                                               flags, cl);
1317         } else {
1318 allocate_blocking:
1319                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1320                                               target, erasure_code,
1321                                               nr_replicas, &nr_effective,
1322                                               &have_cache, reserve,
1323                                               flags, cl);
1324         }
1325 alloc_done:
1326         BUG_ON(!ret && nr_effective < nr_replicas);
1327
1328         if (erasure_code && !ec_open_bucket(c, &ptrs))
1329                 pr_debug("failed to get ec bucket: ret %u", ret);
1330
1331         if (ret == -BCH_ERR_insufficient_devices &&
1332             nr_effective >= nr_replicas_required)
1333                 ret = 0;
1334
1335         if (ret)
1336                 goto err;
1337
1338         /* Free buckets we didn't use: */
1339         open_bucket_for_each(c, &wp->ptrs, ob, i)
1340                 open_bucket_free_unused(c, wp, ob);
1341
1342         wp->ptrs = ptrs;
1343
1344         wp->sectors_free = UINT_MAX;
1345
1346         open_bucket_for_each(c, &wp->ptrs, ob, i)
1347                 wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1348
1349         BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1350
1351         return 0;
1352 err:
1353         open_bucket_for_each(c, &wp->ptrs, ob, i)
1354                 if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1355                         ob_push(c, &ptrs, ob);
1356                 else
1357                         open_bucket_free_unused(c, wp, ob);
1358         wp->ptrs = ptrs;
1359
1360         mutex_unlock(&wp->lock);
1361
1362         if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1363             try_decrease_writepoints(c, write_points_nr))
1364                 goto retry;
1365
1366         if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1367             bch2_err_matches(ret, BCH_ERR_freelist_empty))
1368                 return cl
1369                         ? -BCH_ERR_bucket_alloc_blocked
1370                         : -BCH_ERR_ENOSPC_bucket_alloc;
1371
1372         return ret;
1373 }
1374
1375 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1376 {
1377         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1378
1379         return (struct bch_extent_ptr) {
1380                 .type   = 1 << BCH_EXTENT_ENTRY_ptr,
1381                 .gen    = ob->gen,
1382                 .dev    = ob->dev,
1383                 .offset = bucket_to_sector(ca, ob->bucket) +
1384                         ca->mi.bucket_size -
1385                         ob->sectors_free,
1386         };
1387 }
1388
1389 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1390                                     struct bkey_i *k, unsigned sectors,
1391                                     bool cached)
1392 {
1393         bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1394 }
1395
1396 /*
1397  * Append pointers to the space we just allocated to @k, and mark @sectors space
1398  * as allocated out of @ob
1399  */
1400 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1401 {
1402         bch2_alloc_sectors_done_inlined(c, wp);
1403 }
1404
1405 static inline void writepoint_init(struct write_point *wp,
1406                                    enum bch_data_type type)
1407 {
1408         mutex_init(&wp->lock);
1409         wp->data_type = type;
1410
1411         INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1412         INIT_LIST_HEAD(&wp->writes);
1413         spin_lock_init(&wp->writes_lock);
1414 }
1415
1416 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1417 {
1418         struct open_bucket *ob;
1419         struct write_point *wp;
1420
1421         mutex_init(&c->write_points_hash_lock);
1422         c->write_points_nr = ARRAY_SIZE(c->write_points);
1423
1424         /* open bucket 0 is a sentinal NULL: */
1425         spin_lock_init(&c->open_buckets[0].lock);
1426
1427         for (ob = c->open_buckets + 1;
1428              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1429                 spin_lock_init(&ob->lock);
1430                 c->open_buckets_nr_free++;
1431
1432                 ob->freelist = c->open_buckets_freelist;
1433                 c->open_buckets_freelist = ob - c->open_buckets;
1434         }
1435
1436         writepoint_init(&c->btree_write_point,          BCH_DATA_btree);
1437         writepoint_init(&c->rebalance_write_point,      BCH_DATA_user);
1438         writepoint_init(&c->copygc_write_point,         BCH_DATA_user);
1439
1440         for (wp = c->write_points;
1441              wp < c->write_points + c->write_points_nr; wp++) {
1442                 writepoint_init(wp, BCH_DATA_user);
1443
1444                 wp->last_used   = local_clock();
1445                 wp->write_point = (unsigned long) wp;
1446                 hlist_add_head_rcu(&wp->node,
1447                                    writepoint_hash(c, wp->write_point));
1448         }
1449 }
1450
1451 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1452 {
1453         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1454         unsigned data_type = ob->data_type;
1455         barrier(); /* READ_ONCE() doesn't work on bitfields */
1456
1457         prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
1458                    ob - c->open_buckets,
1459                    atomic_read(&ob->pin),
1460                    data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
1461                    ob->dev, ob->bucket, ob->gen,
1462                    ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1463         if (ob->ec)
1464                 prt_printf(out, " ec idx %llu", ob->ec->idx);
1465         if (ob->on_partial_list)
1466                 prt_str(out, " partial");
1467         prt_newline(out);
1468 }
1469
1470 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1471 {
1472         struct open_bucket *ob;
1473
1474         out->atomic++;
1475
1476         for (ob = c->open_buckets;
1477              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1478              ob++) {
1479                 spin_lock(&ob->lock);
1480                 if (ob->valid && !ob->on_partial_list)
1481                         bch2_open_bucket_to_text(out, c, ob);
1482                 spin_unlock(&ob->lock);
1483         }
1484
1485         --out->atomic;
1486 }
1487
1488 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1489 {
1490         unsigned i;
1491
1492         out->atomic++;
1493         spin_lock(&c->freelist_lock);
1494
1495         for (i = 0; i < c->open_buckets_partial_nr; i++)
1496                 bch2_open_bucket_to_text(out, c,
1497                                 c->open_buckets + c->open_buckets_partial[i]);
1498
1499         spin_unlock(&c->freelist_lock);
1500         --out->atomic;
1501 }
1502
1503 static const char * const bch2_write_point_states[] = {
1504 #define x(n)    #n,
1505         WRITE_POINT_STATES()
1506 #undef x
1507         NULL
1508 };
1509
1510 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1511 {
1512         struct write_point *wp;
1513         unsigned i;
1514
1515         for (wp = c->write_points;
1516              wp < c->write_points + ARRAY_SIZE(c->write_points);
1517              wp++) {
1518                 prt_printf(out, "%lu: ", wp->write_point);
1519                 prt_human_readable_u64(out, wp->sectors_allocated);
1520
1521                 prt_printf(out, " last wrote: ");
1522                 bch2_pr_time_units(out, sched_clock() - wp->last_used);
1523
1524                 for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1525                         prt_printf(out, " %s: ", bch2_write_point_states[i]);
1526                         bch2_pr_time_units(out, wp->time[i]);
1527                 }
1528
1529                 prt_newline(out);
1530         }
1531 }