]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_foreground.c
Update bcachefs sources to 841a95c29f4c bcachefs: fix userspace build errors
[bcachefs-tools-debian] / libbcachefs / alloc_foreground.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2012 Google, Inc.
4  *
5  * Foreground allocator code: allocate buckets from freelist, and allocate in
6  * sector granularity from writepoints.
7  *
8  * bch2_bucket_alloc() allocates a single bucket from a specific device.
9  *
10  * bch2_bucket_alloc_set() allocates one or more buckets from different devices
11  * in a given filesystem.
12  */
13
14 #include "bcachefs.h"
15 #include "alloc_background.h"
16 #include "alloc_foreground.h"
17 #include "backpointers.h"
18 #include "btree_iter.h"
19 #include "btree_update.h"
20 #include "btree_gc.h"
21 #include "buckets.h"
22 #include "buckets_waiting_for_journal.h"
23 #include "clock.h"
24 #include "debug.h"
25 #include "disk_groups.h"
26 #include "ec.h"
27 #include "error.h"
28 #include "io_write.h"
29 #include "journal.h"
30 #include "movinggc.h"
31 #include "nocow_locking.h"
32 #include "trace.h"
33
34 #include <linux/math64.h>
35 #include <linux/rculist.h>
36 #include <linux/rcupdate.h>
37
38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
39                                            struct mutex *lock)
40 {
41         if (!mutex_trylock(lock)) {
42                 bch2_trans_unlock(trans);
43                 mutex_lock(lock);
44         }
45 }
46
47 const char * const bch2_watermarks[] = {
48 #define x(t) #t,
49         BCH_WATERMARKS()
50 #undef x
51         NULL
52 };
53
54 /*
55  * Open buckets represent a bucket that's currently being allocated from.  They
56  * serve two purposes:
57  *
58  *  - They track buckets that have been partially allocated, allowing for
59  *    sub-bucket sized allocations - they're used by the sector allocator below
60  *
61  *  - They provide a reference to the buckets they own that mark and sweep GC
62  *    can find, until the new allocation has a pointer to it inserted into the
63  *    btree
64  *
65  * When allocating some space with the sector allocator, the allocation comes
66  * with a reference to an open bucket - the caller is required to put that
67  * reference _after_ doing the index update that makes its allocation reachable.
68  */
69
70 void bch2_reset_alloc_cursors(struct bch_fs *c)
71 {
72         struct bch_dev *ca;
73         unsigned i;
74
75         rcu_read_lock();
76         for_each_member_device_rcu(ca, c, i, NULL)
77                 ca->alloc_cursor = 0;
78         rcu_read_unlock();
79 }
80
81 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
82 {
83         open_bucket_idx_t idx = ob - c->open_buckets;
84         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
85
86         ob->hash = *slot;
87         *slot = idx;
88 }
89
90 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
91 {
92         open_bucket_idx_t idx = ob - c->open_buckets;
93         open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
94
95         while (*slot != idx) {
96                 BUG_ON(!*slot);
97                 slot = &c->open_buckets[*slot].hash;
98         }
99
100         *slot = ob->hash;
101         ob->hash = 0;
102 }
103
104 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
105 {
106         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
107
108         if (ob->ec) {
109                 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
110                 return;
111         }
112
113         percpu_down_read(&c->mark_lock);
114         spin_lock(&ob->lock);
115
116         ob->valid = false;
117         ob->data_type = 0;
118
119         spin_unlock(&ob->lock);
120         percpu_up_read(&c->mark_lock);
121
122         spin_lock(&c->freelist_lock);
123         bch2_open_bucket_hash_remove(c, ob);
124
125         ob->freelist = c->open_buckets_freelist;
126         c->open_buckets_freelist = ob - c->open_buckets;
127
128         c->open_buckets_nr_free++;
129         ca->nr_open_buckets--;
130         spin_unlock(&c->freelist_lock);
131
132         closure_wake_up(&c->open_buckets_wait);
133 }
134
135 void bch2_open_bucket_write_error(struct bch_fs *c,
136                                   struct open_buckets *obs,
137                                   unsigned dev)
138 {
139         struct open_bucket *ob;
140         unsigned i;
141
142         open_bucket_for_each(c, obs, ob, i)
143                 if (ob->dev == dev && ob->ec)
144                         bch2_ec_bucket_cancel(c, ob);
145 }
146
147 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
148 {
149         struct open_bucket *ob;
150
151         BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
152
153         ob = c->open_buckets + c->open_buckets_freelist;
154         c->open_buckets_freelist = ob->freelist;
155         atomic_set(&ob->pin, 1);
156         ob->data_type = 0;
157
158         c->open_buckets_nr_free--;
159         return ob;
160 }
161
162 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
163 {
164         BUG_ON(c->open_buckets_partial_nr >=
165                ARRAY_SIZE(c->open_buckets_partial));
166
167         spin_lock(&c->freelist_lock);
168         ob->on_partial_list = true;
169         c->open_buckets_partial[c->open_buckets_partial_nr++] =
170                 ob - c->open_buckets;
171         spin_unlock(&c->freelist_lock);
172
173         closure_wake_up(&c->open_buckets_wait);
174         closure_wake_up(&c->freelist_wait);
175 }
176
177 /* _only_ for allocating the journal on a new device: */
178 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
179 {
180         while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
181                 u64 b = ca->new_fs_bucket_idx++;
182
183                 if (!is_superblock_bucket(ca, b) &&
184                     (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
185                         return b;
186         }
187
188         return -1;
189 }
190
191 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
192 {
193         switch (watermark) {
194         case BCH_WATERMARK_reclaim:
195                 return 0;
196         case BCH_WATERMARK_btree:
197         case BCH_WATERMARK_btree_copygc:
198                 return OPEN_BUCKETS_COUNT / 4;
199         case BCH_WATERMARK_copygc:
200                 return OPEN_BUCKETS_COUNT / 3;
201         default:
202                 return OPEN_BUCKETS_COUNT / 2;
203         }
204 }
205
206 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
207                                               u64 bucket,
208                                               enum bch_watermark watermark,
209                                               const struct bch_alloc_v4 *a,
210                                               struct bucket_alloc_state *s,
211                                               struct closure *cl)
212 {
213         struct open_bucket *ob;
214
215         if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
216                 s->skipped_nouse++;
217                 return NULL;
218         }
219
220         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
221                 s->skipped_open++;
222                 return NULL;
223         }
224
225         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
226                         c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
227                 s->skipped_need_journal_commit++;
228                 return NULL;
229         }
230
231         if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
232                 s->skipped_nocow++;
233                 return NULL;
234         }
235
236         spin_lock(&c->freelist_lock);
237
238         if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
239                 if (cl)
240                         closure_wait(&c->open_buckets_wait, cl);
241
242                 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
243                                    &c->blocked_allocate_open_bucket, true);
244                 spin_unlock(&c->freelist_lock);
245                 return ERR_PTR(-BCH_ERR_open_buckets_empty);
246         }
247
248         /* Recheck under lock: */
249         if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
250                 spin_unlock(&c->freelist_lock);
251                 s->skipped_open++;
252                 return NULL;
253         }
254
255         ob = bch2_open_bucket_alloc(c);
256
257         spin_lock(&ob->lock);
258
259         ob->valid       = true;
260         ob->sectors_free = ca->mi.bucket_size;
261         ob->dev         = ca->dev_idx;
262         ob->gen         = a->gen;
263         ob->bucket      = bucket;
264         spin_unlock(&ob->lock);
265
266         ca->nr_open_buckets++;
267         bch2_open_bucket_hash_add(c, ob);
268
269         track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
270                            &c->blocked_allocate_open_bucket, false);
271
272         track_event_change(&c->times[BCH_TIME_blocked_allocate],
273                            &c->blocked_allocate, false);
274
275         spin_unlock(&c->freelist_lock);
276         return ob;
277 }
278
279 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
280                                             enum bch_watermark watermark, u64 free_entry,
281                                             struct bucket_alloc_state *s,
282                                             struct bkey_s_c freespace_k,
283                                             struct closure *cl)
284 {
285         struct bch_fs *c = trans->c;
286         struct btree_iter iter = { NULL };
287         struct bkey_s_c k;
288         struct open_bucket *ob;
289         struct bch_alloc_v4 a_convert;
290         const struct bch_alloc_v4 *a;
291         u64 b = free_entry & ~(~0ULL << 56);
292         unsigned genbits = free_entry >> 56;
293         struct printbuf buf = PRINTBUF;
294         int ret;
295
296         if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
297                 prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
298                        "  freespace key ",
299                         ca->mi.first_bucket, ca->mi.nbuckets);
300                 bch2_bkey_val_to_text(&buf, c, freespace_k);
301                 bch2_trans_inconsistent(trans, "%s", buf.buf);
302                 ob = ERR_PTR(-EIO);
303                 goto err;
304         }
305
306         k = bch2_bkey_get_iter(trans, &iter,
307                                BTREE_ID_alloc, POS(ca->dev_idx, b),
308                                BTREE_ITER_CACHED);
309         ret = bkey_err(k);
310         if (ret) {
311                 ob = ERR_PTR(ret);
312                 goto err;
313         }
314
315         a = bch2_alloc_to_v4(k, &a_convert);
316
317         if (a->data_type != BCH_DATA_free) {
318                 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
319                         ob = NULL;
320                         goto err;
321                 }
322
323                 prt_printf(&buf, "non free bucket in freespace btree\n"
324                        "  freespace key ");
325                 bch2_bkey_val_to_text(&buf, c, freespace_k);
326                 prt_printf(&buf, "\n  ");
327                 bch2_bkey_val_to_text(&buf, c, k);
328                 bch2_trans_inconsistent(trans, "%s", buf.buf);
329                 ob = ERR_PTR(-EIO);
330                 goto err;
331         }
332
333         if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
334             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
335                 prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
336                        "  freespace key ",
337                        genbits, alloc_freespace_genbits(*a) >> 56);
338                 bch2_bkey_val_to_text(&buf, c, freespace_k);
339                 prt_printf(&buf, "\n  ");
340                 bch2_bkey_val_to_text(&buf, c, k);
341                 bch2_trans_inconsistent(trans, "%s", buf.buf);
342                 ob = ERR_PTR(-EIO);
343                 goto err;
344         }
345
346         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
347                 struct bch_backpointer bp;
348                 struct bpos bp_pos = POS_MIN;
349
350                 ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
351                                                 &bp_pos, &bp,
352                                                 BTREE_ITER_NOPRESERVE);
353                 if (ret) {
354                         ob = ERR_PTR(ret);
355                         goto err;
356                 }
357
358                 if (!bkey_eq(bp_pos, POS_MAX)) {
359                         /*
360                          * Bucket may have data in it - we don't call
361                          * bc2h_trans_inconnsistent() because fsck hasn't
362                          * finished yet
363                          */
364                         ob = NULL;
365                         goto err;
366                 }
367         }
368
369         ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
370         if (!ob)
371                 set_btree_iter_dontneed(&iter);
372 err:
373         if (iter.path)
374                 set_btree_iter_dontneed(&iter);
375         bch2_trans_iter_exit(trans, &iter);
376         printbuf_exit(&buf);
377         return ob;
378 }
379
380 /*
381  * This path is for before the freespace btree is initialized:
382  *
383  * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
384  * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
385  */
386 static noinline struct open_bucket *
387 bch2_bucket_alloc_early(struct btree_trans *trans,
388                         struct bch_dev *ca,
389                         enum bch_watermark watermark,
390                         struct bucket_alloc_state *s,
391                         struct closure *cl)
392 {
393         struct btree_iter iter, citer;
394         struct bkey_s_c k, ck;
395         struct open_bucket *ob = NULL;
396         u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
397         u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
398         u64 alloc_cursor = alloc_start;
399         int ret;
400
401         /*
402          * Scan with an uncached iterator to avoid polluting the key cache. An
403          * uncached iter will return a cached key if one exists, but if not
404          * there is no other underlying protection for the associated key cache
405          * slot. To avoid racing bucket allocations, look up the cached key slot
406          * of any likely allocation candidate before attempting to proceed with
407          * the allocation. This provides proper exclusion on the associated
408          * bucket.
409          */
410 again:
411         for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
412                            BTREE_ITER_SLOTS, k, ret) {
413                 struct bch_alloc_v4 a_convert;
414                 const struct bch_alloc_v4 *a;
415
416                 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
417                         break;
418
419                 if (ca->new_fs_bucket_idx &&
420                     is_superblock_bucket(ca, k.k->p.offset))
421                         continue;
422
423                 a = bch2_alloc_to_v4(k, &a_convert);
424                 if (a->data_type != BCH_DATA_free)
425                         continue;
426
427                 /* now check the cached key to serialize concurrent allocs of the bucket */
428                 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
429                 ret = bkey_err(ck);
430                 if (ret)
431                         break;
432
433                 a = bch2_alloc_to_v4(ck, &a_convert);
434                 if (a->data_type != BCH_DATA_free)
435                         goto next;
436
437                 s->buckets_seen++;
438
439                 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
440 next:
441                 set_btree_iter_dontneed(&citer);
442                 bch2_trans_iter_exit(trans, &citer);
443                 if (ob)
444                         break;
445         }
446         bch2_trans_iter_exit(trans, &iter);
447
448         alloc_cursor = iter.pos.offset;
449         ca->alloc_cursor = alloc_cursor;
450
451         if (!ob && ret)
452                 ob = ERR_PTR(ret);
453
454         if (!ob && alloc_start > first_bucket) {
455                 alloc_cursor = alloc_start = first_bucket;
456                 goto again;
457         }
458
459         return ob;
460 }
461
462 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
463                                                    struct bch_dev *ca,
464                                                    enum bch_watermark watermark,
465                                                    struct bucket_alloc_state *s,
466                                                    struct closure *cl)
467 {
468         struct btree_iter iter;
469         struct bkey_s_c k;
470         struct open_bucket *ob = NULL;
471         u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
472         u64 alloc_cursor = alloc_start;
473         int ret;
474
475         BUG_ON(ca->new_fs_bucket_idx);
476 again:
477         for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
478                                      POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
479                 if (k.k->p.inode != ca->dev_idx)
480                         break;
481
482                 for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
483                      alloc_cursor < k.k->p.offset;
484                      alloc_cursor++) {
485                         ret = btree_trans_too_many_iters(trans);
486                         if (ret) {
487                                 ob = ERR_PTR(ret);
488                                 break;
489                         }
490
491                         s->buckets_seen++;
492
493                         ob = try_alloc_bucket(trans, ca, watermark,
494                                               alloc_cursor, s, k, cl);
495                         if (ob) {
496                                 set_btree_iter_dontneed(&iter);
497                                 break;
498                         }
499                 }
500
501                 if (ob || ret)
502                         break;
503         }
504         bch2_trans_iter_exit(trans, &iter);
505
506         ca->alloc_cursor = alloc_cursor;
507
508         if (!ob && ret)
509                 ob = ERR_PTR(ret);
510
511         if (!ob && alloc_start > ca->mi.first_bucket) {
512                 alloc_cursor = alloc_start = ca->mi.first_bucket;
513                 goto again;
514         }
515
516         return ob;
517 }
518
519 /**
520  * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
521  * @trans:      transaction object
522  * @ca:         device to allocate from
523  * @watermark:  how important is this allocation?
524  * @cl:         if not NULL, closure to be used to wait if buckets not available
525  * @usage:      for secondarily also returning the current device usage
526  *
527  * Returns:     an open_bucket on success, or an ERR_PTR() on failure.
528  */
529 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
530                                       struct bch_dev *ca,
531                                       enum bch_watermark watermark,
532                                       struct closure *cl,
533                                       struct bch_dev_usage *usage)
534 {
535         struct bch_fs *c = trans->c;
536         struct open_bucket *ob = NULL;
537         bool freespace = READ_ONCE(ca->mi.freespace_initialized);
538         u64 avail;
539         struct bucket_alloc_state s = { 0 };
540         bool waiting = false;
541 again:
542         bch2_dev_usage_read_fast(ca, usage);
543         avail = dev_buckets_free(ca, *usage, watermark);
544
545         if (usage->d[BCH_DATA_need_discard].buckets > avail)
546                 bch2_do_discards(c);
547
548         if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
549                 bch2_do_gc_gens(c);
550
551         if (should_invalidate_buckets(ca, *usage))
552                 bch2_do_invalidates(c);
553
554         if (!avail) {
555                 if (cl && !waiting) {
556                         closure_wait(&c->freelist_wait, cl);
557                         waiting = true;
558                         goto again;
559                 }
560
561                 track_event_change(&c->times[BCH_TIME_blocked_allocate],
562                                    &c->blocked_allocate, true);
563
564                 ob = ERR_PTR(-BCH_ERR_freelist_empty);
565                 goto err;
566         }
567
568         if (waiting)
569                 closure_wake_up(&c->freelist_wait);
570 alloc:
571         ob = likely(freespace)
572                 ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
573                 : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
574
575         if (s.skipped_need_journal_commit * 2 > avail)
576                 bch2_journal_flush_async(&c->journal, NULL);
577
578         if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
579                 freespace = false;
580                 goto alloc;
581         }
582 err:
583         if (!ob)
584                 ob = ERR_PTR(-BCH_ERR_no_buckets_found);
585
586         if (!IS_ERR(ob))
587                 trace_and_count(c, bucket_alloc, ca,
588                                 bch2_watermarks[watermark],
589                                 ob->bucket,
590                                 usage->d[BCH_DATA_free].buckets,
591                                 avail,
592                                 bch2_copygc_wait_amount(c),
593                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
594                                 &s,
595                                 cl == NULL,
596                                 "");
597         else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
598                 trace_and_count(c, bucket_alloc_fail, ca,
599                                 bch2_watermarks[watermark],
600                                 0,
601                                 usage->d[BCH_DATA_free].buckets,
602                                 avail,
603                                 bch2_copygc_wait_amount(c),
604                                 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
605                                 &s,
606                                 cl == NULL,
607                                 bch2_err_str(PTR_ERR(ob)));
608
609         return ob;
610 }
611
612 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
613                                       enum bch_watermark watermark,
614                                       struct closure *cl)
615 {
616         struct bch_dev_usage usage;
617         struct open_bucket *ob;
618
619         bch2_trans_do(c, NULL, NULL, 0,
620                       PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
621                                                         cl, &usage)));
622         return ob;
623 }
624
625 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
626                             unsigned l, unsigned r)
627 {
628         return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
629                 (stripe->next_alloc[l] < stripe->next_alloc[r]));
630 }
631
632 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
633
634 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
635                                           struct dev_stripe_state *stripe,
636                                           struct bch_devs_mask *devs)
637 {
638         struct dev_alloc_list ret = { .nr = 0 };
639         unsigned i;
640
641         for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
642                 ret.devs[ret.nr++] = i;
643
644         bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
645         return ret;
646 }
647
648 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
649                                struct dev_stripe_state *stripe,
650                                struct bch_dev_usage *usage)
651 {
652         u64 *v = stripe->next_alloc + ca->dev_idx;
653         u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
654         u64 free_space_inv = free_space
655                 ? div64_u64(1ULL << 48, free_space)
656                 : 1ULL << 48;
657         u64 scale = *v / 4;
658
659         if (*v + free_space_inv >= *v)
660                 *v += free_space_inv;
661         else
662                 *v = U64_MAX;
663
664         for (v = stripe->next_alloc;
665              v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
666                 *v = *v < scale ? 0 : *v - scale;
667 }
668
669 void bch2_dev_stripe_increment(struct bch_dev *ca,
670                                struct dev_stripe_state *stripe)
671 {
672         struct bch_dev_usage usage;
673
674         bch2_dev_usage_read_fast(ca, &usage);
675         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
676 }
677
678 static int add_new_bucket(struct bch_fs *c,
679                            struct open_buckets *ptrs,
680                            struct bch_devs_mask *devs_may_alloc,
681                            unsigned nr_replicas,
682                            unsigned *nr_effective,
683                            bool *have_cache,
684                            unsigned flags,
685                            struct open_bucket *ob)
686 {
687         unsigned durability =
688                 bch_dev_bkey_exists(c, ob->dev)->mi.durability;
689
690         BUG_ON(*nr_effective >= nr_replicas);
691
692         __clear_bit(ob->dev, devs_may_alloc->d);
693         *nr_effective   += durability;
694         *have_cache     |= !durability;
695
696         ob_push(c, ptrs, ob);
697
698         if (*nr_effective >= nr_replicas)
699                 return 1;
700         if (ob->ec)
701                 return 1;
702         return 0;
703 }
704
705 int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
706                       struct open_buckets *ptrs,
707                       struct dev_stripe_state *stripe,
708                       struct bch_devs_mask *devs_may_alloc,
709                       unsigned nr_replicas,
710                       unsigned *nr_effective,
711                       bool *have_cache,
712                       unsigned flags,
713                       enum bch_data_type data_type,
714                       enum bch_watermark watermark,
715                       struct closure *cl)
716 {
717         struct bch_fs *c = trans->c;
718         struct dev_alloc_list devs_sorted =
719                 bch2_dev_alloc_list(c, stripe, devs_may_alloc);
720         unsigned dev;
721         struct bch_dev *ca;
722         int ret = -BCH_ERR_insufficient_devices;
723         unsigned i;
724
725         BUG_ON(*nr_effective >= nr_replicas);
726
727         for (i = 0; i < devs_sorted.nr; i++) {
728                 struct bch_dev_usage usage;
729                 struct open_bucket *ob;
730
731                 dev = devs_sorted.devs[i];
732
733                 rcu_read_lock();
734                 ca = rcu_dereference(c->devs[dev]);
735                 if (ca)
736                         percpu_ref_get(&ca->ref);
737                 rcu_read_unlock();
738
739                 if (!ca)
740                         continue;
741
742                 if (!ca->mi.durability && *have_cache) {
743                         percpu_ref_put(&ca->ref);
744                         continue;
745                 }
746
747                 ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
748                 if (!IS_ERR(ob))
749                         bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
750                 percpu_ref_put(&ca->ref);
751
752                 if (IS_ERR(ob)) {
753                         ret = PTR_ERR(ob);
754                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
755                                 break;
756                         continue;
757                 }
758
759                 ob->data_type = data_type;
760
761                 if (add_new_bucket(c, ptrs, devs_may_alloc,
762                                    nr_replicas, nr_effective,
763                                    have_cache, flags, ob)) {
764                         ret = 0;
765                         break;
766                 }
767         }
768
769         return ret;
770 }
771
772 /* Allocate from stripes: */
773
774 /*
775  * if we can't allocate a new stripe because there are already too many
776  * partially filled stripes, force allocating from an existing stripe even when
777  * it's to a device we don't want:
778  */
779
780 static int bucket_alloc_from_stripe(struct btree_trans *trans,
781                          struct open_buckets *ptrs,
782                          struct write_point *wp,
783                          struct bch_devs_mask *devs_may_alloc,
784                          u16 target,
785                          unsigned nr_replicas,
786                          unsigned *nr_effective,
787                          bool *have_cache,
788                          enum bch_watermark watermark,
789                          unsigned flags,
790                          struct closure *cl)
791 {
792         struct bch_fs *c = trans->c;
793         struct dev_alloc_list devs_sorted;
794         struct ec_stripe_head *h;
795         struct open_bucket *ob;
796         unsigned i, ec_idx;
797         int ret = 0;
798
799         if (nr_replicas < 2)
800                 return 0;
801
802         if (ec_open_bucket(c, ptrs))
803                 return 0;
804
805         h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
806         if (IS_ERR(h))
807                 return PTR_ERR(h);
808         if (!h)
809                 return 0;
810
811         devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
812
813         for (i = 0; i < devs_sorted.nr; i++)
814                 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
815                         if (!h->s->blocks[ec_idx])
816                                 continue;
817
818                         ob = c->open_buckets + h->s->blocks[ec_idx];
819                         if (ob->dev == devs_sorted.devs[i] &&
820                             !test_and_set_bit(ec_idx, h->s->blocks_allocated))
821                                 goto got_bucket;
822                 }
823         goto out_put_head;
824 got_bucket:
825         ob->ec_idx      = ec_idx;
826         ob->ec          = h->s;
827         ec_stripe_new_get(h->s, STRIPE_REF_io);
828
829         ret = add_new_bucket(c, ptrs, devs_may_alloc,
830                              nr_replicas, nr_effective,
831                              have_cache, flags, ob);
832 out_put_head:
833         bch2_ec_stripe_head_put(c, h);
834         return ret;
835 }
836
837 /* Sector allocator */
838
839 static bool want_bucket(struct bch_fs *c,
840                         struct write_point *wp,
841                         struct bch_devs_mask *devs_may_alloc,
842                         bool *have_cache, bool ec,
843                         struct open_bucket *ob)
844 {
845         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
846
847         if (!test_bit(ob->dev, devs_may_alloc->d))
848                 return false;
849
850         if (ob->data_type != wp->data_type)
851                 return false;
852
853         if (!ca->mi.durability &&
854             (wp->data_type == BCH_DATA_btree || ec || *have_cache))
855                 return false;
856
857         if (ec != (ob->ec != NULL))
858                 return false;
859
860         return true;
861 }
862
863 static int bucket_alloc_set_writepoint(struct bch_fs *c,
864                                        struct open_buckets *ptrs,
865                                        struct write_point *wp,
866                                        struct bch_devs_mask *devs_may_alloc,
867                                        unsigned nr_replicas,
868                                        unsigned *nr_effective,
869                                        bool *have_cache,
870                                        bool ec, unsigned flags)
871 {
872         struct open_buckets ptrs_skip = { .nr = 0 };
873         struct open_bucket *ob;
874         unsigned i;
875         int ret = 0;
876
877         open_bucket_for_each(c, &wp->ptrs, ob, i) {
878                 if (!ret && want_bucket(c, wp, devs_may_alloc,
879                                         have_cache, ec, ob))
880                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
881                                        nr_replicas, nr_effective,
882                                        have_cache, flags, ob);
883                 else
884                         ob_push(c, &ptrs_skip, ob);
885         }
886         wp->ptrs = ptrs_skip;
887
888         return ret;
889 }
890
891 static int bucket_alloc_set_partial(struct bch_fs *c,
892                                     struct open_buckets *ptrs,
893                                     struct write_point *wp,
894                                     struct bch_devs_mask *devs_may_alloc,
895                                     unsigned nr_replicas,
896                                     unsigned *nr_effective,
897                                     bool *have_cache, bool ec,
898                                     enum bch_watermark watermark,
899                                     unsigned flags)
900 {
901         int i, ret = 0;
902
903         if (!c->open_buckets_partial_nr)
904                 return 0;
905
906         spin_lock(&c->freelist_lock);
907
908         if (!c->open_buckets_partial_nr)
909                 goto unlock;
910
911         for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
912                 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
913
914                 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
915                         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
916                         struct bch_dev_usage usage;
917                         u64 avail;
918
919                         bch2_dev_usage_read_fast(ca, &usage);
920                         avail = dev_buckets_free(ca, usage, watermark);
921                         if (!avail)
922                                 continue;
923
924                         array_remove_item(c->open_buckets_partial,
925                                           c->open_buckets_partial_nr,
926                                           i);
927                         ob->on_partial_list = false;
928
929                         ret = add_new_bucket(c, ptrs, devs_may_alloc,
930                                              nr_replicas, nr_effective,
931                                              have_cache, flags, ob);
932                         if (ret)
933                                 break;
934                 }
935         }
936 unlock:
937         spin_unlock(&c->freelist_lock);
938         return ret;
939 }
940
941 static int __open_bucket_add_buckets(struct btree_trans *trans,
942                         struct open_buckets *ptrs,
943                         struct write_point *wp,
944                         struct bch_devs_list *devs_have,
945                         u16 target,
946                         bool erasure_code,
947                         unsigned nr_replicas,
948                         unsigned *nr_effective,
949                         bool *have_cache,
950                         enum bch_watermark watermark,
951                         unsigned flags,
952                         struct closure *_cl)
953 {
954         struct bch_fs *c = trans->c;
955         struct bch_devs_mask devs;
956         struct open_bucket *ob;
957         struct closure *cl = NULL;
958         unsigned i;
959         int ret;
960
961         devs = target_rw_devs(c, wp->data_type, target);
962
963         /* Don't allocate from devices we already have pointers to: */
964         for (i = 0; i < devs_have->nr; i++)
965                 __clear_bit(devs_have->devs[i], devs.d);
966
967         open_bucket_for_each(c, ptrs, ob, i)
968                 __clear_bit(ob->dev, devs.d);
969
970         if (erasure_code && ec_open_bucket(c, ptrs))
971                 return 0;
972
973         ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
974                                  nr_replicas, nr_effective,
975                                  have_cache, erasure_code, flags);
976         if (ret)
977                 return ret;
978
979         ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
980                                  nr_replicas, nr_effective,
981                                  have_cache, erasure_code, watermark, flags);
982         if (ret)
983                 return ret;
984
985         if (erasure_code) {
986                 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
987                                          target,
988                                          nr_replicas, nr_effective,
989                                          have_cache,
990                                          watermark, flags, _cl);
991         } else {
992 retry_blocking:
993                 /*
994                  * Try nonblocking first, so that if one device is full we'll try from
995                  * other devices:
996                  */
997                 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
998                                         nr_replicas, nr_effective, have_cache,
999                                         flags, wp->data_type, watermark, cl);
1000                 if (ret &&
1001                     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1002                     !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
1003                     !cl && _cl) {
1004                         cl = _cl;
1005                         goto retry_blocking;
1006                 }
1007         }
1008
1009         return ret;
1010 }
1011
1012 static int open_bucket_add_buckets(struct btree_trans *trans,
1013                         struct open_buckets *ptrs,
1014                         struct write_point *wp,
1015                         struct bch_devs_list *devs_have,
1016                         u16 target,
1017                         unsigned erasure_code,
1018                         unsigned nr_replicas,
1019                         unsigned *nr_effective,
1020                         bool *have_cache,
1021                         enum bch_watermark watermark,
1022                         unsigned flags,
1023                         struct closure *cl)
1024 {
1025         int ret;
1026
1027         if (erasure_code) {
1028                 ret = __open_bucket_add_buckets(trans, ptrs, wp,
1029                                 devs_have, target, erasure_code,
1030                                 nr_replicas, nr_effective, have_cache,
1031                                 watermark, flags, cl);
1032                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1033                     bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
1034                     bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
1035                     bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1036                         return ret;
1037                 if (*nr_effective >= nr_replicas)
1038                         return 0;
1039         }
1040
1041         ret = __open_bucket_add_buckets(trans, ptrs, wp,
1042                         devs_have, target, false,
1043                         nr_replicas, nr_effective, have_cache,
1044                         watermark, flags, cl);
1045         return ret < 0 ? ret : 0;
1046 }
1047
1048 /**
1049  * should_drop_bucket - check if this is open_bucket should go away
1050  * @ob:         open_bucket to predicate on
1051  * @c:          filesystem handle
1052  * @ca:         if set, we're killing buckets for a particular device
1053  * @ec:         if true, we're shutting down erasure coding and killing all ec
1054  *              open_buckets
1055  *              otherwise, return true
1056  * Returns: true if we should kill this open_bucket
1057  *
1058  * We're killing open_buckets because we're shutting down a device, erasure
1059  * coding, or the entire filesystem - check if this open_bucket matches:
1060  */
1061 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
1062                                struct bch_dev *ca, bool ec)
1063 {
1064         if (ec) {
1065                 return ob->ec != NULL;
1066         } else if (ca) {
1067                 bool drop = ob->dev == ca->dev_idx;
1068                 struct open_bucket *ob2;
1069                 unsigned i;
1070
1071                 if (!drop && ob->ec) {
1072                         unsigned nr_blocks;
1073
1074                         mutex_lock(&ob->ec->lock);
1075                         nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
1076
1077                         for (i = 0; i < nr_blocks; i++) {
1078                                 if (!ob->ec->blocks[i])
1079                                         continue;
1080
1081                                 ob2 = c->open_buckets + ob->ec->blocks[i];
1082                                 drop |= ob2->dev == ca->dev_idx;
1083                         }
1084                         mutex_unlock(&ob->ec->lock);
1085                 }
1086
1087                 return drop;
1088         } else {
1089                 return true;
1090         }
1091 }
1092
1093 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
1094                                  bool ec, struct write_point *wp)
1095 {
1096         struct open_buckets ptrs = { .nr = 0 };
1097         struct open_bucket *ob;
1098         unsigned i;
1099
1100         mutex_lock(&wp->lock);
1101         open_bucket_for_each(c, &wp->ptrs, ob, i)
1102                 if (should_drop_bucket(ob, c, ca, ec))
1103                         bch2_open_bucket_put(c, ob);
1104                 else
1105                         ob_push(c, &ptrs, ob);
1106         wp->ptrs = ptrs;
1107         mutex_unlock(&wp->lock);
1108 }
1109
1110 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
1111                             bool ec)
1112 {
1113         unsigned i;
1114
1115         /* Next, close write points that point to this device... */
1116         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1117                 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
1118
1119         bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
1120         bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
1121         bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
1122
1123         mutex_lock(&c->btree_reserve_cache_lock);
1124         while (c->btree_reserve_cache_nr) {
1125                 struct btree_alloc *a =
1126                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1127
1128                 bch2_open_buckets_put(c, &a->ob);
1129         }
1130         mutex_unlock(&c->btree_reserve_cache_lock);
1131
1132         spin_lock(&c->freelist_lock);
1133         i = 0;
1134         while (i < c->open_buckets_partial_nr) {
1135                 struct open_bucket *ob =
1136                         c->open_buckets + c->open_buckets_partial[i];
1137
1138                 if (should_drop_bucket(ob, c, ca, ec)) {
1139                         --c->open_buckets_partial_nr;
1140                         swap(c->open_buckets_partial[i],
1141                              c->open_buckets_partial[c->open_buckets_partial_nr]);
1142                         ob->on_partial_list = false;
1143                         spin_unlock(&c->freelist_lock);
1144                         bch2_open_bucket_put(c, ob);
1145                         spin_lock(&c->freelist_lock);
1146                 } else {
1147                         i++;
1148                 }
1149         }
1150         spin_unlock(&c->freelist_lock);
1151
1152         bch2_ec_stop_dev(c, ca);
1153 }
1154
1155 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
1156                                                  unsigned long write_point)
1157 {
1158         unsigned hash =
1159                 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
1160
1161         return &c->write_points_hash[hash];
1162 }
1163
1164 static struct write_point *__writepoint_find(struct hlist_head *head,
1165                                              unsigned long write_point)
1166 {
1167         struct write_point *wp;
1168
1169         rcu_read_lock();
1170         hlist_for_each_entry_rcu(wp, head, node)
1171                 if (wp->write_point == write_point)
1172                         goto out;
1173         wp = NULL;
1174 out:
1175         rcu_read_unlock();
1176         return wp;
1177 }
1178
1179 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
1180 {
1181         u64 stranded    = c->write_points_nr * c->bucket_size_max;
1182         u64 free        = bch2_fs_usage_read_short(c).free;
1183
1184         return stranded * factor > free;
1185 }
1186
1187 static bool try_increase_writepoints(struct bch_fs *c)
1188 {
1189         struct write_point *wp;
1190
1191         if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
1192             too_many_writepoints(c, 32))
1193                 return false;
1194
1195         wp = c->write_points + c->write_points_nr++;
1196         hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
1197         return true;
1198 }
1199
1200 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
1201 {
1202         struct bch_fs *c = trans->c;
1203         struct write_point *wp;
1204         struct open_bucket *ob;
1205         unsigned i;
1206
1207         mutex_lock(&c->write_points_hash_lock);
1208         if (c->write_points_nr < old_nr) {
1209                 mutex_unlock(&c->write_points_hash_lock);
1210                 return true;
1211         }
1212
1213         if (c->write_points_nr == 1 ||
1214             !too_many_writepoints(c, 8)) {
1215                 mutex_unlock(&c->write_points_hash_lock);
1216                 return false;
1217         }
1218
1219         wp = c->write_points + --c->write_points_nr;
1220
1221         hlist_del_rcu(&wp->node);
1222         mutex_unlock(&c->write_points_hash_lock);
1223
1224         bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1225         open_bucket_for_each(c, &wp->ptrs, ob, i)
1226                 open_bucket_free_unused(c, ob);
1227         wp->ptrs.nr = 0;
1228         mutex_unlock(&wp->lock);
1229         return true;
1230 }
1231
1232 static struct write_point *writepoint_find(struct btree_trans *trans,
1233                                            unsigned long write_point)
1234 {
1235         struct bch_fs *c = trans->c;
1236         struct write_point *wp, *oldest;
1237         struct hlist_head *head;
1238
1239         if (!(write_point & 1UL)) {
1240                 wp = (struct write_point *) write_point;
1241                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1242                 return wp;
1243         }
1244
1245         head = writepoint_hash(c, write_point);
1246 restart_find:
1247         wp = __writepoint_find(head, write_point);
1248         if (wp) {
1249 lock_wp:
1250                 bch2_trans_mutex_lock_norelock(trans, &wp->lock);
1251                 if (wp->write_point == write_point)
1252                         goto out;
1253                 mutex_unlock(&wp->lock);
1254                 goto restart_find;
1255         }
1256 restart_find_oldest:
1257         oldest = NULL;
1258         for (wp = c->write_points;
1259              wp < c->write_points + c->write_points_nr; wp++)
1260                 if (!oldest || time_before64(wp->last_used, oldest->last_used))
1261                         oldest = wp;
1262
1263         bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
1264         bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
1265         if (oldest >= c->write_points + c->write_points_nr ||
1266             try_increase_writepoints(c)) {
1267                 mutex_unlock(&c->write_points_hash_lock);
1268                 mutex_unlock(&oldest->lock);
1269                 goto restart_find_oldest;
1270         }
1271
1272         wp = __writepoint_find(head, write_point);
1273         if (wp && wp != oldest) {
1274                 mutex_unlock(&c->write_points_hash_lock);
1275                 mutex_unlock(&oldest->lock);
1276                 goto lock_wp;
1277         }
1278
1279         wp = oldest;
1280         hlist_del_rcu(&wp->node);
1281         wp->write_point = write_point;
1282         hlist_add_head_rcu(&wp->node, head);
1283         mutex_unlock(&c->write_points_hash_lock);
1284 out:
1285         wp->last_used = local_clock();
1286         return wp;
1287 }
1288
1289 static noinline void
1290 deallocate_extra_replicas(struct bch_fs *c,
1291                           struct open_buckets *ptrs,
1292                           struct open_buckets *ptrs_no_use,
1293                           unsigned extra_replicas)
1294 {
1295         struct open_buckets ptrs2 = { 0 };
1296         struct open_bucket *ob;
1297         unsigned i;
1298
1299         open_bucket_for_each(c, ptrs, ob, i) {
1300                 unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
1301
1302                 if (d && d <= extra_replicas) {
1303                         extra_replicas -= d;
1304                         ob_push(c, ptrs_no_use, ob);
1305                 } else {
1306                         ob_push(c, &ptrs2, ob);
1307                 }
1308         }
1309
1310         *ptrs = ptrs2;
1311 }
1312
1313 /*
1314  * Get us an open_bucket we can allocate from, return with it locked:
1315  */
1316 int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
1317                              unsigned target,
1318                              unsigned erasure_code,
1319                              struct write_point_specifier write_point,
1320                              struct bch_devs_list *devs_have,
1321                              unsigned nr_replicas,
1322                              unsigned nr_replicas_required,
1323                              enum bch_watermark watermark,
1324                              unsigned flags,
1325                              struct closure *cl,
1326                              struct write_point **wp_ret)
1327 {
1328         struct bch_fs *c = trans->c;
1329         struct write_point *wp;
1330         struct open_bucket *ob;
1331         struct open_buckets ptrs;
1332         unsigned nr_effective, write_points_nr;
1333         bool have_cache;
1334         int ret;
1335         int i;
1336
1337         if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
1338                 erasure_code = false;
1339
1340         BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
1341
1342         BUG_ON(!nr_replicas || !nr_replicas_required);
1343 retry:
1344         ptrs.nr         = 0;
1345         nr_effective    = 0;
1346         write_points_nr = c->write_points_nr;
1347         have_cache      = false;
1348
1349         *wp_ret = wp = writepoint_find(trans, write_point.v);
1350
1351         /* metadata may not allocate on cache devices: */
1352         if (wp->data_type != BCH_DATA_user)
1353                 have_cache = true;
1354
1355         if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
1356                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1357                                               target, erasure_code,
1358                                               nr_replicas, &nr_effective,
1359                                               &have_cache, watermark,
1360                                               flags, NULL);
1361                 if (!ret ||
1362                     bch2_err_matches(ret, BCH_ERR_transaction_restart))
1363                         goto alloc_done;
1364
1365                 /* Don't retry from all devices if we're out of open buckets: */
1366                 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
1367                         goto allocate_blocking;
1368
1369                 /*
1370                  * Only try to allocate cache (durability = 0 devices) from the
1371                  * specified target:
1372                  */
1373                 have_cache = true;
1374
1375                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1376                                               0, erasure_code,
1377                                               nr_replicas, &nr_effective,
1378                                               &have_cache, watermark,
1379                                               flags, cl);
1380         } else {
1381 allocate_blocking:
1382                 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
1383                                               target, erasure_code,
1384                                               nr_replicas, &nr_effective,
1385                                               &have_cache, watermark,
1386                                               flags, cl);
1387         }
1388 alloc_done:
1389         BUG_ON(!ret && nr_effective < nr_replicas);
1390
1391         if (erasure_code && !ec_open_bucket(c, &ptrs))
1392                 pr_debug("failed to get ec bucket: ret %u", ret);
1393
1394         if (ret == -BCH_ERR_insufficient_devices &&
1395             nr_effective >= nr_replicas_required)
1396                 ret = 0;
1397
1398         if (ret)
1399                 goto err;
1400
1401         if (nr_effective > nr_replicas)
1402                 deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
1403
1404         /* Free buckets we didn't use: */
1405         open_bucket_for_each(c, &wp->ptrs, ob, i)
1406                 open_bucket_free_unused(c, ob);
1407
1408         wp->ptrs = ptrs;
1409
1410         wp->sectors_free = UINT_MAX;
1411
1412         open_bucket_for_each(c, &wp->ptrs, ob, i)
1413                 wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
1414
1415         BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
1416
1417         return 0;
1418 err:
1419         open_bucket_for_each(c, &wp->ptrs, ob, i)
1420                 if (ptrs.nr < ARRAY_SIZE(ptrs.v))
1421                         ob_push(c, &ptrs, ob);
1422                 else
1423                         open_bucket_free_unused(c, ob);
1424         wp->ptrs = ptrs;
1425
1426         mutex_unlock(&wp->lock);
1427
1428         if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
1429             try_decrease_writepoints(trans, write_points_nr))
1430                 goto retry;
1431
1432         if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
1433             bch2_err_matches(ret, BCH_ERR_freelist_empty))
1434                 return cl
1435                         ? -BCH_ERR_bucket_alloc_blocked
1436                         : -BCH_ERR_ENOSPC_bucket_alloc;
1437
1438         return ret;
1439 }
1440
1441 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
1442 {
1443         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1444
1445         return (struct bch_extent_ptr) {
1446                 .type   = 1 << BCH_EXTENT_ENTRY_ptr,
1447                 .gen    = ob->gen,
1448                 .dev    = ob->dev,
1449                 .offset = bucket_to_sector(ca, ob->bucket) +
1450                         ca->mi.bucket_size -
1451                         ob->sectors_free,
1452         };
1453 }
1454
1455 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
1456                                     struct bkey_i *k, unsigned sectors,
1457                                     bool cached)
1458 {
1459         bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
1460 }
1461
1462 /*
1463  * Append pointers to the space we just allocated to @k, and mark @sectors space
1464  * as allocated out of @ob
1465  */
1466 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
1467 {
1468         bch2_alloc_sectors_done_inlined(c, wp);
1469 }
1470
1471 static inline void writepoint_init(struct write_point *wp,
1472                                    enum bch_data_type type)
1473 {
1474         mutex_init(&wp->lock);
1475         wp->data_type = type;
1476
1477         INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
1478         INIT_LIST_HEAD(&wp->writes);
1479         spin_lock_init(&wp->writes_lock);
1480 }
1481
1482 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
1483 {
1484         struct open_bucket *ob;
1485         struct write_point *wp;
1486
1487         mutex_init(&c->write_points_hash_lock);
1488         c->write_points_nr = ARRAY_SIZE(c->write_points);
1489
1490         /* open bucket 0 is a sentinal NULL: */
1491         spin_lock_init(&c->open_buckets[0].lock);
1492
1493         for (ob = c->open_buckets + 1;
1494              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
1495                 spin_lock_init(&ob->lock);
1496                 c->open_buckets_nr_free++;
1497
1498                 ob->freelist = c->open_buckets_freelist;
1499                 c->open_buckets_freelist = ob - c->open_buckets;
1500         }
1501
1502         writepoint_init(&c->btree_write_point,          BCH_DATA_btree);
1503         writepoint_init(&c->rebalance_write_point,      BCH_DATA_user);
1504         writepoint_init(&c->copygc_write_point,         BCH_DATA_user);
1505
1506         for (wp = c->write_points;
1507              wp < c->write_points + c->write_points_nr; wp++) {
1508                 writepoint_init(wp, BCH_DATA_user);
1509
1510                 wp->last_used   = local_clock();
1511                 wp->write_point = (unsigned long) wp;
1512                 hlist_add_head_rcu(&wp->node,
1513                                    writepoint_hash(c, wp->write_point));
1514         }
1515 }
1516
1517 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
1518 {
1519         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1520         unsigned data_type = ob->data_type;
1521         barrier(); /* READ_ONCE() doesn't work on bitfields */
1522
1523         prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
1524                    ob - c->open_buckets,
1525                    atomic_read(&ob->pin),
1526                    data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
1527                    ob->dev, ob->bucket, ob->gen,
1528                    ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
1529         if (ob->ec)
1530                 prt_printf(out, " ec idx %llu", ob->ec->idx);
1531         if (ob->on_partial_list)
1532                 prt_str(out, " partial");
1533         prt_newline(out);
1534 }
1535
1536 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1537 {
1538         struct open_bucket *ob;
1539
1540         out->atomic++;
1541
1542         for (ob = c->open_buckets;
1543              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1544              ob++) {
1545                 spin_lock(&ob->lock);
1546                 if (ob->valid && !ob->on_partial_list)
1547                         bch2_open_bucket_to_text(out, c, ob);
1548                 spin_unlock(&ob->lock);
1549         }
1550
1551         --out->atomic;
1552 }
1553
1554 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
1555 {
1556         unsigned i;
1557
1558         out->atomic++;
1559         spin_lock(&c->freelist_lock);
1560
1561         for (i = 0; i < c->open_buckets_partial_nr; i++)
1562                 bch2_open_bucket_to_text(out, c,
1563                                 c->open_buckets + c->open_buckets_partial[i]);
1564
1565         spin_unlock(&c->freelist_lock);
1566         --out->atomic;
1567 }
1568
1569 static const char * const bch2_write_point_states[] = {
1570 #define x(n)    #n,
1571         WRITE_POINT_STATES()
1572 #undef x
1573         NULL
1574 };
1575
1576 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
1577                                      struct write_point *wp)
1578 {
1579         struct open_bucket *ob;
1580         unsigned i;
1581
1582         prt_printf(out, "%lu: ", wp->write_point);
1583         prt_human_readable_u64(out, wp->sectors_allocated);
1584
1585         prt_printf(out, " last wrote: ");
1586         bch2_pr_time_units(out, sched_clock() - wp->last_used);
1587
1588         for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
1589                 prt_printf(out, " %s: ", bch2_write_point_states[i]);
1590                 bch2_pr_time_units(out, wp->time[i]);
1591         }
1592
1593         prt_newline(out);
1594
1595         printbuf_indent_add(out, 2);
1596         open_bucket_for_each(c, &wp->ptrs, ob, i)
1597                 bch2_open_bucket_to_text(out, c, ob);
1598         printbuf_indent_sub(out, 2);
1599 }
1600
1601 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
1602 {
1603         struct write_point *wp;
1604
1605         prt_str(out, "Foreground write points\n");
1606         for (wp = c->write_points;
1607              wp < c->write_points + ARRAY_SIZE(c->write_points);
1608              wp++)
1609                 bch2_write_point_to_text(out, c, wp);
1610
1611         prt_str(out, "Copygc write point\n");
1612         bch2_write_point_to_text(out, c, &c->copygc_write_point);
1613
1614         prt_str(out, "Rebalance write point\n");
1615         bch2_write_point_to_text(out, c, &c->rebalance_write_point);
1616
1617         prt_str(out, "Btree write point\n");
1618         bch2_write_point_to_text(out, c, &c->btree_write_point);
1619 }