]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_background.c
Update bcachefs sources to e027cf9aa0 fixup! bcachefs: Defer checking of alloc -...
[bcachefs-tools-debian] / libbcachefs / alloc_background.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_gc.h"
11 #include "buckets.h"
12 #include "buckets_waiting_for_journal.h"
13 #include "clock.h"
14 #include "debug.h"
15 #include "ec.h"
16 #include "error.h"
17 #include "lru.h"
18 #include "recovery.h"
19 #include "varint.h"
20
21 #include <linux/kthread.h>
22 #include <linux/math64.h>
23 #include <linux/random.h>
24 #include <linux/rculist.h>
25 #include <linux/rcupdate.h>
26 #include <linux/sched/task.h>
27 #include <linux/sort.h>
28 #include <trace/events/bcachefs.h>
29
30 /* Persistent alloc info: */
31
32 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
33 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
34         BCH_ALLOC_FIELDS_V1()
35 #undef x
36 };
37
38 const char * const bch2_bucket_states[] = {
39         "free",
40         "need gc gens",
41         "need discard",
42         "cached",
43         "dirty",
44         NULL
45 };
46
47 struct bkey_alloc_unpacked {
48         u64             journal_seq;
49         u64             bucket;
50         u8              dev;
51         u8              gen;
52         u8              oldest_gen;
53         u8              data_type;
54         bool            need_discard:1;
55         bool            need_inc_gen:1;
56 #define x(_name, _bits) u##_bits _name;
57         BCH_ALLOC_FIELDS_V2()
58 #undef  x
59 };
60
61 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
62                                      const void **p, unsigned field)
63 {
64         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
65         u64 v;
66
67         if (!(a->fields & (1 << field)))
68                 return 0;
69
70         switch (bytes) {
71         case 1:
72                 v = *((const u8 *) *p);
73                 break;
74         case 2:
75                 v = le16_to_cpup(*p);
76                 break;
77         case 4:
78                 v = le32_to_cpup(*p);
79                 break;
80         case 8:
81                 v = le64_to_cpup(*p);
82                 break;
83         default:
84                 BUG();
85         }
86
87         *p += bytes;
88         return v;
89 }
90
91 static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
92                                       unsigned field, u64 v)
93 {
94         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
95
96         if (!v)
97                 return;
98
99         a->v.fields |= 1 << field;
100
101         switch (bytes) {
102         case 1:
103                 *((u8 *) *p) = v;
104                 break;
105         case 2:
106                 *((__le16 *) *p) = cpu_to_le16(v);
107                 break;
108         case 4:
109                 *((__le32 *) *p) = cpu_to_le32(v);
110                 break;
111         case 8:
112                 *((__le64 *) *p) = cpu_to_le64(v);
113                 break;
114         default:
115                 BUG();
116         }
117
118         *p += bytes;
119 }
120
121 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
122                                  struct bkey_s_c k)
123 {
124         const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
125         const void *d = in->data;
126         unsigned idx = 0;
127
128         out->gen = in->gen;
129
130 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
131         BCH_ALLOC_FIELDS_V1()
132 #undef  x
133 }
134
135 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
136                                 struct bkey_s_c k)
137 {
138         struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
139         const u8 *in = a.v->data;
140         const u8 *end = bkey_val_end(a);
141         unsigned fieldnr = 0;
142         int ret;
143         u64 v;
144
145         out->gen        = a.v->gen;
146         out->oldest_gen = a.v->oldest_gen;
147         out->data_type  = a.v->data_type;
148
149 #define x(_name, _bits)                                                 \
150         if (fieldnr < a.v->nr_fields) {                                 \
151                 ret = bch2_varint_decode_fast(in, end, &v);             \
152                 if (ret < 0)                                            \
153                         return ret;                                     \
154                 in += ret;                                              \
155         } else {                                                        \
156                 v = 0;                                                  \
157         }                                                               \
158         out->_name = v;                                                 \
159         if (v != out->_name)                                            \
160                 return -1;                                              \
161         fieldnr++;
162
163         BCH_ALLOC_FIELDS_V2()
164 #undef  x
165         return 0;
166 }
167
168 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
169                                 struct bkey_s_c k)
170 {
171         struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
172         const u8 *in = a.v->data;
173         const u8 *end = bkey_val_end(a);
174         unsigned fieldnr = 0;
175         int ret;
176         u64 v;
177
178         out->gen        = a.v->gen;
179         out->oldest_gen = a.v->oldest_gen;
180         out->data_type  = a.v->data_type;
181         out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
182         out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
183         out->journal_seq = le64_to_cpu(a.v->journal_seq);
184
185 #define x(_name, _bits)                                                 \
186         if (fieldnr < a.v->nr_fields) {                                 \
187                 ret = bch2_varint_decode_fast(in, end, &v);             \
188                 if (ret < 0)                                            \
189                         return ret;                                     \
190                 in += ret;                                              \
191         } else {                                                        \
192                 v = 0;                                                  \
193         }                                                               \
194         out->_name = v;                                                 \
195         if (v != out->_name)                                            \
196                 return -1;                                              \
197         fieldnr++;
198
199         BCH_ALLOC_FIELDS_V2()
200 #undef  x
201         return 0;
202 }
203
204 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
205 {
206         struct bkey_alloc_unpacked ret = {
207                 .dev    = k.k->p.inode,
208                 .bucket = k.k->p.offset,
209                 .gen    = 0,
210         };
211
212         switch (k.k->type) {
213         case KEY_TYPE_alloc:
214                 bch2_alloc_unpack_v1(&ret, k);
215                 break;
216         case KEY_TYPE_alloc_v2:
217                 bch2_alloc_unpack_v2(&ret, k);
218                 break;
219         case KEY_TYPE_alloc_v3:
220                 bch2_alloc_unpack_v3(&ret, k);
221                 break;
222         }
223
224         return ret;
225 }
226
227 void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
228 {
229         if (k.k->type == KEY_TYPE_alloc_v4) {
230                 *out = *bkey_s_c_to_alloc_v4(k).v;
231         } else {
232                 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
233
234                 *out = (struct bch_alloc_v4) {
235                         .journal_seq            = u.journal_seq,
236                         .flags                  = u.need_discard,
237                         .gen                    = u.gen,
238                         .oldest_gen             = u.oldest_gen,
239                         .data_type              = u.data_type,
240                         .stripe_redundancy      = u.stripe_redundancy,
241                         .dirty_sectors          = u.dirty_sectors,
242                         .cached_sectors         = u.cached_sectors,
243                         .io_time[READ]          = u.read_time,
244                         .io_time[WRITE]         = u.write_time,
245                         .stripe                 = u.stripe,
246                 };
247         }
248 }
249
250 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
251 {
252         struct bkey_i_alloc_v4 *ret;
253
254         if (k.k->type == KEY_TYPE_alloc_v4) {
255                 ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
256                 if (!IS_ERR(ret))
257                         bkey_reassemble(&ret->k_i, k);
258         } else {
259                 ret = bch2_trans_kmalloc(trans, sizeof(*ret));
260                 if (!IS_ERR(ret)) {
261                         bkey_alloc_v4_init(&ret->k_i);
262                         ret->k.p = k.k->p;
263                         bch2_alloc_to_v4(k, &ret->v);
264                 }
265         }
266         return ret;
267 }
268
269 struct bkey_i_alloc_v4 *
270 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
271                               struct bpos pos)
272 {
273         struct bkey_s_c k;
274         struct bkey_i_alloc_v4 *a;
275         int ret;
276
277         bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
278                              BTREE_ITER_WITH_UPDATES|
279                              BTREE_ITER_CACHED|
280                              BTREE_ITER_INTENT);
281         k = bch2_btree_iter_peek_slot(iter);
282         ret = bkey_err(k);
283         if (ret) {
284                 bch2_trans_iter_exit(trans, iter);
285                 return ERR_PTR(ret);
286         }
287
288         a = bch2_alloc_to_v4_mut(trans, k);
289         if (IS_ERR(a))
290                 bch2_trans_iter_exit(trans, iter);
291         return a;
292 }
293
294 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
295 {
296         unsigned i, bytes = offsetof(struct bch_alloc, data);
297
298         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
299                 if (a->fields & (1 << i))
300                         bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
301
302         return DIV_ROUND_UP(bytes, sizeof(u64));
303 }
304
305 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
306                           int rw, struct printbuf *err)
307 {
308         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
309
310         /* allow for unknown fields */
311         if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
312                 pr_buf(err, "incorrect value size (%zu < %u)",
313                        bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
314                 return -EINVAL;
315         }
316
317         return 0;
318 }
319
320 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
321                           int rw, struct printbuf *err)
322 {
323         struct bkey_alloc_unpacked u;
324
325         if (bch2_alloc_unpack_v2(&u, k)) {
326                 pr_buf(err, "unpack error");
327                 return -EINVAL;
328         }
329
330         return 0;
331 }
332
333 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
334                           int rw, struct printbuf *err)
335 {
336         struct bkey_alloc_unpacked u;
337
338         if (bch2_alloc_unpack_v3(&u, k)) {
339                 pr_buf(err, "unpack error");
340                 return -EINVAL;
341         }
342
343         return 0;
344 }
345
346 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
347                           int rw, struct printbuf *err)
348 {
349         if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
350                 pr_buf(err, "bad val size (%zu != %zu)",
351                        bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
352                 return -EINVAL;
353         }
354
355         return 0;
356 }
357
358 void bch2_alloc_v4_swab(struct bkey_s k)
359 {
360         struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
361
362         a->journal_seq          = swab64(a->journal_seq);
363         a->flags                = swab32(a->flags);
364         a->dirty_sectors        = swab32(a->dirty_sectors);
365         a->cached_sectors       = swab32(a->cached_sectors);
366         a->io_time[0]           = swab64(a->io_time[0]);
367         a->io_time[1]           = swab64(a->io_time[1]);
368         a->stripe               = swab32(a->stripe);
369         a->nr_external_backpointers = swab32(a->nr_external_backpointers);
370 }
371
372 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
373 {
374         struct bch_alloc_v4 a;
375
376         bch2_alloc_to_v4(k, &a);
377
378         pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
379                a.gen, a.oldest_gen, bch2_data_types[a.data_type],
380                a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
381         pr_buf(out, " dirty_sectors %u",        a.dirty_sectors);
382         pr_buf(out, " cached_sectors %u",       a.cached_sectors);
383         pr_buf(out, " stripe %u",               a.stripe);
384         pr_buf(out, " stripe_redundancy %u",    a.stripe_redundancy);
385         pr_buf(out, " read_time %llu",          a.io_time[READ]);
386         pr_buf(out, " write_time %llu",         a.io_time[WRITE]);
387 }
388
389 int bch2_alloc_read(struct bch_fs *c)
390 {
391         struct btree_trans trans;
392         struct btree_iter iter;
393         struct bkey_s_c k;
394         struct bch_alloc_v4 a;
395         struct bch_dev *ca;
396         int ret;
397
398         bch2_trans_init(&trans, c, 0, 0);
399
400         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
401                            BTREE_ITER_PREFETCH, k, ret) {
402                 ca = bch_dev_bkey_exists(c, k.k->p.inode);
403                 bch2_alloc_to_v4(k, &a);
404
405                 *bucket_gen(ca, k.k->p.offset) = a.gen;
406         }
407         bch2_trans_iter_exit(&trans, &iter);
408
409         bch2_trans_exit(&trans);
410
411         if (ret)
412                 bch_err(c, "error reading alloc info: %i", ret);
413
414         return ret;
415 }
416
417 /* Free space/discard btree: */
418
419 static int bch2_bucket_do_index(struct btree_trans *trans,
420                                 struct bkey_s_c alloc_k,
421                                 struct bch_alloc_v4 a,
422                                 bool set)
423 {
424         struct bch_fs *c = trans->c;
425         struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
426         struct btree_iter iter;
427         struct bkey_s_c old;
428         struct bkey_i *k;
429         enum bucket_state state = bucket_state(a);
430         enum btree_id btree;
431         enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
432         enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
433         struct printbuf buf = PRINTBUF;
434         int ret;
435
436         if (state != BUCKET_free &&
437             state != BUCKET_need_discard)
438                 return 0;
439
440         k = bch2_trans_kmalloc(trans, sizeof(*k));
441         if (IS_ERR(k))
442                 return PTR_ERR(k);
443
444         bkey_init(&k->k);
445         k->k.type = new_type;
446
447         switch (state) {
448         case BUCKET_free:
449                 btree = BTREE_ID_freespace;
450                 k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
451                 bch2_key_resize(&k->k, 1);
452                 break;
453         case BUCKET_need_discard:
454                 btree = BTREE_ID_need_discard;
455                 k->k.p = alloc_k.k->p;
456                 break;
457         default:
458                 return 0;
459         }
460
461         bch2_trans_iter_init(trans, &iter, btree,
462                              bkey_start_pos(&k->k),
463                              BTREE_ITER_INTENT);
464         old = bch2_btree_iter_peek_slot(&iter);
465         ret = bkey_err(old);
466         if (ret)
467                 goto err;
468
469         if (ca->mi.freespace_initialized &&
470             bch2_fs_inconsistent_on(old.k->type != old_type, c,
471                         "incorrect key when %s %s btree (got %s should be %s)\n"
472                         "  for %s",
473                         set ? "setting" : "clearing",
474                         bch2_btree_ids[btree],
475                         bch2_bkey_types[old.k->type],
476                         bch2_bkey_types[old_type],
477                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
478                 ret = -EIO;
479                 goto err;
480         }
481
482         ret = bch2_trans_update(trans, &iter, k, 0);
483 err:
484         bch2_trans_iter_exit(trans, &iter);
485         printbuf_exit(&buf);
486         return ret;
487 }
488
489 int bch2_trans_mark_alloc(struct btree_trans *trans,
490                           struct bkey_s_c old, struct bkey_i *new,
491                           unsigned flags)
492 {
493         struct bch_fs *c = trans->c;
494         struct bch_alloc_v4 old_a, *new_a;
495         u64 old_lru, new_lru;
496         int ret = 0;
497
498         /*
499          * Deletion only happens in the device removal path, with
500          * BTREE_TRIGGER_NORUN:
501          */
502         BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
503
504         bch2_alloc_to_v4(old, &old_a);
505         new_a = &bkey_i_to_alloc_v4(new)->v;
506
507         if (new_a->dirty_sectors > old_a.dirty_sectors ||
508             new_a->cached_sectors > old_a.cached_sectors) {
509                 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
510                 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
511                 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
512                 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
513         }
514
515         if (old_a.data_type && !new_a->data_type &&
516             old_a.gen == new_a->gen &&
517             !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
518                 new_a->gen++;
519                 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
520         }
521
522         if (bucket_state(old_a) != bucket_state(*new_a) ||
523             (bucket_state(*new_a) == BUCKET_free &&
524              alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
525                 ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
526                         bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
527                 if (ret)
528                         return ret;
529         }
530
531         old_lru = alloc_lru_idx(old_a);
532         new_lru = alloc_lru_idx(*new_a);
533
534         if (old_lru != new_lru) {
535                 ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
536                                       old_lru, &new_lru);
537                 if (ret)
538                         return ret;
539
540                 if (new_lru && new_a->io_time[READ] != new_lru)
541                         new_a->io_time[READ] = new_lru;
542         }
543
544         return 0;
545 }
546
547 static int bch2_check_alloc_key(struct btree_trans *trans,
548                                 struct btree_iter *alloc_iter)
549 {
550         struct bch_fs *c = trans->c;
551         struct bch_dev *ca;
552         struct btree_iter discard_iter, freespace_iter;
553         struct bch_alloc_v4 a;
554         unsigned discard_key_type, freespace_key_type;
555         struct bkey_s_c alloc_k, k;
556         struct printbuf buf = PRINTBUF;
557         struct printbuf buf2 = PRINTBUF;
558         int ret;
559
560         alloc_k = bch2_btree_iter_peek(alloc_iter);
561         if (!alloc_k.k)
562                 return 0;
563
564         ret = bkey_err(alloc_k);
565         if (ret)
566                 return ret;
567
568         if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
569                         "alloc key for invalid device or bucket"))
570                 return bch2_btree_delete_at(trans, alloc_iter, 0);
571
572         ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
573         if (!ca->mi.freespace_initialized)
574                 return 0;
575
576         bch2_alloc_to_v4(alloc_k, &a);
577
578         discard_key_type = bucket_state(a) == BUCKET_need_discard
579                 ? KEY_TYPE_set : 0;
580         freespace_key_type = bucket_state(a) == BUCKET_free
581                 ? KEY_TYPE_set : 0;
582
583         bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
584                              alloc_k.k->p, 0);
585         bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
586                              alloc_freespace_pos(alloc_k.k->p, a), 0);
587
588         k = bch2_btree_iter_peek_slot(&discard_iter);
589         ret = bkey_err(k);
590         if (ret)
591                 goto err;
592
593         if (fsck_err_on(k.k->type != discard_key_type, c,
594                         "incorrect key in need_discard btree (got %s should be %s)\n"
595                         "  %s",
596                         bch2_bkey_types[k.k->type],
597                         bch2_bkey_types[discard_key_type],
598                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
599                 struct bkey_i *update =
600                         bch2_trans_kmalloc(trans, sizeof(*update));
601
602                 ret = PTR_ERR_OR_ZERO(update);
603                 if (ret)
604                         goto err;
605
606                 bkey_init(&update->k);
607                 update->k.type  = discard_key_type;
608                 update->k.p     = discard_iter.pos;
609
610                 ret = bch2_trans_update(trans, &discard_iter, update, 0);
611                 if (ret)
612                         goto err;
613         }
614
615         k = bch2_btree_iter_peek_slot(&freespace_iter);
616         ret = bkey_err(k);
617         if (ret)
618                 goto err;
619
620         if (fsck_err_on(k.k->type != freespace_key_type, c,
621                         "incorrect key in freespace btree (got %s should be %s)\n"
622                         "  %s",
623                         bch2_bkey_types[k.k->type],
624                         bch2_bkey_types[freespace_key_type],
625                         (printbuf_reset(&buf),
626                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
627                 struct bkey_i *update =
628                         bch2_trans_kmalloc(trans, sizeof(*update));
629
630                 ret = PTR_ERR_OR_ZERO(update);
631                 if (ret)
632                         goto err;
633
634                 bkey_init(&update->k);
635                 update->k.type  = freespace_key_type;
636                 update->k.p     = freespace_iter.pos;
637                 bch2_key_resize(&update->k, 1);
638
639                 ret = bch2_trans_update(trans, &freespace_iter, update, 0);
640                 if (ret)
641                         goto err;
642         }
643 err:
644 fsck_err:
645         bch2_trans_iter_exit(trans, &freespace_iter);
646         bch2_trans_iter_exit(trans, &discard_iter);
647         printbuf_exit(&buf2);
648         printbuf_exit(&buf);
649         return ret;
650 }
651
652 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
653                                             struct btree_iter *iter)
654 {
655         struct bch_fs *c = trans->c;
656         struct btree_iter alloc_iter;
657         struct bkey_s_c k, freespace_k;
658         struct bch_alloc_v4 a;
659         u64 genbits;
660         struct bpos pos;
661         struct bkey_i *update;
662         enum bucket_state state = iter->btree_id == BTREE_ID_need_discard
663                 ? BUCKET_need_discard
664                 : BUCKET_free;
665         struct printbuf buf = PRINTBUF;
666         int ret;
667
668         freespace_k = bch2_btree_iter_peek(iter);
669         if (!freespace_k.k)
670                 return 1;
671
672         ret = bkey_err(freespace_k);
673         if (ret)
674                 return ret;
675
676         pos = iter->pos;
677         pos.offset &= ~(~0ULL << 56);
678         genbits = iter->pos.offset & (~0ULL << 56);
679
680         bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
681
682         if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
683                         "%llu:%llu set in %s btree but device or bucket does not exist",
684                         pos.inode, pos.offset,
685                         bch2_btree_ids[iter->btree_id]))
686                 goto delete;
687
688         k = bch2_btree_iter_peek_slot(&alloc_iter);
689         ret = bkey_err(k);
690         if (ret)
691                 goto err;
692
693         bch2_alloc_to_v4(k, &a);
694
695         if (fsck_err_on(bucket_state(a) != state ||
696                         (state == BUCKET_free &&
697                          genbits != alloc_freespace_genbits(a)), c,
698                         "%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
699                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
700                         bch2_btree_ids[iter->btree_id],
701                         bucket_state(a) == state,
702                         genbits >> 56, alloc_freespace_genbits(a) >> 56))
703                 goto delete;
704 out:
705 err:
706 fsck_err:
707         bch2_trans_iter_exit(trans, &alloc_iter);
708         printbuf_exit(&buf);
709         return ret;
710 delete:
711         if (iter->btree_id == BTREE_ID_freespace) {
712                 /* should probably add a helper for deleting extents */
713                 update = bch2_trans_kmalloc(trans, sizeof(*update));
714                 ret = PTR_ERR_OR_ZERO(update);
715                 if (ret)
716                         goto err;
717
718                 bkey_init(&update->k);
719                 update->k.p = iter->pos;
720                 bch2_key_resize(&update->k, 1);
721
722                 ret = bch2_trans_update(trans, iter, update, 0);
723         } else {
724                 ret = bch2_btree_delete_at(trans, iter, 0);
725         }
726         goto out;
727 }
728
729 int bch2_check_alloc_info(struct bch_fs *c)
730 {
731         struct btree_trans trans;
732         struct btree_iter iter;
733         struct bkey_s_c k;
734         int ret = 0;
735
736         bch2_trans_init(&trans, c, 0, 0);
737
738         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
739                            BTREE_ITER_PREFETCH, k, ret) {
740                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
741                         bch2_check_alloc_key(&trans, &iter));
742                 if (ret)
743                         break;
744         }
745         bch2_trans_iter_exit(&trans, &iter);
746
747         if (ret)
748                 goto err;
749
750         bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
751                              BTREE_ITER_PREFETCH);
752         while (1) {
753                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
754                         bch2_check_discard_freespace_key(&trans, &iter));
755                 if (ret)
756                         break;
757
758                 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
759         }
760         bch2_trans_iter_exit(&trans, &iter);
761
762         if (ret)
763                 goto err;
764
765         bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
766                              BTREE_ITER_PREFETCH);
767         while (1) {
768                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
769                         bch2_check_discard_freespace_key(&trans, &iter));
770                 if (ret)
771                         break;
772
773                 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
774         }
775         bch2_trans_iter_exit(&trans, &iter);
776 err:
777         bch2_trans_exit(&trans);
778         return ret < 0 ? ret : 0;
779 }
780
781 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
782                                        struct btree_iter *alloc_iter)
783 {
784         struct bch_fs *c = trans->c;
785         struct btree_iter lru_iter;
786         struct bch_alloc_v4 a;
787         struct bkey_s_c alloc_k, k;
788         struct printbuf buf = PRINTBUF;
789         struct printbuf buf2 = PRINTBUF;
790         int ret;
791
792         alloc_k = bch2_btree_iter_peek(alloc_iter);
793         if (!alloc_k.k)
794                 return 0;
795
796         ret = bkey_err(alloc_k);
797         if (ret)
798                 return ret;
799
800         bch2_alloc_to_v4(alloc_k, &a);
801
802         if (bucket_state(a) != BUCKET_cached)
803                 return 0;
804
805         bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
806                              POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
807
808         k = bch2_btree_iter_peek_slot(&lru_iter);
809         ret = bkey_err(k);
810         if (ret)
811                 goto err;
812
813         if (fsck_err_on(!a.io_time[READ], c,
814                         "cached bucket with read_time 0\n"
815                         "  %s",
816                 (printbuf_reset(&buf),
817                  bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
818             fsck_err_on(k.k->type != KEY_TYPE_lru ||
819                         le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
820                         "incorrect/missing lru entry\n"
821                         "  %s\n"
822                         "  %s",
823                         (printbuf_reset(&buf),
824                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
825                         (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
826                 u64 read_time = a.io_time[READ];
827
828                 if (!a.io_time[READ])
829                         a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
830
831                 ret = bch2_lru_change(trans,
832                                       alloc_k.k->p.inode,
833                                       alloc_k.k->p.offset,
834                                       0, &a.io_time[READ]);
835                 if (ret)
836                         goto err;
837
838                 if (a.io_time[READ] != read_time) {
839                         struct bkey_i_alloc_v4 *a_mut =
840                                 bch2_alloc_to_v4_mut(trans, alloc_k);
841                         ret = PTR_ERR_OR_ZERO(a_mut);
842                         if (ret)
843                                 goto err;
844
845                         a_mut->v.io_time[READ] = a.io_time[READ];
846                         ret = bch2_trans_update(trans, alloc_iter,
847                                                 &a_mut->k_i, BTREE_TRIGGER_NORUN);
848                         if (ret)
849                                 goto err;
850                 }
851         }
852 err:
853 fsck_err:
854         bch2_trans_iter_exit(trans, &lru_iter);
855         printbuf_exit(&buf2);
856         printbuf_exit(&buf);
857         return ret;
858 }
859
860 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
861 {
862         struct btree_trans trans;
863         struct btree_iter iter;
864         struct bkey_s_c k;
865         int ret = 0;
866
867         bch2_trans_init(&trans, c, 0, 0);
868
869         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
870                            BTREE_ITER_PREFETCH, k, ret) {
871                 ret = __bch2_trans_do(&trans, NULL, NULL,
872                                       BTREE_INSERT_NOFAIL|
873                                       BTREE_INSERT_LAZY_RW,
874                         bch2_check_alloc_to_lru_ref(&trans, &iter));
875                 if (ret)
876                         break;
877         }
878         bch2_trans_iter_exit(&trans, &iter);
879
880         bch2_trans_exit(&trans);
881         return ret < 0 ? ret : 0;
882 }
883
884 static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
885                                    struct bch_dev *ca, bool *discard_done)
886 {
887         struct bch_fs *c = trans->c;
888         struct btree_iter iter;
889         struct bkey_s_c k;
890         struct bkey_i_alloc_v4 *a;
891         struct printbuf buf = PRINTBUF;
892         int ret;
893
894         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
895                              BTREE_ITER_CACHED);
896         k = bch2_btree_iter_peek_slot(&iter);
897         ret = bkey_err(k);
898         if (ret)
899                 goto out;
900
901         a = bch2_alloc_to_v4_mut(trans, k);
902         ret = PTR_ERR_OR_ZERO(a);
903         if (ret)
904                 goto out;
905
906         if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
907                 a->v.gen++;
908                 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
909                 goto write;
910         }
911
912         BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
913
914         if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
915                         "%s\n  incorrectly set in need_discard btree",
916                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
917                 ret = -EIO;
918                 goto out;
919         }
920
921         if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
922                 /*
923                  * This works without any other locks because this is the only
924                  * thread that removes items from the need_discard tree
925                  */
926                 bch2_trans_unlock(trans);
927                 blkdev_issue_discard(ca->disk_sb.bdev,
928                                      k.k->p.offset * ca->mi.bucket_size,
929                                      ca->mi.bucket_size,
930                                      GFP_KERNEL, 0);
931                 *discard_done = true;
932
933                 ret = bch2_trans_relock(trans) ? 0 : -EINTR;
934                 if (ret)
935                         goto out;
936         }
937
938         SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
939 write:
940         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
941 out:
942         bch2_trans_iter_exit(trans, &iter);
943         printbuf_exit(&buf);
944         return ret;
945 }
946
947 static void bch2_do_discards_work(struct work_struct *work)
948 {
949         struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
950         struct bch_dev *ca = NULL;
951         struct btree_trans trans;
952         struct btree_iter iter;
953         struct bkey_s_c k;
954         u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
955         int ret;
956
957         bch2_trans_init(&trans, c, 0, 0);
958
959         for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
960                            POS_MIN, 0, k, ret) {
961                 bool discard_done = false;
962
963                 if (ca && k.k->p.inode != ca->dev_idx) {
964                         percpu_ref_put(&ca->io_ref);
965                         ca = NULL;
966                 }
967
968                 if (!ca) {
969                         ca = bch_dev_bkey_exists(c, k.k->p.inode);
970                         if (!percpu_ref_tryget(&ca->io_ref)) {
971                                 ca = NULL;
972                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
973                                 continue;
974                         }
975                 }
976
977                 seen++;
978
979                 if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
980                         open++;
981                         continue;
982                 }
983
984                 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
985                                 c->journal.flushed_seq_ondisk,
986                                 k.k->p.inode, k.k->p.offset)) {
987                         need_journal_commit++;
988                         continue;
989                 }
990
991                 ret = __bch2_trans_do(&trans, NULL, NULL,
992                                       BTREE_INSERT_USE_RESERVE|
993                                       BTREE_INSERT_NOFAIL,
994                                 bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
995                 if (ret)
996                         break;
997
998                 discarded++;
999         }
1000         bch2_trans_iter_exit(&trans, &iter);
1001
1002         if (ca)
1003                 percpu_ref_put(&ca->io_ref);
1004
1005         bch2_trans_exit(&trans);
1006
1007         if (need_journal_commit * 2 > seen)
1008                 bch2_journal_flush_async(&c->journal, NULL);
1009
1010         percpu_ref_put(&c->writes);
1011
1012         trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
1013 }
1014
1015 void bch2_do_discards(struct bch_fs *c)
1016 {
1017         if (percpu_ref_tryget(&c->writes) &&
1018             !queue_work(system_long_wq, &c->discard_work))
1019                 percpu_ref_put(&c->writes);
1020 }
1021
1022 static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
1023 {
1024         struct bch_fs *c = trans->c;
1025         struct btree_iter lru_iter, alloc_iter = { NULL };
1026         struct bkey_s_c k;
1027         struct bkey_i_alloc_v4 *a;
1028         u64 bucket, idx;
1029         int ret;
1030
1031         bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
1032                              POS(ca->dev_idx, 0), 0);
1033         k = bch2_btree_iter_peek(&lru_iter);
1034         ret = bkey_err(k);
1035         if (ret)
1036                 goto out;
1037
1038         if (!k.k || k.k->p.inode != ca->dev_idx)
1039                 goto out;
1040
1041         if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
1042                                     "non lru key in lru btree"))
1043                 goto out;
1044
1045         idx     = k.k->p.offset;
1046         bucket  = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
1047
1048         a = bch2_trans_start_alloc_update(trans, &alloc_iter,
1049                                           POS(ca->dev_idx, bucket));
1050         ret = PTR_ERR_OR_ZERO(a);
1051         if (ret)
1052                 goto out;
1053
1054         if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
1055                         "invalidating bucket with wrong lru idx (got %llu should be %llu",
1056                         idx, alloc_lru_idx(a->v)))
1057                 goto out;
1058
1059         SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1060         a->v.gen++;
1061         a->v.data_type          = 0;
1062         a->v.dirty_sectors      = 0;
1063         a->v.cached_sectors     = 0;
1064         a->v.io_time[READ]      = atomic64_read(&c->io_clock[READ].now);
1065         a->v.io_time[WRITE]     = atomic64_read(&c->io_clock[WRITE].now);
1066
1067         ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
1068                                 BTREE_TRIGGER_BUCKET_INVALIDATE);
1069 out:
1070         bch2_trans_iter_exit(trans, &alloc_iter);
1071         bch2_trans_iter_exit(trans, &lru_iter);
1072         return ret;
1073 }
1074
1075 static void bch2_do_invalidates_work(struct work_struct *work)
1076 {
1077         struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
1078         struct bch_dev *ca;
1079         struct btree_trans trans;
1080         unsigned i;
1081         int ret = 0;
1082
1083         bch2_trans_init(&trans, c, 0, 0);
1084
1085         for_each_member_device(ca, c, i)
1086                 while (!ret && should_invalidate_buckets(ca))
1087                         ret = __bch2_trans_do(&trans, NULL, NULL,
1088                                               BTREE_INSERT_USE_RESERVE|
1089                                               BTREE_INSERT_NOFAIL,
1090                                         invalidate_one_bucket(&trans, ca));
1091
1092         bch2_trans_exit(&trans);
1093         percpu_ref_put(&c->writes);
1094 }
1095
1096 void bch2_do_invalidates(struct bch_fs *c)
1097 {
1098         if (percpu_ref_tryget(&c->writes))
1099                 queue_work(system_long_wq, &c->invalidate_work);
1100 }
1101
1102 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
1103 {
1104         struct btree_trans trans;
1105         struct btree_iter iter;
1106         struct bkey_s_c k;
1107         struct bch_alloc_v4 a;
1108         struct bch_member *m;
1109         int ret;
1110
1111         bch2_trans_init(&trans, c, 0, 0);
1112
1113         for_each_btree_key(&trans, iter, BTREE_ID_alloc,
1114                            POS(ca->dev_idx, ca->mi.first_bucket),
1115                            BTREE_ITER_SLOTS|
1116                            BTREE_ITER_PREFETCH, k, ret) {
1117                 if (iter.pos.offset >= ca->mi.nbuckets)
1118                         break;
1119
1120                 bch2_alloc_to_v4(k, &a);
1121                 ret = __bch2_trans_do(&trans, NULL, NULL,
1122                                       BTREE_INSERT_LAZY_RW,
1123                                  bch2_bucket_do_index(&trans, k, a, true));
1124                 if (ret)
1125                         break;
1126         }
1127         bch2_trans_iter_exit(&trans, &iter);
1128
1129         bch2_trans_exit(&trans);
1130
1131         if (ret) {
1132                 bch_err(ca, "error initializing free space: %i", ret);
1133                 return ret;
1134         }
1135
1136         mutex_lock(&c->sb_lock);
1137         m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
1138         SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
1139         mutex_unlock(&c->sb_lock);
1140
1141         return ret;
1142 }
1143
1144 int bch2_fs_freespace_init(struct bch_fs *c)
1145 {
1146         struct bch_dev *ca;
1147         unsigned i;
1148         int ret = 0;
1149         bool doing_init = false;
1150
1151         /*
1152          * We can crash during the device add path, so we need to check this on
1153          * every mount:
1154          */
1155
1156         for_each_member_device(ca, c, i) {
1157                 if (ca->mi.freespace_initialized)
1158                         continue;
1159
1160                 if (!doing_init) {
1161                         bch_info(c, "initializing freespace");
1162                         doing_init = true;
1163                 }
1164
1165                 ret = bch2_dev_freespace_init(c, ca);
1166                 if (ret) {
1167                         percpu_ref_put(&ca->ref);
1168                         return ret;
1169                 }
1170         }
1171
1172         if (doing_init) {
1173                 mutex_lock(&c->sb_lock);
1174                 bch2_write_super(c);
1175                 mutex_unlock(&c->sb_lock);
1176
1177                 bch_verbose(c, "done initializing freespace");
1178         }
1179
1180         return ret;
1181 }
1182
1183 /* Bucket IO clocks: */
1184
1185 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
1186                               size_t bucket_nr, int rw)
1187 {
1188         struct bch_fs *c = trans->c;
1189         struct btree_iter iter;
1190         struct bkey_i_alloc_v4 *a;
1191         u64 now;
1192         int ret = 0;
1193
1194         a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
1195         ret = PTR_ERR_OR_ZERO(a);
1196         if (ret)
1197                 return ret;
1198
1199         now = atomic64_read(&c->io_clock[rw].now);
1200         if (a->v.io_time[rw] == now)
1201                 goto out;
1202
1203         a->v.io_time[rw] = now;
1204
1205         ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1206                 bch2_trans_commit(trans, NULL, NULL, 0);
1207 out:
1208         bch2_trans_iter_exit(trans, &iter);
1209         return ret;
1210 }
1211
1212 /* Startup/shutdown (ro/rw): */
1213
1214 void bch2_recalc_capacity(struct bch_fs *c)
1215 {
1216         struct bch_dev *ca;
1217         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
1218         unsigned bucket_size_max = 0;
1219         unsigned long ra_pages = 0;
1220         unsigned i;
1221
1222         lockdep_assert_held(&c->state_lock);
1223
1224         for_each_online_member(ca, c, i) {
1225                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
1226
1227                 ra_pages += bdi->ra_pages;
1228         }
1229
1230         bch2_set_ra_pages(c, ra_pages);
1231
1232         for_each_rw_member(ca, c, i) {
1233                 u64 dev_reserve = 0;
1234
1235                 /*
1236                  * We need to reserve buckets (from the number
1237                  * of currently available buckets) against
1238                  * foreground writes so that mainly copygc can
1239                  * make forward progress.
1240                  *
1241                  * We need enough to refill the various reserves
1242                  * from scratch - copygc will use its entire
1243                  * reserve all at once, then run against when
1244                  * its reserve is refilled (from the formerly
1245                  * available buckets).
1246                  *
1247                  * This reserve is just used when considering if
1248                  * allocations for foreground writes must wait -
1249                  * not -ENOSPC calculations.
1250                  */
1251
1252                 dev_reserve += ca->nr_btree_reserve * 2;
1253                 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
1254
1255                 dev_reserve += 1;       /* btree write point */
1256                 dev_reserve += 1;       /* copygc write point */
1257                 dev_reserve += 1;       /* rebalance write point */
1258
1259                 dev_reserve *= ca->mi.bucket_size;
1260
1261                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1262                                              ca->mi.first_bucket);
1263
1264                 reserved_sectors += dev_reserve * 2;
1265
1266                 bucket_size_max = max_t(unsigned, bucket_size_max,
1267                                         ca->mi.bucket_size);
1268         }
1269
1270         gc_reserve = c->opts.gc_reserve_bytes
1271                 ? c->opts.gc_reserve_bytes >> 9
1272                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1273
1274         reserved_sectors = max(gc_reserve, reserved_sectors);
1275
1276         reserved_sectors = min(reserved_sectors, capacity);
1277
1278         c->capacity = capacity - reserved_sectors;
1279
1280         c->bucket_size_max = bucket_size_max;
1281
1282         /* Wake up case someone was waiting for buckets */
1283         closure_wake_up(&c->freelist_wait);
1284 }
1285
1286 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1287 {
1288         struct open_bucket *ob;
1289         bool ret = false;
1290
1291         for (ob = c->open_buckets;
1292              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1293              ob++) {
1294                 spin_lock(&ob->lock);
1295                 if (ob->valid && !ob->on_partial_list &&
1296                     ob->dev == ca->dev_idx)
1297                         ret = true;
1298                 spin_unlock(&ob->lock);
1299         }
1300
1301         return ret;
1302 }
1303
1304 /* device goes ro: */
1305 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1306 {
1307         unsigned i;
1308
1309         /* First, remove device from allocation groups: */
1310
1311         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1312                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1313
1314         /*
1315          * Capacity is calculated based off of devices in allocation groups:
1316          */
1317         bch2_recalc_capacity(c);
1318
1319         /* Next, close write points that point to this device... */
1320         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1321                 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1322
1323         bch2_writepoint_stop(c, ca, &c->copygc_write_point);
1324         bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1325         bch2_writepoint_stop(c, ca, &c->btree_write_point);
1326
1327         mutex_lock(&c->btree_reserve_cache_lock);
1328         while (c->btree_reserve_cache_nr) {
1329                 struct btree_alloc *a =
1330                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1331
1332                 bch2_open_buckets_put(c, &a->ob);
1333         }
1334         mutex_unlock(&c->btree_reserve_cache_lock);
1335
1336         while (1) {
1337                 struct open_bucket *ob;
1338
1339                 spin_lock(&c->freelist_lock);
1340                 if (!ca->open_buckets_partial_nr) {
1341                         spin_unlock(&c->freelist_lock);
1342                         break;
1343                 }
1344                 ob = c->open_buckets +
1345                         ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1346                 ob->on_partial_list = false;
1347                 spin_unlock(&c->freelist_lock);
1348
1349                 bch2_open_bucket_put(c, ob);
1350         }
1351
1352         bch2_ec_stop_dev(c, ca);
1353
1354         /*
1355          * Wake up threads that were blocked on allocation, so they can notice
1356          * the device can no longer be removed and the capacity has changed:
1357          */
1358         closure_wake_up(&c->freelist_wait);
1359
1360         /*
1361          * journal_res_get() can block waiting for free space in the journal -
1362          * it needs to notice there may not be devices to allocate from anymore:
1363          */
1364         wake_up(&c->journal.wait);
1365
1366         /* Now wait for any in flight writes: */
1367
1368         closure_wait_event(&c->open_buckets_wait,
1369                            !bch2_dev_has_open_write_point(c, ca));
1370 }
1371
1372 /* device goes rw: */
1373 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1374 {
1375         unsigned i;
1376
1377         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1378                 if (ca->mi.data_allowed & (1 << i))
1379                         set_bit(ca->dev_idx, c->rw_devs[i].d);
1380 }
1381
1382 void bch2_fs_allocator_background_init(struct bch_fs *c)
1383 {
1384         spin_lock_init(&c->freelist_lock);
1385         INIT_WORK(&c->discard_work, bch2_do_discards_work);
1386         INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
1387 }