]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_background.c
Update bcachefs sources to 98a1bff393 bcachefs: Topology repair fixes
[bcachefs-tools-debian] / libbcachefs / alloc_background.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_gc.h"
11 #include "buckets.h"
12 #include "buckets_waiting_for_journal.h"
13 #include "clock.h"
14 #include "debug.h"
15 #include "ec.h"
16 #include "error.h"
17 #include "lru.h"
18 #include "recovery.h"
19 #include "varint.h"
20
21 #include <linux/kthread.h>
22 #include <linux/math64.h>
23 #include <linux/random.h>
24 #include <linux/rculist.h>
25 #include <linux/rcupdate.h>
26 #include <linux/sched/task.h>
27 #include <linux/sort.h>
28 #include <trace/events/bcachefs.h>
29
30 /* Persistent alloc info: */
31
32 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
33 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
34         BCH_ALLOC_FIELDS_V1()
35 #undef x
36 };
37
38 const char * const bch2_bucket_states[] = {
39         "free",
40         "need gc gens",
41         "need discard",
42         "cached",
43         "dirty",
44         NULL
45 };
46
47 struct bkey_alloc_unpacked {
48         u64             journal_seq;
49         u64             bucket;
50         u8              dev;
51         u8              gen;
52         u8              oldest_gen;
53         u8              data_type;
54         bool            need_discard:1;
55         bool            need_inc_gen:1;
56 #define x(_name, _bits) u##_bits _name;
57         BCH_ALLOC_FIELDS_V2()
58 #undef  x
59 };
60
61 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
62                                      const void **p, unsigned field)
63 {
64         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
65         u64 v;
66
67         if (!(a->fields & (1 << field)))
68                 return 0;
69
70         switch (bytes) {
71         case 1:
72                 v = *((const u8 *) *p);
73                 break;
74         case 2:
75                 v = le16_to_cpup(*p);
76                 break;
77         case 4:
78                 v = le32_to_cpup(*p);
79                 break;
80         case 8:
81                 v = le64_to_cpup(*p);
82                 break;
83         default:
84                 BUG();
85         }
86
87         *p += bytes;
88         return v;
89 }
90
91 static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
92                                       unsigned field, u64 v)
93 {
94         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
95
96         if (!v)
97                 return;
98
99         a->v.fields |= 1 << field;
100
101         switch (bytes) {
102         case 1:
103                 *((u8 *) *p) = v;
104                 break;
105         case 2:
106                 *((__le16 *) *p) = cpu_to_le16(v);
107                 break;
108         case 4:
109                 *((__le32 *) *p) = cpu_to_le32(v);
110                 break;
111         case 8:
112                 *((__le64 *) *p) = cpu_to_le64(v);
113                 break;
114         default:
115                 BUG();
116         }
117
118         *p += bytes;
119 }
120
121 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
122                                  struct bkey_s_c k)
123 {
124         const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
125         const void *d = in->data;
126         unsigned idx = 0;
127
128         out->gen = in->gen;
129
130 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
131         BCH_ALLOC_FIELDS_V1()
132 #undef  x
133 }
134
135 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
136                                 struct bkey_s_c k)
137 {
138         struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
139         const u8 *in = a.v->data;
140         const u8 *end = bkey_val_end(a);
141         unsigned fieldnr = 0;
142         int ret;
143         u64 v;
144
145         out->gen        = a.v->gen;
146         out->oldest_gen = a.v->oldest_gen;
147         out->data_type  = a.v->data_type;
148
149 #define x(_name, _bits)                                                 \
150         if (fieldnr < a.v->nr_fields) {                                 \
151                 ret = bch2_varint_decode_fast(in, end, &v);             \
152                 if (ret < 0)                                            \
153                         return ret;                                     \
154                 in += ret;                                              \
155         } else {                                                        \
156                 v = 0;                                                  \
157         }                                                               \
158         out->_name = v;                                                 \
159         if (v != out->_name)                                            \
160                 return -1;                                              \
161         fieldnr++;
162
163         BCH_ALLOC_FIELDS_V2()
164 #undef  x
165         return 0;
166 }
167
168 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
169                                 struct bkey_s_c k)
170 {
171         struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
172         const u8 *in = a.v->data;
173         const u8 *end = bkey_val_end(a);
174         unsigned fieldnr = 0;
175         int ret;
176         u64 v;
177
178         out->gen        = a.v->gen;
179         out->oldest_gen = a.v->oldest_gen;
180         out->data_type  = a.v->data_type;
181         out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
182         out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
183         out->journal_seq = le64_to_cpu(a.v->journal_seq);
184
185 #define x(_name, _bits)                                                 \
186         if (fieldnr < a.v->nr_fields) {                                 \
187                 ret = bch2_varint_decode_fast(in, end, &v);             \
188                 if (ret < 0)                                            \
189                         return ret;                                     \
190                 in += ret;                                              \
191         } else {                                                        \
192                 v = 0;                                                  \
193         }                                                               \
194         out->_name = v;                                                 \
195         if (v != out->_name)                                            \
196                 return -1;                                              \
197         fieldnr++;
198
199         BCH_ALLOC_FIELDS_V2()
200 #undef  x
201         return 0;
202 }
203
204 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
205 {
206         struct bkey_alloc_unpacked ret = {
207                 .dev    = k.k->p.inode,
208                 .bucket = k.k->p.offset,
209                 .gen    = 0,
210         };
211
212         switch (k.k->type) {
213         case KEY_TYPE_alloc:
214                 bch2_alloc_unpack_v1(&ret, k);
215                 break;
216         case KEY_TYPE_alloc_v2:
217                 bch2_alloc_unpack_v2(&ret, k);
218                 break;
219         case KEY_TYPE_alloc_v3:
220                 bch2_alloc_unpack_v3(&ret, k);
221                 break;
222         }
223
224         return ret;
225 }
226
227 void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
228 {
229         if (k.k->type == KEY_TYPE_alloc_v4) {
230                 *out = *bkey_s_c_to_alloc_v4(k).v;
231         } else {
232                 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
233
234                 *out = (struct bch_alloc_v4) {
235                         .journal_seq            = u.journal_seq,
236                         .flags                  = u.need_discard,
237                         .gen                    = u.gen,
238                         .oldest_gen             = u.oldest_gen,
239                         .data_type              = u.data_type,
240                         .stripe_redundancy      = u.stripe_redundancy,
241                         .dirty_sectors          = u.dirty_sectors,
242                         .cached_sectors         = u.cached_sectors,
243                         .io_time[READ]          = u.read_time,
244                         .io_time[WRITE]         = u.write_time,
245                         .stripe                 = u.stripe,
246                 };
247         }
248 }
249
250 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
251 {
252         struct bkey_i_alloc_v4 *ret;
253
254         if (k.k->type == KEY_TYPE_alloc_v4) {
255                 ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
256                 if (!IS_ERR(ret))
257                         bkey_reassemble(&ret->k_i, k);
258         } else {
259                 ret = bch2_trans_kmalloc(trans, sizeof(*ret));
260                 if (!IS_ERR(ret)) {
261                         bkey_alloc_v4_init(&ret->k_i);
262                         ret->k.p = k.k->p;
263                         bch2_alloc_to_v4(k, &ret->v);
264                 }
265         }
266         return ret;
267 }
268
269 struct bkey_i_alloc_v4 *
270 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
271                               struct bpos pos)
272 {
273         struct bkey_s_c k;
274         struct bkey_i_alloc_v4 *a;
275         int ret;
276
277         bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
278                              BTREE_ITER_WITH_UPDATES|
279                              BTREE_ITER_CACHED|
280                              BTREE_ITER_INTENT);
281         k = bch2_btree_iter_peek_slot(iter);
282         ret = bkey_err(k);
283         if (ret) {
284                 bch2_trans_iter_exit(trans, iter);
285                 return ERR_PTR(ret);
286         }
287
288         a = bch2_alloc_to_v4_mut(trans, k);
289         if (IS_ERR(a))
290                 bch2_trans_iter_exit(trans, iter);
291         return a;
292 }
293
294 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
295 {
296         unsigned i, bytes = offsetof(struct bch_alloc, data);
297
298         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
299                 if (a->fields & (1 << i))
300                         bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
301
302         return DIV_ROUND_UP(bytes, sizeof(u64));
303 }
304
305 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
306                           int rw, struct printbuf *err)
307 {
308         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
309
310         /* allow for unknown fields */
311         if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
312                 pr_buf(err, "incorrect value size (%zu < %u)",
313                        bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
314                 return -EINVAL;
315         }
316
317         return 0;
318 }
319
320 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
321                           int rw, struct printbuf *err)
322 {
323         struct bkey_alloc_unpacked u;
324
325         if (bch2_alloc_unpack_v2(&u, k)) {
326                 pr_buf(err, "unpack error");
327                 return -EINVAL;
328         }
329
330         return 0;
331 }
332
333 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
334                           int rw, struct printbuf *err)
335 {
336         struct bkey_alloc_unpacked u;
337
338         if (bch2_alloc_unpack_v3(&u, k)) {
339                 pr_buf(err, "unpack error");
340                 return -EINVAL;
341         }
342
343         return 0;
344 }
345
346 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
347                           int rw, struct printbuf *err)
348 {
349         struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
350
351         if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
352                 pr_buf(err, "bad val size (%zu != %zu)",
353                        bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
354                 return -EINVAL;
355         }
356
357         if (rw == WRITE) {
358                 if (a.v->cached_sectors &&
359                     !a.v->dirty_sectors &&
360                     !a.v->io_time[READ]) {
361                         pr_buf(err, "cached bucket with read_time == 0");
362                         return -EINVAL;
363                 }
364
365                 if (!a.v->dirty_sectors &&
366                     !a.v->cached_sectors &&
367                     !a.v->stripe &&
368                     a.v->data_type) {
369                         pr_buf(err, "empty, but data_type nonzero");
370                         return -EINVAL;
371                 }
372         }
373
374         return 0;
375 }
376
377 void bch2_alloc_v4_swab(struct bkey_s k)
378 {
379         struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
380
381         a->journal_seq          = swab64(a->journal_seq);
382         a->flags                = swab32(a->flags);
383         a->dirty_sectors        = swab32(a->dirty_sectors);
384         a->cached_sectors       = swab32(a->cached_sectors);
385         a->io_time[0]           = swab64(a->io_time[0]);
386         a->io_time[1]           = swab64(a->io_time[1]);
387         a->stripe               = swab32(a->stripe);
388         a->nr_external_backpointers = swab32(a->nr_external_backpointers);
389 }
390
391 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
392 {
393         struct bch_alloc_v4 a;
394
395         bch2_alloc_to_v4(k, &a);
396
397         pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
398                a.gen, a.oldest_gen, bch2_data_types[a.data_type],
399                a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
400         pr_buf(out, " dirty_sectors %u",        a.dirty_sectors);
401         pr_buf(out, " cached_sectors %u",       a.cached_sectors);
402         pr_buf(out, " stripe %u",               a.stripe);
403         pr_buf(out, " stripe_redundancy %u",    a.stripe_redundancy);
404         pr_buf(out, " read_time %llu",          a.io_time[READ]);
405         pr_buf(out, " write_time %llu",         a.io_time[WRITE]);
406 }
407
408 int bch2_alloc_read(struct bch_fs *c)
409 {
410         struct btree_trans trans;
411         struct btree_iter iter;
412         struct bkey_s_c k;
413         struct bch_alloc_v4 a;
414         struct bch_dev *ca;
415         int ret;
416
417         bch2_trans_init(&trans, c, 0, 0);
418
419         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
420                            BTREE_ITER_PREFETCH, k, ret) {
421                 ca = bch_dev_bkey_exists(c, k.k->p.inode);
422                 bch2_alloc_to_v4(k, &a);
423
424                 *bucket_gen(ca, k.k->p.offset) = a.gen;
425         }
426         bch2_trans_iter_exit(&trans, &iter);
427
428         bch2_trans_exit(&trans);
429
430         if (ret)
431                 bch_err(c, "error reading alloc info: %i", ret);
432
433         return ret;
434 }
435
436 /* Free space/discard btree: */
437
438 static int bch2_bucket_do_index(struct btree_trans *trans,
439                                 struct bkey_s_c alloc_k,
440                                 struct bch_alloc_v4 a,
441                                 bool set)
442 {
443         struct bch_fs *c = trans->c;
444         struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
445         struct btree_iter iter;
446         struct bkey_s_c old;
447         struct bkey_i *k;
448         enum bucket_state state = bucket_state(a);
449         enum btree_id btree;
450         enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
451         enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
452         struct printbuf buf = PRINTBUF;
453         int ret;
454
455         if (state != BUCKET_free &&
456             state != BUCKET_need_discard)
457                 return 0;
458
459         k = bch2_trans_kmalloc(trans, sizeof(*k));
460         if (IS_ERR(k))
461                 return PTR_ERR(k);
462
463         bkey_init(&k->k);
464         k->k.type = new_type;
465
466         switch (state) {
467         case BUCKET_free:
468                 btree = BTREE_ID_freespace;
469                 k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
470                 bch2_key_resize(&k->k, 1);
471                 break;
472         case BUCKET_need_discard:
473                 btree = BTREE_ID_need_discard;
474                 k->k.p = alloc_k.k->p;
475                 break;
476         default:
477                 return 0;
478         }
479
480         bch2_trans_iter_init(trans, &iter, btree,
481                              bkey_start_pos(&k->k),
482                              BTREE_ITER_INTENT);
483         old = bch2_btree_iter_peek_slot(&iter);
484         ret = bkey_err(old);
485         if (ret)
486                 goto err;
487
488         if (ca->mi.freespace_initialized &&
489             bch2_fs_inconsistent_on(old.k->type != old_type, c,
490                         "incorrect key when %s %s btree (got %s should be %s)\n"
491                         "  for %s",
492                         set ? "setting" : "clearing",
493                         bch2_btree_ids[btree],
494                         bch2_bkey_types[old.k->type],
495                         bch2_bkey_types[old_type],
496                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
497                 ret = -EIO;
498                 goto err;
499         }
500
501         ret = bch2_trans_update(trans, &iter, k, 0);
502 err:
503         bch2_trans_iter_exit(trans, &iter);
504         printbuf_exit(&buf);
505         return ret;
506 }
507
508 int bch2_trans_mark_alloc(struct btree_trans *trans,
509                           struct bkey_s_c old, struct bkey_i *new,
510                           unsigned flags)
511 {
512         struct bch_fs *c = trans->c;
513         struct bch_alloc_v4 old_a, *new_a;
514         u64 old_lru, new_lru;
515         int ret = 0;
516
517         /*
518          * Deletion only happens in the device removal path, with
519          * BTREE_TRIGGER_NORUN:
520          */
521         BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
522
523         bch2_alloc_to_v4(old, &old_a);
524         new_a = &bkey_i_to_alloc_v4(new)->v;
525
526         if (new_a->dirty_sectors > old_a.dirty_sectors ||
527             new_a->cached_sectors > old_a.cached_sectors) {
528                 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
529                 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
530                 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
531                 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
532         }
533
534         if (old_a.data_type && !new_a->data_type &&
535             old_a.gen == new_a->gen &&
536             !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
537                 new_a->gen++;
538                 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
539         }
540
541         if (bucket_state(old_a) != bucket_state(*new_a) ||
542             (bucket_state(*new_a) == BUCKET_free &&
543              alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
544                 ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
545                         bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
546                 if (ret)
547                         return ret;
548         }
549
550         old_lru = alloc_lru_idx(old_a);
551         new_lru = alloc_lru_idx(*new_a);
552
553         if (old_lru != new_lru) {
554                 ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
555                                       old_lru, &new_lru);
556                 if (ret)
557                         return ret;
558
559                 if (new_lru && new_a->io_time[READ] != new_lru)
560                         new_a->io_time[READ] = new_lru;
561         }
562
563         return 0;
564 }
565
566 static int bch2_check_alloc_key(struct btree_trans *trans,
567                                 struct btree_iter *alloc_iter)
568 {
569         struct bch_fs *c = trans->c;
570         struct bch_dev *ca;
571         struct btree_iter discard_iter, freespace_iter;
572         struct bch_alloc_v4 a;
573         unsigned discard_key_type, freespace_key_type;
574         struct bkey_s_c alloc_k, k;
575         struct printbuf buf = PRINTBUF;
576         struct printbuf buf2 = PRINTBUF;
577         int ret;
578
579         alloc_k = bch2_btree_iter_peek(alloc_iter);
580         if (!alloc_k.k)
581                 return 0;
582
583         ret = bkey_err(alloc_k);
584         if (ret)
585                 return ret;
586
587         if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
588                         "alloc key for invalid device or bucket"))
589                 return bch2_btree_delete_at(trans, alloc_iter, 0);
590
591         ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
592         if (!ca->mi.freespace_initialized)
593                 return 0;
594
595         bch2_alloc_to_v4(alloc_k, &a);
596
597         discard_key_type = bucket_state(a) == BUCKET_need_discard
598                 ? KEY_TYPE_set : 0;
599         freespace_key_type = bucket_state(a) == BUCKET_free
600                 ? KEY_TYPE_set : 0;
601
602         bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
603                              alloc_k.k->p, 0);
604         bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
605                              alloc_freespace_pos(alloc_k.k->p, a), 0);
606
607         k = bch2_btree_iter_peek_slot(&discard_iter);
608         ret = bkey_err(k);
609         if (ret)
610                 goto err;
611
612         if (fsck_err_on(k.k->type != discard_key_type, c,
613                         "incorrect key in need_discard btree (got %s should be %s)\n"
614                         "  %s",
615                         bch2_bkey_types[k.k->type],
616                         bch2_bkey_types[discard_key_type],
617                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
618                 struct bkey_i *update =
619                         bch2_trans_kmalloc(trans, sizeof(*update));
620
621                 ret = PTR_ERR_OR_ZERO(update);
622                 if (ret)
623                         goto err;
624
625                 bkey_init(&update->k);
626                 update->k.type  = discard_key_type;
627                 update->k.p     = discard_iter.pos;
628
629                 ret = bch2_trans_update(trans, &discard_iter, update, 0);
630                 if (ret)
631                         goto err;
632         }
633
634         k = bch2_btree_iter_peek_slot(&freespace_iter);
635         ret = bkey_err(k);
636         if (ret)
637                 goto err;
638
639         if (fsck_err_on(k.k->type != freespace_key_type, c,
640                         "incorrect key in freespace btree (got %s should be %s)\n"
641                         "  %s",
642                         bch2_bkey_types[k.k->type],
643                         bch2_bkey_types[freespace_key_type],
644                         (printbuf_reset(&buf),
645                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
646                 struct bkey_i *update =
647                         bch2_trans_kmalloc(trans, sizeof(*update));
648
649                 ret = PTR_ERR_OR_ZERO(update);
650                 if (ret)
651                         goto err;
652
653                 bkey_init(&update->k);
654                 update->k.type  = freespace_key_type;
655                 update->k.p     = freespace_iter.pos;
656                 bch2_key_resize(&update->k, 1);
657
658                 ret = bch2_trans_update(trans, &freespace_iter, update, 0);
659                 if (ret)
660                         goto err;
661         }
662 err:
663 fsck_err:
664         bch2_trans_iter_exit(trans, &freespace_iter);
665         bch2_trans_iter_exit(trans, &discard_iter);
666         printbuf_exit(&buf2);
667         printbuf_exit(&buf);
668         return ret;
669 }
670
671 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
672                                             struct btree_iter *iter)
673 {
674         struct bch_fs *c = trans->c;
675         struct btree_iter alloc_iter;
676         struct bkey_s_c k, freespace_k;
677         struct bch_alloc_v4 a;
678         u64 genbits;
679         struct bpos pos;
680         struct bkey_i *update;
681         enum bucket_state state = iter->btree_id == BTREE_ID_need_discard
682                 ? BUCKET_need_discard
683                 : BUCKET_free;
684         struct printbuf buf = PRINTBUF;
685         int ret;
686
687         freespace_k = bch2_btree_iter_peek(iter);
688         if (!freespace_k.k)
689                 return 1;
690
691         ret = bkey_err(freespace_k);
692         if (ret)
693                 return ret;
694
695         pos = iter->pos;
696         pos.offset &= ~(~0ULL << 56);
697         genbits = iter->pos.offset & (~0ULL << 56);
698
699         bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
700
701         if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
702                         "%llu:%llu set in %s btree but device or bucket does not exist",
703                         pos.inode, pos.offset,
704                         bch2_btree_ids[iter->btree_id]))
705                 goto delete;
706
707         k = bch2_btree_iter_peek_slot(&alloc_iter);
708         ret = bkey_err(k);
709         if (ret)
710                 goto err;
711
712         bch2_alloc_to_v4(k, &a);
713
714         if (fsck_err_on(bucket_state(a) != state ||
715                         (state == BUCKET_free &&
716                          genbits != alloc_freespace_genbits(a)), c,
717                         "%s\n  incorrectly set in %s index (free %u, genbits %llu should be %llu)",
718                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
719                         bch2_btree_ids[iter->btree_id],
720                         bucket_state(a) == state,
721                         genbits >> 56, alloc_freespace_genbits(a) >> 56))
722                 goto delete;
723 out:
724 err:
725 fsck_err:
726         bch2_trans_iter_exit(trans, &alloc_iter);
727         printbuf_exit(&buf);
728         return ret;
729 delete:
730         if (iter->btree_id == BTREE_ID_freespace) {
731                 /* should probably add a helper for deleting extents */
732                 update = bch2_trans_kmalloc(trans, sizeof(*update));
733                 ret = PTR_ERR_OR_ZERO(update);
734                 if (ret)
735                         goto err;
736
737                 bkey_init(&update->k);
738                 update->k.p = iter->pos;
739                 bch2_key_resize(&update->k, 1);
740
741                 ret = bch2_trans_update(trans, iter, update, 0);
742         } else {
743                 ret = bch2_btree_delete_at(trans, iter, 0);
744         }
745         goto out;
746 }
747
748 int bch2_check_alloc_info(struct bch_fs *c)
749 {
750         struct btree_trans trans;
751         struct btree_iter iter;
752         struct bkey_s_c k;
753         int ret = 0;
754
755         bch2_trans_init(&trans, c, 0, 0);
756
757         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
758                            BTREE_ITER_PREFETCH, k, ret) {
759                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
760                         bch2_check_alloc_key(&trans, &iter));
761                 if (ret)
762                         break;
763         }
764         bch2_trans_iter_exit(&trans, &iter);
765
766         if (ret)
767                 goto err;
768
769         bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
770                              BTREE_ITER_PREFETCH);
771         while (1) {
772                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
773                         bch2_check_discard_freespace_key(&trans, &iter));
774                 if (ret)
775                         break;
776
777                 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
778         }
779         bch2_trans_iter_exit(&trans, &iter);
780
781         if (ret)
782                 goto err;
783
784         bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
785                              BTREE_ITER_PREFETCH);
786         while (1) {
787                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
788                         bch2_check_discard_freespace_key(&trans, &iter));
789                 if (ret)
790                         break;
791
792                 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
793         }
794         bch2_trans_iter_exit(&trans, &iter);
795 err:
796         bch2_trans_exit(&trans);
797         return ret < 0 ? ret : 0;
798 }
799
800 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
801                                        struct btree_iter *alloc_iter)
802 {
803         struct bch_fs *c = trans->c;
804         struct btree_iter lru_iter;
805         struct bch_alloc_v4 a;
806         struct bkey_s_c alloc_k, k;
807         struct printbuf buf = PRINTBUF;
808         struct printbuf buf2 = PRINTBUF;
809         int ret;
810
811         alloc_k = bch2_btree_iter_peek(alloc_iter);
812         if (!alloc_k.k)
813                 return 0;
814
815         ret = bkey_err(alloc_k);
816         if (ret)
817                 return ret;
818
819         bch2_alloc_to_v4(alloc_k, &a);
820
821         if (bucket_state(a) != BUCKET_cached)
822                 return 0;
823
824         bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
825                              POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
826
827         k = bch2_btree_iter_peek_slot(&lru_iter);
828         ret = bkey_err(k);
829         if (ret)
830                 goto err;
831
832         if (fsck_err_on(!a.io_time[READ], c,
833                         "cached bucket with read_time 0\n"
834                         "  %s",
835                 (printbuf_reset(&buf),
836                  bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
837             fsck_err_on(k.k->type != KEY_TYPE_lru ||
838                         le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
839                         "incorrect/missing lru entry\n"
840                         "  %s\n"
841                         "  %s",
842                         (printbuf_reset(&buf),
843                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
844                         (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
845                 u64 read_time = a.io_time[READ];
846
847                 if (!a.io_time[READ])
848                         a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
849
850                 ret = bch2_lru_change(trans,
851                                       alloc_k.k->p.inode,
852                                       alloc_k.k->p.offset,
853                                       0, &a.io_time[READ]);
854                 if (ret)
855                         goto err;
856
857                 if (a.io_time[READ] != read_time) {
858                         struct bkey_i_alloc_v4 *a_mut =
859                                 bch2_alloc_to_v4_mut(trans, alloc_k);
860                         ret = PTR_ERR_OR_ZERO(a_mut);
861                         if (ret)
862                                 goto err;
863
864                         a_mut->v.io_time[READ] = a.io_time[READ];
865                         ret = bch2_trans_update(trans, alloc_iter,
866                                                 &a_mut->k_i, BTREE_TRIGGER_NORUN);
867                         if (ret)
868                                 goto err;
869                 }
870         }
871 err:
872 fsck_err:
873         bch2_trans_iter_exit(trans, &lru_iter);
874         printbuf_exit(&buf2);
875         printbuf_exit(&buf);
876         return ret;
877 }
878
879 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
880 {
881         struct btree_trans trans;
882         struct btree_iter iter;
883         struct bkey_s_c k;
884         int ret = 0;
885
886         bch2_trans_init(&trans, c, 0, 0);
887
888         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
889                            BTREE_ITER_PREFETCH, k, ret) {
890                 ret = __bch2_trans_do(&trans, NULL, NULL,
891                                       BTREE_INSERT_NOFAIL|
892                                       BTREE_INSERT_LAZY_RW,
893                         bch2_check_alloc_to_lru_ref(&trans, &iter));
894                 if (ret)
895                         break;
896         }
897         bch2_trans_iter_exit(&trans, &iter);
898
899         bch2_trans_exit(&trans);
900         return ret < 0 ? ret : 0;
901 }
902
903 static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
904                                    struct bch_dev *ca, bool *discard_done)
905 {
906         struct bch_fs *c = trans->c;
907         struct btree_iter iter;
908         struct bkey_s_c k;
909         struct bkey_i_alloc_v4 *a;
910         struct printbuf buf = PRINTBUF;
911         int ret;
912
913         bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
914                              BTREE_ITER_CACHED);
915         k = bch2_btree_iter_peek_slot(&iter);
916         ret = bkey_err(k);
917         if (ret)
918                 goto out;
919
920         a = bch2_alloc_to_v4_mut(trans, k);
921         ret = PTR_ERR_OR_ZERO(a);
922         if (ret)
923                 goto out;
924
925         if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
926                 a->v.gen++;
927                 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
928                 goto write;
929         }
930
931         BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
932
933         if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
934                         "%s\n  incorrectly set in need_discard btree",
935                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
936                 ret = -EIO;
937                 goto out;
938         }
939
940         if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
941                 /*
942                  * This works without any other locks because this is the only
943                  * thread that removes items from the need_discard tree
944                  */
945                 bch2_trans_unlock(trans);
946                 blkdev_issue_discard(ca->disk_sb.bdev,
947                                      k.k->p.offset * ca->mi.bucket_size,
948                                      ca->mi.bucket_size,
949                                      GFP_KERNEL, 0);
950                 *discard_done = true;
951
952                 ret = bch2_trans_relock(trans) ? 0 : -EINTR;
953                 if (ret)
954                         goto out;
955         }
956
957         SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
958 write:
959         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
960 out:
961         bch2_trans_iter_exit(trans, &iter);
962         printbuf_exit(&buf);
963         return ret;
964 }
965
966 static void bch2_do_discards_work(struct work_struct *work)
967 {
968         struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
969         struct bch_dev *ca = NULL;
970         struct btree_trans trans;
971         struct btree_iter iter;
972         struct bkey_s_c k;
973         u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
974         int ret;
975
976         bch2_trans_init(&trans, c, 0, 0);
977
978         for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
979                            POS_MIN, 0, k, ret) {
980                 bool discard_done = false;
981
982                 if (ca && k.k->p.inode != ca->dev_idx) {
983                         percpu_ref_put(&ca->io_ref);
984                         ca = NULL;
985                 }
986
987                 if (!ca) {
988                         ca = bch_dev_bkey_exists(c, k.k->p.inode);
989                         if (!percpu_ref_tryget(&ca->io_ref)) {
990                                 ca = NULL;
991                                 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
992                                 continue;
993                         }
994                 }
995
996                 seen++;
997
998                 if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
999                         open++;
1000                         continue;
1001                 }
1002
1003                 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1004                                 c->journal.flushed_seq_ondisk,
1005                                 k.k->p.inode, k.k->p.offset)) {
1006                         need_journal_commit++;
1007                         continue;
1008                 }
1009
1010                 ret = __bch2_trans_do(&trans, NULL, NULL,
1011                                       BTREE_INSERT_USE_RESERVE|
1012                                       BTREE_INSERT_NOFAIL,
1013                                 bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
1014                 if (ret)
1015                         break;
1016
1017                 discarded++;
1018         }
1019         bch2_trans_iter_exit(&trans, &iter);
1020
1021         if (ca)
1022                 percpu_ref_put(&ca->io_ref);
1023
1024         bch2_trans_exit(&trans);
1025
1026         if (need_journal_commit * 2 > seen)
1027                 bch2_journal_flush_async(&c->journal, NULL);
1028
1029         percpu_ref_put(&c->writes);
1030
1031         trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
1032 }
1033
1034 void bch2_do_discards(struct bch_fs *c)
1035 {
1036         if (percpu_ref_tryget(&c->writes) &&
1037             !queue_work(system_long_wq, &c->discard_work))
1038                 percpu_ref_put(&c->writes);
1039 }
1040
1041 static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
1042 {
1043         struct bch_fs *c = trans->c;
1044         struct btree_iter lru_iter, alloc_iter = { NULL };
1045         struct bkey_s_c k;
1046         struct bkey_i_alloc_v4 *a;
1047         u64 bucket, idx;
1048         int ret;
1049
1050         bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
1051                              POS(ca->dev_idx, 0), 0);
1052         k = bch2_btree_iter_peek(&lru_iter);
1053         ret = bkey_err(k);
1054         if (ret)
1055                 goto out;
1056
1057         if (!k.k || k.k->p.inode != ca->dev_idx)
1058                 goto out;
1059
1060         if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
1061                                     "non lru key in lru btree"))
1062                 goto out;
1063
1064         idx     = k.k->p.offset;
1065         bucket  = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
1066
1067         a = bch2_trans_start_alloc_update(trans, &alloc_iter,
1068                                           POS(ca->dev_idx, bucket));
1069         ret = PTR_ERR_OR_ZERO(a);
1070         if (ret)
1071                 goto out;
1072
1073         if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
1074                         "invalidating bucket with wrong lru idx (got %llu should be %llu",
1075                         idx, alloc_lru_idx(a->v)))
1076                 goto out;
1077
1078         SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1079         a->v.gen++;
1080         a->v.data_type          = 0;
1081         a->v.dirty_sectors      = 0;
1082         a->v.cached_sectors     = 0;
1083         a->v.io_time[READ]      = atomic64_read(&c->io_clock[READ].now);
1084         a->v.io_time[WRITE]     = atomic64_read(&c->io_clock[WRITE].now);
1085
1086         ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
1087                                 BTREE_TRIGGER_BUCKET_INVALIDATE);
1088 out:
1089         bch2_trans_iter_exit(trans, &alloc_iter);
1090         bch2_trans_iter_exit(trans, &lru_iter);
1091         return ret;
1092 }
1093
1094 static void bch2_do_invalidates_work(struct work_struct *work)
1095 {
1096         struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
1097         struct bch_dev *ca;
1098         struct btree_trans trans;
1099         unsigned i;
1100         int ret = 0;
1101
1102         bch2_trans_init(&trans, c, 0, 0);
1103
1104         for_each_member_device(ca, c, i)
1105                 while (!ret && should_invalidate_buckets(ca))
1106                         ret = __bch2_trans_do(&trans, NULL, NULL,
1107                                               BTREE_INSERT_USE_RESERVE|
1108                                               BTREE_INSERT_NOFAIL,
1109                                         invalidate_one_bucket(&trans, ca));
1110
1111         bch2_trans_exit(&trans);
1112         percpu_ref_put(&c->writes);
1113 }
1114
1115 void bch2_do_invalidates(struct bch_fs *c)
1116 {
1117         if (percpu_ref_tryget(&c->writes))
1118                 queue_work(system_long_wq, &c->invalidate_work);
1119 }
1120
1121 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
1122 {
1123         struct btree_trans trans;
1124         struct btree_iter iter;
1125         struct bkey_s_c k;
1126         struct bch_alloc_v4 a;
1127         struct bch_member *m;
1128         int ret;
1129
1130         bch2_trans_init(&trans, c, 0, 0);
1131
1132         for_each_btree_key(&trans, iter, BTREE_ID_alloc,
1133                            POS(ca->dev_idx, ca->mi.first_bucket),
1134                            BTREE_ITER_SLOTS|
1135                            BTREE_ITER_PREFETCH, k, ret) {
1136                 if (iter.pos.offset >= ca->mi.nbuckets)
1137                         break;
1138
1139                 bch2_alloc_to_v4(k, &a);
1140                 ret = __bch2_trans_do(&trans, NULL, NULL,
1141                                       BTREE_INSERT_LAZY_RW,
1142                                  bch2_bucket_do_index(&trans, k, a, true));
1143                 if (ret)
1144                         break;
1145         }
1146         bch2_trans_iter_exit(&trans, &iter);
1147
1148         bch2_trans_exit(&trans);
1149
1150         if (ret) {
1151                 bch_err(ca, "error initializing free space: %i", ret);
1152                 return ret;
1153         }
1154
1155         mutex_lock(&c->sb_lock);
1156         m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
1157         SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
1158         mutex_unlock(&c->sb_lock);
1159
1160         return ret;
1161 }
1162
1163 int bch2_fs_freespace_init(struct bch_fs *c)
1164 {
1165         struct bch_dev *ca;
1166         unsigned i;
1167         int ret = 0;
1168         bool doing_init = false;
1169
1170         /*
1171          * We can crash during the device add path, so we need to check this on
1172          * every mount:
1173          */
1174
1175         for_each_member_device(ca, c, i) {
1176                 if (ca->mi.freespace_initialized)
1177                         continue;
1178
1179                 if (!doing_init) {
1180                         bch_info(c, "initializing freespace");
1181                         doing_init = true;
1182                 }
1183
1184                 ret = bch2_dev_freespace_init(c, ca);
1185                 if (ret) {
1186                         percpu_ref_put(&ca->ref);
1187                         return ret;
1188                 }
1189         }
1190
1191         if (doing_init) {
1192                 mutex_lock(&c->sb_lock);
1193                 bch2_write_super(c);
1194                 mutex_unlock(&c->sb_lock);
1195
1196                 bch_verbose(c, "done initializing freespace");
1197         }
1198
1199         return ret;
1200 }
1201
1202 /* Bucket IO clocks: */
1203
1204 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
1205                               size_t bucket_nr, int rw)
1206 {
1207         struct bch_fs *c = trans->c;
1208         struct btree_iter iter;
1209         struct bkey_i_alloc_v4 *a;
1210         u64 now;
1211         int ret = 0;
1212
1213         a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
1214         ret = PTR_ERR_OR_ZERO(a);
1215         if (ret)
1216                 return ret;
1217
1218         now = atomic64_read(&c->io_clock[rw].now);
1219         if (a->v.io_time[rw] == now)
1220                 goto out;
1221
1222         a->v.io_time[rw] = now;
1223
1224         ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1225                 bch2_trans_commit(trans, NULL, NULL, 0);
1226 out:
1227         bch2_trans_iter_exit(trans, &iter);
1228         return ret;
1229 }
1230
1231 /* Startup/shutdown (ro/rw): */
1232
1233 void bch2_recalc_capacity(struct bch_fs *c)
1234 {
1235         struct bch_dev *ca;
1236         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
1237         unsigned bucket_size_max = 0;
1238         unsigned long ra_pages = 0;
1239         unsigned i;
1240
1241         lockdep_assert_held(&c->state_lock);
1242
1243         for_each_online_member(ca, c, i) {
1244                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
1245
1246                 ra_pages += bdi->ra_pages;
1247         }
1248
1249         bch2_set_ra_pages(c, ra_pages);
1250
1251         for_each_rw_member(ca, c, i) {
1252                 u64 dev_reserve = 0;
1253
1254                 /*
1255                  * We need to reserve buckets (from the number
1256                  * of currently available buckets) against
1257                  * foreground writes so that mainly copygc can
1258                  * make forward progress.
1259                  *
1260                  * We need enough to refill the various reserves
1261                  * from scratch - copygc will use its entire
1262                  * reserve all at once, then run against when
1263                  * its reserve is refilled (from the formerly
1264                  * available buckets).
1265                  *
1266                  * This reserve is just used when considering if
1267                  * allocations for foreground writes must wait -
1268                  * not -ENOSPC calculations.
1269                  */
1270
1271                 dev_reserve += ca->nr_btree_reserve * 2;
1272                 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
1273
1274                 dev_reserve += 1;       /* btree write point */
1275                 dev_reserve += 1;       /* copygc write point */
1276                 dev_reserve += 1;       /* rebalance write point */
1277
1278                 dev_reserve *= ca->mi.bucket_size;
1279
1280                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1281                                              ca->mi.first_bucket);
1282
1283                 reserved_sectors += dev_reserve * 2;
1284
1285                 bucket_size_max = max_t(unsigned, bucket_size_max,
1286                                         ca->mi.bucket_size);
1287         }
1288
1289         gc_reserve = c->opts.gc_reserve_bytes
1290                 ? c->opts.gc_reserve_bytes >> 9
1291                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1292
1293         reserved_sectors = max(gc_reserve, reserved_sectors);
1294
1295         reserved_sectors = min(reserved_sectors, capacity);
1296
1297         c->capacity = capacity - reserved_sectors;
1298
1299         c->bucket_size_max = bucket_size_max;
1300
1301         /* Wake up case someone was waiting for buckets */
1302         closure_wake_up(&c->freelist_wait);
1303 }
1304
1305 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1306 {
1307         struct open_bucket *ob;
1308         bool ret = false;
1309
1310         for (ob = c->open_buckets;
1311              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1312              ob++) {
1313                 spin_lock(&ob->lock);
1314                 if (ob->valid && !ob->on_partial_list &&
1315                     ob->dev == ca->dev_idx)
1316                         ret = true;
1317                 spin_unlock(&ob->lock);
1318         }
1319
1320         return ret;
1321 }
1322
1323 /* device goes ro: */
1324 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1325 {
1326         unsigned i;
1327
1328         /* First, remove device from allocation groups: */
1329
1330         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1331                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1332
1333         /*
1334          * Capacity is calculated based off of devices in allocation groups:
1335          */
1336         bch2_recalc_capacity(c);
1337
1338         /* Next, close write points that point to this device... */
1339         for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1340                 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1341
1342         bch2_writepoint_stop(c, ca, &c->copygc_write_point);
1343         bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1344         bch2_writepoint_stop(c, ca, &c->btree_write_point);
1345
1346         mutex_lock(&c->btree_reserve_cache_lock);
1347         while (c->btree_reserve_cache_nr) {
1348                 struct btree_alloc *a =
1349                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1350
1351                 bch2_open_buckets_put(c, &a->ob);
1352         }
1353         mutex_unlock(&c->btree_reserve_cache_lock);
1354
1355         while (1) {
1356                 struct open_bucket *ob;
1357
1358                 spin_lock(&c->freelist_lock);
1359                 if (!ca->open_buckets_partial_nr) {
1360                         spin_unlock(&c->freelist_lock);
1361                         break;
1362                 }
1363                 ob = c->open_buckets +
1364                         ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1365                 ob->on_partial_list = false;
1366                 spin_unlock(&c->freelist_lock);
1367
1368                 bch2_open_bucket_put(c, ob);
1369         }
1370
1371         bch2_ec_stop_dev(c, ca);
1372
1373         /*
1374          * Wake up threads that were blocked on allocation, so they can notice
1375          * the device can no longer be removed and the capacity has changed:
1376          */
1377         closure_wake_up(&c->freelist_wait);
1378
1379         /*
1380          * journal_res_get() can block waiting for free space in the journal -
1381          * it needs to notice there may not be devices to allocate from anymore:
1382          */
1383         wake_up(&c->journal.wait);
1384
1385         /* Now wait for any in flight writes: */
1386
1387         closure_wait_event(&c->open_buckets_wait,
1388                            !bch2_dev_has_open_write_point(c, ca));
1389 }
1390
1391 /* device goes rw: */
1392 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1393 {
1394         unsigned i;
1395
1396         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1397                 if (ca->mi.data_allowed & (1 << i))
1398                         set_bit(ca->dev_idx, c->rw_devs[i].d);
1399 }
1400
1401 void bch2_fs_allocator_background_init(struct bch_fs *c)
1402 {
1403         spin_lock_init(&c->freelist_lock);
1404         INIT_WORK(&c->discard_work, bch2_do_discards_work);
1405         INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
1406 }