]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/alloc_background.c
Update bcachefs sources to 33a60d9b05 bcachefs: Assorted fixes for clang
[bcachefs-tools-debian] / libbcachefs / alloc_background.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "btree_cache.h"
7 #include "btree_io.h"
8 #include "btree_key_cache.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_gc.h"
12 #include "btree_write_buffer.h"
13 #include "buckets.h"
14 #include "buckets_waiting_for_journal.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "lru.h"
20 #include "recovery.h"
21 #include "trace.h"
22 #include "varint.h"
23
24 #include <linux/kthread.h>
25 #include <linux/math64.h>
26 #include <linux/random.h>
27 #include <linux/rculist.h>
28 #include <linux/rcupdate.h>
29 #include <linux/sched/task.h>
30 #include <linux/sort.h>
31
32 /* Persistent alloc info: */
33
34 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
35 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
36         BCH_ALLOC_FIELDS_V1()
37 #undef x
38 };
39
40 struct bkey_alloc_unpacked {
41         u64             journal_seq;
42         u8              gen;
43         u8              oldest_gen;
44         u8              data_type;
45         bool            need_discard:1;
46         bool            need_inc_gen:1;
47 #define x(_name, _bits) u##_bits _name;
48         BCH_ALLOC_FIELDS_V2()
49 #undef  x
50 };
51
52 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
53                                      const void **p, unsigned field)
54 {
55         unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
56         u64 v;
57
58         if (!(a->fields & (1 << field)))
59                 return 0;
60
61         switch (bytes) {
62         case 1:
63                 v = *((const u8 *) *p);
64                 break;
65         case 2:
66                 v = le16_to_cpup(*p);
67                 break;
68         case 4:
69                 v = le32_to_cpup(*p);
70                 break;
71         case 8:
72                 v = le64_to_cpup(*p);
73                 break;
74         default:
75                 BUG();
76         }
77
78         *p += bytes;
79         return v;
80 }
81
82 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
83                                  struct bkey_s_c k)
84 {
85         const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
86         const void *d = in->data;
87         unsigned idx = 0;
88
89         out->gen = in->gen;
90
91 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
92         BCH_ALLOC_FIELDS_V1()
93 #undef  x
94 }
95
96 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
97                                 struct bkey_s_c k)
98 {
99         struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
100         const u8 *in = a.v->data;
101         const u8 *end = bkey_val_end(a);
102         unsigned fieldnr = 0;
103         int ret;
104         u64 v;
105
106         out->gen        = a.v->gen;
107         out->oldest_gen = a.v->oldest_gen;
108         out->data_type  = a.v->data_type;
109
110 #define x(_name, _bits)                                                 \
111         if (fieldnr < a.v->nr_fields) {                                 \
112                 ret = bch2_varint_decode_fast(in, end, &v);             \
113                 if (ret < 0)                                            \
114                         return ret;                                     \
115                 in += ret;                                              \
116         } else {                                                        \
117                 v = 0;                                                  \
118         }                                                               \
119         out->_name = v;                                                 \
120         if (v != out->_name)                                            \
121                 return -1;                                              \
122         fieldnr++;
123
124         BCH_ALLOC_FIELDS_V2()
125 #undef  x
126         return 0;
127 }
128
129 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
130                                 struct bkey_s_c k)
131 {
132         struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
133         const u8 *in = a.v->data;
134         const u8 *end = bkey_val_end(a);
135         unsigned fieldnr = 0;
136         int ret;
137         u64 v;
138
139         out->gen        = a.v->gen;
140         out->oldest_gen = a.v->oldest_gen;
141         out->data_type  = a.v->data_type;
142         out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
143         out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
144         out->journal_seq = le64_to_cpu(a.v->journal_seq);
145
146 #define x(_name, _bits)                                                 \
147         if (fieldnr < a.v->nr_fields) {                                 \
148                 ret = bch2_varint_decode_fast(in, end, &v);             \
149                 if (ret < 0)                                            \
150                         return ret;                                     \
151                 in += ret;                                              \
152         } else {                                                        \
153                 v = 0;                                                  \
154         }                                                               \
155         out->_name = v;                                                 \
156         if (v != out->_name)                                            \
157                 return -1;                                              \
158         fieldnr++;
159
160         BCH_ALLOC_FIELDS_V2()
161 #undef  x
162         return 0;
163 }
164
165 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
166 {
167         struct bkey_alloc_unpacked ret = { .gen = 0 };
168
169         switch (k.k->type) {
170         case KEY_TYPE_alloc:
171                 bch2_alloc_unpack_v1(&ret, k);
172                 break;
173         case KEY_TYPE_alloc_v2:
174                 bch2_alloc_unpack_v2(&ret, k);
175                 break;
176         case KEY_TYPE_alloc_v3:
177                 bch2_alloc_unpack_v3(&ret, k);
178                 break;
179         }
180
181         return ret;
182 }
183
184 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
185 {
186         unsigned i, bytes = offsetof(struct bch_alloc, data);
187
188         for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
189                 if (a->fields & (1 << i))
190                         bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
191
192         return DIV_ROUND_UP(bytes, sizeof(u64));
193 }
194
195 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
196                           enum bkey_invalid_flags flags,
197                           struct printbuf *err)
198 {
199         struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
200
201         /* allow for unknown fields */
202         if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
203                 prt_printf(err, "incorrect value size (%zu < %u)",
204                        bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
205                 return -BCH_ERR_invalid_bkey;
206         }
207
208         return 0;
209 }
210
211 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
212                           enum bkey_invalid_flags flags,
213                           struct printbuf *err)
214 {
215         struct bkey_alloc_unpacked u;
216
217         if (bch2_alloc_unpack_v2(&u, k)) {
218                 prt_printf(err, "unpack error");
219                 return -BCH_ERR_invalid_bkey;
220         }
221
222         return 0;
223 }
224
225 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
226                           enum bkey_invalid_flags flags,
227                           struct printbuf *err)
228 {
229         struct bkey_alloc_unpacked u;
230
231         if (bch2_alloc_unpack_v3(&u, k)) {
232                 prt_printf(err, "unpack error");
233                 return -BCH_ERR_invalid_bkey;
234         }
235
236         return 0;
237 }
238
239 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
240                           enum bkey_invalid_flags flags,
241                           struct printbuf *err)
242 {
243         struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
244         int rw = flags & WRITE;
245
246         if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
247                 prt_printf(err, "bad val size (%u > %lu)",
248                        alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
249                 return -BCH_ERR_invalid_bkey;
250         }
251
252         if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
253             BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
254                 prt_printf(err, "invalid backpointers_start");
255                 return -BCH_ERR_invalid_bkey;
256         }
257
258         if (rw == WRITE &&
259             !(flags & BKEY_INVALID_JOURNAL) &&
260             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) {
261                 unsigned i, bp_len = 0;
262
263                 for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
264                         bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
265
266                 if (bp_len > a.v->dirty_sectors) {
267                         prt_printf(err, "too many backpointers");
268                         return -BCH_ERR_invalid_bkey;
269                 }
270         }
271
272         if (rw == WRITE) {
273                 if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
274                         prt_printf(err, "invalid data type (got %u should be %u)",
275                                a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
276                         return -BCH_ERR_invalid_bkey;
277                 }
278
279                 switch (a.v->data_type) {
280                 case BCH_DATA_free:
281                 case BCH_DATA_need_gc_gens:
282                 case BCH_DATA_need_discard:
283                         if (a.v->dirty_sectors ||
284                             a.v->cached_sectors ||
285                             a.v->stripe) {
286                                 prt_printf(err, "empty data type free but have data");
287                                 return -BCH_ERR_invalid_bkey;
288                         }
289                         break;
290                 case BCH_DATA_sb:
291                 case BCH_DATA_journal:
292                 case BCH_DATA_btree:
293                 case BCH_DATA_user:
294                 case BCH_DATA_parity:
295                         if (!a.v->dirty_sectors) {
296                                 prt_printf(err, "data_type %s but dirty_sectors==0",
297                                        bch2_data_types[a.v->data_type]);
298                                 return -BCH_ERR_invalid_bkey;
299                         }
300                         break;
301                 case BCH_DATA_cached:
302                         if (!a.v->cached_sectors ||
303                             a.v->dirty_sectors ||
304                             a.v->stripe) {
305                                 prt_printf(err, "data type inconsistency");
306                                 return -BCH_ERR_invalid_bkey;
307                         }
308
309                         if (!a.v->io_time[READ] &&
310                             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
311                                 prt_printf(err, "cached bucket with read_time == 0");
312                                 return -BCH_ERR_invalid_bkey;
313                         }
314                         break;
315                 case BCH_DATA_stripe:
316                         if (!a.v->stripe) {
317                                 prt_printf(err, "data_type %s but stripe==0",
318                                        bch2_data_types[a.v->data_type]);
319                                 return -BCH_ERR_invalid_bkey;
320                         }
321                         break;
322                 }
323         }
324
325         return 0;
326 }
327
328 static inline u64 swab40(u64 x)
329 {
330         return (((x & 0x00000000ffULL) << 32)|
331                 ((x & 0x000000ff00ULL) << 16)|
332                 ((x & 0x0000ff0000ULL) >>  0)|
333                 ((x & 0x00ff000000ULL) >> 16)|
334                 ((x & 0xff00000000ULL) >> 32));
335 }
336
337 void bch2_alloc_v4_swab(struct bkey_s k)
338 {
339         struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
340         struct bch_backpointer *bp, *bps;
341
342         a->journal_seq          = swab64(a->journal_seq);
343         a->flags                = swab32(a->flags);
344         a->dirty_sectors        = swab32(a->dirty_sectors);
345         a->cached_sectors       = swab32(a->cached_sectors);
346         a->io_time[0]           = swab64(a->io_time[0]);
347         a->io_time[1]           = swab64(a->io_time[1]);
348         a->stripe               = swab32(a->stripe);
349         a->nr_external_backpointers = swab32(a->nr_external_backpointers);
350
351         bps = alloc_v4_backpointers(a);
352         for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
353                 bp->bucket_offset       = swab40(bp->bucket_offset);
354                 bp->bucket_len          = swab32(bp->bucket_len);
355                 bch2_bpos_swab(&bp->pos);
356         }
357 }
358
359 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
360 {
361         struct bch_alloc_v4 _a;
362         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
363         unsigned i;
364
365         prt_newline(out);
366         printbuf_indent_add(out, 2);
367
368         prt_printf(out, "gen %u oldest_gen %u data_type %s",
369                a->gen, a->oldest_gen,
370                a->data_type < BCH_DATA_NR
371                ? bch2_data_types[a->data_type]
372                : "(invalid data type)");
373         prt_newline(out);
374         prt_printf(out, "journal_seq       %llu",       a->journal_seq);
375         prt_newline(out);
376         prt_printf(out, "need_discard      %llu",       BCH_ALLOC_V4_NEED_DISCARD(a));
377         prt_newline(out);
378         prt_printf(out, "need_inc_gen      %llu",       BCH_ALLOC_V4_NEED_INC_GEN(a));
379         prt_newline(out);
380         prt_printf(out, "dirty_sectors     %u", a->dirty_sectors);
381         prt_newline(out);
382         prt_printf(out, "cached_sectors    %u", a->cached_sectors);
383         prt_newline(out);
384         prt_printf(out, "stripe            %u", a->stripe);
385         prt_newline(out);
386         prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
387         prt_newline(out);
388         prt_printf(out, "io_time[READ]     %llu",       a->io_time[READ]);
389         prt_newline(out);
390         prt_printf(out, "io_time[WRITE]    %llu",       a->io_time[WRITE]);
391         prt_newline(out);
392         prt_printf(out, "fragmentation     %llu",       a->fragmentation_lru);
393         prt_newline(out);
394         prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
395         prt_newline(out);
396
397         if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
398                 struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
399                 const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
400
401                 prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
402                 printbuf_indent_add(out, 2);
403
404                 for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
405                         prt_newline(out);
406                         bch2_backpointer_to_text(out, &bps[i]);
407                 }
408
409                 printbuf_indent_sub(out, 2);
410         }
411
412         printbuf_indent_sub(out, 2);
413 }
414
415 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
416 {
417         if (k.k->type == KEY_TYPE_alloc_v4) {
418                 void *src, *dst;
419
420                 *out = *bkey_s_c_to_alloc_v4(k).v;
421
422                 src = alloc_v4_backpointers(out);
423                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
424                 dst = alloc_v4_backpointers(out);
425
426                 if (src < dst)
427                         memset(src, 0, dst - src);
428
429                 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
430         } else {
431                 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
432
433                 *out = (struct bch_alloc_v4) {
434                         .journal_seq            = u.journal_seq,
435                         .flags                  = u.need_discard,
436                         .gen                    = u.gen,
437                         .oldest_gen             = u.oldest_gen,
438                         .data_type              = u.data_type,
439                         .stripe_redundancy      = u.stripe_redundancy,
440                         .dirty_sectors          = u.dirty_sectors,
441                         .cached_sectors         = u.cached_sectors,
442                         .io_time[READ]          = u.read_time,
443                         .io_time[WRITE]         = u.write_time,
444                         .stripe                 = u.stripe,
445                 };
446
447                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
448         }
449 }
450
451 static noinline struct bkey_i_alloc_v4 *
452 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
453 {
454         struct bkey_i_alloc_v4 *ret;
455
456         ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
457         if (IS_ERR(ret))
458                 return ret;
459
460         if (k.k->type == KEY_TYPE_alloc_v4) {
461                 void *src, *dst;
462
463                 bkey_reassemble(&ret->k_i, k);
464
465                 src = alloc_v4_backpointers(&ret->v);
466                 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
467                 dst = alloc_v4_backpointers(&ret->v);
468
469                 if (src < dst)
470                         memset(src, 0, dst - src);
471
472                 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
473                 set_alloc_v4_u64s(ret);
474         } else {
475                 bkey_alloc_v4_init(&ret->k_i);
476                 ret->k.p = k.k->p;
477                 bch2_alloc_to_v4(k, &ret->v);
478         }
479         return ret;
480 }
481
482 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
483 {
484         struct bkey_s_c_alloc_v4 a;
485
486         if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
487             ((a = bkey_s_c_to_alloc_v4(k), true) &&
488              BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
489                 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
490
491         return __bch2_alloc_to_v4_mut(trans, k);
492 }
493
494 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
495 {
496         return bch2_alloc_to_v4_mut_inlined(trans, k);
497 }
498
499 struct bkey_i_alloc_v4 *
500 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
501                               struct bpos pos)
502 {
503         struct bkey_s_c k;
504         struct bkey_i_alloc_v4 *a;
505         int ret;
506
507         k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
508                              BTREE_ITER_WITH_UPDATES|
509                              BTREE_ITER_CACHED|
510                              BTREE_ITER_INTENT);
511         ret = bkey_err(k);
512         if (unlikely(ret))
513                 return ERR_PTR(ret);
514
515         a = bch2_alloc_to_v4_mut_inlined(trans, k);
516         ret = PTR_ERR_OR_ZERO(a);
517         if (unlikely(ret))
518                 goto err;
519         return a;
520 err:
521         bch2_trans_iter_exit(trans, iter);
522         return ERR_PTR(ret);
523 }
524
525 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
526 {
527         *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
528
529         pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
530         return pos;
531 }
532
533 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
534 {
535         pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
536         pos.offset += offset;
537         return pos;
538 }
539
540 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
541 {
542         return k.k->type == KEY_TYPE_bucket_gens
543                 ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
544                 : 0;
545 }
546
547 int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
548                              enum bkey_invalid_flags flags,
549                              struct printbuf *err)
550 {
551         if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
552                 prt_printf(err, "bad val size (%lu != %zu)",
553                        bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
554                 return -BCH_ERR_invalid_bkey;
555         }
556
557         return 0;
558 }
559
560 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
561 {
562         struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
563         unsigned i;
564
565         for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
566                 if (i)
567                         prt_char(out, ' ');
568                 prt_printf(out, "%u", g.v->gens[i]);
569         }
570 }
571
572 int bch2_bucket_gens_init(struct bch_fs *c)
573 {
574         struct btree_trans trans;
575         struct btree_iter iter;
576         struct bkey_s_c k;
577         struct bch_alloc_v4 a;
578         struct bkey_i_bucket_gens g;
579         bool have_bucket_gens_key = false;
580         unsigned offset;
581         struct bpos pos;
582         u8 gen;
583         int ret;
584
585         bch2_trans_init(&trans, c, 0, 0);
586
587         for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
588                            BTREE_ITER_PREFETCH, k, ret) {
589                 /*
590                  * Not a fsck error because this is checked/repaired by
591                  * bch2_check_alloc_key() which runs later:
592                  */
593                 if (!bch2_dev_bucket_exists(c, k.k->p))
594                         continue;
595
596                 gen = bch2_alloc_to_v4(k, &a)->gen;
597                 pos = alloc_gens_pos(iter.pos, &offset);
598
599                 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
600                         ret = commit_do(&trans, NULL, NULL,
601                                         BTREE_INSERT_NOFAIL|
602                                         BTREE_INSERT_LAZY_RW,
603                                 __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
604                         if (ret)
605                                 break;
606                         have_bucket_gens_key = false;
607                 }
608
609                 if (!have_bucket_gens_key) {
610                         bkey_bucket_gens_init(&g.k_i);
611                         g.k.p = pos;
612                         have_bucket_gens_key = true;
613                 }
614
615                 g.v.gens[offset] = gen;
616         }
617         bch2_trans_iter_exit(&trans, &iter);
618
619         if (have_bucket_gens_key && !ret)
620                 ret = commit_do(&trans, NULL, NULL,
621                                 BTREE_INSERT_NOFAIL|
622                                 BTREE_INSERT_LAZY_RW,
623                         __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
624
625         bch2_trans_exit(&trans);
626
627         if (ret)
628                 bch_err_fn(c, ret);
629         return ret;
630 }
631
632 int bch2_alloc_read(struct bch_fs *c)
633 {
634         struct btree_trans trans;
635         struct btree_iter iter;
636         struct bkey_s_c k;
637         struct bch_dev *ca;
638         int ret;
639
640         down_read(&c->gc_lock);
641         bch2_trans_init(&trans, c, 0, 0);
642
643         if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
644                 const struct bch_bucket_gens *g;
645                 u64 b;
646
647                 for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
648                                    BTREE_ITER_PREFETCH, k, ret) {
649                         u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
650                         u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
651
652                         if (k.k->type != KEY_TYPE_bucket_gens)
653                                 continue;
654
655                         g = bkey_s_c_to_bucket_gens(k).v;
656
657                         /*
658                          * Not a fsck error because this is checked/repaired by
659                          * bch2_check_alloc_key() which runs later:
660                          */
661                         if (!bch2_dev_exists2(c, k.k->p.inode))
662                                 continue;
663
664                         ca = bch_dev_bkey_exists(c, k.k->p.inode);
665
666                         for (b = max_t(u64, ca->mi.first_bucket, start);
667                              b < min_t(u64, ca->mi.nbuckets, end);
668                              b++)
669                                 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
670                 }
671                 bch2_trans_iter_exit(&trans, &iter);
672         } else {
673                 struct bch_alloc_v4 a;
674
675                 for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
676                                    BTREE_ITER_PREFETCH, k, ret) {
677                         /*
678                          * Not a fsck error because this is checked/repaired by
679                          * bch2_check_alloc_key() which runs later:
680                          */
681                         if (!bch2_dev_bucket_exists(c, k.k->p))
682                                 continue;
683
684                         ca = bch_dev_bkey_exists(c, k.k->p.inode);
685
686                         *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
687                 }
688                 bch2_trans_iter_exit(&trans, &iter);
689         }
690
691         bch2_trans_exit(&trans);
692         up_read(&c->gc_lock);
693
694         if (ret)
695                 bch_err_fn(c, ret);
696
697         return ret;
698 }
699
700 /* Free space/discard btree: */
701
702 static int bch2_bucket_do_index(struct btree_trans *trans,
703                                 struct bkey_s_c alloc_k,
704                                 const struct bch_alloc_v4 *a,
705                                 bool set)
706 {
707         struct bch_fs *c = trans->c;
708         struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
709         struct btree_iter iter;
710         struct bkey_s_c old;
711         struct bkey_i *k;
712         enum btree_id btree;
713         enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
714         enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
715         struct printbuf buf = PRINTBUF;
716         int ret;
717
718         if (a->data_type != BCH_DATA_free &&
719             a->data_type != BCH_DATA_need_discard)
720                 return 0;
721
722         k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
723         if (IS_ERR(k))
724                 return PTR_ERR(k);
725
726         bkey_init(&k->k);
727         k->k.type = new_type;
728
729         switch (a->data_type) {
730         case BCH_DATA_free:
731                 btree = BTREE_ID_freespace;
732                 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
733                 bch2_key_resize(&k->k, 1);
734                 break;
735         case BCH_DATA_need_discard:
736                 btree = BTREE_ID_need_discard;
737                 k->k.p = alloc_k.k->p;
738                 break;
739         default:
740                 return 0;
741         }
742
743         old = bch2_bkey_get_iter(trans, &iter, btree,
744                              bkey_start_pos(&k->k),
745                              BTREE_ITER_INTENT);
746         ret = bkey_err(old);
747         if (ret)
748                 return ret;
749
750         if (ca->mi.freespace_initialized &&
751             c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
752             bch2_trans_inconsistent_on(old.k->type != old_type, trans,
753                         "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
754                         "  for %s",
755                         set ? "setting" : "clearing",
756                         bch2_btree_ids[btree],
757                         iter.pos.inode,
758                         iter.pos.offset,
759                         bch2_bkey_types[old.k->type],
760                         bch2_bkey_types[old_type],
761                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
762                 ret = -EIO;
763                 goto err;
764         }
765
766         ret = bch2_trans_update(trans, &iter, k, 0);
767 err:
768         bch2_trans_iter_exit(trans, &iter);
769         printbuf_exit(&buf);
770         return ret;
771 }
772
773 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
774                                            struct bpos bucket, u8 gen)
775 {
776         struct btree_iter iter;
777         unsigned offset;
778         struct bpos pos = alloc_gens_pos(bucket, &offset);
779         struct bkey_i_bucket_gens *g;
780         struct bkey_s_c k;
781         int ret;
782
783         g = bch2_trans_kmalloc(trans, sizeof(*g));
784         ret = PTR_ERR_OR_ZERO(g);
785         if (ret)
786                 return ret;
787
788         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
789                                BTREE_ITER_INTENT|
790                                BTREE_ITER_WITH_UPDATES);
791         ret = bkey_err(k);
792         if (ret)
793                 return ret;
794
795         if (k.k->type != KEY_TYPE_bucket_gens) {
796                 bkey_bucket_gens_init(&g->k_i);
797                 g->k.p = iter.pos;
798         } else {
799                 bkey_reassemble(&g->k_i, k);
800         }
801
802         g->v.gens[offset] = gen;
803
804         ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
805         bch2_trans_iter_exit(trans, &iter);
806         return ret;
807 }
808
809 int bch2_trans_mark_alloc(struct btree_trans *trans,
810                           enum btree_id btree_id, unsigned level,
811                           struct bkey_s_c old, struct bkey_i *new,
812                           unsigned flags)
813 {
814         struct bch_fs *c = trans->c;
815         struct bch_alloc_v4 old_a_convert, *new_a;
816         const struct bch_alloc_v4 *old_a;
817         u64 old_lru, new_lru;
818         int ret = 0;
819
820         /*
821          * Deletion only happens in the device removal path, with
822          * BTREE_TRIGGER_NORUN:
823          */
824         BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
825
826         old_a = bch2_alloc_to_v4(old, &old_a_convert);
827         new_a = &bkey_i_to_alloc_v4(new)->v;
828
829         new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
830
831         if (new_a->dirty_sectors > old_a->dirty_sectors ||
832             new_a->cached_sectors > old_a->cached_sectors) {
833                 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
834                 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
835                 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
836                 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
837         }
838
839         if (data_type_is_empty(new_a->data_type) &&
840             BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
841             !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
842                 new_a->gen++;
843                 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
844         }
845
846         if (old_a->data_type != new_a->data_type ||
847             (new_a->data_type == BCH_DATA_free &&
848              alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
849                 ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
850                         bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
851                 if (ret)
852                         return ret;
853         }
854
855         if (new_a->data_type == BCH_DATA_cached &&
856             !new_a->io_time[READ])
857                 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
858
859         old_lru = alloc_lru_idx_read(*old_a);
860         new_lru = alloc_lru_idx_read(*new_a);
861
862         if (old_lru != new_lru) {
863                 ret = bch2_lru_change(trans, new->k.p.inode,
864                                       bucket_to_u64(new->k.p),
865                                       old_lru, new_lru);
866                 if (ret)
867                         return ret;
868         }
869
870         new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
871                                         bch_dev_bkey_exists(c, new->k.p.inode));
872
873         if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
874                 ret = bch2_lru_change(trans,
875                                 BCH_LRU_FRAGMENTATION_START,
876                                 bucket_to_u64(new->k.p),
877                                 old_a->fragmentation_lru, new_a->fragmentation_lru);
878                 if (ret)
879                         return ret;
880         }
881
882         if (old_a->gen != new_a->gen) {
883                 ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
884                 if (ret)
885                         return ret;
886         }
887
888         return 0;
889 }
890
891 /*
892  * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
893  * extents style btrees, but works on non-extents btrees:
894  */
895 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
896 {
897         struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
898
899         if (bkey_err(k))
900                 return k;
901
902         if (k.k->type) {
903                 return k;
904         } else {
905                 struct btree_iter iter2;
906                 struct bpos next;
907
908                 bch2_trans_copy_iter(&iter2, iter);
909
910                 if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
911                         end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
912
913                 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
914
915                 /*
916                  * btree node min/max is a closed interval, upto takes a half
917                  * open interval:
918                  */
919                 k = bch2_btree_iter_peek_upto(&iter2, end);
920                 next = iter2.pos;
921                 bch2_trans_iter_exit(iter->trans, &iter2);
922
923                 BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
924
925                 if (bkey_err(k))
926                         return k;
927
928                 bkey_init(hole);
929                 hole->p = iter->pos;
930
931                 bch2_key_resize(hole, next.offset - iter->pos.offset);
932                 return (struct bkey_s_c) { hole, NULL };
933         }
934 }
935
936 static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
937 {
938         struct bch_dev *ca;
939         unsigned iter;
940
941         if (bch2_dev_bucket_exists(c, *bucket))
942                 return true;
943
944         if (bch2_dev_exists2(c, bucket->inode)) {
945                 ca = bch_dev_bkey_exists(c, bucket->inode);
946
947                 if (bucket->offset < ca->mi.first_bucket) {
948                         bucket->offset = ca->mi.first_bucket;
949                         return true;
950                 }
951
952                 bucket->inode++;
953                 bucket->offset = 0;
954         }
955
956         rcu_read_lock();
957         iter = bucket->inode;
958         ca = __bch2_next_dev(c, &iter, NULL);
959         if (ca)
960                 *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
961         rcu_read_unlock();
962
963         return ca != NULL;
964 }
965
966 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
967 {
968         struct bch_fs *c = iter->trans->c;
969         struct bkey_s_c k;
970 again:
971         k = bch2_get_key_or_hole(iter, POS_MAX, hole);
972         if (bkey_err(k))
973                 return k;
974
975         if (!k.k->type) {
976                 struct bpos bucket = bkey_start_pos(k.k);
977
978                 if (!bch2_dev_bucket_exists(c, bucket)) {
979                         if (!next_bucket(c, &bucket))
980                                 return bkey_s_c_null;
981
982                         bch2_btree_iter_set_pos(iter, bucket);
983                         goto again;
984                 }
985
986                 if (!bch2_dev_bucket_exists(c, k.k->p)) {
987                         struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
988
989                         bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
990                 }
991         }
992
993         return k;
994 }
995
996 static noinline_for_stack
997 int bch2_check_alloc_key(struct btree_trans *trans,
998                          struct bkey_s_c alloc_k,
999                          struct btree_iter *alloc_iter,
1000                          struct btree_iter *discard_iter,
1001                          struct btree_iter *freespace_iter,
1002                          struct btree_iter *bucket_gens_iter)
1003 {
1004         struct bch_fs *c = trans->c;
1005         struct bch_dev *ca;
1006         struct bch_alloc_v4 a_convert;
1007         const struct bch_alloc_v4 *a;
1008         unsigned discard_key_type, freespace_key_type;
1009         unsigned gens_offset;
1010         struct bkey_s_c k;
1011         struct printbuf buf = PRINTBUF;
1012         int ret;
1013
1014         if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
1015                         "alloc key for invalid device:bucket %llu:%llu",
1016                         alloc_k.k->p.inode, alloc_k.k->p.offset))
1017                 return bch2_btree_delete_at(trans, alloc_iter, 0);
1018
1019         ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
1020         if (!ca->mi.freespace_initialized)
1021                 return 0;
1022
1023         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1024
1025         discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
1026         bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1027         k = bch2_btree_iter_peek_slot(discard_iter);
1028         ret = bkey_err(k);
1029         if (ret)
1030                 goto err;
1031
1032         if (k.k->type != discard_key_type &&
1033             (c->opts.reconstruct_alloc ||
1034              fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
1035                       "  %s",
1036                       bch2_bkey_types[k.k->type],
1037                       bch2_bkey_types[discard_key_type],
1038                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
1039                 struct bkey_i *update =
1040                         bch2_trans_kmalloc(trans, sizeof(*update));
1041
1042                 ret = PTR_ERR_OR_ZERO(update);
1043                 if (ret)
1044                         goto err;
1045
1046                 bkey_init(&update->k);
1047                 update->k.type  = discard_key_type;
1048                 update->k.p     = discard_iter->pos;
1049
1050                 ret = bch2_trans_update(trans, discard_iter, update, 0);
1051                 if (ret)
1052                         goto err;
1053         }
1054
1055         freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1056         bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1057         k = bch2_btree_iter_peek_slot(freespace_iter);
1058         ret = bkey_err(k);
1059         if (ret)
1060                 goto err;
1061
1062         if (k.k->type != freespace_key_type &&
1063             (c->opts.reconstruct_alloc ||
1064              fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
1065                       "  %s",
1066                       bch2_bkey_types[k.k->type],
1067                       bch2_bkey_types[freespace_key_type],
1068                       (printbuf_reset(&buf),
1069                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
1070                 struct bkey_i *update =
1071                         bch2_trans_kmalloc(trans, sizeof(*update));
1072
1073                 ret = PTR_ERR_OR_ZERO(update);
1074                 if (ret)
1075                         goto err;
1076
1077                 bkey_init(&update->k);
1078                 update->k.type  = freespace_key_type;
1079                 update->k.p     = freespace_iter->pos;
1080                 bch2_key_resize(&update->k, 1);
1081
1082                 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1083                 if (ret)
1084                         goto err;
1085         }
1086
1087         bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1088         k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1089         ret = bkey_err(k);
1090         if (ret)
1091                 goto err;
1092
1093         if (a->gen != alloc_gen(k, gens_offset) &&
1094             (c->opts.reconstruct_alloc ||
1095              fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
1096                       "  %s",
1097                       alloc_gen(k, gens_offset), a->gen,
1098                       (printbuf_reset(&buf),
1099                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
1100                 struct bkey_i_bucket_gens *g =
1101                         bch2_trans_kmalloc(trans, sizeof(*g));
1102
1103                 ret = PTR_ERR_OR_ZERO(g);
1104                 if (ret)
1105                         goto err;
1106
1107                 if (k.k->type == KEY_TYPE_bucket_gens) {
1108                         bkey_reassemble(&g->k_i, k);
1109                 } else {
1110                         bkey_bucket_gens_init(&g->k_i);
1111                         g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1112                 }
1113
1114                 g->v.gens[gens_offset] = a->gen;
1115
1116                 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1117                 if (ret)
1118                         goto err;
1119         }
1120 err:
1121 fsck_err:
1122         printbuf_exit(&buf);
1123         return ret;
1124 }
1125
1126 static noinline_for_stack
1127 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1128                                     struct bpos start,
1129                                     struct bpos *end,
1130                                     struct btree_iter *freespace_iter)
1131 {
1132         struct bch_fs *c = trans->c;
1133         struct bch_dev *ca;
1134         struct bkey_s_c k;
1135         struct printbuf buf = PRINTBUF;
1136         int ret;
1137
1138         ca = bch_dev_bkey_exists(c, start.inode);
1139         if (!ca->mi.freespace_initialized)
1140                 return 0;
1141
1142         bch2_btree_iter_set_pos(freespace_iter, start);
1143
1144         k = bch2_btree_iter_peek_slot(freespace_iter);
1145         ret = bkey_err(k);
1146         if (ret)
1147                 goto err;
1148
1149         *end = bkey_min(k.k->p, *end);
1150
1151         if (k.k->type != KEY_TYPE_set &&
1152             (c->opts.reconstruct_alloc ||
1153              fsck_err(c, "hole in alloc btree missing in freespace btree\n"
1154                       "  device %llu buckets %llu-%llu",
1155                       freespace_iter->pos.inode,
1156                       freespace_iter->pos.offset,
1157                       end->offset))) {
1158                 struct bkey_i *update =
1159                         bch2_trans_kmalloc(trans, sizeof(*update));
1160
1161                 ret = PTR_ERR_OR_ZERO(update);
1162                 if (ret)
1163                         goto err;
1164
1165                 bkey_init(&update->k);
1166                 update->k.type  = KEY_TYPE_set;
1167                 update->k.p     = freespace_iter->pos;
1168                 bch2_key_resize(&update->k,
1169                                 min_t(u64, U32_MAX, end->offset -
1170                                       freespace_iter->pos.offset));
1171
1172                 ret = bch2_trans_update(trans, freespace_iter, update, 0);
1173                 if (ret)
1174                         goto err;
1175         }
1176 err:
1177 fsck_err:
1178         printbuf_exit(&buf);
1179         return ret;
1180 }
1181
1182 static noinline_for_stack
1183 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1184                                       struct bpos start,
1185                                       struct bpos *end,
1186                                       struct btree_iter *bucket_gens_iter)
1187 {
1188         struct bch_fs *c = trans->c;
1189         struct bkey_s_c k;
1190         struct printbuf buf = PRINTBUF;
1191         unsigned i, gens_offset, gens_end_offset;
1192         int ret;
1193
1194         if (c->sb.version < bcachefs_metadata_version_bucket_gens)
1195                 return 0;
1196
1197         bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1198
1199         k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1200         ret = bkey_err(k);
1201         if (ret)
1202                 goto err;
1203
1204         if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1205                      alloc_gens_pos(*end,  &gens_end_offset)))
1206                 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1207
1208         if (k.k->type == KEY_TYPE_bucket_gens) {
1209                 struct bkey_i_bucket_gens g;
1210                 bool need_update = false;
1211
1212                 bkey_reassemble(&g.k_i, k);
1213
1214                 for (i = gens_offset; i < gens_end_offset; i++) {
1215                         if (fsck_err_on(g.v.gens[i], c,
1216                                         "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1217                                         bucket_gens_pos_to_alloc(k.k->p, i).inode,
1218                                         bucket_gens_pos_to_alloc(k.k->p, i).offset,
1219                                         g.v.gens[i])) {
1220                                 g.v.gens[i] = 0;
1221                                 need_update = true;
1222                         }
1223                 }
1224
1225                 if (need_update) {
1226                         struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
1227
1228                         ret = PTR_ERR_OR_ZERO(k);
1229                         if (ret)
1230                                 goto err;
1231
1232                         memcpy(k, &g, sizeof(g));
1233
1234                         ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
1235                         if (ret)
1236                                 goto err;
1237                 }
1238         }
1239
1240         *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1241 err:
1242 fsck_err:
1243         printbuf_exit(&buf);
1244         return ret;
1245 }
1246
1247 static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
1248                                               struct btree_iter *iter)
1249 {
1250         struct bch_fs *c = trans->c;
1251         struct btree_iter alloc_iter;
1252         struct bkey_s_c alloc_k;
1253         struct bch_alloc_v4 a_convert;
1254         const struct bch_alloc_v4 *a;
1255         u64 genbits;
1256         struct bpos pos;
1257         enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1258                 ? BCH_DATA_need_discard
1259                 : BCH_DATA_free;
1260         struct printbuf buf = PRINTBUF;
1261         int ret;
1262
1263         pos = iter->pos;
1264         pos.offset &= ~(~0ULL << 56);
1265         genbits = iter->pos.offset & (~0ULL << 56);
1266
1267         alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1268         ret = bkey_err(alloc_k);
1269         if (ret)
1270                 return ret;
1271
1272         if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
1273                         "entry in %s btree for nonexistant dev:bucket %llu:%llu",
1274                         bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
1275                 goto delete;
1276
1277         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1278
1279         if (fsck_err_on(a->data_type != state ||
1280                         (state == BCH_DATA_free &&
1281                          genbits != alloc_freespace_genbits(*a)), c,
1282                         "%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1283                         (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1284                         bch2_btree_ids[iter->btree_id],
1285                         iter->pos.inode,
1286                         iter->pos.offset,
1287                         a->data_type == state,
1288                         genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1289                 goto delete;
1290 out:
1291 fsck_err:
1292         set_btree_iter_dontneed(&alloc_iter);
1293         bch2_trans_iter_exit(trans, &alloc_iter);
1294         printbuf_exit(&buf);
1295         return ret;
1296 delete:
1297         ret =   bch2_btree_delete_extent_at(trans, iter,
1298                         iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1299                 bch2_trans_commit(trans, NULL, NULL,
1300                         BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
1301         goto out;
1302 }
1303
1304 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
1305                                             struct btree_iter *iter,
1306                                             struct bpos end)
1307 {
1308         if (!btree_id_is_extents(iter->btree_id)) {
1309                 return __bch2_check_discard_freespace_key(trans, iter);
1310         } else {
1311                 int ret;
1312
1313                 while (!bkey_eq(iter->pos, end) &&
1314                        !(ret = btree_trans_too_many_iters(trans) ?:
1315                                __bch2_check_discard_freespace_key(trans, iter)))
1316                         bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
1317
1318                 return ret;
1319         }
1320 }
1321
1322 /*
1323  * We've already checked that generation numbers in the bucket_gens btree are
1324  * valid for buckets that exist; this just checks for keys for nonexistent
1325  * buckets.
1326  */
1327 static noinline_for_stack
1328 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1329                                struct btree_iter *iter,
1330                                struct bkey_s_c k)
1331 {
1332         struct bch_fs *c = trans->c;
1333         struct bkey_i_bucket_gens g;
1334         struct bch_dev *ca;
1335         u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1336         u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1337         u64 b;
1338         bool need_update = false, dev_exists;
1339         struct printbuf buf = PRINTBUF;
1340         int ret = 0;
1341
1342         BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1343         bkey_reassemble(&g.k_i, k);
1344
1345         /* if no bch_dev, skip out whether we repair or not */
1346         dev_exists = bch2_dev_exists2(c, k.k->p.inode);
1347         if (!dev_exists) {
1348                 if (fsck_err_on(!dev_exists, c,
1349                                 "bucket_gens key for invalid device:\n  %s",
1350                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1351                         ret = bch2_btree_delete_at(trans, iter, 0);
1352                 }
1353                 goto out;
1354         }
1355
1356         ca = bch_dev_bkey_exists(c, k.k->p.inode);
1357         if (fsck_err_on(end <= ca->mi.first_bucket ||
1358                         start >= ca->mi.nbuckets, c,
1359                         "bucket_gens key for invalid buckets:\n  %s",
1360                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1361                 ret = bch2_btree_delete_at(trans, iter, 0);
1362                 goto out;
1363         }
1364
1365         for (b = start; b < ca->mi.first_bucket; b++)
1366                 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1367                                 "bucket_gens key has nonzero gen for invalid bucket")) {
1368                         g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1369                         need_update = true;
1370                 }
1371
1372         for (b = ca->mi.nbuckets; b < end; b++)
1373                 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
1374                                 "bucket_gens key has nonzero gen for invalid bucket")) {
1375                         g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1376                         need_update = true;
1377                 }
1378
1379         if (need_update) {
1380                 struct bkey_i *k;
1381
1382                 k = bch2_trans_kmalloc(trans, sizeof(g));
1383                 ret = PTR_ERR_OR_ZERO(k);
1384                 if (ret)
1385                         goto out;
1386
1387                 memcpy(k, &g, sizeof(g));
1388                 ret = bch2_trans_update(trans, iter, k, 0);
1389         }
1390 out:
1391 fsck_err:
1392         printbuf_exit(&buf);
1393         return ret;
1394 }
1395
1396 int bch2_check_alloc_info(struct bch_fs *c)
1397 {
1398         struct btree_trans trans;
1399         struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1400         struct bkey hole;
1401         struct bkey_s_c k;
1402         int ret = 0;
1403
1404         bch2_trans_init(&trans, c, 0, 0);
1405
1406         bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
1407                              BTREE_ITER_PREFETCH);
1408         bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1409                              BTREE_ITER_PREFETCH);
1410         bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1411                              BTREE_ITER_PREFETCH);
1412         bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1413                              BTREE_ITER_PREFETCH);
1414
1415         while (1) {
1416                 struct bpos next;
1417
1418                 bch2_trans_begin(&trans);
1419
1420                 k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
1421                 ret = bkey_err(k);
1422                 if (ret)
1423                         goto bkey_err;
1424
1425                 if (!k.k)
1426                         break;
1427
1428                 if (k.k->type) {
1429                         next = bpos_nosnap_successor(k.k->p);
1430
1431                         ret = bch2_check_alloc_key(&trans,
1432                                                    k, &iter,
1433                                                    &discard_iter,
1434                                                    &freespace_iter,
1435                                                    &bucket_gens_iter);
1436                         if (ret)
1437                                 goto bkey_err;
1438                 } else {
1439                         next = k.k->p;
1440
1441                         ret = bch2_check_alloc_hole_freespace(&trans,
1442                                                     bkey_start_pos(k.k),
1443                                                     &next,
1444                                                     &freespace_iter) ?:
1445                                 bch2_check_alloc_hole_bucket_gens(&trans,
1446                                                     bkey_start_pos(k.k),
1447                                                     &next,
1448                                                     &bucket_gens_iter);
1449                         if (ret)
1450                                 goto bkey_err;
1451                 }
1452
1453                 ret = bch2_trans_commit(&trans, NULL, NULL,
1454                                         BTREE_INSERT_NOFAIL|
1455                                         BTREE_INSERT_LAZY_RW);
1456                 if (ret)
1457                         goto bkey_err;
1458
1459                 bch2_btree_iter_set_pos(&iter, next);
1460 bkey_err:
1461                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1462                         continue;
1463                 if (ret)
1464                         break;
1465         }
1466         bch2_trans_iter_exit(&trans, &bucket_gens_iter);
1467         bch2_trans_iter_exit(&trans, &freespace_iter);
1468         bch2_trans_iter_exit(&trans, &discard_iter);
1469         bch2_trans_iter_exit(&trans, &iter);
1470
1471         if (ret < 0)
1472                 goto err;
1473
1474         ret = for_each_btree_key2(&trans, iter,
1475                         BTREE_ID_need_discard, POS_MIN,
1476                         BTREE_ITER_PREFETCH, k,
1477                 bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
1478               for_each_btree_key2(&trans, iter,
1479                         BTREE_ID_freespace, POS_MIN,
1480                         BTREE_ITER_PREFETCH, k,
1481                 bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
1482               for_each_btree_key_commit(&trans, iter,
1483                         BTREE_ID_bucket_gens, POS_MIN,
1484                         BTREE_ITER_PREFETCH, k,
1485                         NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
1486                 bch2_check_bucket_gens_key(&trans, &iter, k));
1487 err:
1488         bch2_trans_exit(&trans);
1489         if (ret)
1490                 bch_err_fn(c, ret);
1491         return ret;
1492 }
1493
1494 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1495                                        struct btree_iter *alloc_iter)
1496 {
1497         struct bch_fs *c = trans->c;
1498         struct btree_iter lru_iter;
1499         struct bch_alloc_v4 a_convert;
1500         const struct bch_alloc_v4 *a;
1501         struct bkey_s_c alloc_k, lru_k;
1502         struct printbuf buf = PRINTBUF;
1503         int ret;
1504
1505         alloc_k = bch2_btree_iter_peek(alloc_iter);
1506         if (!alloc_k.k)
1507                 return 0;
1508
1509         ret = bkey_err(alloc_k);
1510         if (ret)
1511                 return ret;
1512
1513         a = bch2_alloc_to_v4(alloc_k, &a_convert);
1514
1515         if (a->data_type != BCH_DATA_cached)
1516                 return 0;
1517
1518         lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
1519                              lru_pos(alloc_k.k->p.inode,
1520                                      bucket_to_u64(alloc_k.k->p),
1521                                      a->io_time[READ]), 0);
1522         ret = bkey_err(lru_k);
1523         if (ret)
1524                 return ret;
1525
1526         if (fsck_err_on(!a->io_time[READ], c,
1527                         "cached bucket with read_time 0\n"
1528                         "  %s",
1529                 (printbuf_reset(&buf),
1530                  bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
1531             fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
1532                         "missing lru entry\n"
1533                         "  %s",
1534                         (printbuf_reset(&buf),
1535                          bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1536                 u64 read_time = a->io_time[READ] ?:
1537                         atomic64_read(&c->io_clock[READ].now);
1538
1539                 ret = bch2_lru_set(trans,
1540                                    alloc_k.k->p.inode,
1541                                    bucket_to_u64(alloc_k.k->p),
1542                                    read_time);
1543                 if (ret)
1544                         goto err;
1545
1546                 if (a->io_time[READ] != read_time) {
1547                         struct bkey_i_alloc_v4 *a_mut =
1548                                 bch2_alloc_to_v4_mut(trans, alloc_k);
1549                         ret = PTR_ERR_OR_ZERO(a_mut);
1550                         if (ret)
1551                                 goto err;
1552
1553                         a_mut->v.io_time[READ] = read_time;
1554                         ret = bch2_trans_update(trans, alloc_iter,
1555                                                 &a_mut->k_i, BTREE_TRIGGER_NORUN);
1556                         if (ret)
1557                                 goto err;
1558                 }
1559         }
1560 err:
1561 fsck_err:
1562         bch2_trans_iter_exit(trans, &lru_iter);
1563         printbuf_exit(&buf);
1564         return ret;
1565 }
1566
1567 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1568 {
1569         struct btree_iter iter;
1570         struct bkey_s_c k;
1571         int ret = 0;
1572
1573         ret = bch2_trans_run(c,
1574                 for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
1575                                 POS_MIN, BTREE_ITER_PREFETCH, k,
1576                                 NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
1577                         bch2_check_alloc_to_lru_ref(&trans, &iter)));
1578         if (ret)
1579                 bch_err_fn(c, ret);
1580         return ret;
1581 }
1582
1583 static int bch2_discard_one_bucket(struct btree_trans *trans,
1584                                    struct btree_iter *need_discard_iter,
1585                                    struct bpos *discard_pos_done,
1586                                    u64 *seen,
1587                                    u64 *open,
1588                                    u64 *need_journal_commit,
1589                                    u64 *discarded)
1590 {
1591         struct bch_fs *c = trans->c;
1592         struct bpos pos = need_discard_iter->pos;
1593         struct btree_iter iter = { NULL };
1594         struct bkey_s_c k;
1595         struct bch_dev *ca;
1596         struct bkey_i_alloc_v4 *a;
1597         struct printbuf buf = PRINTBUF;
1598         int ret = 0;
1599
1600         ca = bch_dev_bkey_exists(c, pos.inode);
1601         if (!percpu_ref_tryget(&ca->io_ref)) {
1602                 bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
1603                 return 0;
1604         }
1605
1606         if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1607                 (*open)++;
1608                 goto out;
1609         }
1610
1611         if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1612                         c->journal.flushed_seq_ondisk,
1613                         pos.inode, pos.offset)) {
1614                 (*need_journal_commit)++;
1615                 goto out;
1616         }
1617
1618         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1619                                need_discard_iter->pos,
1620                                BTREE_ITER_CACHED);
1621         ret = bkey_err(k);
1622         if (ret)
1623                 goto out;
1624
1625         a = bch2_alloc_to_v4_mut(trans, k);
1626         ret = PTR_ERR_OR_ZERO(a);
1627         if (ret)
1628                 goto out;
1629
1630         if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1631                 a->v.gen++;
1632                 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1633                 goto write;
1634         }
1635
1636         if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1637                 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
1638                         bch2_trans_inconsistent(trans,
1639                                 "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
1640                                 "%s",
1641                                 a->v.journal_seq,
1642                                 c->journal.flushed_seq_ondisk,
1643                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
1644                         ret = -EIO;
1645                 }
1646                 goto out;
1647         }
1648
1649         if (a->v.data_type != BCH_DATA_need_discard) {
1650                 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
1651                         bch2_trans_inconsistent(trans,
1652                                 "bucket incorrectly set in need_discard btree\n"
1653                                 "%s",
1654                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
1655                         ret = -EIO;
1656                 }
1657
1658                 goto out;
1659         }
1660
1661         if (!bkey_eq(*discard_pos_done, iter.pos) &&
1662             ca->mi.discard && !c->opts.nochanges) {
1663                 /*
1664                  * This works without any other locks because this is the only
1665                  * thread that removes items from the need_discard tree
1666                  */
1667                 bch2_trans_unlock(trans);
1668                 blkdev_issue_discard(ca->disk_sb.bdev,
1669                                      k.k->p.offset * ca->mi.bucket_size,
1670                                      ca->mi.bucket_size,
1671                                      GFP_KERNEL);
1672                 *discard_pos_done = iter.pos;
1673
1674                 ret = bch2_trans_relock_notrace(trans);
1675                 if (ret)
1676                         goto out;
1677         }
1678
1679         SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1680         a->v.data_type = alloc_data_type(a->v, a->v.data_type);
1681 write:
1682         ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1683                 bch2_trans_commit(trans, NULL, NULL,
1684                                   BCH_WATERMARK_btree|
1685                                   BTREE_INSERT_NOFAIL);
1686         if (ret)
1687                 goto out;
1688
1689         this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
1690         (*discarded)++;
1691 out:
1692         (*seen)++;
1693         bch2_trans_iter_exit(trans, &iter);
1694         percpu_ref_put(&ca->io_ref);
1695         printbuf_exit(&buf);
1696         return ret;
1697 }
1698
1699 static void bch2_do_discards_work(struct work_struct *work)
1700 {
1701         struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
1702         struct btree_trans trans;
1703         struct btree_iter iter;
1704         struct bkey_s_c k;
1705         u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
1706         struct bpos discard_pos_done = POS_MAX;
1707         int ret;
1708
1709         bch2_trans_init(&trans, c, 0, 0);
1710
1711         /*
1712          * We're doing the commit in bch2_discard_one_bucket instead of using
1713          * for_each_btree_key_commit() so that we can increment counters after
1714          * successful commit:
1715          */
1716         ret = for_each_btree_key2(&trans, iter,
1717                         BTREE_ID_need_discard, POS_MIN, 0, k,
1718                 bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
1719                                         &seen,
1720                                         &open,
1721                                         &need_journal_commit,
1722                                         &discarded));
1723
1724         bch2_trans_exit(&trans);
1725
1726         if (need_journal_commit * 2 > seen)
1727                 bch2_journal_flush_async(&c->journal, NULL);
1728
1729         bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1730
1731         trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
1732                               bch2_err_str(ret));
1733 }
1734
1735 void bch2_do_discards(struct bch_fs *c)
1736 {
1737         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
1738             !queue_work(c->write_ref_wq, &c->discard_work))
1739                 bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1740 }
1741
1742 static int invalidate_one_bucket(struct btree_trans *trans,
1743                                  struct btree_iter *lru_iter,
1744                                  struct bkey_s_c lru_k,
1745                                  s64 *nr_to_invalidate)
1746 {
1747         struct bch_fs *c = trans->c;
1748         struct btree_iter alloc_iter = { NULL };
1749         struct bkey_i_alloc_v4 *a = NULL;
1750         struct printbuf buf = PRINTBUF;
1751         struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
1752         unsigned cached_sectors;
1753         int ret = 0;
1754
1755         if (*nr_to_invalidate <= 0)
1756                 return 1;
1757
1758         if (!bch2_dev_bucket_exists(c, bucket)) {
1759                 prt_str(&buf, "lru entry points to invalid bucket");
1760                 goto err;
1761         }
1762
1763         if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
1764                 return 0;
1765
1766         a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
1767         ret = PTR_ERR_OR_ZERO(a);
1768         if (ret)
1769                 goto out;
1770
1771         /* We expect harmless races here due to the btree write buffer: */
1772         if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
1773                 goto out;
1774
1775         BUG_ON(a->v.data_type != BCH_DATA_cached);
1776
1777         if (!a->v.cached_sectors)
1778                 bch_err(c, "invalidating empty bucket, confused");
1779
1780         cached_sectors = a->v.cached_sectors;
1781
1782         SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1783         a->v.gen++;
1784         a->v.data_type          = 0;
1785         a->v.dirty_sectors      = 0;
1786         a->v.cached_sectors     = 0;
1787         a->v.io_time[READ]      = atomic64_read(&c->io_clock[READ].now);
1788         a->v.io_time[WRITE]     = atomic64_read(&c->io_clock[WRITE].now);
1789
1790         ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
1791                                 BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
1792                 bch2_trans_commit(trans, NULL, NULL,
1793                                   BCH_WATERMARK_btree|
1794                                   BTREE_INSERT_NOFAIL);
1795         if (ret)
1796                 goto out;
1797
1798         trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
1799         --*nr_to_invalidate;
1800 out:
1801         bch2_trans_iter_exit(trans, &alloc_iter);
1802         printbuf_exit(&buf);
1803         return ret;
1804 err:
1805         prt_str(&buf, "\n  lru key: ");
1806         bch2_bkey_val_to_text(&buf, c, lru_k);
1807
1808         prt_str(&buf, "\n  lru entry: ");
1809         bch2_lru_pos_to_text(&buf, lru_iter->pos);
1810
1811         prt_str(&buf, "\n  alloc key: ");
1812         if (!a)
1813                 bch2_bpos_to_text(&buf, bucket);
1814         else
1815                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
1816
1817         bch_err(c, "%s", buf.buf);
1818         if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
1819                 bch2_inconsistent_error(c);
1820                 ret = -EINVAL;
1821         }
1822
1823         goto out;
1824 }
1825
1826 static void bch2_do_invalidates_work(struct work_struct *work)
1827 {
1828         struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
1829         struct bch_dev *ca;
1830         struct btree_trans trans;
1831         struct btree_iter iter;
1832         struct bkey_s_c k;
1833         unsigned i;
1834         int ret = 0;
1835
1836         bch2_trans_init(&trans, c, 0, 0);
1837
1838         ret = bch2_btree_write_buffer_flush(&trans);
1839         if (ret)
1840                 goto err;
1841
1842         for_each_member_device(ca, c, i) {
1843                 s64 nr_to_invalidate =
1844                         should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
1845
1846                 ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
1847                                 lru_pos(ca->dev_idx, 0, 0),
1848                                 lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
1849                                 BTREE_ITER_INTENT, k,
1850                         invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
1851
1852                 if (ret < 0) {
1853                         percpu_ref_put(&ca->ref);
1854                         break;
1855                 }
1856         }
1857 err:
1858         bch2_trans_exit(&trans);
1859         bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
1860 }
1861
1862 void bch2_do_invalidates(struct bch_fs *c)
1863 {
1864         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
1865             !queue_work(c->write_ref_wq, &c->invalidate_work))
1866                 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
1867 }
1868
1869 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
1870                                    unsigned long *last_updated)
1871 {
1872         struct btree_trans trans;
1873         struct btree_iter iter;
1874         struct bkey_s_c k;
1875         struct bkey hole;
1876         struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
1877         struct bch_member *m;
1878         int ret;
1879
1880         bch2_trans_init(&trans, c, 0, 0);
1881
1882         bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
1883                              POS(ca->dev_idx, ca->mi.first_bucket),
1884                              BTREE_ITER_PREFETCH);
1885         /*
1886          * Scan the alloc btree for every bucket on @ca, and add buckets to the
1887          * freespace/need_discard/need_gc_gens btrees as needed:
1888          */
1889         while (1) {
1890                 if (*last_updated + HZ * 10 < jiffies) {
1891                         bch_info(ca, "%s: currently at %llu/%llu",
1892                                  __func__, iter.pos.offset, ca->mi.nbuckets);
1893                         *last_updated = jiffies;
1894                 }
1895
1896                 bch2_trans_begin(&trans);
1897
1898                 if (bkey_ge(iter.pos, end)) {
1899                         ret = 0;
1900                         break;
1901                 }
1902
1903                 k = bch2_get_key_or_hole(&iter, end, &hole);
1904                 ret = bkey_err(k);
1905                 if (ret)
1906                         goto bkey_err;
1907
1908                 if (k.k->type) {
1909                         /*
1910                          * We process live keys in the alloc btree one at a
1911                          * time:
1912                          */
1913                         struct bch_alloc_v4 a_convert;
1914                         const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1915
1916                         ret =   bch2_bucket_do_index(&trans, k, a, true) ?:
1917                                 bch2_trans_commit(&trans, NULL, NULL,
1918                                                   BTREE_INSERT_LAZY_RW|
1919                                                   BTREE_INSERT_NOFAIL);
1920                         if (ret)
1921                                 goto bkey_err;
1922
1923                         bch2_btree_iter_advance(&iter);
1924                 } else {
1925                         struct bkey_i *freespace;
1926
1927                         freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
1928                         ret = PTR_ERR_OR_ZERO(freespace);
1929                         if (ret)
1930                                 goto bkey_err;
1931
1932                         bkey_init(&freespace->k);
1933                         freespace->k.type       = KEY_TYPE_set;
1934                         freespace->k.p          = k.k->p;
1935                         freespace->k.size       = k.k->size;
1936
1937                         ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
1938                                 bch2_trans_commit(&trans, NULL, NULL,
1939                                                   BTREE_INSERT_LAZY_RW|
1940                                                   BTREE_INSERT_NOFAIL);
1941                         if (ret)
1942                                 goto bkey_err;
1943
1944                         bch2_btree_iter_set_pos(&iter, k.k->p);
1945                 }
1946 bkey_err:
1947                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1948                         continue;
1949                 if (ret)
1950                         break;
1951         }
1952
1953         bch2_trans_iter_exit(&trans, &iter);
1954         bch2_trans_exit(&trans);
1955
1956         if (ret < 0) {
1957                 bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
1958                 return ret;
1959         }
1960
1961         mutex_lock(&c->sb_lock);
1962         m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
1963         SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
1964         mutex_unlock(&c->sb_lock);
1965
1966         return 0;
1967 }
1968
1969 int bch2_fs_freespace_init(struct bch_fs *c)
1970 {
1971         struct bch_dev *ca;
1972         unsigned i;
1973         int ret = 0;
1974         bool doing_init = false;
1975         unsigned long last_updated = jiffies;
1976
1977         /*
1978          * We can crash during the device add path, so we need to check this on
1979          * every mount:
1980          */
1981
1982         for_each_member_device(ca, c, i) {
1983                 if (ca->mi.freespace_initialized)
1984                         continue;
1985
1986                 if (!doing_init) {
1987                         bch_info(c, "initializing freespace");
1988                         doing_init = true;
1989                 }
1990
1991                 ret = bch2_dev_freespace_init(c, ca, &last_updated);
1992                 if (ret) {
1993                         percpu_ref_put(&ca->ref);
1994                         bch_err_fn(c, ret);
1995                         return ret;
1996                 }
1997         }
1998
1999         if (doing_init) {
2000                 mutex_lock(&c->sb_lock);
2001                 bch2_write_super(c);
2002                 mutex_unlock(&c->sb_lock);
2003                 bch_verbose(c, "done initializing freespace");
2004         }
2005
2006         return 0;
2007 }
2008
2009 /* Bucket IO clocks: */
2010
2011 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2012                               size_t bucket_nr, int rw)
2013 {
2014         struct bch_fs *c = trans->c;
2015         struct btree_iter iter;
2016         struct bkey_i_alloc_v4 *a;
2017         u64 now;
2018         int ret = 0;
2019
2020         a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
2021         ret = PTR_ERR_OR_ZERO(a);
2022         if (ret)
2023                 return ret;
2024
2025         now = atomic64_read(&c->io_clock[rw].now);
2026         if (a->v.io_time[rw] == now)
2027                 goto out;
2028
2029         a->v.io_time[rw] = now;
2030
2031         ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2032                 bch2_trans_commit(trans, NULL, NULL, 0);
2033 out:
2034         bch2_trans_iter_exit(trans, &iter);
2035         return ret;
2036 }
2037
2038 /* Startup/shutdown (ro/rw): */
2039
2040 void bch2_recalc_capacity(struct bch_fs *c)
2041 {
2042         struct bch_dev *ca;
2043         u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2044         unsigned bucket_size_max = 0;
2045         unsigned long ra_pages = 0;
2046         unsigned i;
2047
2048         lockdep_assert_held(&c->state_lock);
2049
2050         for_each_online_member(ca, c, i) {
2051                 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2052
2053                 ra_pages += bdi->ra_pages;
2054         }
2055
2056         bch2_set_ra_pages(c, ra_pages);
2057
2058         for_each_rw_member(ca, c, i) {
2059                 u64 dev_reserve = 0;
2060
2061                 /*
2062                  * We need to reserve buckets (from the number
2063                  * of currently available buckets) against
2064                  * foreground writes so that mainly copygc can
2065                  * make forward progress.
2066                  *
2067                  * We need enough to refill the various reserves
2068                  * from scratch - copygc will use its entire
2069                  * reserve all at once, then run against when
2070                  * its reserve is refilled (from the formerly
2071                  * available buckets).
2072                  *
2073                  * This reserve is just used when considering if
2074                  * allocations for foreground writes must wait -
2075                  * not -ENOSPC calculations.
2076                  */
2077
2078                 dev_reserve += ca->nr_btree_reserve * 2;
2079                 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2080
2081                 dev_reserve += 1;       /* btree write point */
2082                 dev_reserve += 1;       /* copygc write point */
2083                 dev_reserve += 1;       /* rebalance write point */
2084
2085                 dev_reserve *= ca->mi.bucket_size;
2086
2087                 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2088                                              ca->mi.first_bucket);
2089
2090                 reserved_sectors += dev_reserve * 2;
2091
2092                 bucket_size_max = max_t(unsigned, bucket_size_max,
2093                                         ca->mi.bucket_size);
2094         }
2095
2096         gc_reserve = c->opts.gc_reserve_bytes
2097                 ? c->opts.gc_reserve_bytes >> 9
2098                 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2099
2100         reserved_sectors = max(gc_reserve, reserved_sectors);
2101
2102         reserved_sectors = min(reserved_sectors, capacity);
2103
2104         c->capacity = capacity - reserved_sectors;
2105
2106         c->bucket_size_max = bucket_size_max;
2107
2108         /* Wake up case someone was waiting for buckets */
2109         closure_wake_up(&c->freelist_wait);
2110 }
2111
2112 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2113 {
2114         struct open_bucket *ob;
2115         bool ret = false;
2116
2117         for (ob = c->open_buckets;
2118              ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2119              ob++) {
2120                 spin_lock(&ob->lock);
2121                 if (ob->valid && !ob->on_partial_list &&
2122                     ob->dev == ca->dev_idx)
2123                         ret = true;
2124                 spin_unlock(&ob->lock);
2125         }
2126
2127         return ret;
2128 }
2129
2130 /* device goes ro: */
2131 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2132 {
2133         unsigned i;
2134
2135         /* First, remove device from allocation groups: */
2136
2137         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2138                 clear_bit(ca->dev_idx, c->rw_devs[i].d);
2139
2140         /*
2141          * Capacity is calculated based off of devices in allocation groups:
2142          */
2143         bch2_recalc_capacity(c);
2144
2145         bch2_open_buckets_stop(c, ca, false);
2146
2147         /*
2148          * Wake up threads that were blocked on allocation, so they can notice
2149          * the device can no longer be removed and the capacity has changed:
2150          */
2151         closure_wake_up(&c->freelist_wait);
2152
2153         /*
2154          * journal_res_get() can block waiting for free space in the journal -
2155          * it needs to notice there may not be devices to allocate from anymore:
2156          */
2157         wake_up(&c->journal.wait);
2158
2159         /* Now wait for any in flight writes: */
2160
2161         closure_wait_event(&c->open_buckets_wait,
2162                            !bch2_dev_has_open_write_point(c, ca));
2163 }
2164
2165 /* device goes rw: */
2166 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2167 {
2168         unsigned i;
2169
2170         for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2171                 if (ca->mi.data_allowed & (1 << i))
2172                         set_bit(ca->dev_idx, c->rw_devs[i].d);
2173 }
2174
2175 void bch2_fs_allocator_background_init(struct bch_fs *c)
2176 {
2177         spin_lock_init(&c->freelist_lock);
2178         INIT_WORK(&c->discard_work, bch2_do_discards_work);
2179         INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
2180 }