1 // SPDX-License-Identifier: GPL-2.0
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_cache.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
12 #include "buckets_waiting_for_journal.h"
21 #include <linux/kthread.h>
22 #include <linux/math64.h>
23 #include <linux/random.h>
24 #include <linux/rculist.h>
25 #include <linux/rcupdate.h>
26 #include <linux/sched/task.h>
27 #include <linux/sort.h>
28 #include <trace/events/bcachefs.h>
30 /* Persistent alloc info: */
32 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
33 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
38 const char * const bch2_bucket_states[] = {
47 struct bkey_alloc_unpacked {
56 #define x(_name, _bits) u##_bits _name;
61 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
62 const void **p, unsigned field)
64 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
67 if (!(a->fields & (1 << field)))
72 v = *((const u8 *) *p);
91 static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
92 unsigned field, u64 v)
94 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
99 a->v.fields |= 1 << field;
106 *((__le16 *) *p) = cpu_to_le16(v);
109 *((__le32 *) *p) = cpu_to_le32(v);
112 *((__le64 *) *p) = cpu_to_le64(v);
121 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
124 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
125 const void *d = in->data;
130 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
131 BCH_ALLOC_FIELDS_V1()
135 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
138 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
139 const u8 *in = a.v->data;
140 const u8 *end = bkey_val_end(a);
141 unsigned fieldnr = 0;
146 out->oldest_gen = a.v->oldest_gen;
147 out->data_type = a.v->data_type;
149 #define x(_name, _bits) \
150 if (fieldnr < a.v->nr_fields) { \
151 ret = bch2_varint_decode_fast(in, end, &v); \
159 if (v != out->_name) \
163 BCH_ALLOC_FIELDS_V2()
168 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
171 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
172 const u8 *in = a.v->data;
173 const u8 *end = bkey_val_end(a);
174 unsigned fieldnr = 0;
179 out->oldest_gen = a.v->oldest_gen;
180 out->data_type = a.v->data_type;
181 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
182 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
183 out->journal_seq = le64_to_cpu(a.v->journal_seq);
185 #define x(_name, _bits) \
186 if (fieldnr < a.v->nr_fields) { \
187 ret = bch2_varint_decode_fast(in, end, &v); \
195 if (v != out->_name) \
199 BCH_ALLOC_FIELDS_V2()
204 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
206 struct bkey_alloc_unpacked ret = {
208 .bucket = k.k->p.offset,
214 bch2_alloc_unpack_v1(&ret, k);
216 case KEY_TYPE_alloc_v2:
217 bch2_alloc_unpack_v2(&ret, k);
219 case KEY_TYPE_alloc_v3:
220 bch2_alloc_unpack_v3(&ret, k);
227 void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
229 if (k.k->type == KEY_TYPE_alloc_v4) {
230 *out = *bkey_s_c_to_alloc_v4(k).v;
232 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
234 *out = (struct bch_alloc_v4) {
235 .journal_seq = u.journal_seq,
236 .flags = u.need_discard,
238 .oldest_gen = u.oldest_gen,
239 .data_type = u.data_type,
240 .stripe_redundancy = u.stripe_redundancy,
241 .dirty_sectors = u.dirty_sectors,
242 .cached_sectors = u.cached_sectors,
243 .io_time[READ] = u.read_time,
244 .io_time[WRITE] = u.write_time,
250 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
252 struct bkey_i_alloc_v4 *ret;
254 if (k.k->type == KEY_TYPE_alloc_v4) {
255 ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
257 bkey_reassemble(&ret->k_i, k);
259 ret = bch2_trans_kmalloc(trans, sizeof(*ret));
261 bkey_alloc_v4_init(&ret->k_i);
263 bch2_alloc_to_v4(k, &ret->v);
269 struct bkey_i_alloc_v4 *
270 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
274 struct bkey_i_alloc_v4 *a;
277 bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
278 BTREE_ITER_WITH_UPDATES|
281 k = bch2_btree_iter_peek_slot(iter);
284 bch2_trans_iter_exit(trans, iter);
288 a = bch2_alloc_to_v4_mut(trans, k);
290 bch2_trans_iter_exit(trans, iter);
294 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
296 unsigned i, bytes = offsetof(struct bch_alloc, data);
298 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
299 if (a->fields & (1 << i))
300 bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
302 return DIV_ROUND_UP(bytes, sizeof(u64));
305 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
306 int rw, struct printbuf *err)
308 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
310 /* allow for unknown fields */
311 if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
312 pr_buf(err, "incorrect value size (%zu < %u)",
313 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
320 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
321 int rw, struct printbuf *err)
323 struct bkey_alloc_unpacked u;
325 if (bch2_alloc_unpack_v2(&u, k)) {
326 pr_buf(err, "unpack error");
333 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
334 int rw, struct printbuf *err)
336 struct bkey_alloc_unpacked u;
338 if (bch2_alloc_unpack_v3(&u, k)) {
339 pr_buf(err, "unpack error");
346 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
347 int rw, struct printbuf *err)
349 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
351 if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
352 pr_buf(err, "bad val size (%zu != %zu)",
353 bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
358 if (a.v->cached_sectors &&
359 !a.v->dirty_sectors &&
360 !a.v->io_time[READ]) {
361 pr_buf(err, "cached bucket with read_time == 0");
365 if (!a.v->dirty_sectors &&
366 !a.v->cached_sectors &&
369 pr_buf(err, "empty, but data_type nonzero");
377 void bch2_alloc_v4_swab(struct bkey_s k)
379 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
381 a->journal_seq = swab64(a->journal_seq);
382 a->flags = swab32(a->flags);
383 a->dirty_sectors = swab32(a->dirty_sectors);
384 a->cached_sectors = swab32(a->cached_sectors);
385 a->io_time[0] = swab64(a->io_time[0]);
386 a->io_time[1] = swab64(a->io_time[1]);
387 a->stripe = swab32(a->stripe);
388 a->nr_external_backpointers = swab32(a->nr_external_backpointers);
391 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
393 struct bch_alloc_v4 a;
395 bch2_alloc_to_v4(k, &a);
397 pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
398 a.gen, a.oldest_gen, bch2_data_types[a.data_type],
399 a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
400 pr_buf(out, " dirty_sectors %u", a.dirty_sectors);
401 pr_buf(out, " cached_sectors %u", a.cached_sectors);
402 pr_buf(out, " stripe %u", a.stripe);
403 pr_buf(out, " stripe_redundancy %u", a.stripe_redundancy);
404 pr_buf(out, " read_time %llu", a.io_time[READ]);
405 pr_buf(out, " write_time %llu", a.io_time[WRITE]);
408 int bch2_alloc_read(struct bch_fs *c)
410 struct btree_trans trans;
411 struct btree_iter iter;
413 struct bch_alloc_v4 a;
417 bch2_trans_init(&trans, c, 0, 0);
419 for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
420 BTREE_ITER_PREFETCH, k, ret) {
421 ca = bch_dev_bkey_exists(c, k.k->p.inode);
422 bch2_alloc_to_v4(k, &a);
424 *bucket_gen(ca, k.k->p.offset) = a.gen;
426 bch2_trans_iter_exit(&trans, &iter);
428 bch2_trans_exit(&trans);
431 bch_err(c, "error reading alloc info: %i", ret);
436 /* Free space/discard btree: */
438 static int bch2_bucket_do_index(struct btree_trans *trans,
439 struct bkey_s_c alloc_k,
440 struct bch_alloc_v4 a,
443 struct bch_fs *c = trans->c;
444 struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
445 struct btree_iter iter;
448 enum bucket_state state = bucket_state(a);
450 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
451 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
452 struct printbuf buf = PRINTBUF;
455 if (state != BUCKET_free &&
456 state != BUCKET_need_discard)
459 k = bch2_trans_kmalloc(trans, sizeof(*k));
464 k->k.type = new_type;
468 btree = BTREE_ID_freespace;
469 k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
470 bch2_key_resize(&k->k, 1);
472 case BUCKET_need_discard:
473 btree = BTREE_ID_need_discard;
474 k->k.p = alloc_k.k->p;
480 bch2_trans_iter_init(trans, &iter, btree,
481 bkey_start_pos(&k->k),
483 old = bch2_btree_iter_peek_slot(&iter);
488 if (ca->mi.freespace_initialized &&
489 bch2_fs_inconsistent_on(old.k->type != old_type, c,
490 "incorrect key when %s %s btree (got %s should be %s)\n"
492 set ? "setting" : "clearing",
493 bch2_btree_ids[btree],
494 bch2_bkey_types[old.k->type],
495 bch2_bkey_types[old_type],
496 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
501 ret = bch2_trans_update(trans, &iter, k, 0);
503 bch2_trans_iter_exit(trans, &iter);
508 int bch2_trans_mark_alloc(struct btree_trans *trans,
509 struct bkey_s_c old, struct bkey_i *new,
512 struct bch_fs *c = trans->c;
513 struct bch_alloc_v4 old_a, *new_a;
514 u64 old_lru, new_lru;
518 * Deletion only happens in the device removal path, with
519 * BTREE_TRIGGER_NORUN:
521 BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
523 bch2_alloc_to_v4(old, &old_a);
524 new_a = &bkey_i_to_alloc_v4(new)->v;
526 if (new_a->dirty_sectors > old_a.dirty_sectors ||
527 new_a->cached_sectors > old_a.cached_sectors) {
528 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
529 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
530 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
531 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
534 if (old_a.data_type && !new_a->data_type &&
535 old_a.gen == new_a->gen &&
536 !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
538 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
541 if (bucket_state(old_a) != bucket_state(*new_a) ||
542 (bucket_state(*new_a) == BUCKET_free &&
543 alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
544 ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
545 bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
550 old_lru = alloc_lru_idx(old_a);
551 new_lru = alloc_lru_idx(*new_a);
553 if (old_lru != new_lru) {
554 ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
559 if (new_lru && new_a->io_time[READ] != new_lru)
560 new_a->io_time[READ] = new_lru;
566 static int bch2_check_alloc_key(struct btree_trans *trans,
567 struct btree_iter *alloc_iter)
569 struct bch_fs *c = trans->c;
571 struct btree_iter discard_iter, freespace_iter;
572 struct bch_alloc_v4 a;
573 unsigned discard_key_type, freespace_key_type;
574 struct bkey_s_c alloc_k, k;
575 struct printbuf buf = PRINTBUF;
576 struct printbuf buf2 = PRINTBUF;
579 alloc_k = bch2_btree_iter_peek(alloc_iter);
583 ret = bkey_err(alloc_k);
587 if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
588 "alloc key for invalid device or bucket"))
589 return bch2_btree_delete_at(trans, alloc_iter, 0);
591 ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
592 if (!ca->mi.freespace_initialized)
595 bch2_alloc_to_v4(alloc_k, &a);
597 discard_key_type = bucket_state(a) == BUCKET_need_discard
599 freespace_key_type = bucket_state(a) == BUCKET_free
602 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
604 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
605 alloc_freespace_pos(alloc_k.k->p, a), 0);
607 k = bch2_btree_iter_peek_slot(&discard_iter);
612 if (fsck_err_on(k.k->type != discard_key_type, c,
613 "incorrect key in need_discard btree (got %s should be %s)\n"
615 bch2_bkey_types[k.k->type],
616 bch2_bkey_types[discard_key_type],
617 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
618 struct bkey_i *update =
619 bch2_trans_kmalloc(trans, sizeof(*update));
621 ret = PTR_ERR_OR_ZERO(update);
625 bkey_init(&update->k);
626 update->k.type = discard_key_type;
627 update->k.p = discard_iter.pos;
629 ret = bch2_trans_update(trans, &discard_iter, update, 0);
634 k = bch2_btree_iter_peek_slot(&freespace_iter);
639 if (fsck_err_on(k.k->type != freespace_key_type, c,
640 "incorrect key in freespace btree (got %s should be %s)\n"
642 bch2_bkey_types[k.k->type],
643 bch2_bkey_types[freespace_key_type],
644 (printbuf_reset(&buf),
645 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
646 struct bkey_i *update =
647 bch2_trans_kmalloc(trans, sizeof(*update));
649 ret = PTR_ERR_OR_ZERO(update);
653 bkey_init(&update->k);
654 update->k.type = freespace_key_type;
655 update->k.p = freespace_iter.pos;
656 bch2_key_resize(&update->k, 1);
658 ret = bch2_trans_update(trans, &freespace_iter, update, 0);
664 bch2_trans_iter_exit(trans, &freespace_iter);
665 bch2_trans_iter_exit(trans, &discard_iter);
666 printbuf_exit(&buf2);
671 static int bch2_check_discard_freespace_key(struct btree_trans *trans,
672 struct btree_iter *iter)
674 struct bch_fs *c = trans->c;
675 struct btree_iter alloc_iter;
676 struct bkey_s_c k, freespace_k;
677 struct bch_alloc_v4 a;
680 struct bkey_i *update;
681 enum bucket_state state = iter->btree_id == BTREE_ID_need_discard
682 ? BUCKET_need_discard
684 struct printbuf buf = PRINTBUF;
687 freespace_k = bch2_btree_iter_peek(iter);
691 ret = bkey_err(freespace_k);
696 pos.offset &= ~(~0ULL << 56);
697 genbits = iter->pos.offset & (~0ULL << 56);
699 bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
701 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
702 "%llu:%llu set in %s btree but device or bucket does not exist",
703 pos.inode, pos.offset,
704 bch2_btree_ids[iter->btree_id]))
707 k = bch2_btree_iter_peek_slot(&alloc_iter);
712 bch2_alloc_to_v4(k, &a);
714 if (fsck_err_on(bucket_state(a) != state ||
715 (state == BUCKET_free &&
716 genbits != alloc_freespace_genbits(a)), c,
717 "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)",
718 (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
719 bch2_btree_ids[iter->btree_id],
720 bucket_state(a) == state,
721 genbits >> 56, alloc_freespace_genbits(a) >> 56))
726 bch2_trans_iter_exit(trans, &alloc_iter);
730 if (iter->btree_id == BTREE_ID_freespace) {
731 /* should probably add a helper for deleting extents */
732 update = bch2_trans_kmalloc(trans, sizeof(*update));
733 ret = PTR_ERR_OR_ZERO(update);
737 bkey_init(&update->k);
738 update->k.p = iter->pos;
739 bch2_key_resize(&update->k, 1);
741 ret = bch2_trans_update(trans, iter, update, 0);
743 ret = bch2_btree_delete_at(trans, iter, 0);
748 int bch2_check_alloc_info(struct bch_fs *c)
750 struct btree_trans trans;
751 struct btree_iter iter;
755 bch2_trans_init(&trans, c, 0, 0);
757 for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
758 BTREE_ITER_PREFETCH, k, ret) {
759 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
760 bch2_check_alloc_key(&trans, &iter));
764 bch2_trans_iter_exit(&trans, &iter);
769 bch2_trans_iter_init(&trans, &iter, BTREE_ID_need_discard, POS_MIN,
770 BTREE_ITER_PREFETCH);
772 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
773 bch2_check_discard_freespace_key(&trans, &iter));
777 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
779 bch2_trans_iter_exit(&trans, &iter);
784 bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
785 BTREE_ITER_PREFETCH);
787 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
788 bch2_check_discard_freespace_key(&trans, &iter));
792 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
794 bch2_trans_iter_exit(&trans, &iter);
796 bch2_trans_exit(&trans);
797 return ret < 0 ? ret : 0;
800 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
801 struct btree_iter *alloc_iter)
803 struct bch_fs *c = trans->c;
804 struct btree_iter lru_iter;
805 struct bch_alloc_v4 a;
806 struct bkey_s_c alloc_k, k;
807 struct printbuf buf = PRINTBUF;
808 struct printbuf buf2 = PRINTBUF;
811 alloc_k = bch2_btree_iter_peek(alloc_iter);
815 ret = bkey_err(alloc_k);
819 bch2_alloc_to_v4(alloc_k, &a);
821 if (bucket_state(a) != BUCKET_cached)
824 bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
825 POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
827 k = bch2_btree_iter_peek_slot(&lru_iter);
832 if (fsck_err_on(!a.io_time[READ], c,
833 "cached bucket with read_time 0\n"
835 (printbuf_reset(&buf),
836 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
837 fsck_err_on(k.k->type != KEY_TYPE_lru ||
838 le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
839 "incorrect/missing lru entry\n"
842 (printbuf_reset(&buf),
843 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
844 (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
845 u64 read_time = a.io_time[READ];
847 if (!a.io_time[READ])
848 a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
850 ret = bch2_lru_change(trans,
853 0, &a.io_time[READ]);
857 if (a.io_time[READ] != read_time) {
858 struct bkey_i_alloc_v4 *a_mut =
859 bch2_alloc_to_v4_mut(trans, alloc_k);
860 ret = PTR_ERR_OR_ZERO(a_mut);
864 a_mut->v.io_time[READ] = a.io_time[READ];
865 ret = bch2_trans_update(trans, alloc_iter,
866 &a_mut->k_i, BTREE_TRIGGER_NORUN);
873 bch2_trans_iter_exit(trans, &lru_iter);
874 printbuf_exit(&buf2);
879 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
881 struct btree_trans trans;
882 struct btree_iter iter;
886 bch2_trans_init(&trans, c, 0, 0);
888 for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
889 BTREE_ITER_PREFETCH, k, ret) {
890 ret = __bch2_trans_do(&trans, NULL, NULL,
892 BTREE_INSERT_LAZY_RW,
893 bch2_check_alloc_to_lru_ref(&trans, &iter));
897 bch2_trans_iter_exit(&trans, &iter);
899 bch2_trans_exit(&trans);
900 return ret < 0 ? ret : 0;
903 static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
904 struct bch_dev *ca, bool *discard_done)
906 struct bch_fs *c = trans->c;
907 struct btree_iter iter;
909 struct bkey_i_alloc_v4 *a;
910 struct printbuf buf = PRINTBUF;
913 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
915 k = bch2_btree_iter_peek_slot(&iter);
920 a = bch2_alloc_to_v4_mut(trans, k);
921 ret = PTR_ERR_OR_ZERO(a);
925 if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
927 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
931 BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
933 if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
934 "%s\n incorrectly set in need_discard btree",
935 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
940 if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
942 * This works without any other locks because this is the only
943 * thread that removes items from the need_discard tree
945 bch2_trans_unlock(trans);
946 blkdev_issue_discard(ca->disk_sb.bdev,
947 k.k->p.offset * ca->mi.bucket_size,
950 *discard_done = true;
952 ret = bch2_trans_relock(trans) ? 0 : -EINTR;
957 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
959 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
961 bch2_trans_iter_exit(trans, &iter);
966 static void bch2_do_discards_work(struct work_struct *work)
968 struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
969 struct bch_dev *ca = NULL;
970 struct btree_trans trans;
971 struct btree_iter iter;
973 u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
976 bch2_trans_init(&trans, c, 0, 0);
978 for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
979 POS_MIN, 0, k, ret) {
980 bool discard_done = false;
982 if (ca && k.k->p.inode != ca->dev_idx) {
983 percpu_ref_put(&ca->io_ref);
988 ca = bch_dev_bkey_exists(c, k.k->p.inode);
989 if (!percpu_ref_tryget(&ca->io_ref)) {
991 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
998 if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
1003 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1004 c->journal.flushed_seq_ondisk,
1005 k.k->p.inode, k.k->p.offset)) {
1006 need_journal_commit++;
1010 ret = __bch2_trans_do(&trans, NULL, NULL,
1011 BTREE_INSERT_USE_RESERVE|
1012 BTREE_INSERT_NOFAIL,
1013 bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
1019 bch2_trans_iter_exit(&trans, &iter);
1022 percpu_ref_put(&ca->io_ref);
1024 bch2_trans_exit(&trans);
1026 if (need_journal_commit * 2 > seen)
1027 bch2_journal_flush_async(&c->journal, NULL);
1029 percpu_ref_put(&c->writes);
1031 trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
1034 void bch2_do_discards(struct bch_fs *c)
1036 if (percpu_ref_tryget(&c->writes) &&
1037 !queue_work(system_long_wq, &c->discard_work))
1038 percpu_ref_put(&c->writes);
1041 static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
1043 struct bch_fs *c = trans->c;
1044 struct btree_iter lru_iter, alloc_iter = { NULL };
1046 struct bkey_i_alloc_v4 *a;
1050 bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
1051 POS(ca->dev_idx, 0), 0);
1052 k = bch2_btree_iter_peek(&lru_iter);
1057 if (!k.k || k.k->p.inode != ca->dev_idx)
1060 if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
1061 "non lru key in lru btree"))
1064 idx = k.k->p.offset;
1065 bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
1067 a = bch2_trans_start_alloc_update(trans, &alloc_iter,
1068 POS(ca->dev_idx, bucket));
1069 ret = PTR_ERR_OR_ZERO(a);
1073 if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
1074 "invalidating bucket with wrong lru idx (got %llu should be %llu",
1075 idx, alloc_lru_idx(a->v)))
1078 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1081 a->v.dirty_sectors = 0;
1082 a->v.cached_sectors = 0;
1083 a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
1084 a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
1086 ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
1087 BTREE_TRIGGER_BUCKET_INVALIDATE);
1089 bch2_trans_iter_exit(trans, &alloc_iter);
1090 bch2_trans_iter_exit(trans, &lru_iter);
1094 static void bch2_do_invalidates_work(struct work_struct *work)
1096 struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
1098 struct btree_trans trans;
1102 bch2_trans_init(&trans, c, 0, 0);
1104 for_each_member_device(ca, c, i)
1105 while (!ret && should_invalidate_buckets(ca))
1106 ret = __bch2_trans_do(&trans, NULL, NULL,
1107 BTREE_INSERT_USE_RESERVE|
1108 BTREE_INSERT_NOFAIL,
1109 invalidate_one_bucket(&trans, ca));
1111 bch2_trans_exit(&trans);
1112 percpu_ref_put(&c->writes);
1115 void bch2_do_invalidates(struct bch_fs *c)
1117 if (percpu_ref_tryget(&c->writes))
1118 queue_work(system_long_wq, &c->invalidate_work);
1121 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
1123 struct btree_trans trans;
1124 struct btree_iter iter;
1126 struct bch_alloc_v4 a;
1127 struct bch_member *m;
1130 bch2_trans_init(&trans, c, 0, 0);
1132 for_each_btree_key(&trans, iter, BTREE_ID_alloc,
1133 POS(ca->dev_idx, ca->mi.first_bucket),
1135 BTREE_ITER_PREFETCH, k, ret) {
1136 if (iter.pos.offset >= ca->mi.nbuckets)
1139 bch2_alloc_to_v4(k, &a);
1140 ret = __bch2_trans_do(&trans, NULL, NULL,
1141 BTREE_INSERT_LAZY_RW,
1142 bch2_bucket_do_index(&trans, k, a, true));
1146 bch2_trans_iter_exit(&trans, &iter);
1148 bch2_trans_exit(&trans);
1151 bch_err(ca, "error initializing free space: %i", ret);
1155 mutex_lock(&c->sb_lock);
1156 m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
1157 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
1158 mutex_unlock(&c->sb_lock);
1163 int bch2_fs_freespace_init(struct bch_fs *c)
1168 bool doing_init = false;
1171 * We can crash during the device add path, so we need to check this on
1175 for_each_member_device(ca, c, i) {
1176 if (ca->mi.freespace_initialized)
1180 bch_info(c, "initializing freespace");
1184 ret = bch2_dev_freespace_init(c, ca);
1186 percpu_ref_put(&ca->ref);
1192 mutex_lock(&c->sb_lock);
1193 bch2_write_super(c);
1194 mutex_unlock(&c->sb_lock);
1196 bch_verbose(c, "done initializing freespace");
1202 /* Bucket IO clocks: */
1204 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
1205 size_t bucket_nr, int rw)
1207 struct bch_fs *c = trans->c;
1208 struct btree_iter iter;
1209 struct bkey_i_alloc_v4 *a;
1213 a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
1214 ret = PTR_ERR_OR_ZERO(a);
1218 now = atomic64_read(&c->io_clock[rw].now);
1219 if (a->v.io_time[rw] == now)
1222 a->v.io_time[rw] = now;
1224 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1225 bch2_trans_commit(trans, NULL, NULL, 0);
1227 bch2_trans_iter_exit(trans, &iter);
1231 /* Startup/shutdown (ro/rw): */
1233 void bch2_recalc_capacity(struct bch_fs *c)
1236 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
1237 unsigned bucket_size_max = 0;
1238 unsigned long ra_pages = 0;
1241 lockdep_assert_held(&c->state_lock);
1243 for_each_online_member(ca, c, i) {
1244 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
1246 ra_pages += bdi->ra_pages;
1249 bch2_set_ra_pages(c, ra_pages);
1251 for_each_rw_member(ca, c, i) {
1252 u64 dev_reserve = 0;
1255 * We need to reserve buckets (from the number
1256 * of currently available buckets) against
1257 * foreground writes so that mainly copygc can
1258 * make forward progress.
1260 * We need enough to refill the various reserves
1261 * from scratch - copygc will use its entire
1262 * reserve all at once, then run against when
1263 * its reserve is refilled (from the formerly
1264 * available buckets).
1266 * This reserve is just used when considering if
1267 * allocations for foreground writes must wait -
1268 * not -ENOSPC calculations.
1271 dev_reserve += ca->nr_btree_reserve * 2;
1272 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
1274 dev_reserve += 1; /* btree write point */
1275 dev_reserve += 1; /* copygc write point */
1276 dev_reserve += 1; /* rebalance write point */
1278 dev_reserve *= ca->mi.bucket_size;
1280 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1281 ca->mi.first_bucket);
1283 reserved_sectors += dev_reserve * 2;
1285 bucket_size_max = max_t(unsigned, bucket_size_max,
1286 ca->mi.bucket_size);
1289 gc_reserve = c->opts.gc_reserve_bytes
1290 ? c->opts.gc_reserve_bytes >> 9
1291 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1293 reserved_sectors = max(gc_reserve, reserved_sectors);
1295 reserved_sectors = min(reserved_sectors, capacity);
1297 c->capacity = capacity - reserved_sectors;
1299 c->bucket_size_max = bucket_size_max;
1301 /* Wake up case someone was waiting for buckets */
1302 closure_wake_up(&c->freelist_wait);
1305 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1307 struct open_bucket *ob;
1310 for (ob = c->open_buckets;
1311 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1313 spin_lock(&ob->lock);
1314 if (ob->valid && !ob->on_partial_list &&
1315 ob->dev == ca->dev_idx)
1317 spin_unlock(&ob->lock);
1323 /* device goes ro: */
1324 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1328 /* First, remove device from allocation groups: */
1330 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1331 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1334 * Capacity is calculated based off of devices in allocation groups:
1336 bch2_recalc_capacity(c);
1338 /* Next, close write points that point to this device... */
1339 for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1340 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1342 bch2_writepoint_stop(c, ca, &c->copygc_write_point);
1343 bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1344 bch2_writepoint_stop(c, ca, &c->btree_write_point);
1346 mutex_lock(&c->btree_reserve_cache_lock);
1347 while (c->btree_reserve_cache_nr) {
1348 struct btree_alloc *a =
1349 &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1351 bch2_open_buckets_put(c, &a->ob);
1353 mutex_unlock(&c->btree_reserve_cache_lock);
1356 struct open_bucket *ob;
1358 spin_lock(&c->freelist_lock);
1359 if (!ca->open_buckets_partial_nr) {
1360 spin_unlock(&c->freelist_lock);
1363 ob = c->open_buckets +
1364 ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1365 ob->on_partial_list = false;
1366 spin_unlock(&c->freelist_lock);
1368 bch2_open_bucket_put(c, ob);
1371 bch2_ec_stop_dev(c, ca);
1374 * Wake up threads that were blocked on allocation, so they can notice
1375 * the device can no longer be removed and the capacity has changed:
1377 closure_wake_up(&c->freelist_wait);
1380 * journal_res_get() can block waiting for free space in the journal -
1381 * it needs to notice there may not be devices to allocate from anymore:
1383 wake_up(&c->journal.wait);
1385 /* Now wait for any in flight writes: */
1387 closure_wait_event(&c->open_buckets_wait,
1388 !bch2_dev_has_open_write_point(c, ca));
1391 /* device goes rw: */
1392 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1396 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1397 if (ca->mi.data_allowed & (1 << i))
1398 set_bit(ca->dev_idx, c->rw_devs[i].d);
1401 void bch2_fs_allocator_background_init(struct bch_fs *c)
1403 spin_lock_init(&c->freelist_lock);
1404 INIT_WORK(&c->discard_work, bch2_do_discards_work);
1405 INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);