1 // SPDX-License-Identifier: GPL-2.0
6 #include "alloc_foreground.h"
7 #include "bkey_on_stack.h"
10 #include "btree_update.h"
12 #include "disk_groups.h"
21 #include <linux/sort.h>
25 #include <linux/raid/pq.h>
26 #include <linux/raid/xor.h>
28 static void raid5_recov(unsigned disks, unsigned failed_idx,
29 size_t size, void **data)
33 BUG_ON(failed_idx >= disks);
35 swap(data[0], data[failed_idx]);
36 memcpy(data[0], data[1], size);
39 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
40 xor_blocks(nr, size, data[0], data + i);
44 swap(data[0], data[failed_idx]);
47 static void raid_gen(int nd, int np, size_t size, void **v)
50 raid5_recov(nd + np, nd, size, v);
52 raid6_call.gen_syndrome(nd + np, size, v);
56 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
63 raid5_recov(nd + 1, ir[0], size, v);
65 raid6_call.gen_syndrome(nd + np, size, v);
69 /* data+data failure. */
70 raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
71 } else if (ir[0] < nd) {
72 /* data + p/q failure */
74 if (ir[1] == nd) /* data + p failure */
75 raid6_datap_recov(nd + np, size, ir[0], v);
76 else { /* data + q failure */
77 raid5_recov(nd + 1, ir[0], size, v);
78 raid6_call.gen_syndrome(nd + np, size, v);
81 raid_gen(nd, np, size, v);
91 #include <raid/raid.h>
97 struct ec_stripe_buf *buf;
102 /* Stripes btree keys: */
104 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
106 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
109 return "invalid stripe key";
111 if (bkey_val_bytes(k.k) < sizeof(*s))
112 return "incorrect value size";
114 if (bkey_val_bytes(k.k) < sizeof(*s) ||
115 bkey_val_u64s(k.k) < stripe_val_u64s(s))
116 return "incorrect value size";
118 return bch2_bkey_ptrs_invalid(c, k);
121 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
124 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
127 pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
129 le16_to_cpu(s->sectors),
130 s->nr_blocks - s->nr_redundant,
133 1U << s->csum_granularity_bits);
135 for (i = 0; i < s->nr_blocks; i++)
136 pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
137 (u64) s->ptrs[i].offset,
138 stripe_blockcount_get(s, i));
141 static int ptr_matches_stripe(struct bch_fs *c,
142 struct bch_stripe *v,
143 const struct bch_extent_ptr *ptr)
147 for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
148 const struct bch_extent_ptr *ptr2 = v->ptrs + i;
150 if (ptr->dev == ptr2->dev &&
151 ptr->gen == ptr2->gen &&
152 ptr->offset >= ptr2->offset &&
153 ptr->offset < ptr2->offset + le16_to_cpu(v->sectors))
160 static int extent_matches_stripe(struct bch_fs *c,
161 struct bch_stripe *v,
166 case KEY_TYPE_extent: {
167 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
168 const struct bch_extent_ptr *ptr;
171 extent_for_each_ptr(e, ptr) {
172 idx = ptr_matches_stripe(c, v, ptr);
183 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
186 case KEY_TYPE_extent: {
187 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
188 const union bch_extent_entry *entry;
190 extent_for_each_entry(e, entry)
191 if (extent_entry_type(entry) ==
192 BCH_EXTENT_ENTRY_stripe_ptr &&
193 entry->stripe_ptr.idx == idx)
203 static void ec_stripe_key_init(struct bch_fs *c,
204 struct bkey_i_stripe *s,
205 struct open_buckets *blocks,
206 struct open_buckets *parity,
207 unsigned stripe_size)
209 struct open_bucket *ob;
212 bkey_stripe_init(&s->k_i);
213 s->v.sectors = cpu_to_le16(stripe_size);
215 s->v.nr_blocks = parity->nr + blocks->nr;
216 s->v.nr_redundant = parity->nr;
217 s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
218 s->v.csum_type = BCH_CSUM_CRC32C;
221 open_bucket_for_each(c, blocks, ob, i)
222 s->v.ptrs[i] = ob->ptr;
224 open_bucket_for_each(c, parity, ob, i)
225 s->v.ptrs[blocks->nr + i] = ob->ptr;
227 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
228 BUG_ON(1 << s->v.csum_granularity_bits >=
229 le16_to_cpu(s->v.sectors) ||
230 s->v.csum_granularity_bits == U8_MAX);
231 s->v.csum_granularity_bits++;
234 set_bkey_val_u64s(&s->k, u64s);
239 static void ec_generate_checksums(struct ec_stripe_buf *buf)
241 struct bch_stripe *v = &buf->key.v;
242 unsigned csum_granularity = 1 << v->csum_granularity_bits;
243 unsigned csums_per_device = stripe_csums_per_device(v);
244 unsigned csum_bytes = bch_crc_bytes[v->csum_type];
251 BUG_ON(buf->size != le16_to_cpu(v->sectors));
253 for (i = 0; i < v->nr_blocks; i++) {
254 for (j = 0; j < csums_per_device; j++) {
255 unsigned offset = j << v->csum_granularity_bits;
256 unsigned len = min(csum_granularity, buf->size - offset);
258 struct bch_csum csum =
259 bch2_checksum(NULL, v->csum_type,
261 buf->data[i] + (offset << 9),
264 memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
269 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
271 struct bch_stripe *v = &buf->key.v;
272 unsigned csum_granularity = 1 << v->csum_granularity_bits;
273 unsigned csum_bytes = bch_crc_bytes[v->csum_type];
279 for (i = 0; i < v->nr_blocks; i++) {
280 unsigned offset = buf->offset;
281 unsigned end = buf->offset + buf->size;
283 if (!test_bit(i, buf->valid))
286 while (offset < end) {
287 unsigned j = offset >> v->csum_granularity_bits;
288 unsigned len = min(csum_granularity, end - offset);
289 struct bch_csum csum;
291 BUG_ON(offset & (csum_granularity - 1));
292 BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
293 ((offset + len) & (csum_granularity - 1)));
295 csum = bch2_checksum(NULL, v->csum_type,
297 buf->data[i] + ((offset - buf->offset) << 9),
300 if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
302 "checksum error while doing reconstruct read (%u:%u)",
304 clear_bit(i, buf->valid);
313 /* Erasure coding: */
315 static void ec_generate_ec(struct ec_stripe_buf *buf)
317 struct bch_stripe *v = &buf->key.v;
318 unsigned nr_data = v->nr_blocks - v->nr_redundant;
319 unsigned bytes = le16_to_cpu(v->sectors) << 9;
321 raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
324 static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
326 return nr - bitmap_weight(buf->valid, nr);
329 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
331 return __ec_nr_failed(buf, buf->key.v.nr_blocks);
334 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
336 struct bch_stripe *v = &buf->key.v;
337 unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
338 unsigned nr_data = v->nr_blocks - v->nr_redundant;
339 unsigned bytes = buf->size << 9;
341 if (ec_nr_failed(buf) > v->nr_redundant) {
343 "error doing reconstruct read: unable to read enough blocks");
347 for (i = 0; i < nr_data; i++)
348 if (!test_bit(i, buf->valid))
349 failed[nr_failed++] = i;
351 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
357 static void ec_block_endio(struct bio *bio)
359 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
360 struct bch_dev *ca = ec_bio->ca;
361 struct closure *cl = bio->bi_private;
363 if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
364 clear_bit(ec_bio->idx, ec_bio->buf->valid);
366 bio_put(&ec_bio->bio);
367 percpu_ref_put(&ca->io_ref);
371 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
372 unsigned rw, unsigned idx, struct closure *cl)
374 struct bch_stripe *v = &buf->key.v;
375 unsigned offset = 0, bytes = buf->size << 9;
376 struct bch_extent_ptr *ptr = &v->ptrs[idx];
377 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
379 if (!bch2_dev_get_ioref(ca, rw)) {
380 clear_bit(idx, buf->valid);
384 while (offset < bytes) {
385 unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
386 DIV_ROUND_UP(bytes, PAGE_SIZE));
387 unsigned b = min_t(size_t, bytes - offset,
388 nr_iovecs << PAGE_SHIFT);
389 struct ec_bio *ec_bio;
391 ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
399 bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
400 bio_set_op_attrs(&ec_bio->bio, rw, 0);
402 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
403 ec_bio->bio.bi_end_io = ec_block_endio;
404 ec_bio->bio.bi_private = cl;
406 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
409 percpu_ref_get(&ca->io_ref);
411 submit_bio(&ec_bio->bio);
416 percpu_ref_put(&ca->io_ref);
419 /* recovery read path: */
420 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
422 struct btree_trans trans;
423 struct btree_iter *iter;
424 struct ec_stripe_buf *buf;
427 struct bch_stripe *v;
429 unsigned offset, end;
430 unsigned i, nr_data, csum_granularity;
433 closure_init_stack(&cl);
435 BUG_ON(!rbio->pick.has_ec);
437 stripe_idx = rbio->pick.ec.idx;
439 buf = kzalloc(sizeof(*buf), GFP_NOIO);
443 bch2_trans_init(&trans, c, 0, 0);
445 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
448 k = bch2_btree_iter_peek_slot(iter);
449 if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
451 "error doing reconstruct read: stripe not found");
453 return bch2_trans_exit(&trans) ?: -EIO;
456 bkey_reassemble(&buf->key.k_i, k);
457 bch2_trans_exit(&trans);
461 nr_data = v->nr_blocks - v->nr_redundant;
463 idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
466 csum_granularity = 1U << v->csum_granularity_bits;
468 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
469 end = offset + bio_sectors(&rbio->bio);
471 BUG_ON(end > le16_to_cpu(v->sectors));
473 buf->offset = round_down(offset, csum_granularity);
474 buf->size = min_t(unsigned, le16_to_cpu(v->sectors),
475 round_up(end, csum_granularity)) - buf->offset;
477 for (i = 0; i < v->nr_blocks; i++) {
478 buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
485 memset(buf->valid, 0xFF, sizeof(buf->valid));
487 for (i = 0; i < v->nr_blocks; i++) {
488 struct bch_extent_ptr *ptr = v->ptrs + i;
489 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
491 if (ptr_stale(ca, ptr)) {
493 "error doing reconstruct read: stale pointer");
494 clear_bit(i, buf->valid);
498 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
503 if (ec_nr_failed(buf) > v->nr_redundant) {
505 "error doing reconstruct read: unable to read enough blocks");
510 ec_validate_checksums(c, buf);
512 ret = ec_do_recov(c, buf);
516 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
517 buf->data[idx] + ((offset - buf->offset) << 9));
519 for (i = 0; i < v->nr_blocks; i++)
525 /* stripe bucket accounting: */
527 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
529 ec_stripes_heap n, *h = &c->ec_stripes_heap;
531 if (idx >= h->size) {
532 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
535 spin_lock(&c->ec_stripes_heap_lock);
536 if (n.size > h->size) {
537 memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
541 spin_unlock(&c->ec_stripes_heap_lock);
546 if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
549 if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
550 !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
556 static int ec_stripe_mem_alloc(struct bch_fs *c,
557 struct btree_iter *iter)
559 size_t idx = iter->pos.offset;
562 if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
565 bch2_trans_unlock(iter->trans);
568 if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
574 static ssize_t stripe_idx_to_delete(struct bch_fs *c)
576 ec_stripes_heap *h = &c->ec_stripes_heap;
578 return h->used && h->data[0].blocks_nonempty == 0
579 ? h->data[0].idx : -1;
582 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
583 struct ec_stripe_heap_entry l,
584 struct ec_stripe_heap_entry r)
586 return ((l.blocks_nonempty > r.blocks_nonempty) -
587 (l.blocks_nonempty < r.blocks_nonempty));
590 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
593 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
595 genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
598 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
600 ec_stripes_heap *h = &c->ec_stripes_heap;
601 struct stripe *m = genradix_ptr(&c->stripes[0], idx);
604 BUG_ON(m->heap_idx >= h->used);
605 BUG_ON(h->data[m->heap_idx].idx != idx);
608 void bch2_stripes_heap_update(struct bch_fs *c,
609 struct stripe *m, size_t idx)
611 ec_stripes_heap *h = &c->ec_stripes_heap;
615 heap_verify_backpointer(c, idx);
617 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
620 heap_sift_up(h, i, ec_stripes_heap_cmp,
621 ec_stripes_heap_set_backpointer);
622 heap_sift_down(h, i, ec_stripes_heap_cmp,
623 ec_stripes_heap_set_backpointer);
625 heap_verify_backpointer(c, idx);
627 bch2_stripes_heap_insert(c, m, idx);
630 if (stripe_idx_to_delete(c) >= 0 &&
631 !percpu_ref_is_dying(&c->writes))
632 schedule_work(&c->ec_stripe_delete_work);
635 void bch2_stripes_heap_del(struct bch_fs *c,
636 struct stripe *m, size_t idx)
638 heap_verify_backpointer(c, idx);
641 heap_del(&c->ec_stripes_heap, m->heap_idx,
643 ec_stripes_heap_set_backpointer);
646 void bch2_stripes_heap_insert(struct bch_fs *c,
647 struct stripe *m, size_t idx)
649 BUG_ON(heap_full(&c->ec_stripes_heap));
651 heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
653 .blocks_nonempty = m->blocks_nonempty,
656 ec_stripes_heap_set_backpointer);
659 heap_verify_backpointer(c, idx);
662 /* stripe deletion */
664 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
666 return bch2_btree_delete_range(c, BTREE_ID_EC,
672 static void ec_stripe_delete_work(struct work_struct *work)
675 container_of(work, struct bch_fs, ec_stripe_delete_work);
678 down_read(&c->gc_lock);
679 mutex_lock(&c->ec_stripe_create_lock);
682 spin_lock(&c->ec_stripes_heap_lock);
683 idx = stripe_idx_to_delete(c);
684 spin_unlock(&c->ec_stripes_heap_lock);
689 if (ec_stripe_delete(c, idx))
693 mutex_unlock(&c->ec_stripe_create_lock);
694 up_read(&c->gc_lock);
697 /* stripe creation: */
699 static int ec_stripe_bkey_insert(struct bch_fs *c,
700 struct bkey_i_stripe *stripe)
702 struct btree_trans trans;
703 struct btree_iter *iter;
705 struct bpos start_pos = POS(0, c->ec_stripe_hint);
708 bch2_trans_init(&trans, c, 0, 0);
710 bch2_trans_begin(&trans);
712 for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
713 BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
714 if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
715 if (start_pos.offset) {
717 bch2_btree_iter_set_pos(iter, start_pos);
725 if (bkey_deleted(k.k))
731 start_pos = iter->pos;
733 ret = ec_stripe_mem_alloc(c, iter);
737 stripe->k.p = iter->pos;
739 bch2_trans_update(&trans, iter, &stripe->k_i);
741 ret = bch2_trans_commit(&trans, NULL, NULL,
742 BTREE_INSERT_NOFAIL);
747 c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
748 bch2_trans_exit(&trans);
753 static void extent_stripe_ptr_add(struct bkey_s_extent e,
754 struct ec_stripe_buf *s,
755 struct bch_extent_ptr *ptr,
758 struct bch_extent_stripe_ptr *dst = (void *) ptr;
759 union bch_extent_entry *end = extent_entry_last(e);
761 memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
762 e.k->u64s += sizeof(*dst) / sizeof(u64);
764 *dst = (struct bch_extent_stripe_ptr) {
765 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
767 .idx = s->key.k.p.offset,
771 static int ec_stripe_update_ptrs(struct bch_fs *c,
772 struct ec_stripe_buf *s,
775 struct btree_trans trans;
776 struct btree_iter *iter;
778 struct bkey_s_extent e;
779 struct bkey_on_stack sk;
780 int ret = 0, dev, idx;
782 bkey_on_stack_init(&sk);
783 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
785 iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
789 while ((k = bch2_btree_iter_peek(iter)).k &&
790 !(ret = bkey_err(k)) &&
791 bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
792 struct bch_extent_ptr *ptr, *ec_ptr = NULL;
794 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
795 bch2_btree_iter_next(iter);
799 idx = extent_matches_stripe(c, &s->key.v, k);
801 bch2_btree_iter_next(iter);
805 bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
807 dev = s->key.v.ptrs[idx].dev;
809 bkey_on_stack_reassemble(&sk, c, k);
810 e = bkey_i_to_s_extent(sk.k);
812 extent_for_each_ptr(e, ptr) {
819 extent_stripe_ptr_add(e, s, ec_ptr, idx);
821 bch2_trans_update(&trans, iter, sk.k);
823 ret = bch2_trans_commit(&trans, NULL, NULL,
825 BTREE_INSERT_USE_RESERVE);
832 bch2_trans_exit(&trans);
833 bkey_on_stack_exit(&sk, c);
839 * data buckets of new stripe all written: create the stripe
841 static void ec_stripe_create(struct ec_stripe_new *s)
843 struct bch_fs *c = s->c;
844 struct open_bucket *ob;
846 struct bch_stripe *v = &s->stripe.key.v;
847 unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
851 BUG_ON(s->h->s == s);
853 closure_init_stack(&cl);
856 bch_err(c, "error creating stripe: error writing data buckets");
860 if (!percpu_ref_tryget(&c->writes))
863 BUG_ON(bitmap_weight(s->blocks_allocated,
864 s->blocks.nr) != s->blocks.nr);
866 ec_generate_ec(&s->stripe);
868 ec_generate_checksums(&s->stripe);
871 for (i = nr_data; i < v->nr_blocks; i++)
872 ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
876 for (i = nr_data; i < v->nr_blocks; i++)
877 if (!test_bit(i, s->stripe.valid)) {
878 bch_err(c, "error creating stripe: error writing redundancy buckets");
882 mutex_lock(&c->ec_stripe_create_lock);
884 ret = ec_stripe_bkey_insert(c, &s->stripe.key);
886 bch_err(c, "error creating stripe: error creating stripe key");
890 for_each_keylist_key(&s->keys, k) {
891 ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
897 mutex_unlock(&c->ec_stripe_create_lock);
899 percpu_ref_put(&c->writes);
901 open_bucket_for_each(c, &s->blocks, ob, i) {
903 __bch2_open_bucket_put(c, ob);
906 bch2_open_buckets_put(c, &s->parity);
908 bch2_keylist_free(&s->keys, s->inline_keys);
910 mutex_lock(&s->h->lock);
912 mutex_unlock(&s->h->lock);
914 for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
915 kvpfree(s->stripe.data[i], s->stripe.size << 9);
919 static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
921 struct ec_stripe_new *s = h->s;
923 list_add(&s->list, &h->stripes);
929 static void ec_stripe_new_put(struct ec_stripe_new *s)
931 BUG_ON(atomic_read(&s->pin) <= 0);
932 if (atomic_dec_and_test(&s->pin))
936 /* have a full bucket - hand it off to be erasure coded: */
937 void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
939 struct ec_stripe_new *s = ob->ec;
941 if (ob->sectors_free)
944 ec_stripe_new_put(s);
947 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
949 struct ec_stripe_new *s = ob->ec;
954 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
956 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
963 ca = bch_dev_bkey_exists(c, ob->ptr.dev);
964 offset = ca->mi.bucket_size - ob->sectors_free;
966 return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
969 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
970 struct bpos pos, unsigned sectors)
972 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
973 struct ec_stripe_new *ec;
979 mutex_lock(&ec->lock);
981 if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
982 ARRAY_SIZE(ec->inline_keys),
987 bkey_init(&ec->keys.top->k);
988 ec->keys.top->k.p = pos;
989 bch2_key_resize(&ec->keys.top->k, sectors);
990 bch2_keylist_push(&ec->keys);
992 mutex_unlock(&ec->lock);
995 static int unsigned_cmp(const void *_l, const void *_r)
997 unsigned l = *((const unsigned *) _l);
998 unsigned r = *((const unsigned *) _r);
1000 return cmp_int(l, r);
1003 /* pick most common bucket size: */
1004 static unsigned pick_blocksize(struct bch_fs *c,
1005 struct bch_devs_mask *devs)
1008 unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1011 } cur = { 0, 0 }, best = { 0, 0 };
1013 for_each_member_device_rcu(ca, c, i, devs)
1014 sizes[nr++] = ca->mi.bucket_size;
1016 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1018 for (i = 0; i < nr; i++) {
1019 if (sizes[i] != cur.size) {
1020 if (cur.nr > best.nr)
1024 cur.size = sizes[i];
1030 if (cur.nr > best.nr)
1036 int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1038 struct ec_stripe_new *s;
1041 BUG_ON(h->parity.nr != h->redundancy);
1042 BUG_ON(!h->blocks.nr);
1043 BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
1044 lockdep_assert_held(&h->lock);
1046 s = kzalloc(sizeof(*s), GFP_KERNEL);
1050 mutex_init(&s->lock);
1051 atomic_set(&s->pin, 1);
1054 s->blocks = h->blocks;
1055 s->parity = h->parity;
1057 memset(&h->blocks, 0, sizeof(h->blocks));
1058 memset(&h->parity, 0, sizeof(h->parity));
1060 bch2_keylist_init(&s->keys, s->inline_keys);
1062 s->stripe.offset = 0;
1063 s->stripe.size = h->blocksize;
1064 memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
1066 ec_stripe_key_init(c, &s->stripe.key,
1067 &s->blocks, &s->parity,
1070 for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
1071 s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
1072 if (!s->stripe.data[i])
1080 for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
1081 kvpfree(s->stripe.data[i], s->stripe.size << 9);
1086 static struct ec_stripe_head *
1087 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1088 unsigned algo, unsigned redundancy)
1090 struct ec_stripe_head *h;
1094 h = kzalloc(sizeof(*h), GFP_KERNEL);
1098 mutex_init(&h->lock);
1099 mutex_lock(&h->lock);
1100 INIT_LIST_HEAD(&h->stripes);
1104 h->redundancy = redundancy;
1107 h->devs = target_rw_devs(c, BCH_DATA_USER, target);
1109 for_each_member_device_rcu(ca, c, i, &h->devs)
1110 if (!ca->mi.durability)
1111 __clear_bit(i, h->devs.d);
1113 h->blocksize = pick_blocksize(c, &h->devs);
1115 for_each_member_device_rcu(ca, c, i, &h->devs)
1116 if (ca->mi.bucket_size == h->blocksize)
1117 h->nr_active_devs++;
1120 list_add(&h->list, &c->ec_new_stripe_list);
1124 void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
1126 struct ec_stripe_new *s = NULL;
1129 bitmap_weight(h->s->blocks_allocated,
1130 h->s->blocks.nr) == h->s->blocks.nr)
1131 s = ec_stripe_set_pending(h);
1133 mutex_unlock(&h->lock);
1136 ec_stripe_new_put(s);
1139 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
1142 unsigned redundancy)
1144 struct ec_stripe_head *h;
1149 mutex_lock(&c->ec_new_stripe_lock);
1150 list_for_each_entry(h, &c->ec_new_stripe_list, list)
1151 if (h->target == target &&
1153 h->redundancy == redundancy) {
1154 mutex_lock(&h->lock);
1158 h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
1160 mutex_unlock(&c->ec_new_stripe_lock);
1164 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1166 struct ec_stripe_head *h;
1167 struct open_bucket *ob;
1170 mutex_lock(&c->ec_new_stripe_lock);
1171 list_for_each_entry(h, &c->ec_new_stripe_list, list) {
1172 struct ec_stripe_new *s = NULL;
1174 mutex_lock(&h->lock);
1175 bch2_open_buckets_stop_dev(c, ca, &h->blocks);
1176 bch2_open_buckets_stop_dev(c, ca, &h->parity);
1181 open_bucket_for_each(c, &h->s->blocks, ob, i)
1182 if (ob->ptr.dev == ca->dev_idx)
1184 open_bucket_for_each(c, &h->s->parity, ob, i)
1185 if (ob->ptr.dev == ca->dev_idx)
1190 s = ec_stripe_set_pending(h);
1192 mutex_unlock(&h->lock);
1195 ec_stripe_new_put(s);
1197 mutex_unlock(&c->ec_new_stripe_lock);
1200 static int __bch2_stripe_write_key(struct btree_trans *trans,
1201 struct btree_iter *iter,
1204 struct bkey_i_stripe *new_key,
1207 struct bch_fs *c = trans->c;
1212 bch2_btree_iter_set_pos(iter, POS(0, idx));
1214 k = bch2_btree_iter_peek_slot(iter);
1219 if (k.k->type != KEY_TYPE_stripe)
1222 bkey_reassemble(&new_key->k_i, k);
1224 spin_lock(&c->ec_stripes_heap_lock);
1226 for (i = 0; i < new_key->v.nr_blocks; i++)
1227 stripe_blockcount_set(&new_key->v, i,
1228 m->block_sectors[i]);
1231 spin_unlock(&c->ec_stripes_heap_lock);
1233 bch2_trans_update(trans, iter, &new_key->k_i);
1235 return bch2_trans_commit(trans, NULL, NULL,
1236 BTREE_INSERT_NOFAIL|flags);
1239 int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
1241 struct btree_trans trans;
1242 struct btree_iter *iter;
1243 struct genradix_iter giter;
1244 struct bkey_i_stripe *new_key;
1248 new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
1251 bch2_trans_init(&trans, c, 0, 0);
1253 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
1254 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1256 genradix_for_each(&c->stripes[0], giter, m) {
1261 bch2_trans_reset(&trans, TRANS_RESET_MEM);
1263 ret = __bch2_stripe_write_key(&trans, iter, m,
1264 giter.pos, new_key, flags);
1265 } while (ret == -EINTR);
1273 bch2_trans_exit(&trans);
1280 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
1282 struct btree_trans trans;
1283 struct btree_iter *btree_iter;
1284 struct journal_iter journal_iter;
1285 struct bkey_s_c btree_k, journal_k;
1288 ret = bch2_fs_ec_start(c);
1292 bch2_trans_init(&trans, c, 0, 0);
1294 btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
1295 journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
1297 btree_k = bch2_btree_iter_peek(btree_iter);
1298 journal_k = bch2_journal_iter_peek(&journal_iter);
1303 if (btree_k.k && journal_k.k) {
1304 int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
1307 btree_k = bch2_btree_iter_next(btree_iter);
1309 } else if (btree_k.k) {
1311 } else if (journal_k.k) {
1317 bch2_mark_key(c, btree ? btree_k : journal_k,
1319 BCH_BUCKET_MARK_ALLOC_READ|
1320 BCH_BUCKET_MARK_NOATOMIC);
1323 btree_k = bch2_btree_iter_next(btree_iter);
1325 journal_k = bch2_journal_iter_next(&journal_iter);
1328 ret = bch2_trans_exit(&trans) ?: ret;
1330 bch_err(c, "error reading stripes: %i", ret);
1337 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
1339 struct btree_trans trans;
1340 struct btree_iter *iter;
1345 bch2_trans_init(&trans, c, 0, 0);
1347 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
1349 k = bch2_btree_iter_prev(iter);
1350 if (!IS_ERR_OR_NULL(k.k))
1351 idx = k.k->p.offset + 1;
1352 ret = bch2_trans_exit(&trans);
1360 !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
1364 ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
1366 for (i = 0; i < idx; i++)
1367 if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
1373 int bch2_fs_ec_start(struct bch_fs *c)
1375 return bch2_ec_mem_alloc(c, false);
1378 void bch2_fs_ec_exit(struct bch_fs *c)
1380 struct ec_stripe_head *h;
1383 mutex_lock(&c->ec_new_stripe_lock);
1384 h = list_first_entry_or_null(&c->ec_new_stripe_list,
1385 struct ec_stripe_head, list);
1388 mutex_unlock(&c->ec_new_stripe_lock);
1393 BUG_ON(!list_empty(&h->stripes));
1397 free_heap(&c->ec_stripes_heap);
1398 genradix_free(&c->stripes[0]);
1399 bioset_exit(&c->ec_bioset);
1402 int bch2_fs_ec_init(struct bch_fs *c)
1404 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1406 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),