1 // SPDX-License-Identifier: GPL-2.0
6 #include "alloc_foreground.h"
9 #include "btree_update.h"
11 #include "disk_groups.h"
20 #include <linux/sort.h>
24 #include <linux/raid/pq.h>
25 #include <linux/raid/xor.h>
27 static void raid5_recov(unsigned disks, unsigned failed_idx,
28 size_t size, void **data)
32 BUG_ON(failed_idx >= disks);
34 swap(data[0], data[failed_idx]);
35 memcpy(data[0], data[1], size);
38 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
39 xor_blocks(nr, size, data[0], data + i);
43 swap(data[0], data[failed_idx]);
46 static void raid_gen(int nd, int np, size_t size, void **v)
49 raid5_recov(nd + np, nd, size, v);
51 raid6_call.gen_syndrome(nd + np, size, v);
55 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
62 raid5_recov(nd + 1, ir[0], size, v);
64 raid6_call.gen_syndrome(nd + np, size, v);
68 /* data+data failure. */
69 raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
70 } else if (ir[0] < nd) {
71 /* data + p/q failure */
73 if (ir[1] == nd) /* data + p failure */
74 raid6_datap_recov(nd + np, size, ir[0], v);
75 else { /* data + q failure */
76 raid5_recov(nd + 1, ir[0], size, v);
77 raid6_call.gen_syndrome(nd + np, size, v);
80 raid_gen(nd, np, size, v);
90 #include <raid/raid.h>
96 struct ec_stripe_buf *buf;
101 /* Stripes btree keys: */
103 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
105 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
108 return "invalid stripe key";
110 if (bkey_val_bytes(k.k) < sizeof(*s))
111 return "incorrect value size";
113 if (bkey_val_bytes(k.k) < sizeof(*s) ||
114 bkey_val_u64s(k.k) < stripe_val_u64s(s))
115 return "incorrect value size";
117 return bch2_bkey_ptrs_invalid(c, k);
120 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
123 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
126 pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
128 le16_to_cpu(s->sectors),
129 s->nr_blocks - s->nr_redundant,
132 1U << s->csum_granularity_bits);
134 for (i = 0; i < s->nr_blocks; i++)
135 pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
136 (u64) s->ptrs[i].offset,
137 stripe_blockcount_get(s, i));
139 bch2_bkey_ptrs_to_text(out, c, k);
142 static int ptr_matches_stripe(struct bch_fs *c,
143 struct bch_stripe *v,
144 const struct bch_extent_ptr *ptr)
148 for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
149 const struct bch_extent_ptr *ptr2 = v->ptrs + i;
151 if (ptr->dev == ptr2->dev &&
152 ptr->gen == ptr2->gen &&
153 ptr->offset >= ptr2->offset &&
154 ptr->offset < ptr2->offset + le16_to_cpu(v->sectors))
161 static int extent_matches_stripe(struct bch_fs *c,
162 struct bch_stripe *v,
165 struct bkey_s_c_extent e;
166 const struct bch_extent_ptr *ptr;
169 if (!bkey_extent_is_data(k.k))
172 e = bkey_s_c_to_extent(k);
174 extent_for_each_ptr(e, ptr) {
175 idx = ptr_matches_stripe(c, v, ptr);
183 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
185 struct bkey_s_c_extent e;
186 const union bch_extent_entry *entry;
188 if (!bkey_extent_is_data(k.k))
191 e = bkey_s_c_to_extent(k);
193 extent_for_each_entry(e, entry)
194 if (extent_entry_type(entry) ==
195 BCH_EXTENT_ENTRY_stripe_ptr &&
196 entry->stripe_ptr.idx == idx)
202 static void ec_stripe_key_init(struct bch_fs *c,
203 struct bkey_i_stripe *s,
204 struct open_buckets *blocks,
205 struct open_buckets *parity,
206 unsigned stripe_size)
208 struct open_bucket *ob;
211 bkey_stripe_init(&s->k_i);
212 s->v.sectors = cpu_to_le16(stripe_size);
214 s->v.nr_blocks = parity->nr + blocks->nr;
215 s->v.nr_redundant = parity->nr;
216 s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
217 s->v.csum_type = BCH_CSUM_CRC32C;
220 open_bucket_for_each(c, blocks, ob, i)
221 s->v.ptrs[i] = ob->ptr;
223 open_bucket_for_each(c, parity, ob, i)
224 s->v.ptrs[blocks->nr + i] = ob->ptr;
226 while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
227 BUG_ON(1 << s->v.csum_granularity_bits >=
228 le16_to_cpu(s->v.sectors) ||
229 s->v.csum_granularity_bits == U8_MAX);
230 s->v.csum_granularity_bits++;
233 set_bkey_val_u64s(&s->k, u64s);
238 static void ec_generate_checksums(struct ec_stripe_buf *buf)
240 struct bch_stripe *v = &buf->key.v;
241 unsigned csum_granularity = 1 << v->csum_granularity_bits;
242 unsigned csums_per_device = stripe_csums_per_device(v);
243 unsigned csum_bytes = bch_crc_bytes[v->csum_type];
250 BUG_ON(buf->size != le16_to_cpu(v->sectors));
252 for (i = 0; i < v->nr_blocks; i++) {
253 for (j = 0; j < csums_per_device; j++) {
254 unsigned offset = j << v->csum_granularity_bits;
255 unsigned len = min(csum_granularity, buf->size - offset);
257 struct bch_csum csum =
258 bch2_checksum(NULL, v->csum_type,
260 buf->data[i] + (offset << 9),
263 memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
268 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
270 struct bch_stripe *v = &buf->key.v;
271 unsigned csum_granularity = 1 << v->csum_granularity_bits;
272 unsigned csum_bytes = bch_crc_bytes[v->csum_type];
278 for (i = 0; i < v->nr_blocks; i++) {
279 unsigned offset = buf->offset;
280 unsigned end = buf->offset + buf->size;
282 if (!test_bit(i, buf->valid))
285 while (offset < end) {
286 unsigned j = offset >> v->csum_granularity_bits;
287 unsigned len = min(csum_granularity, end - offset);
288 struct bch_csum csum;
290 BUG_ON(offset & (csum_granularity - 1));
291 BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
292 ((offset + len) & (csum_granularity - 1)));
294 csum = bch2_checksum(NULL, v->csum_type,
296 buf->data[i] + ((offset - buf->offset) << 9),
299 if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
301 "checksum error while doing reconstruct read (%u:%u)",
303 clear_bit(i, buf->valid);
312 /* Erasure coding: */
314 static void ec_generate_ec(struct ec_stripe_buf *buf)
316 struct bch_stripe *v = &buf->key.v;
317 unsigned nr_data = v->nr_blocks - v->nr_redundant;
318 unsigned bytes = le16_to_cpu(v->sectors) << 9;
320 raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
323 static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
325 return nr - bitmap_weight(buf->valid, nr);
328 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
330 return __ec_nr_failed(buf, buf->key.v.nr_blocks);
333 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
335 struct bch_stripe *v = &buf->key.v;
336 unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
337 unsigned nr_data = v->nr_blocks - v->nr_redundant;
338 unsigned bytes = buf->size << 9;
340 if (ec_nr_failed(buf) > v->nr_redundant) {
342 "error doing reconstruct read: unable to read enough blocks");
346 for (i = 0; i < nr_data; i++)
347 if (!test_bit(i, buf->valid))
348 failed[nr_failed++] = i;
350 raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
356 static void ec_block_endio(struct bio *bio)
358 struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
359 struct bch_dev *ca = ec_bio->ca;
360 struct closure *cl = bio->bi_private;
362 if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
363 clear_bit(ec_bio->idx, ec_bio->buf->valid);
365 bio_put(&ec_bio->bio);
366 percpu_ref_put(&ca->io_ref);
370 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
371 unsigned rw, unsigned idx, struct closure *cl)
373 struct bch_stripe *v = &buf->key.v;
374 unsigned offset = 0, bytes = buf->size << 9;
375 struct bch_extent_ptr *ptr = &v->ptrs[idx];
376 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
378 if (!bch2_dev_get_ioref(ca, rw)) {
379 clear_bit(idx, buf->valid);
383 while (offset < bytes) {
384 unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
385 DIV_ROUND_UP(bytes, PAGE_SIZE));
386 unsigned b = min_t(size_t, bytes - offset,
387 nr_iovecs << PAGE_SHIFT);
388 struct ec_bio *ec_bio;
390 ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
398 bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
399 bio_set_op_attrs(&ec_bio->bio, rw, 0);
401 ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
402 ec_bio->bio.bi_end_io = ec_block_endio;
403 ec_bio->bio.bi_private = cl;
405 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
408 percpu_ref_get(&ca->io_ref);
410 submit_bio(&ec_bio->bio);
415 percpu_ref_put(&ca->io_ref);
418 /* recovery read path: */
419 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
421 struct btree_trans trans;
422 struct btree_iter *iter;
423 struct ec_stripe_buf *buf;
426 struct bch_stripe *v;
428 unsigned offset, end;
429 unsigned i, nr_data, csum_granularity;
432 closure_init_stack(&cl);
434 BUG_ON(!rbio->pick.idx ||
435 rbio->pick.idx - 1 >= rbio->pick.ec_nr);
437 stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
439 buf = kzalloc(sizeof(*buf), GFP_NOIO);
443 bch2_trans_init(&trans, c, 0, 0);
445 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
448 k = bch2_btree_iter_peek_slot(iter);
449 if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
451 "error doing reconstruct read: stripe not found");
453 return bch2_trans_exit(&trans) ?: -EIO;
456 bkey_reassemble(&buf->key.k_i, k);
457 bch2_trans_exit(&trans);
461 nr_data = v->nr_blocks - v->nr_redundant;
463 idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
466 csum_granularity = 1U << v->csum_granularity_bits;
468 offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
469 end = offset + bio_sectors(&rbio->bio);
471 BUG_ON(end > le16_to_cpu(v->sectors));
473 buf->offset = round_down(offset, csum_granularity);
474 buf->size = min_t(unsigned, le16_to_cpu(v->sectors),
475 round_up(end, csum_granularity)) - buf->offset;
477 for (i = 0; i < v->nr_blocks; i++) {
478 buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
485 memset(buf->valid, 0xFF, sizeof(buf->valid));
487 for (i = 0; i < v->nr_blocks; i++) {
488 struct bch_extent_ptr *ptr = v->ptrs + i;
489 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
491 if (ptr_stale(ca, ptr)) {
493 "error doing reconstruct read: stale pointer");
494 clear_bit(i, buf->valid);
498 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
503 if (ec_nr_failed(buf) > v->nr_redundant) {
505 "error doing reconstruct read: unable to read enough blocks");
510 ec_validate_checksums(c, buf);
512 ret = ec_do_recov(c, buf);
516 memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
517 buf->data[idx] + ((offset - buf->offset) << 9));
519 for (i = 0; i < v->nr_blocks; i++)
525 /* stripe bucket accounting: */
527 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
529 ec_stripes_heap n, *h = &c->ec_stripes_heap;
531 if (idx >= h->size) {
532 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
535 spin_lock(&c->ec_stripes_heap_lock);
536 if (n.size > h->size) {
537 memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
541 spin_unlock(&c->ec_stripes_heap_lock);
546 if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
549 if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
550 !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
556 static int ec_stripe_mem_alloc(struct bch_fs *c,
557 struct btree_iter *iter)
559 size_t idx = iter->pos.offset;
562 if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT))
565 bch2_trans_unlock(iter->trans);
568 if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
574 static ssize_t stripe_idx_to_delete(struct bch_fs *c)
576 ec_stripes_heap *h = &c->ec_stripes_heap;
578 return h->used && h->data[0].blocks_nonempty == 0
579 ? h->data[0].idx : -1;
582 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
583 struct ec_stripe_heap_entry l,
584 struct ec_stripe_heap_entry r)
586 return ((l.blocks_nonempty > r.blocks_nonempty) -
587 (l.blocks_nonempty < r.blocks_nonempty));
590 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
593 struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
595 genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
598 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
600 ec_stripes_heap *h = &c->ec_stripes_heap;
601 struct stripe *m = genradix_ptr(&c->stripes[0], idx);
604 BUG_ON(m->heap_idx >= h->used);
605 BUG_ON(h->data[m->heap_idx].idx != idx);
608 void bch2_stripes_heap_update(struct bch_fs *c,
609 struct stripe *m, size_t idx)
611 ec_stripes_heap *h = &c->ec_stripes_heap;
615 heap_verify_backpointer(c, idx);
617 h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
620 heap_sift_up(h, i, ec_stripes_heap_cmp,
621 ec_stripes_heap_set_backpointer);
622 heap_sift_down(h, i, ec_stripes_heap_cmp,
623 ec_stripes_heap_set_backpointer);
625 heap_verify_backpointer(c, idx);
627 bch2_stripes_heap_insert(c, m, idx);
630 if (stripe_idx_to_delete(c) >= 0 &&
631 !percpu_ref_is_dying(&c->writes))
632 schedule_work(&c->ec_stripe_delete_work);
635 void bch2_stripes_heap_del(struct bch_fs *c,
636 struct stripe *m, size_t idx)
638 heap_verify_backpointer(c, idx);
641 heap_del(&c->ec_stripes_heap, m->heap_idx,
643 ec_stripes_heap_set_backpointer);
646 void bch2_stripes_heap_insert(struct bch_fs *c,
647 struct stripe *m, size_t idx)
649 BUG_ON(heap_full(&c->ec_stripes_heap));
651 heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
653 .blocks_nonempty = m->blocks_nonempty,
656 ec_stripes_heap_set_backpointer);
659 heap_verify_backpointer(c, idx);
662 /* stripe deletion */
664 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
666 return bch2_btree_delete_range(c, BTREE_ID_EC,
672 static void ec_stripe_delete_work(struct work_struct *work)
675 container_of(work, struct bch_fs, ec_stripe_delete_work);
678 down_read(&c->gc_lock);
679 mutex_lock(&c->ec_stripe_create_lock);
682 spin_lock(&c->ec_stripes_heap_lock);
683 idx = stripe_idx_to_delete(c);
684 spin_unlock(&c->ec_stripes_heap_lock);
689 if (ec_stripe_delete(c, idx))
693 mutex_unlock(&c->ec_stripe_create_lock);
694 up_read(&c->gc_lock);
697 /* stripe creation: */
699 static int ec_stripe_bkey_insert(struct bch_fs *c,
700 struct bkey_i_stripe *stripe)
702 struct btree_trans trans;
703 struct btree_iter *iter;
707 bch2_trans_init(&trans, c, 0, 0);
709 bch2_trans_begin(&trans);
711 /* XXX: start pos hint */
712 for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN,
713 BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
714 if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
717 if (bkey_deleted(k.k))
725 ret = ec_stripe_mem_alloc(c, iter);
729 stripe->k.p = iter->pos;
731 bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
733 ret = bch2_trans_commit(&trans, NULL, NULL,
735 BTREE_INSERT_NOFAIL);
739 bch2_trans_exit(&trans);
744 static void extent_stripe_ptr_add(struct bkey_s_extent e,
745 struct ec_stripe_buf *s,
746 struct bch_extent_ptr *ptr,
749 struct bch_extent_stripe_ptr *dst = (void *) ptr;
750 union bch_extent_entry *end = extent_entry_last(e);
752 memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
753 e.k->u64s += sizeof(*dst) / sizeof(u64);
755 *dst = (struct bch_extent_stripe_ptr) {
756 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
758 .idx = s->key.k.p.offset,
762 static int ec_stripe_update_ptrs(struct bch_fs *c,
763 struct ec_stripe_buf *s,
766 struct btree_trans trans;
767 struct btree_iter *iter;
769 struct bkey_s_extent e;
770 struct bch_extent_ptr *ptr;
772 int ret = 0, dev, idx;
774 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
776 iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
780 while ((k = bch2_btree_iter_peek(iter)).k &&
781 !(ret = bkey_err(k)) &&
782 bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
783 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
784 bch2_btree_iter_next(iter);
788 idx = extent_matches_stripe(c, &s->key.v, k);
790 bch2_btree_iter_next(iter);
794 bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
796 dev = s->key.v.ptrs[idx].dev;
798 bkey_reassemble(&tmp.k, k);
799 e = bkey_i_to_s_extent(&tmp.k);
801 extent_for_each_ptr(e, ptr)
805 ptr = (void *) bch2_extent_has_device(e.c, dev);
808 extent_stripe_ptr_add(e, s, ptr, idx);
810 bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k));
812 ret = bch2_trans_commit(&trans, NULL, NULL,
815 BTREE_INSERT_USE_RESERVE);
822 bch2_trans_exit(&trans);
828 * data buckets of new stripe all written: create the stripe
830 static void ec_stripe_create(struct ec_stripe_new *s)
832 struct bch_fs *c = s->c;
833 struct open_bucket *ob;
835 struct bch_stripe *v = &s->stripe.key.v;
836 unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
840 BUG_ON(s->h->s == s);
842 closure_init_stack(&cl);
845 bch_err(c, "error creating stripe: error writing data buckets");
849 if (!percpu_ref_tryget(&c->writes))
852 BUG_ON(bitmap_weight(s->blocks_allocated,
853 s->blocks.nr) != s->blocks.nr);
855 ec_generate_ec(&s->stripe);
857 ec_generate_checksums(&s->stripe);
860 for (i = nr_data; i < v->nr_blocks; i++)
861 ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
865 for (i = nr_data; i < v->nr_blocks; i++)
866 if (!test_bit(i, s->stripe.valid)) {
867 bch_err(c, "error creating stripe: error writing redundancy buckets");
871 mutex_lock(&c->ec_stripe_create_lock);
873 ret = ec_stripe_bkey_insert(c, &s->stripe.key);
875 bch_err(c, "error creating stripe: error creating stripe key");
879 for_each_keylist_key(&s->keys, k) {
880 ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
886 mutex_unlock(&c->ec_stripe_create_lock);
888 percpu_ref_put(&c->writes);
890 open_bucket_for_each(c, &s->blocks, ob, i) {
892 __bch2_open_bucket_put(c, ob);
895 bch2_open_buckets_put(c, &s->parity);
897 bch2_keylist_free(&s->keys, s->inline_keys);
899 mutex_lock(&s->h->lock);
901 mutex_unlock(&s->h->lock);
903 for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
904 kvpfree(s->stripe.data[i], s->stripe.size << 9);
908 static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
910 struct ec_stripe_new *s = h->s;
912 list_add(&s->list, &h->stripes);
918 static void ec_stripe_new_put(struct ec_stripe_new *s)
920 BUG_ON(atomic_read(&s->pin) <= 0);
921 if (atomic_dec_and_test(&s->pin))
925 /* have a full bucket - hand it off to be erasure coded: */
926 void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
928 struct ec_stripe_new *s = ob->ec;
930 if (ob->sectors_free)
933 ec_stripe_new_put(s);
936 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
938 struct ec_stripe_new *s = ob->ec;
943 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
945 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
952 ca = bch_dev_bkey_exists(c, ob->ptr.dev);
953 offset = ca->mi.bucket_size - ob->sectors_free;
955 return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
958 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
959 struct bpos pos, unsigned sectors)
961 struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
962 struct ec_stripe_new *ec;
968 mutex_lock(&ec->lock);
970 if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
971 ARRAY_SIZE(ec->inline_keys),
976 bkey_init(&ec->keys.top->k);
977 ec->keys.top->k.p = pos;
978 bch2_key_resize(&ec->keys.top->k, sectors);
979 bch2_keylist_push(&ec->keys);
981 mutex_unlock(&ec->lock);
984 static int unsigned_cmp(const void *_l, const void *_r)
986 unsigned l = *((const unsigned *) _l);
987 unsigned r = *((const unsigned *) _r);
989 return cmp_int(l, r);
992 /* pick most common bucket size: */
993 static unsigned pick_blocksize(struct bch_fs *c,
994 struct bch_devs_mask *devs)
997 unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1000 } cur = { 0, 0 }, best = { 0, 0 };
1002 for_each_member_device_rcu(ca, c, i, devs)
1003 sizes[nr++] = ca->mi.bucket_size;
1005 sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1007 for (i = 0; i < nr; i++) {
1008 if (sizes[i] != cur.size) {
1009 if (cur.nr > best.nr)
1013 cur.size = sizes[i];
1019 if (cur.nr > best.nr)
1025 int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1027 struct ec_stripe_new *s;
1030 BUG_ON(h->parity.nr != h->redundancy);
1031 BUG_ON(!h->blocks.nr);
1032 BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
1033 lockdep_assert_held(&h->lock);
1035 s = kzalloc(sizeof(*s), GFP_KERNEL);
1039 mutex_init(&s->lock);
1040 atomic_set(&s->pin, 1);
1043 s->blocks = h->blocks;
1044 s->parity = h->parity;
1046 memset(&h->blocks, 0, sizeof(h->blocks));
1047 memset(&h->parity, 0, sizeof(h->parity));
1049 bch2_keylist_init(&s->keys, s->inline_keys);
1051 s->stripe.offset = 0;
1052 s->stripe.size = h->blocksize;
1053 memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
1055 ec_stripe_key_init(c, &s->stripe.key,
1056 &s->blocks, &s->parity,
1059 for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
1060 s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
1061 if (!s->stripe.data[i])
1069 for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
1070 kvpfree(s->stripe.data[i], s->stripe.size << 9);
1075 static struct ec_stripe_head *
1076 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1077 unsigned algo, unsigned redundancy)
1079 struct ec_stripe_head *h;
1083 h = kzalloc(sizeof(*h), GFP_KERNEL);
1087 mutex_init(&h->lock);
1088 mutex_lock(&h->lock);
1089 INIT_LIST_HEAD(&h->stripes);
1093 h->redundancy = redundancy;
1096 h->devs = target_rw_devs(c, BCH_DATA_USER, target);
1098 for_each_member_device_rcu(ca, c, i, &h->devs)
1099 if (!ca->mi.durability)
1100 __clear_bit(i, h->devs.d);
1102 h->blocksize = pick_blocksize(c, &h->devs);
1104 for_each_member_device_rcu(ca, c, i, &h->devs)
1105 if (ca->mi.bucket_size == h->blocksize)
1106 h->nr_active_devs++;
1109 list_add(&h->list, &c->ec_new_stripe_list);
1113 void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
1115 struct ec_stripe_new *s = NULL;
1118 bitmap_weight(h->s->blocks_allocated,
1119 h->s->blocks.nr) == h->s->blocks.nr)
1120 s = ec_stripe_set_pending(h);
1122 mutex_unlock(&h->lock);
1125 ec_stripe_new_put(s);
1128 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
1131 unsigned redundancy)
1133 struct ec_stripe_head *h;
1138 mutex_lock(&c->ec_new_stripe_lock);
1139 list_for_each_entry(h, &c->ec_new_stripe_list, list)
1140 if (h->target == target &&
1142 h->redundancy == redundancy) {
1143 mutex_lock(&h->lock);
1147 h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
1149 mutex_unlock(&c->ec_new_stripe_lock);
1153 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1155 struct ec_stripe_head *h;
1156 struct open_bucket *ob;
1159 mutex_lock(&c->ec_new_stripe_lock);
1160 list_for_each_entry(h, &c->ec_new_stripe_list, list) {
1161 struct ec_stripe_new *s = NULL;
1163 mutex_lock(&h->lock);
1164 bch2_open_buckets_stop_dev(c, ca,
1167 bch2_open_buckets_stop_dev(c, ca,
1174 open_bucket_for_each(c, &h->s->blocks, ob, i)
1175 if (ob->ptr.dev == ca->dev_idx)
1177 open_bucket_for_each(c, &h->s->parity, ob, i)
1178 if (ob->ptr.dev == ca->dev_idx)
1183 s = ec_stripe_set_pending(h);
1185 mutex_unlock(&h->lock);
1188 ec_stripe_new_put(s);
1190 mutex_unlock(&c->ec_new_stripe_lock);
1193 static int __bch2_stripe_write_key(struct btree_trans *trans,
1194 struct btree_iter *iter,
1197 struct bkey_i_stripe *new_key,
1200 struct bch_fs *c = trans->c;
1205 bch2_btree_iter_set_pos(iter, POS(0, idx));
1207 k = bch2_btree_iter_peek_slot(iter);
1212 if (k.k->type != KEY_TYPE_stripe)
1215 bkey_reassemble(&new_key->k_i, k);
1217 spin_lock(&c->ec_stripes_heap_lock);
1219 for (i = 0; i < new_key->v.nr_blocks; i++)
1220 stripe_blockcount_set(&new_key->v, i,
1221 m->block_sectors[i]);
1224 spin_unlock(&c->ec_stripes_heap_lock);
1226 bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i));
1228 return bch2_trans_commit(trans, NULL, NULL,
1229 BTREE_INSERT_NOFAIL|flags);
1232 int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
1234 struct btree_trans trans;
1235 struct btree_iter *iter;
1236 struct genradix_iter giter;
1237 struct bkey_i_stripe *new_key;
1241 new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
1244 bch2_trans_init(&trans, c, 0, 0);
1246 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
1247 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1249 genradix_for_each(&c->stripes[0], giter, m) {
1253 ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
1261 bch2_trans_exit(&trans);
1268 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
1270 struct btree_trans trans;
1271 struct btree_iter *btree_iter;
1272 struct journal_iter journal_iter;
1273 struct bkey_s_c btree_k, journal_k, k;
1276 ret = bch2_fs_ec_start(c);
1280 bch2_trans_init(&trans, c, 0, 0);
1282 btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
1283 journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
1285 btree_k = bch2_btree_iter_peek(btree_iter);
1286 journal_k = bch2_journal_iter_peek(&journal_iter);
1289 if (btree_k.k && journal_k.k) {
1290 int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
1294 btree_k = bch2_btree_iter_next(btree_iter);
1295 } else if (cmp == 0) {
1296 btree_k = bch2_btree_iter_next(btree_iter);
1298 journal_k = bch2_journal_iter_next(&journal_iter);
1301 journal_k = bch2_journal_iter_next(&journal_iter);
1303 } else if (btree_k.k) {
1305 btree_k = bch2_btree_iter_next(btree_iter);
1306 } else if (journal_k.k) {
1308 journal_k = bch2_journal_iter_next(&journal_iter);
1313 bch2_mark_key(c, k, 0, NULL, 0,
1314 BCH_BUCKET_MARK_ALLOC_READ|
1315 BCH_BUCKET_MARK_NOATOMIC);
1318 ret = bch2_trans_exit(&trans) ?: ret;
1320 bch_err(c, "error reading stripes: %i", ret);
1327 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
1329 struct btree_trans trans;
1330 struct btree_iter *iter;
1335 bch2_trans_init(&trans, c, 0, 0);
1337 iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
1339 k = bch2_btree_iter_prev(iter);
1340 if (!IS_ERR_OR_NULL(k.k))
1341 idx = k.k->p.offset + 1;
1342 ret = bch2_trans_exit(&trans);
1347 !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
1351 ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
1353 for (i = 0; i < idx; i++)
1354 if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
1360 int bch2_fs_ec_start(struct bch_fs *c)
1362 return bch2_ec_mem_alloc(c, false);
1365 void bch2_fs_ec_exit(struct bch_fs *c)
1367 struct ec_stripe_head *h;
1370 mutex_lock(&c->ec_new_stripe_lock);
1371 h = list_first_entry_or_null(&c->ec_new_stripe_list,
1372 struct ec_stripe_head, list);
1375 mutex_unlock(&c->ec_new_stripe_lock);
1380 BUG_ON(!list_empty(&h->stripes));
1384 free_heap(&c->ec_stripes_heap);
1385 genradix_free(&c->stripes[0]);
1386 bioset_exit(&c->ec_bioset);
1389 int bch2_fs_ec_init(struct bch_fs *c)
1391 INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1393 return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),