]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/ec.c
Update bcachefs sources to ff83eed5f5 bcachefs: Fix bch2_mark_stripe()
[bcachefs-tools-debian] / libbcachefs / ec.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* erasure coding */
4
5 #include "bcachefs.h"
6 #include "alloc_foreground.h"
7 #include "bkey_on_stack.h"
8 #include "bset.h"
9 #include "btree_gc.h"
10 #include "btree_update.h"
11 #include "buckets.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "error.h"
15 #include "io.h"
16 #include "keylist.h"
17 #include "recovery.h"
18 #include "super-io.h"
19 #include "util.h"
20
21 #include <linux/sort.h>
22
23 #ifdef __KERNEL__
24
25 #include <linux/raid/pq.h>
26 #include <linux/raid/xor.h>
27
28 static void raid5_recov(unsigned disks, unsigned failed_idx,
29                         size_t size, void **data)
30 {
31         unsigned i = 2, nr;
32
33         BUG_ON(failed_idx >= disks);
34
35         swap(data[0], data[failed_idx]);
36         memcpy(data[0], data[1], size);
37
38         while (i < disks) {
39                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
40                 xor_blocks(nr, size, data[0], data + i);
41                 i += nr;
42         }
43
44         swap(data[0], data[failed_idx]);
45 }
46
47 static void raid_gen(int nd, int np, size_t size, void **v)
48 {
49         if (np >= 1)
50                 raid5_recov(nd + np, nd, size, v);
51         if (np >= 2)
52                 raid6_call.gen_syndrome(nd + np, size, v);
53         BUG_ON(np > 2);
54 }
55
56 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
57 {
58         switch (nr) {
59         case 0:
60                 break;
61         case 1:
62                 if (ir[0] < nd + 1)
63                         raid5_recov(nd + 1, ir[0], size, v);
64                 else
65                         raid6_call.gen_syndrome(nd + np, size, v);
66                 break;
67         case 2:
68                 if (ir[1] < nd) {
69                         /* data+data failure. */
70                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
71                 } else if (ir[0] < nd) {
72                         /* data + p/q failure */
73
74                         if (ir[1] == nd) /* data + p failure */
75                                 raid6_datap_recov(nd + np, size, ir[0], v);
76                         else { /* data + q failure */
77                                 raid5_recov(nd + 1, ir[0], size, v);
78                                 raid6_call.gen_syndrome(nd + np, size, v);
79                         }
80                 } else {
81                         raid_gen(nd, np, size, v);
82                 }
83                 break;
84         default:
85                 BUG();
86         }
87 }
88
89 #else
90
91 #include <raid/raid.h>
92
93 #endif
94
95 struct ec_bio {
96         struct bch_dev          *ca;
97         struct ec_stripe_buf    *buf;
98         size_t                  idx;
99         struct bio              bio;
100 };
101
102 /* Stripes btree keys: */
103
104 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
105 {
106         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
107
108         if (k.k->p.inode)
109                 return "invalid stripe key";
110
111         if (bkey_val_bytes(k.k) < sizeof(*s))
112                 return "incorrect value size";
113
114         if (bkey_val_bytes(k.k) < sizeof(*s) ||
115             bkey_val_u64s(k.k) < stripe_val_u64s(s))
116                 return "incorrect value size";
117
118         return bch2_bkey_ptrs_invalid(c, k);
119 }
120
121 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
122                          struct bkey_s_c k)
123 {
124         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
125         unsigned i;
126
127         pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
128                s->algorithm,
129                le16_to_cpu(s->sectors),
130                s->nr_blocks - s->nr_redundant,
131                s->nr_redundant,
132                s->csum_type,
133                1U << s->csum_granularity_bits);
134
135         for (i = 0; i < s->nr_blocks; i++)
136                 pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
137                        (u64) s->ptrs[i].offset,
138                        stripe_blockcount_get(s, i));
139 }
140
141 static int ptr_matches_stripe(struct bch_fs *c,
142                               struct bch_stripe *v,
143                               const struct bch_extent_ptr *ptr)
144 {
145         unsigned i;
146
147         for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
148                 const struct bch_extent_ptr *ptr2 = v->ptrs + i;
149
150                 if (ptr->dev == ptr2->dev &&
151                     ptr->gen == ptr2->gen &&
152                     ptr->offset >= ptr2->offset &&
153                     ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
154                         return i;
155         }
156
157         return -1;
158 }
159
160 static int extent_matches_stripe(struct bch_fs *c,
161                                  struct bch_stripe *v,
162                                  struct bkey_s_c k)
163 {
164
165         switch (k.k->type) {
166         case KEY_TYPE_extent: {
167                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
168                 const struct bch_extent_ptr *ptr;
169                 int idx;
170
171                 extent_for_each_ptr(e, ptr) {
172                         idx = ptr_matches_stripe(c, v, ptr);
173                         if (idx >= 0)
174                                 return idx;
175                 }
176                 break;
177         }
178         }
179
180         return -1;
181 }
182
183 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
184 {
185         switch (k.k->type) {
186         case KEY_TYPE_extent: {
187                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
188                 const union bch_extent_entry *entry;
189
190                 extent_for_each_entry(e, entry)
191                         if (extent_entry_type(entry) ==
192                             BCH_EXTENT_ENTRY_stripe_ptr &&
193                             entry->stripe_ptr.idx == idx)
194                                 return true;
195
196                 break;
197         }
198         }
199
200         return false;
201 }
202
203 /* Checksumming: */
204
205 static void ec_generate_checksums(struct ec_stripe_buf *buf)
206 {
207         struct bch_stripe *v = &buf->key.v;
208         unsigned csum_granularity = 1 << v->csum_granularity_bits;
209         unsigned csums_per_device = stripe_csums_per_device(v);
210         unsigned csum_bytes = bch_crc_bytes[v->csum_type];
211         unsigned i, j;
212
213         if (!csum_bytes)
214                 return;
215
216         BUG_ON(buf->offset);
217         BUG_ON(buf->size != le16_to_cpu(v->sectors));
218
219         for (i = 0; i < v->nr_blocks; i++) {
220                 for (j = 0; j < csums_per_device; j++) {
221                         unsigned offset = j << v->csum_granularity_bits;
222                         unsigned len = min(csum_granularity, buf->size - offset);
223
224                         struct bch_csum csum =
225                                 bch2_checksum(NULL, v->csum_type,
226                                               null_nonce(),
227                                               buf->data[i] + (offset << 9),
228                                               len << 9);
229
230                         memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
231                 }
232         }
233 }
234
235 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
236 {
237         struct bch_stripe *v = &buf->key.v;
238         unsigned csum_granularity = 1 << v->csum_granularity_bits;
239         unsigned csum_bytes = bch_crc_bytes[v->csum_type];
240         unsigned i;
241
242         if (!csum_bytes)
243                 return;
244
245         for (i = 0; i < v->nr_blocks; i++) {
246                 unsigned offset = buf->offset;
247                 unsigned end = buf->offset + buf->size;
248
249                 if (!test_bit(i, buf->valid))
250                         continue;
251
252                 while (offset < end) {
253                         unsigned j = offset >> v->csum_granularity_bits;
254                         unsigned len = min(csum_granularity, end - offset);
255                         struct bch_csum csum;
256
257                         BUG_ON(offset & (csum_granularity - 1));
258                         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
259                                ((offset + len) & (csum_granularity - 1)));
260
261                         csum = bch2_checksum(NULL, v->csum_type,
262                                              null_nonce(),
263                                              buf->data[i] + ((offset - buf->offset) << 9),
264                                              len << 9);
265
266                         if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
267                                 __bcache_io_error(c,
268                                         "checksum error while doing reconstruct read (%u:%u)",
269                                         i, j);
270                                 clear_bit(i, buf->valid);
271                                 break;
272                         }
273
274                         offset += len;
275                 }
276         }
277 }
278
279 /* Erasure coding: */
280
281 static void ec_generate_ec(struct ec_stripe_buf *buf)
282 {
283         struct bch_stripe *v = &buf->key.v;
284         unsigned nr_data = v->nr_blocks - v->nr_redundant;
285         unsigned bytes = le16_to_cpu(v->sectors) << 9;
286
287         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
288 }
289
290 static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
291 {
292         return nr - bitmap_weight(buf->valid, nr);
293 }
294
295 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
296 {
297         return __ec_nr_failed(buf, buf->key.v.nr_blocks);
298 }
299
300 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
301 {
302         struct bch_stripe *v = &buf->key.v;
303         unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
304         unsigned nr_data = v->nr_blocks - v->nr_redundant;
305         unsigned bytes = buf->size << 9;
306
307         if (ec_nr_failed(buf) > v->nr_redundant) {
308                 __bcache_io_error(c,
309                         "error doing reconstruct read: unable to read enough blocks");
310                 return -1;
311         }
312
313         for (i = 0; i < nr_data; i++)
314                 if (!test_bit(i, buf->valid))
315                         failed[nr_failed++] = i;
316
317         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
318         return 0;
319 }
320
321 /* IO: */
322
323 static void ec_block_endio(struct bio *bio)
324 {
325         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
326         struct bch_dev *ca = ec_bio->ca;
327         struct closure *cl = bio->bi_private;
328
329         if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
330                                bio_data_dir(bio) ? "write" : "read",
331                                bch2_blk_status_to_str(bio->bi_status)))
332                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
333
334         bio_put(&ec_bio->bio);
335         percpu_ref_put(&ca->io_ref);
336         closure_put(cl);
337 }
338
339 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
340                         unsigned rw, unsigned idx, struct closure *cl)
341 {
342         struct bch_stripe *v = &buf->key.v;
343         unsigned offset = 0, bytes = buf->size << 9;
344         struct bch_extent_ptr *ptr = &v->ptrs[idx];
345         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
346         enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
347                 ? BCH_DATA_user
348                 : BCH_DATA_parity;
349
350         if (!bch2_dev_get_ioref(ca, rw)) {
351                 clear_bit(idx, buf->valid);
352                 return;
353         }
354
355         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
356
357         while (offset < bytes) {
358                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
359                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
360                 unsigned b = min_t(size_t, bytes - offset,
361                                    nr_iovecs << PAGE_SHIFT);
362                 struct ec_bio *ec_bio;
363
364                 ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
365                                                        &c->ec_bioset),
366                                       struct ec_bio, bio);
367
368                 ec_bio->ca                      = ca;
369                 ec_bio->buf                     = buf;
370                 ec_bio->idx                     = idx;
371
372                 bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
373                 bio_set_op_attrs(&ec_bio->bio, rw, 0);
374
375                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
376                 ec_bio->bio.bi_end_io           = ec_block_endio;
377                 ec_bio->bio.bi_private          = cl;
378
379                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
380
381                 closure_get(cl);
382                 percpu_ref_get(&ca->io_ref);
383
384                 submit_bio(&ec_bio->bio);
385
386                 offset += b;
387         }
388
389         percpu_ref_put(&ca->io_ref);
390 }
391
392 /* recovery read path: */
393 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
394 {
395         struct btree_trans trans;
396         struct btree_iter *iter;
397         struct ec_stripe_buf *buf;
398         struct closure cl;
399         struct bkey_s_c k;
400         struct bch_stripe *v;
401         unsigned stripe_idx;
402         unsigned offset, end;
403         unsigned i, nr_data, csum_granularity;
404         int ret = 0, idx;
405
406         closure_init_stack(&cl);
407
408         BUG_ON(!rbio->pick.has_ec);
409
410         stripe_idx = rbio->pick.ec.idx;
411
412         buf = kzalloc(sizeof(*buf), GFP_NOIO);
413         if (!buf)
414                 return -ENOMEM;
415
416         bch2_trans_init(&trans, c, 0, 0);
417
418         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
419                                    POS(0, stripe_idx),
420                                    BTREE_ITER_SLOTS);
421         k = bch2_btree_iter_peek_slot(iter);
422         if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
423                 __bcache_io_error(c,
424                         "error doing reconstruct read: stripe not found");
425                 kfree(buf);
426                 return bch2_trans_exit(&trans) ?: -EIO;
427         }
428
429         bkey_reassemble(&buf->key.k_i, k);
430         bch2_trans_exit(&trans);
431
432         v = &buf->key.v;
433
434         nr_data = v->nr_blocks - v->nr_redundant;
435
436         idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
437         BUG_ON(idx < 0);
438
439         csum_granularity = 1U << v->csum_granularity_bits;
440
441         offset  = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
442         end     = offset + bio_sectors(&rbio->bio);
443
444         BUG_ON(end > le16_to_cpu(v->sectors));
445
446         buf->offset     = round_down(offset, csum_granularity);
447         buf->size       = min_t(unsigned, le16_to_cpu(v->sectors),
448                                 round_up(end, csum_granularity)) - buf->offset;
449
450         for (i = 0; i < v->nr_blocks; i++) {
451                 buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
452                 if (!buf->data[i]) {
453                         ret = -ENOMEM;
454                         goto err;
455                 }
456         }
457
458         memset(buf->valid, 0xFF, sizeof(buf->valid));
459
460         for (i = 0; i < v->nr_blocks; i++) {
461                 struct bch_extent_ptr *ptr = v->ptrs + i;
462                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
463
464                 if (ptr_stale(ca, ptr)) {
465                         __bcache_io_error(c,
466                                           "error doing reconstruct read: stale pointer");
467                         clear_bit(i, buf->valid);
468                         continue;
469                 }
470
471                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
472         }
473
474         closure_sync(&cl);
475
476         if (ec_nr_failed(buf) > v->nr_redundant) {
477                 __bcache_io_error(c,
478                         "error doing reconstruct read: unable to read enough blocks");
479                 ret = -EIO;
480                 goto err;
481         }
482
483         ec_validate_checksums(c, buf);
484
485         ret = ec_do_recov(c, buf);
486         if (ret)
487                 goto err;
488
489         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
490                       buf->data[idx] + ((offset - buf->offset) << 9));
491 err:
492         for (i = 0; i < v->nr_blocks; i++)
493                 kfree(buf->data[i]);
494         kfree(buf);
495         return ret;
496 }
497
498 /* stripe bucket accounting: */
499
500 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
501 {
502         ec_stripes_heap n, *h = &c->ec_stripes_heap;
503
504         if (idx >= h->size) {
505                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
506                         return -ENOMEM;
507
508                 spin_lock(&c->ec_stripes_heap_lock);
509                 if (n.size > h->size) {
510                         memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
511                         n.used = h->used;
512                         swap(*h, n);
513                 }
514                 spin_unlock(&c->ec_stripes_heap_lock);
515
516                 free_heap(&n);
517         }
518
519         if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
520                 return -ENOMEM;
521
522         if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
523             !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
524                 return -ENOMEM;
525
526         return 0;
527 }
528
529 static int ec_stripe_mem_alloc(struct bch_fs *c,
530                                struct btree_iter *iter)
531 {
532         size_t idx = iter->pos.offset;
533         int ret = 0;
534
535         if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
536                 return ret;
537
538         bch2_trans_unlock(iter->trans);
539         ret = -EINTR;
540
541         if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
542                 return ret;
543
544         return -ENOMEM;
545 }
546
547 static ssize_t stripe_idx_to_delete(struct bch_fs *c)
548 {
549         ec_stripes_heap *h = &c->ec_stripes_heap;
550
551         return h->used && h->data[0].blocks_nonempty == 0
552                 ? h->data[0].idx : -1;
553 }
554
555 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
556                                       struct ec_stripe_heap_entry l,
557                                       struct ec_stripe_heap_entry r)
558 {
559         return ((l.blocks_nonempty > r.blocks_nonempty) -
560                 (l.blocks_nonempty < r.blocks_nonempty));
561 }
562
563 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
564                                                    size_t i)
565 {
566         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
567
568         genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
569 }
570
571 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
572 {
573         ec_stripes_heap *h = &c->ec_stripes_heap;
574         struct stripe *m = genradix_ptr(&c->stripes[0], idx);
575
576         BUG_ON(!m->alive);
577         BUG_ON(m->heap_idx >= h->used);
578         BUG_ON(h->data[m->heap_idx].idx != idx);
579 }
580
581 void bch2_stripes_heap_del(struct bch_fs *c,
582                            struct stripe *m, size_t idx)
583 {
584         if (!m->on_heap)
585                 return;
586
587         m->on_heap = false;
588
589         heap_verify_backpointer(c, idx);
590
591         heap_del(&c->ec_stripes_heap, m->heap_idx,
592                  ec_stripes_heap_cmp,
593                  ec_stripes_heap_set_backpointer);
594 }
595
596 void bch2_stripes_heap_insert(struct bch_fs *c,
597                               struct stripe *m, size_t idx)
598 {
599         if (m->on_heap)
600                 return;
601
602         BUG_ON(heap_full(&c->ec_stripes_heap));
603
604         m->on_heap = true;
605
606         heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
607                         .idx = idx,
608                         .blocks_nonempty = m->blocks_nonempty,
609                 }),
610                  ec_stripes_heap_cmp,
611                  ec_stripes_heap_set_backpointer);
612
613         heap_verify_backpointer(c, idx);
614 }
615
616 void bch2_stripes_heap_update(struct bch_fs *c,
617                               struct stripe *m, size_t idx)
618 {
619         ec_stripes_heap *h = &c->ec_stripes_heap;
620         size_t i;
621
622         if (!m->on_heap)
623                 return;
624
625         heap_verify_backpointer(c, idx);
626
627         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
628
629         i = m->heap_idx;
630         heap_sift_up(h,   i, ec_stripes_heap_cmp,
631                      ec_stripes_heap_set_backpointer);
632         heap_sift_down(h, i, ec_stripes_heap_cmp,
633                        ec_stripes_heap_set_backpointer);
634
635         heap_verify_backpointer(c, idx);
636
637         if (stripe_idx_to_delete(c) >= 0 &&
638             !percpu_ref_is_dying(&c->writes))
639                 schedule_work(&c->ec_stripe_delete_work);
640 }
641
642 /* stripe deletion */
643
644 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
645 {
646         //pr_info("deleting stripe %zu", idx);
647         return bch2_btree_delete_range(c, BTREE_ID_EC,
648                                        POS(0, idx),
649                                        POS(0, idx + 1),
650                                        NULL);
651 }
652
653 static void ec_stripe_delete_work(struct work_struct *work)
654 {
655         struct bch_fs *c =
656                 container_of(work, struct bch_fs, ec_stripe_delete_work);
657         ssize_t idx;
658
659         while (1) {
660                 spin_lock(&c->ec_stripes_heap_lock);
661                 idx = stripe_idx_to_delete(c);
662                 if (idx < 0) {
663                         spin_unlock(&c->ec_stripes_heap_lock);
664                         break;
665                 }
666
667                 bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
668                 spin_unlock(&c->ec_stripes_heap_lock);
669
670                 if (ec_stripe_delete(c, idx))
671                         break;
672         }
673 }
674
675 /* stripe creation: */
676
677 static int ec_stripe_bkey_insert(struct bch_fs *c,
678                                  struct ec_stripe_new *s,
679                                  struct bkey_i_stripe *stripe)
680 {
681         struct btree_trans trans;
682         struct btree_iter *iter;
683         struct bkey_s_c k;
684         struct bpos start_pos = POS(0, c->ec_stripe_hint);
685         int ret;
686
687         bch2_trans_init(&trans, c, 0, 0);
688 retry:
689         bch2_trans_begin(&trans);
690
691         for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
692                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
693                 if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
694                         if (start_pos.offset) {
695                                 start_pos = POS_MIN;
696                                 bch2_btree_iter_set_pos(iter, start_pos);
697                                 continue;
698                         }
699
700                         ret = -ENOSPC;
701                         break;
702                 }
703
704                 if (bkey_deleted(k.k))
705                         goto found_slot;
706         }
707
708         goto err;
709 found_slot:
710         start_pos = iter->pos;
711
712         ret = ec_stripe_mem_alloc(c, iter);
713         if (ret)
714                 goto err;
715
716         stripe->k.p = iter->pos;
717
718         bch2_trans_update(&trans, iter, &stripe->k_i, 0);
719
720         ret = bch2_trans_commit(&trans, &s->res, NULL,
721                                 BTREE_INSERT_NOFAIL);
722 err:
723         bch2_trans_iter_put(&trans, iter);
724
725         if (ret == -EINTR)
726                 goto retry;
727
728         c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
729         bch2_trans_exit(&trans);
730
731         return ret;
732 }
733
734 static void extent_stripe_ptr_add(struct bkey_s_extent e,
735                                   struct ec_stripe_buf *s,
736                                   struct bch_extent_ptr *ptr,
737                                   unsigned block)
738 {
739         struct bch_extent_stripe_ptr *dst = (void *) ptr;
740         union bch_extent_entry *end = extent_entry_last(e);
741
742         memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
743         e.k->u64s += sizeof(*dst) / sizeof(u64);
744
745         *dst = (struct bch_extent_stripe_ptr) {
746                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
747                 .block          = block,
748                 .idx            = s->key.k.p.offset,
749         };
750 }
751
752 static int ec_stripe_update_ptrs(struct bch_fs *c,
753                                  struct ec_stripe_buf *s,
754                                  struct bkey *pos)
755 {
756         struct btree_trans trans;
757         struct btree_iter *iter;
758         struct bkey_s_c k;
759         struct bkey_s_extent e;
760         struct bkey_on_stack sk;
761         int ret = 0, dev, idx;
762
763         bkey_on_stack_init(&sk);
764         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
765
766         /* XXX this doesn't support the reflink btree */
767
768         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
769                                    bkey_start_pos(pos),
770                                    BTREE_ITER_INTENT);
771
772         while ((k = bch2_btree_iter_peek(iter)).k &&
773                !(ret = bkey_err(k)) &&
774                bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
775                 struct bch_extent_ptr *ptr, *ec_ptr = NULL;
776
777                 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
778                         bch2_btree_iter_next(iter);
779                         continue;
780                 }
781
782                 idx = extent_matches_stripe(c, &s->key.v, k);
783                 if (idx < 0) {
784                         bch2_btree_iter_next(iter);
785                         continue;
786                 }
787
788                 dev = s->key.v.ptrs[idx].dev;
789
790                 bkey_on_stack_reassemble(&sk, c, k);
791                 e = bkey_i_to_s_extent(sk.k);
792
793                 bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
794                 ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
795                 BUG_ON(!ec_ptr);
796
797                 extent_stripe_ptr_add(e, s, ec_ptr, idx);
798
799                 bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
800                 bch2_trans_update(&trans, iter, sk.k, 0);
801
802                 ret = bch2_trans_commit(&trans, NULL, NULL,
803                                         BTREE_INSERT_NOFAIL|
804                                         BTREE_INSERT_USE_RESERVE);
805                 if (ret == -EINTR)
806                         ret = 0;
807                 if (ret)
808                         break;
809         }
810
811         bch2_trans_exit(&trans);
812         bkey_on_stack_exit(&sk, c);
813
814         return ret;
815 }
816
817 /*
818  * data buckets of new stripe all written: create the stripe
819  */
820 static void ec_stripe_create(struct ec_stripe_new *s)
821 {
822         struct bch_fs *c = s->c;
823         struct open_bucket *ob;
824         struct bkey_i *k;
825         struct stripe *m;
826         struct bch_stripe *v = &s->stripe.key.v;
827         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
828         struct closure cl;
829         int ret;
830
831         BUG_ON(s->h->s == s);
832
833         closure_init_stack(&cl);
834
835         if (s->err) {
836                 if (s->err != -EROFS)
837                         bch_err(c, "error creating stripe: error writing data buckets");
838                 goto err;
839         }
840
841         BUG_ON(!s->allocated);
842
843         if (!percpu_ref_tryget(&c->writes))
844                 goto err;
845
846         BUG_ON(bitmap_weight(s->blocks_allocated,
847                              s->blocks.nr) != s->blocks.nr);
848
849         ec_generate_ec(&s->stripe);
850
851         ec_generate_checksums(&s->stripe);
852
853         /* write p/q: */
854         for (i = nr_data; i < v->nr_blocks; i++)
855                 ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
856
857         closure_sync(&cl);
858
859         for (i = nr_data; i < v->nr_blocks; i++)
860                 if (!test_bit(i, s->stripe.valid)) {
861                         bch_err(c, "error creating stripe: error writing redundancy buckets");
862                         goto err_put_writes;
863                 }
864
865         ret = s->existing_stripe
866                 ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
867                                     &s->res, NULL, BTREE_INSERT_NOFAIL)
868                 : ec_stripe_bkey_insert(c, s, &s->stripe.key);
869         if (ret) {
870                 bch_err(c, "error creating stripe: error creating stripe key");
871                 goto err_put_writes;
872         }
873
874         for_each_keylist_key(&s->keys, k) {
875                 ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
876                 if (ret) {
877                         bch_err(c, "error creating stripe: error updating pointers");
878                         break;
879                 }
880         }
881
882         spin_lock(&c->ec_stripes_heap_lock);
883         m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
884 #if 0
885         pr_info("created a %s stripe %llu",
886                 s->existing_stripe ? "existing" : "new",
887                 s->stripe.key.k.p.offset);
888 #endif
889         BUG_ON(m->on_heap);
890         bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
891         spin_unlock(&c->ec_stripes_heap_lock);
892 err_put_writes:
893         percpu_ref_put(&c->writes);
894 err:
895         bch2_disk_reservation_put(c, &s->res);
896
897         open_bucket_for_each(c, &s->blocks, ob, i) {
898                 ob->ec = NULL;
899                 __bch2_open_bucket_put(c, ob);
900         }
901
902         bch2_open_buckets_put(c, &s->parity);
903
904         bch2_keylist_free(&s->keys, s->inline_keys);
905
906         for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
907                 kvpfree(s->stripe.data[i], s->stripe.size << 9);
908         kfree(s);
909 }
910
911 static void ec_stripe_create_work(struct work_struct *work)
912 {
913         struct bch_fs *c = container_of(work,
914                 struct bch_fs, ec_stripe_create_work);
915         struct ec_stripe_new *s, *n;
916 restart:
917         mutex_lock(&c->ec_stripe_new_lock);
918         list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
919                 if (!atomic_read(&s->pin)) {
920                         list_del(&s->list);
921                         mutex_unlock(&c->ec_stripe_new_lock);
922                         ec_stripe_create(s);
923                         goto restart;
924                 }
925         mutex_unlock(&c->ec_stripe_new_lock);
926 }
927
928 static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
929 {
930         BUG_ON(atomic_read(&s->pin) <= 0);
931
932         if (atomic_dec_and_test(&s->pin)) {
933                 BUG_ON(!s->pending);
934                 queue_work(system_long_wq, &c->ec_stripe_create_work);
935         }
936 }
937
938 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
939 {
940         struct ec_stripe_new *s = h->s;
941
942         BUG_ON(!s->allocated && !s->err);
943
944         h->s            = NULL;
945         s->pending      = true;
946
947         mutex_lock(&c->ec_stripe_new_lock);
948         list_add(&s->list, &c->ec_stripe_new_list);
949         mutex_unlock(&c->ec_stripe_new_lock);
950
951         ec_stripe_new_put(c, s);
952 }
953
954 /* have a full bucket - hand it off to be erasure coded: */
955 void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
956 {
957         struct ec_stripe_new *s = ob->ec;
958
959         if (ob->sectors_free)
960                 s->err = -1;
961
962         ec_stripe_new_put(c, s);
963 }
964
965 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
966 {
967         struct ec_stripe_new *s = ob->ec;
968
969         s->err = -EIO;
970 }
971
972 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
973 {
974         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
975         struct bch_dev *ca;
976         unsigned offset;
977
978         if (!ob)
979                 return NULL;
980
981         ca      = bch_dev_bkey_exists(c, ob->ptr.dev);
982         offset  = ca->mi.bucket_size - ob->sectors_free;
983
984         return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
985 }
986
987 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
988                              struct bpos pos, unsigned sectors)
989 {
990         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
991         struct ec_stripe_new *ec;
992
993         if (!ob)
994                 return;
995
996         //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
997
998         ec = ob->ec;
999         mutex_lock(&ec->lock);
1000
1001         if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
1002                                  ARRAY_SIZE(ec->inline_keys),
1003                                  BKEY_U64s)) {
1004                 BUG();
1005         }
1006
1007         bkey_init(&ec->keys.top->k);
1008         ec->keys.top->k.p       = pos;
1009         bch2_key_resize(&ec->keys.top->k, sectors);
1010         bch2_keylist_push(&ec->keys);
1011
1012         mutex_unlock(&ec->lock);
1013 }
1014
1015 static int unsigned_cmp(const void *_l, const void *_r)
1016 {
1017         unsigned l = *((const unsigned *) _l);
1018         unsigned r = *((const unsigned *) _r);
1019
1020         return cmp_int(l, r);
1021 }
1022
1023 /* pick most common bucket size: */
1024 static unsigned pick_blocksize(struct bch_fs *c,
1025                                struct bch_devs_mask *devs)
1026 {
1027         struct bch_dev *ca;
1028         unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1029         struct {
1030                 unsigned nr, size;
1031         } cur = { 0, 0 }, best = { 0, 0 };
1032
1033         for_each_member_device_rcu(ca, c, i, devs)
1034                 sizes[nr++] = ca->mi.bucket_size;
1035
1036         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1037
1038         for (i = 0; i < nr; i++) {
1039                 if (sizes[i] != cur.size) {
1040                         if (cur.nr > best.nr)
1041                                 best = cur;
1042
1043                         cur.nr = 0;
1044                         cur.size = sizes[i];
1045                 }
1046
1047                 cur.nr++;
1048         }
1049
1050         if (cur.nr > best.nr)
1051                 best = cur;
1052
1053         return best.size;
1054 }
1055
1056 static bool may_create_new_stripe(struct bch_fs *c)
1057 {
1058         return false;
1059 }
1060
1061 static void ec_stripe_key_init(struct bch_fs *c,
1062                                struct bkey_i_stripe *s,
1063                                unsigned nr_data,
1064                                unsigned nr_parity,
1065                                unsigned stripe_size)
1066 {
1067         unsigned u64s;
1068
1069         bkey_stripe_init(&s->k_i);
1070         s->v.sectors                    = cpu_to_le16(stripe_size);
1071         s->v.algorithm                  = 0;
1072         s->v.nr_blocks                  = nr_data + nr_parity;
1073         s->v.nr_redundant               = nr_parity;
1074         s->v.csum_granularity_bits      = ilog2(c->sb.encoded_extent_max);
1075         s->v.csum_type                  = BCH_CSUM_CRC32C;
1076         s->v.pad                        = 0;
1077
1078         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1079                 BUG_ON(1 << s->v.csum_granularity_bits >=
1080                        le16_to_cpu(s->v.sectors) ||
1081                        s->v.csum_granularity_bits == U8_MAX);
1082                 s->v.csum_granularity_bits++;
1083         }
1084
1085         set_bkey_val_u64s(&s->k, u64s);
1086 }
1087
1088 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1089 {
1090         struct ec_stripe_new *s;
1091         unsigned i;
1092
1093         lockdep_assert_held(&h->lock);
1094
1095         s = kzalloc(sizeof(*s), GFP_KERNEL);
1096         if (!s)
1097                 return -ENOMEM;
1098
1099         mutex_init(&s->lock);
1100         atomic_set(&s->pin, 1);
1101         s->c            = c;
1102         s->h            = h;
1103         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1104                                 EC_STRIPE_MAX) - h->redundancy;
1105         s->nr_parity    = h->redundancy;
1106
1107         bch2_keylist_init(&s->keys, s->inline_keys);
1108
1109         s->stripe.offset        = 0;
1110         s->stripe.size          = h->blocksize;
1111         memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
1112
1113         ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
1114                            s->nr_parity, h->blocksize);
1115
1116         for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
1117                 s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
1118                 if (!s->stripe.data[i])
1119                         goto err;
1120         }
1121
1122         h->s = s;
1123
1124         return 0;
1125 err:
1126         for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
1127                 kvpfree(s->stripe.data[i], s->stripe.size << 9);
1128         kfree(s);
1129         return -ENOMEM;
1130 }
1131
1132 static struct ec_stripe_head *
1133 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1134                          unsigned algo, unsigned redundancy)
1135 {
1136         struct ec_stripe_head *h;
1137         struct bch_dev *ca;
1138         unsigned i;
1139
1140         h = kzalloc(sizeof(*h), GFP_KERNEL);
1141         if (!h)
1142                 return NULL;
1143
1144         mutex_init(&h->lock);
1145         mutex_lock(&h->lock);
1146
1147         h->target       = target;
1148         h->algo         = algo;
1149         h->redundancy   = redundancy;
1150
1151         rcu_read_lock();
1152         h->devs = target_rw_devs(c, BCH_DATA_user, target);
1153
1154         for_each_member_device_rcu(ca, c, i, &h->devs)
1155                 if (!ca->mi.durability)
1156                         __clear_bit(i, h->devs.d);
1157
1158         h->blocksize = pick_blocksize(c, &h->devs);
1159
1160         for_each_member_device_rcu(ca, c, i, &h->devs)
1161                 if (ca->mi.bucket_size == h->blocksize)
1162                         h->nr_active_devs++;
1163
1164         rcu_read_unlock();
1165         list_add(&h->list, &c->ec_stripe_head_list);
1166         return h;
1167 }
1168
1169 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1170 {
1171         if (h->s &&
1172             h->s->allocated &&
1173             bitmap_weight(h->s->blocks_allocated,
1174                           h->s->blocks.nr) == h->s->blocks.nr)
1175                 ec_stripe_set_pending(c, h);
1176
1177         mutex_unlock(&h->lock);
1178 }
1179
1180 struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
1181                                                unsigned target,
1182                                                unsigned algo,
1183                                                unsigned redundancy)
1184 {
1185         struct ec_stripe_head *h;
1186
1187         if (!redundancy)
1188                 return NULL;
1189
1190         mutex_lock(&c->ec_stripe_head_lock);
1191         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1192                 if (h->target           == target &&
1193                     h->algo             == algo &&
1194                     h->redundancy       == redundancy) {
1195                         mutex_lock(&h->lock);
1196                         goto found;
1197                 }
1198
1199         h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
1200 found:
1201         mutex_unlock(&c->ec_stripe_head_lock);
1202         return h;
1203 }
1204
1205 /*
1206  * XXX: use a higher watermark for allocating open buckets here:
1207  */
1208 static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
1209 {
1210         struct bch_devs_mask devs;
1211         struct open_bucket *ob;
1212         unsigned i, nr_have, nr_data =
1213                 min_t(unsigned, h->nr_active_devs,
1214                       EC_STRIPE_MAX) - h->redundancy;
1215         bool have_cache = true;
1216         int ret = 0;
1217
1218         devs = h->devs;
1219
1220         for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
1221                 __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
1222                 --nr_data;
1223         }
1224
1225         BUG_ON(h->s->blocks.nr > nr_data);
1226         BUG_ON(h->s->parity.nr > h->redundancy);
1227
1228         open_bucket_for_each(c, &h->s->parity, ob, i)
1229                 __clear_bit(ob->ptr.dev, devs.d);
1230         open_bucket_for_each(c, &h->s->blocks, ob, i)
1231                 __clear_bit(ob->ptr.dev, devs.d);
1232
1233         percpu_down_read(&c->mark_lock);
1234         rcu_read_lock();
1235
1236         if (h->s->parity.nr < h->redundancy) {
1237                 nr_have = h->s->parity.nr;
1238
1239                 ret = bch2_bucket_alloc_set(c, &h->s->parity,
1240                                             &h->parity_stripe,
1241                                             &devs,
1242                                             h->redundancy,
1243                                             &nr_have,
1244                                             &have_cache,
1245                                             RESERVE_NONE,
1246                                             0,
1247                                             NULL);
1248                 if (ret)
1249                         goto err;
1250         }
1251
1252         if (h->s->blocks.nr < nr_data) {
1253                 nr_have = h->s->blocks.nr;
1254
1255                 ret = bch2_bucket_alloc_set(c, &h->s->blocks,
1256                                             &h->block_stripe,
1257                                             &devs,
1258                                             nr_data,
1259                                             &nr_have,
1260                                             &have_cache,
1261                                             RESERVE_NONE,
1262                                             0,
1263                                             NULL);
1264                 if (ret)
1265                         goto err;
1266         }
1267 err:
1268         rcu_read_unlock();
1269         percpu_up_read(&c->mark_lock);
1270         return ret;
1271 }
1272
1273 /* XXX: doesn't obey target: */
1274 static s64 get_existing_stripe(struct bch_fs *c,
1275                                unsigned target,
1276                                unsigned algo,
1277                                unsigned redundancy)
1278 {
1279         ec_stripes_heap *h = &c->ec_stripes_heap;
1280         struct stripe *m;
1281         size_t heap_idx;
1282         u64 stripe_idx;
1283
1284         if (may_create_new_stripe(c))
1285                 return -1;
1286
1287         spin_lock(&c->ec_stripes_heap_lock);
1288         for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
1289                 if (!h->data[heap_idx].blocks_nonempty)
1290                         continue;
1291
1292                 stripe_idx = h->data[heap_idx].idx;
1293                 m = genradix_ptr(&c->stripes[0], stripe_idx);
1294
1295                 if (m->algorithm        == algo &&
1296                     m->nr_redundant     == redundancy &&
1297                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant) {
1298                         bch2_stripes_heap_del(c, m, stripe_idx);
1299                         spin_unlock(&c->ec_stripes_heap_lock);
1300                         return stripe_idx;
1301                 }
1302         }
1303
1304         spin_unlock(&c->ec_stripes_heap_lock);
1305         return -1;
1306 }
1307
1308 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
1309 {
1310         struct btree_trans trans;
1311         struct btree_iter *iter;
1312         struct bkey_s_c k;
1313         int ret;
1314
1315         bch2_trans_init(&trans, c, 0, 0);
1316         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
1317         k = bch2_btree_iter_peek_slot(iter);
1318         ret = bkey_err(k);
1319         if (!ret)
1320                 bkey_reassemble(&stripe->key.k_i, k);
1321         bch2_trans_exit(&trans);
1322
1323         return ret;
1324 }
1325
1326 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
1327                                                unsigned target,
1328                                                unsigned algo,
1329                                                unsigned redundancy)
1330 {
1331         struct closure cl;
1332         struct ec_stripe_head *h;
1333         struct open_bucket *ob;
1334         unsigned i, data_idx = 0;
1335         s64 idx;
1336         int ret;
1337
1338         closure_init_stack(&cl);
1339
1340         h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
1341         if (!h)
1342                 return NULL;
1343
1344         if (!h->s && ec_new_stripe_alloc(c, h)) {
1345                 bch2_ec_stripe_head_put(c, h);
1346                 return NULL;
1347         }
1348
1349         if (!h->s->allocated) {
1350                 if (!h->s->existing_stripe &&
1351                     (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
1352                         //pr_info("got existing stripe %llu", idx);
1353
1354                         h->s->existing_stripe = true;
1355                         h->s->existing_stripe_idx = idx;
1356                         if (get_stripe_key(c, idx, &h->s->stripe)) {
1357                                 /* btree error */
1358                                 BUG();
1359                         }
1360
1361                         for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
1362                                 if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
1363                                         __set_bit(i, h->s->blocks_allocated);
1364                                         ec_block_io(c, &h->s->stripe, READ, i, &cl);
1365                                 }
1366                 }
1367
1368                 if (!h->s->existing_stripe &&
1369                     !h->s->res.sectors) {
1370                         ret = bch2_disk_reservation_get(c, &h->s->res,
1371                                                         h->blocksize,
1372                                                         h->s->nr_parity, 0);
1373                         if (ret) {
1374                                 /* What should we do here? */
1375                                 bch_err(c, "unable to create new stripe: %i", ret);
1376                                 bch2_ec_stripe_head_put(c, h);
1377                                 h = NULL;
1378                                 goto out;
1379
1380                         }
1381
1382                 }
1383
1384                 if (new_stripe_alloc_buckets(c, h)) {
1385                         bch2_ec_stripe_head_put(c, h);
1386                         h = NULL;
1387                         goto out;
1388                 }
1389
1390                 open_bucket_for_each(c, &h->s->blocks, ob, i) {
1391                         data_idx = find_next_zero_bit(h->s->blocks_allocated,
1392                                                       h->s->nr_data, data_idx);
1393                         BUG_ON(data_idx >= h->s->nr_data);
1394
1395                         h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
1396                         h->s->data_block_idx[i] = data_idx;
1397                         data_idx++;
1398                 }
1399
1400                 open_bucket_for_each(c, &h->s->parity, ob, i)
1401                         h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
1402
1403                 //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
1404                 h->s->allocated = true;
1405         }
1406 out:
1407         closure_sync(&cl);
1408         return h;
1409 }
1410
1411 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1412 {
1413         struct ec_stripe_head *h;
1414         struct open_bucket *ob;
1415         unsigned i;
1416
1417         mutex_lock(&c->ec_stripe_head_lock);
1418         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
1419
1420                 mutex_lock(&h->lock);
1421                 if (!h->s)
1422                         goto unlock;
1423
1424                 open_bucket_for_each(c, &h->s->blocks, ob, i)
1425                         if (ob->ptr.dev == ca->dev_idx)
1426                                 goto found;
1427                 open_bucket_for_each(c, &h->s->parity, ob, i)
1428                         if (ob->ptr.dev == ca->dev_idx)
1429                                 goto found;
1430                 goto unlock;
1431 found:
1432                 h->s->err = -EROFS;
1433                 ec_stripe_set_pending(c, h);
1434 unlock:
1435                 mutex_unlock(&h->lock);
1436         }
1437         mutex_unlock(&c->ec_stripe_head_lock);
1438 }
1439
1440 static int __bch2_stripe_write_key(struct btree_trans *trans,
1441                                    struct btree_iter *iter,
1442                                    struct stripe *m,
1443                                    size_t idx,
1444                                    struct bkey_i_stripe *new_key)
1445 {
1446         struct bch_fs *c = trans->c;
1447         struct bkey_s_c k;
1448         unsigned i;
1449         int ret;
1450
1451         bch2_btree_iter_set_pos(iter, POS(0, idx));
1452
1453         k = bch2_btree_iter_peek_slot(iter);
1454         ret = bkey_err(k);
1455         if (ret)
1456                 return ret;
1457
1458         if (k.k->type != KEY_TYPE_stripe)
1459                 return -EIO;
1460
1461         bkey_reassemble(&new_key->k_i, k);
1462
1463         spin_lock(&c->ec_stripes_heap_lock);
1464
1465         for (i = 0; i < new_key->v.nr_blocks; i++)
1466                 stripe_blockcount_set(&new_key->v, i,
1467                                       m->block_sectors[i]);
1468         m->dirty = false;
1469
1470         spin_unlock(&c->ec_stripes_heap_lock);
1471
1472         bch2_trans_update(trans, iter, &new_key->k_i, 0);
1473         return 0;
1474 }
1475
1476 int bch2_stripes_write(struct bch_fs *c, unsigned flags)
1477 {
1478         struct btree_trans trans;
1479         struct btree_iter *iter;
1480         struct genradix_iter giter;
1481         struct bkey_i_stripe *new_key;
1482         struct stripe *m;
1483         int ret = 0;
1484
1485         new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
1486         BUG_ON(!new_key);
1487
1488         bch2_trans_init(&trans, c, 0, 0);
1489
1490         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
1491                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1492
1493         genradix_for_each(&c->stripes[0], giter, m) {
1494                 if (!m->dirty)
1495                         continue;
1496
1497                 ret = __bch2_trans_do(&trans, NULL, NULL,
1498                                       BTREE_INSERT_NOFAIL|flags,
1499                         __bch2_stripe_write_key(&trans, iter, m,
1500                                         giter.pos, new_key));
1501
1502                 if (ret)
1503                         break;
1504         }
1505
1506         bch2_trans_exit(&trans);
1507
1508         kfree(new_key);
1509
1510         return ret;
1511 }
1512
1513 static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
1514                               unsigned level, struct bkey_s_c k)
1515 {
1516         int ret = 0;
1517
1518         if (k.k->type == KEY_TYPE_stripe) {
1519                 struct stripe *m;
1520
1521                 ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
1522                         bch2_mark_key(c, k, 0, 0, NULL, 0,
1523                                       BTREE_TRIGGER_NOATOMIC);
1524                 if (ret)
1525                         return ret;
1526
1527                 spin_lock(&c->ec_stripes_heap_lock);
1528                 m = genradix_ptr(&c->stripes[0], k.k->p.offset);
1529                 bch2_stripes_heap_insert(c, m, k.k->p.offset);
1530                 spin_unlock(&c->ec_stripes_heap_lock);
1531         }
1532
1533         return ret;
1534 }
1535
1536 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
1537 {
1538         int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC,
1539                                           NULL, bch2_stripes_read_fn);
1540         if (ret)
1541                 bch_err(c, "error reading stripes: %i", ret);
1542
1543         return ret;
1544 }
1545
1546 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
1547 {
1548         struct btree_trans trans;
1549         struct btree_iter *iter;
1550         struct bkey_s_c k;
1551         size_t i, idx = 0;
1552         int ret = 0;
1553
1554         bch2_trans_init(&trans, c, 0, 0);
1555
1556         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
1557
1558         k = bch2_btree_iter_prev(iter);
1559         if (!IS_ERR_OR_NULL(k.k))
1560                 idx = k.k->p.offset + 1;
1561         ret = bch2_trans_exit(&trans);
1562         if (ret)
1563                 return ret;
1564
1565         if (!idx)
1566                 return 0;
1567
1568         if (!gc &&
1569             !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
1570                        GFP_KERNEL))
1571                 return -ENOMEM;
1572 #if 0
1573         ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
1574 #else
1575         for (i = 0; i < idx; i++)
1576                 if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
1577                         return -ENOMEM;
1578 #endif
1579         return 0;
1580 }
1581
1582 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
1583 {
1584         ec_stripes_heap *h = &c->ec_stripes_heap;
1585         struct stripe *m;
1586         size_t i;
1587
1588         spin_lock(&c->ec_stripes_heap_lock);
1589         for (i = 0; i < min(h->used, 20UL); i++) {
1590                 m = genradix_ptr(&c->stripes[0], h->data[i].idx);
1591
1592                 pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
1593                        h->data[i].blocks_nonempty,
1594                        m->nr_blocks - m->nr_redundant,
1595                        m->nr_redundant);
1596         }
1597         spin_unlock(&c->ec_stripes_heap_lock);
1598 }
1599
1600 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
1601 {
1602         struct ec_stripe_head *h;
1603         struct ec_stripe_new *s;
1604
1605         mutex_lock(&c->ec_stripe_head_lock);
1606         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
1607                 pr_buf(out, "target %u algo %u redundancy %u:\n",
1608                        h->target, h->algo, h->redundancy);
1609
1610                 if (h->s)
1611                         pr_buf(out, "\tpending: blocks %u allocated %u\n",
1612                                h->s->blocks.nr,
1613                                bitmap_weight(h->s->blocks_allocated,
1614                                              h->s->blocks.nr));
1615         }
1616         mutex_unlock(&c->ec_stripe_head_lock);
1617
1618         mutex_lock(&c->ec_stripe_new_lock);
1619         list_for_each_entry(s, &c->ec_stripe_new_list, list) {
1620                 pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
1621                        s->blocks.nr,
1622                        bitmap_weight(s->blocks_allocated,
1623                                      s->blocks.nr),
1624                        atomic_read(&s->pin));
1625         }
1626         mutex_unlock(&c->ec_stripe_new_lock);
1627 }
1628
1629 void bch2_fs_ec_exit(struct bch_fs *c)
1630 {
1631         struct ec_stripe_head *h;
1632
1633         while (1) {
1634                 mutex_lock(&c->ec_stripe_head_lock);
1635                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
1636                                              struct ec_stripe_head, list);
1637                 if (h)
1638                         list_del(&h->list);
1639                 mutex_unlock(&c->ec_stripe_head_lock);
1640                 if (!h)
1641                         break;
1642
1643                 BUG_ON(h->s);
1644                 kfree(h);
1645         }
1646
1647         BUG_ON(!list_empty(&c->ec_stripe_new_list));
1648
1649         free_heap(&c->ec_stripes_heap);
1650         genradix_free(&c->stripes[0]);
1651         bioset_exit(&c->ec_bioset);
1652 }
1653
1654 int bch2_fs_ec_init(struct bch_fs *c)
1655 {
1656         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
1657         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1658
1659         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
1660                            BIOSET_NEED_BVECS);
1661 }