]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/ec.c
Update bcachefs sources to fcf8a0889c bcachefs: bch2_alloc_write() should be writing...
[bcachefs-tools-debian] / libbcachefs / ec.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* erasure coding */
4
5 #include "bcachefs.h"
6 #include "alloc_foreground.h"
7 #include "bkey_buf.h"
8 #include "bset.h"
9 #include "btree_gc.h"
10 #include "btree_update.h"
11 #include "buckets.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "error.h"
15 #include "io.h"
16 #include "keylist.h"
17 #include "recovery.h"
18 #include "super-io.h"
19 #include "util.h"
20
21 #include <linux/sort.h>
22
23 #ifdef __KERNEL__
24
25 #include <linux/raid/pq.h>
26 #include <linux/raid/xor.h>
27
28 static void raid5_recov(unsigned disks, unsigned failed_idx,
29                         size_t size, void **data)
30 {
31         unsigned i = 2, nr;
32
33         BUG_ON(failed_idx >= disks);
34
35         swap(data[0], data[failed_idx]);
36         memcpy(data[0], data[1], size);
37
38         while (i < disks) {
39                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
40                 xor_blocks(nr, size, data[0], data + i);
41                 i += nr;
42         }
43
44         swap(data[0], data[failed_idx]);
45 }
46
47 static void raid_gen(int nd, int np, size_t size, void **v)
48 {
49         if (np >= 1)
50                 raid5_recov(nd + np, nd, size, v);
51         if (np >= 2)
52                 raid6_call.gen_syndrome(nd + np, size, v);
53         BUG_ON(np > 2);
54 }
55
56 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
57 {
58         switch (nr) {
59         case 0:
60                 break;
61         case 1:
62                 if (ir[0] < nd + 1)
63                         raid5_recov(nd + 1, ir[0], size, v);
64                 else
65                         raid6_call.gen_syndrome(nd + np, size, v);
66                 break;
67         case 2:
68                 if (ir[1] < nd) {
69                         /* data+data failure. */
70                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
71                 } else if (ir[0] < nd) {
72                         /* data + p/q failure */
73
74                         if (ir[1] == nd) /* data + p failure */
75                                 raid6_datap_recov(nd + np, size, ir[0], v);
76                         else { /* data + q failure */
77                                 raid5_recov(nd + 1, ir[0], size, v);
78                                 raid6_call.gen_syndrome(nd + np, size, v);
79                         }
80                 } else {
81                         raid_gen(nd, np, size, v);
82                 }
83                 break;
84         default:
85                 BUG();
86         }
87 }
88
89 #else
90
91 #include <raid/raid.h>
92
93 #endif
94
95 struct ec_bio {
96         struct bch_dev          *ca;
97         struct ec_stripe_buf    *buf;
98         size_t                  idx;
99         struct bio              bio;
100 };
101
102 /* Stripes btree keys: */
103
104 const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
105 {
106         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
107
108         if (k.k->p.inode)
109                 return "invalid stripe key";
110
111         if (bkey_val_bytes(k.k) < sizeof(*s))
112                 return "incorrect value size";
113
114         if (bkey_val_bytes(k.k) < sizeof(*s) ||
115             bkey_val_u64s(k.k) < stripe_val_u64s(s))
116                 return "incorrect value size";
117
118         return bch2_bkey_ptrs_invalid(c, k);
119 }
120
121 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
122                          struct bkey_s_c k)
123 {
124         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
125         unsigned i;
126
127         pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
128                s->algorithm,
129                le16_to_cpu(s->sectors),
130                s->nr_blocks - s->nr_redundant,
131                s->nr_redundant,
132                s->csum_type,
133                1U << s->csum_granularity_bits);
134
135         for (i = 0; i < s->nr_blocks; i++)
136                 pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
137                        (u64) s->ptrs[i].offset,
138                        stripe_blockcount_get(s, i));
139 }
140
141 static int ptr_matches_stripe(struct bch_fs *c,
142                               struct bch_stripe *v,
143                               const struct bch_extent_ptr *ptr)
144 {
145         unsigned i;
146
147         for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
148                 const struct bch_extent_ptr *ptr2 = v->ptrs + i;
149
150                 if (ptr->dev == ptr2->dev &&
151                     ptr->gen == ptr2->gen &&
152                     ptr->offset >= ptr2->offset &&
153                     ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
154                         return i;
155         }
156
157         return -1;
158 }
159
160 static int extent_matches_stripe(struct bch_fs *c,
161                                  struct bch_stripe *v,
162                                  struct bkey_s_c k)
163 {
164
165         switch (k.k->type) {
166         case KEY_TYPE_extent: {
167                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
168                 const struct bch_extent_ptr *ptr;
169                 int idx;
170
171                 extent_for_each_ptr(e, ptr) {
172                         idx = ptr_matches_stripe(c, v, ptr);
173                         if (idx >= 0)
174                                 return idx;
175                 }
176                 break;
177         }
178         }
179
180         return -1;
181 }
182
183 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
184 {
185         switch (k.k->type) {
186         case KEY_TYPE_extent: {
187                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
188                 const union bch_extent_entry *entry;
189
190                 extent_for_each_entry(e, entry)
191                         if (extent_entry_type(entry) ==
192                             BCH_EXTENT_ENTRY_stripe_ptr &&
193                             entry->stripe_ptr.idx == idx)
194                                 return true;
195
196                 break;
197         }
198         }
199
200         return false;
201 }
202
203 /* Stripe bufs: */
204
205 static void ec_stripe_buf_free(struct ec_stripe_buf *stripe)
206 {
207         unsigned i;
208
209         for (i = 0; i < stripe->key.v.nr_blocks; i++) {
210                 kvpfree(stripe->data[i], stripe->size << 9);
211                 stripe->data[i] = NULL;
212         }
213 }
214
215 static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe)
216 {
217         unsigned i;
218
219         memset(stripe->valid, 0xFF, sizeof(stripe->valid));
220
221         for (i = 0; i < stripe->key.v.nr_blocks; i++) {
222                 stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL);
223                 if (!stripe->data[i])
224                         goto err;
225         }
226
227         return 0;
228 err:
229         ec_stripe_buf_free(stripe);
230         return -ENOMEM;
231 }
232
233 /* Checksumming: */
234
235 static void ec_generate_checksums(struct ec_stripe_buf *buf)
236 {
237         struct bch_stripe *v = &buf->key.v;
238         unsigned csum_granularity = 1 << v->csum_granularity_bits;
239         unsigned csums_per_device = stripe_csums_per_device(v);
240         unsigned csum_bytes = bch_crc_bytes[v->csum_type];
241         unsigned i, j;
242
243         if (!csum_bytes)
244                 return;
245
246         BUG_ON(buf->offset);
247         BUG_ON(buf->size != le16_to_cpu(v->sectors));
248
249         for (i = 0; i < v->nr_blocks; i++) {
250                 for (j = 0; j < csums_per_device; j++) {
251                         unsigned offset = j << v->csum_granularity_bits;
252                         unsigned len = min(csum_granularity, buf->size - offset);
253
254                         struct bch_csum csum =
255                                 bch2_checksum(NULL, v->csum_type,
256                                               null_nonce(),
257                                               buf->data[i] + (offset << 9),
258                                               len << 9);
259
260                         memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
261                 }
262         }
263 }
264
265 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
266 {
267         struct bch_stripe *v = &buf->key.v;
268         unsigned csum_granularity = 1 << v->csum_granularity_bits;
269         unsigned csum_bytes = bch_crc_bytes[v->csum_type];
270         unsigned i;
271
272         if (!csum_bytes)
273                 return;
274
275         for (i = 0; i < v->nr_blocks; i++) {
276                 unsigned offset = buf->offset;
277                 unsigned end = buf->offset + buf->size;
278
279                 if (!test_bit(i, buf->valid))
280                         continue;
281
282                 while (offset < end) {
283                         unsigned j = offset >> v->csum_granularity_bits;
284                         unsigned len = min(csum_granularity, end - offset);
285                         struct bch_csum csum;
286
287                         BUG_ON(offset & (csum_granularity - 1));
288                         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
289                                ((offset + len) & (csum_granularity - 1)));
290
291                         csum = bch2_checksum(NULL, v->csum_type,
292                                              null_nonce(),
293                                              buf->data[i] + ((offset - buf->offset) << 9),
294                                              len << 9);
295
296                         if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
297                                 bch_err_ratelimited(c,
298                                         "checksum error while doing reconstruct read (%u:%u)",
299                                         i, j);
300                                 clear_bit(i, buf->valid);
301                                 break;
302                         }
303
304                         offset += len;
305                 }
306         }
307 }
308
309 /* Erasure coding: */
310
311 static void ec_generate_ec(struct ec_stripe_buf *buf)
312 {
313         struct bch_stripe *v = &buf->key.v;
314         unsigned nr_data = v->nr_blocks - v->nr_redundant;
315         unsigned bytes = le16_to_cpu(v->sectors) << 9;
316
317         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
318 }
319
320 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
321 {
322         return buf->key.v.nr_blocks -
323                 bitmap_weight(buf->valid, buf->key.v.nr_blocks);
324 }
325
326 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
327 {
328         struct bch_stripe *v = &buf->key.v;
329         unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
330         unsigned nr_data = v->nr_blocks - v->nr_redundant;
331         unsigned bytes = buf->size << 9;
332
333         if (ec_nr_failed(buf) > v->nr_redundant) {
334                 bch_err_ratelimited(c,
335                         "error doing reconstruct read: unable to read enough blocks");
336                 return -1;
337         }
338
339         for (i = 0; i < nr_data; i++)
340                 if (!test_bit(i, buf->valid))
341                         failed[nr_failed++] = i;
342
343         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
344         return 0;
345 }
346
347 /* IO: */
348
349 static void ec_block_endio(struct bio *bio)
350 {
351         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
352         struct bch_dev *ca = ec_bio->ca;
353         struct closure *cl = bio->bi_private;
354
355         if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
356                                bio_data_dir(bio) ? "write" : "read",
357                                bch2_blk_status_to_str(bio->bi_status)))
358                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
359
360         bio_put(&ec_bio->bio);
361         percpu_ref_put(&ca->io_ref);
362         closure_put(cl);
363 }
364
365 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
366                         unsigned rw, unsigned idx, struct closure *cl)
367 {
368         struct bch_stripe *v = &buf->key.v;
369         unsigned offset = 0, bytes = buf->size << 9;
370         struct bch_extent_ptr *ptr = &v->ptrs[idx];
371         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
372         enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
373                 ? BCH_DATA_user
374                 : BCH_DATA_parity;
375
376         if (!bch2_dev_get_ioref(ca, rw)) {
377                 clear_bit(idx, buf->valid);
378                 return;
379         }
380
381         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
382
383         while (offset < bytes) {
384                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
385                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
386                 unsigned b = min_t(size_t, bytes - offset,
387                                    nr_iovecs << PAGE_SHIFT);
388                 struct ec_bio *ec_bio;
389
390                 ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
391                                                        &c->ec_bioset),
392                                       struct ec_bio, bio);
393
394                 ec_bio->ca                      = ca;
395                 ec_bio->buf                     = buf;
396                 ec_bio->idx                     = idx;
397
398                 bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
399                 bio_set_op_attrs(&ec_bio->bio, rw, 0);
400
401                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
402                 ec_bio->bio.bi_end_io           = ec_block_endio;
403                 ec_bio->bio.bi_private          = cl;
404
405                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
406
407                 closure_get(cl);
408                 percpu_ref_get(&ca->io_ref);
409
410                 submit_bio(&ec_bio->bio);
411
412                 offset += b;
413         }
414
415         percpu_ref_put(&ca->io_ref);
416 }
417
418 /* recovery read path: */
419 int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
420 {
421         struct btree_trans trans;
422         struct btree_iter *iter;
423         struct ec_stripe_buf *buf;
424         struct closure cl;
425         struct bkey_s_c k;
426         struct bch_stripe *v;
427         unsigned stripe_idx;
428         unsigned offset, end;
429         unsigned i, nr_data, csum_granularity;
430         int ret = 0, idx;
431
432         closure_init_stack(&cl);
433
434         BUG_ON(!rbio->pick.has_ec);
435
436         stripe_idx = rbio->pick.ec.idx;
437
438         buf = kzalloc(sizeof(*buf), GFP_NOIO);
439         if (!buf)
440                 return -ENOMEM;
441
442         bch2_trans_init(&trans, c, 0, 0);
443
444         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
445                                    POS(0, stripe_idx),
446                                    BTREE_ITER_SLOTS);
447         k = bch2_btree_iter_peek_slot(iter);
448         if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
449                 bch_err_ratelimited(c,
450                         "error doing reconstruct read: stripe not found");
451                 kfree(buf);
452                 return bch2_trans_exit(&trans) ?: -EIO;
453         }
454
455         bkey_reassemble(&buf->key.k_i, k);
456         bch2_trans_exit(&trans);
457
458         v = &buf->key.v;
459
460         nr_data = v->nr_blocks - v->nr_redundant;
461
462         idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
463         BUG_ON(idx < 0);
464
465         csum_granularity = 1U << v->csum_granularity_bits;
466
467         offset  = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
468         end     = offset + bio_sectors(&rbio->bio);
469
470         BUG_ON(end > le16_to_cpu(v->sectors));
471
472         buf->offset     = round_down(offset, csum_granularity);
473         buf->size       = min_t(unsigned, le16_to_cpu(v->sectors),
474                                 round_up(end, csum_granularity)) - buf->offset;
475
476         for (i = 0; i < v->nr_blocks; i++) {
477                 buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
478                 if (!buf->data[i]) {
479                         ret = -ENOMEM;
480                         goto err;
481                 }
482         }
483
484         memset(buf->valid, 0xFF, sizeof(buf->valid));
485
486         for (i = 0; i < v->nr_blocks; i++) {
487                 struct bch_extent_ptr *ptr = v->ptrs + i;
488                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
489
490                 if (ptr_stale(ca, ptr)) {
491                         bch_err_ratelimited(c,
492                                           "error doing reconstruct read: stale pointer");
493                         clear_bit(i, buf->valid);
494                         continue;
495                 }
496
497                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
498         }
499
500         closure_sync(&cl);
501
502         if (ec_nr_failed(buf) > v->nr_redundant) {
503                 bch_err_ratelimited(c,
504                         "error doing reconstruct read: unable to read enough blocks");
505                 ret = -EIO;
506                 goto err;
507         }
508
509         ec_validate_checksums(c, buf);
510
511         ret = ec_do_recov(c, buf);
512         if (ret)
513                 goto err;
514
515         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
516                       buf->data[idx] + ((offset - buf->offset) << 9));
517 err:
518         for (i = 0; i < v->nr_blocks; i++)
519                 kfree(buf->data[i]);
520         kfree(buf);
521         return ret;
522 }
523
524 /* stripe bucket accounting: */
525
526 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
527 {
528         ec_stripes_heap n, *h = &c->ec_stripes_heap;
529
530         if (idx >= h->size) {
531                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
532                         return -ENOMEM;
533
534                 spin_lock(&c->ec_stripes_heap_lock);
535                 if (n.size > h->size) {
536                         memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
537                         n.used = h->used;
538                         swap(*h, n);
539                 }
540                 spin_unlock(&c->ec_stripes_heap_lock);
541
542                 free_heap(&n);
543         }
544
545         if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
546                 return -ENOMEM;
547
548         if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
549             !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
550                 return -ENOMEM;
551
552         return 0;
553 }
554
555 static int ec_stripe_mem_alloc(struct bch_fs *c,
556                                struct btree_iter *iter)
557 {
558         size_t idx = iter->pos.offset;
559         int ret = 0;
560
561         if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
562                 return ret;
563
564         bch2_trans_unlock(iter->trans);
565         ret = -EINTR;
566
567         if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
568                 return ret;
569
570         return -ENOMEM;
571 }
572
573 static ssize_t stripe_idx_to_delete(struct bch_fs *c)
574 {
575         ec_stripes_heap *h = &c->ec_stripes_heap;
576
577         return h->used && h->data[0].blocks_nonempty == 0
578                 ? h->data[0].idx : -1;
579 }
580
581 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
582                                       struct ec_stripe_heap_entry l,
583                                       struct ec_stripe_heap_entry r)
584 {
585         return ((l.blocks_nonempty > r.blocks_nonempty) -
586                 (l.blocks_nonempty < r.blocks_nonempty));
587 }
588
589 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
590                                                    size_t i)
591 {
592         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
593
594         genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
595 }
596
597 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
598 {
599         ec_stripes_heap *h = &c->ec_stripes_heap;
600         struct stripe *m = genradix_ptr(&c->stripes[0], idx);
601
602         BUG_ON(!m->alive);
603         BUG_ON(m->heap_idx >= h->used);
604         BUG_ON(h->data[m->heap_idx].idx != idx);
605 }
606
607 void bch2_stripes_heap_del(struct bch_fs *c,
608                            struct stripe *m, size_t idx)
609 {
610         if (!m->on_heap)
611                 return;
612
613         m->on_heap = false;
614
615         heap_verify_backpointer(c, idx);
616
617         heap_del(&c->ec_stripes_heap, m->heap_idx,
618                  ec_stripes_heap_cmp,
619                  ec_stripes_heap_set_backpointer);
620 }
621
622 void bch2_stripes_heap_insert(struct bch_fs *c,
623                               struct stripe *m, size_t idx)
624 {
625         if (m->on_heap)
626                 return;
627
628         BUG_ON(heap_full(&c->ec_stripes_heap));
629
630         m->on_heap = true;
631
632         heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
633                         .idx = idx,
634                         .blocks_nonempty = m->blocks_nonempty,
635                 }),
636                  ec_stripes_heap_cmp,
637                  ec_stripes_heap_set_backpointer);
638
639         heap_verify_backpointer(c, idx);
640 }
641
642 void bch2_stripes_heap_update(struct bch_fs *c,
643                               struct stripe *m, size_t idx)
644 {
645         ec_stripes_heap *h = &c->ec_stripes_heap;
646         size_t i;
647
648         if (!m->on_heap)
649                 return;
650
651         heap_verify_backpointer(c, idx);
652
653         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
654
655         i = m->heap_idx;
656         heap_sift_up(h,   i, ec_stripes_heap_cmp,
657                      ec_stripes_heap_set_backpointer);
658         heap_sift_down(h, i, ec_stripes_heap_cmp,
659                        ec_stripes_heap_set_backpointer);
660
661         heap_verify_backpointer(c, idx);
662
663         if (stripe_idx_to_delete(c) >= 0 &&
664             !percpu_ref_is_dying(&c->writes))
665                 schedule_work(&c->ec_stripe_delete_work);
666 }
667
668 /* stripe deletion */
669
670 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
671 {
672         //pr_info("deleting stripe %zu", idx);
673         return bch2_btree_delete_range(c, BTREE_ID_EC,
674                                        POS(0, idx),
675                                        POS(0, idx + 1),
676                                        NULL);
677 }
678
679 static void ec_stripe_delete_work(struct work_struct *work)
680 {
681         struct bch_fs *c =
682                 container_of(work, struct bch_fs, ec_stripe_delete_work);
683         ssize_t idx;
684
685         while (1) {
686                 spin_lock(&c->ec_stripes_heap_lock);
687                 idx = stripe_idx_to_delete(c);
688                 if (idx < 0) {
689                         spin_unlock(&c->ec_stripes_heap_lock);
690                         break;
691                 }
692
693                 bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
694                 spin_unlock(&c->ec_stripes_heap_lock);
695
696                 if (ec_stripe_delete(c, idx))
697                         break;
698         }
699 }
700
701 /* stripe creation: */
702
703 static int ec_stripe_bkey_insert(struct bch_fs *c,
704                                  struct ec_stripe_new *s,
705                                  struct bkey_i_stripe *stripe)
706 {
707         struct btree_trans trans;
708         struct btree_iter *iter;
709         struct bkey_s_c k;
710         struct bpos start_pos = POS(0, c->ec_stripe_hint);
711         int ret;
712
713         bch2_trans_init(&trans, c, 0, 0);
714 retry:
715         bch2_trans_begin(&trans);
716
717         for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
718                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
719                 if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
720                         if (start_pos.offset) {
721                                 start_pos = POS_MIN;
722                                 bch2_btree_iter_set_pos(iter, start_pos);
723                                 continue;
724                         }
725
726                         ret = -ENOSPC;
727                         break;
728                 }
729
730                 if (bkey_deleted(k.k))
731                         goto found_slot;
732         }
733
734         goto err;
735 found_slot:
736         start_pos = iter->pos;
737
738         ret = ec_stripe_mem_alloc(c, iter);
739         if (ret)
740                 goto err;
741
742         stripe->k.p = iter->pos;
743
744         bch2_trans_update(&trans, iter, &stripe->k_i, 0);
745
746         ret = bch2_trans_commit(&trans, &s->res, NULL,
747                                 BTREE_INSERT_NOFAIL);
748 err:
749         bch2_trans_iter_put(&trans, iter);
750
751         if (ret == -EINTR)
752                 goto retry;
753
754         c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
755         bch2_trans_exit(&trans);
756
757         return ret;
758 }
759
760 static void extent_stripe_ptr_add(struct bkey_s_extent e,
761                                   struct ec_stripe_buf *s,
762                                   struct bch_extent_ptr *ptr,
763                                   unsigned block)
764 {
765         struct bch_extent_stripe_ptr *dst = (void *) ptr;
766         union bch_extent_entry *end = extent_entry_last(e);
767
768         memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
769         e.k->u64s += sizeof(*dst) / sizeof(u64);
770
771         *dst = (struct bch_extent_stripe_ptr) {
772                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
773                 .block          = block,
774                 .idx            = s->key.k.p.offset,
775         };
776 }
777
778 static int ec_stripe_update_ptrs(struct bch_fs *c,
779                                  struct ec_stripe_buf *s,
780                                  struct bkey *pos)
781 {
782         struct btree_trans trans;
783         struct btree_iter *iter;
784         struct bkey_s_c k;
785         struct bkey_s_extent e;
786         struct bkey_buf sk;
787         int ret = 0, dev, idx;
788
789         bch2_bkey_buf_init(&sk);
790         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
791
792         /* XXX this doesn't support the reflink btree */
793
794         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
795                                    bkey_start_pos(pos),
796                                    BTREE_ITER_INTENT);
797
798         while ((k = bch2_btree_iter_peek(iter)).k &&
799                !(ret = bkey_err(k)) &&
800                bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
801                 struct bch_extent_ptr *ptr, *ec_ptr = NULL;
802
803                 if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
804                         bch2_btree_iter_next(iter);
805                         continue;
806                 }
807
808                 idx = extent_matches_stripe(c, &s->key.v, k);
809                 if (idx < 0) {
810                         bch2_btree_iter_next(iter);
811                         continue;
812                 }
813
814                 dev = s->key.v.ptrs[idx].dev;
815
816                 bch2_bkey_buf_reassemble(&sk, c, k);
817                 e = bkey_i_to_s_extent(sk.k);
818
819                 bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
820                 ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
821                 BUG_ON(!ec_ptr);
822
823                 extent_stripe_ptr_add(e, s, ec_ptr, idx);
824
825                 bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
826                 bch2_trans_update(&trans, iter, sk.k, 0);
827
828                 ret = bch2_trans_commit(&trans, NULL, NULL,
829                                         BTREE_INSERT_NOFAIL);
830                 if (ret == -EINTR)
831                         ret = 0;
832                 if (ret)
833                         break;
834         }
835
836         bch2_trans_exit(&trans);
837         bch2_bkey_buf_exit(&sk, c);
838
839         return ret;
840 }
841
842 /*
843  * data buckets of new stripe all written: create the stripe
844  */
845 static void ec_stripe_create(struct ec_stripe_new *s)
846 {
847         struct bch_fs *c = s->c;
848         struct open_bucket *ob;
849         struct bkey_i *k;
850         struct stripe *m;
851         struct bch_stripe *v = &s->new_stripe.key.v;
852         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
853         int ret;
854
855         BUG_ON(s->h->s == s);
856
857         closure_sync(&s->iodone);
858
859         if (s->err) {
860                 if (s->err != -EROFS)
861                         bch_err(c, "error creating stripe: error writing data buckets");
862                 goto err;
863         }
864
865         if (s->have_existing_stripe) {
866                 ec_validate_checksums(c, &s->existing_stripe);
867
868                 if (ec_do_recov(c, &s->existing_stripe)) {
869                         bch_err(c, "error creating stripe: error reading existing stripe");
870                         goto err;
871                 }
872
873                 for (i = 0; i < nr_data; i++)
874                         if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
875                                 swap(s->new_stripe.data[i],
876                                      s->existing_stripe.data[i]);
877
878                 ec_stripe_buf_free(&s->existing_stripe);
879         }
880
881         BUG_ON(!s->allocated);
882
883         if (!percpu_ref_tryget(&c->writes))
884                 goto err;
885
886         BUG_ON(bitmap_weight(s->blocks_allocated,
887                              s->blocks.nr) != s->blocks.nr);
888
889         ec_generate_ec(&s->new_stripe);
890
891         ec_generate_checksums(&s->new_stripe);
892
893         /* write p/q: */
894         for (i = nr_data; i < v->nr_blocks; i++)
895                 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
896         closure_sync(&s->iodone);
897
898         if (ec_nr_failed(&s->new_stripe)) {
899                 bch_err(c, "error creating stripe: error writing redundancy buckets");
900                 goto err_put_writes;
901         }
902
903         ret = s->have_existing_stripe
904                 ? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i,
905                                     &s->res, NULL, BTREE_INSERT_NOFAIL)
906                 : ec_stripe_bkey_insert(c, s, &s->new_stripe.key);
907         if (ret) {
908                 bch_err(c, "error creating stripe: error creating stripe key");
909                 goto err_put_writes;
910         }
911
912         for_each_keylist_key(&s->keys, k) {
913                 ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
914                 if (ret) {
915                         bch_err(c, "error creating stripe: error %i updating pointers", ret);
916                         break;
917                 }
918         }
919
920         spin_lock(&c->ec_stripes_heap_lock);
921         m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
922 #if 0
923         pr_info("created a %s stripe %llu",
924                 s->have_existing_stripe ? "existing" : "new",
925                 s->stripe.key.k.p.offset);
926 #endif
927         BUG_ON(m->on_heap);
928         bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
929         spin_unlock(&c->ec_stripes_heap_lock);
930 err_put_writes:
931         percpu_ref_put(&c->writes);
932 err:
933         bch2_disk_reservation_put(c, &s->res);
934
935         open_bucket_for_each(c, &s->blocks, ob, i) {
936                 ob->ec = NULL;
937                 __bch2_open_bucket_put(c, ob);
938         }
939
940         bch2_open_buckets_put(c, &s->parity);
941
942         bch2_keylist_free(&s->keys, s->inline_keys);
943
944         ec_stripe_buf_free(&s->existing_stripe);
945         ec_stripe_buf_free(&s->new_stripe);
946         closure_debug_destroy(&s->iodone);
947         kfree(s);
948 }
949
950 static void ec_stripe_create_work(struct work_struct *work)
951 {
952         struct bch_fs *c = container_of(work,
953                 struct bch_fs, ec_stripe_create_work);
954         struct ec_stripe_new *s, *n;
955 restart:
956         mutex_lock(&c->ec_stripe_new_lock);
957         list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
958                 if (!atomic_read(&s->pin)) {
959                         list_del(&s->list);
960                         mutex_unlock(&c->ec_stripe_new_lock);
961                         ec_stripe_create(s);
962                         goto restart;
963                 }
964         mutex_unlock(&c->ec_stripe_new_lock);
965 }
966
967 static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
968 {
969         BUG_ON(atomic_read(&s->pin) <= 0);
970
971         if (atomic_dec_and_test(&s->pin)) {
972                 BUG_ON(!s->pending);
973                 queue_work(system_long_wq, &c->ec_stripe_create_work);
974         }
975 }
976
977 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
978 {
979         struct ec_stripe_new *s = h->s;
980
981         BUG_ON(!s->allocated && !s->err);
982
983         h->s            = NULL;
984         s->pending      = true;
985
986         mutex_lock(&c->ec_stripe_new_lock);
987         list_add(&s->list, &c->ec_stripe_new_list);
988         mutex_unlock(&c->ec_stripe_new_lock);
989
990         ec_stripe_new_put(c, s);
991 }
992
993 /* have a full bucket - hand it off to be erasure coded: */
994 void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
995 {
996         struct ec_stripe_new *s = ob->ec;
997
998         if (ob->sectors_free)
999                 s->err = -1;
1000
1001         ec_stripe_new_put(c, s);
1002 }
1003
1004 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1005 {
1006         struct ec_stripe_new *s = ob->ec;
1007
1008         s->err = -EIO;
1009 }
1010
1011 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1012 {
1013         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1014         struct bch_dev *ca;
1015         unsigned offset;
1016
1017         if (!ob)
1018                 return NULL;
1019
1020         ca      = bch_dev_bkey_exists(c, ob->ptr.dev);
1021         offset  = ca->mi.bucket_size - ob->sectors_free;
1022
1023         return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1024 }
1025
1026 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
1027                              struct bpos pos, unsigned sectors)
1028 {
1029         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1030         struct ec_stripe_new *ec;
1031
1032         if (!ob)
1033                 return;
1034
1035         //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
1036
1037         ec = ob->ec;
1038         mutex_lock(&ec->lock);
1039
1040         if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
1041                                  ARRAY_SIZE(ec->inline_keys),
1042                                  BKEY_U64s)) {
1043                 BUG();
1044         }
1045
1046         bkey_init(&ec->keys.top->k);
1047         ec->keys.top->k.p       = pos;
1048         bch2_key_resize(&ec->keys.top->k, sectors);
1049         bch2_keylist_push(&ec->keys);
1050
1051         mutex_unlock(&ec->lock);
1052 }
1053
1054 static int unsigned_cmp(const void *_l, const void *_r)
1055 {
1056         unsigned l = *((const unsigned *) _l);
1057         unsigned r = *((const unsigned *) _r);
1058
1059         return cmp_int(l, r);
1060 }
1061
1062 /* pick most common bucket size: */
1063 static unsigned pick_blocksize(struct bch_fs *c,
1064                                struct bch_devs_mask *devs)
1065 {
1066         struct bch_dev *ca;
1067         unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1068         struct {
1069                 unsigned nr, size;
1070         } cur = { 0, 0 }, best = { 0, 0 };
1071
1072         for_each_member_device_rcu(ca, c, i, devs)
1073                 sizes[nr++] = ca->mi.bucket_size;
1074
1075         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1076
1077         for (i = 0; i < nr; i++) {
1078                 if (sizes[i] != cur.size) {
1079                         if (cur.nr > best.nr)
1080                                 best = cur;
1081
1082                         cur.nr = 0;
1083                         cur.size = sizes[i];
1084                 }
1085
1086                 cur.nr++;
1087         }
1088
1089         if (cur.nr > best.nr)
1090                 best = cur;
1091
1092         return best.size;
1093 }
1094
1095 static bool may_create_new_stripe(struct bch_fs *c)
1096 {
1097         return false;
1098 }
1099
1100 static void ec_stripe_key_init(struct bch_fs *c,
1101                                struct bkey_i_stripe *s,
1102                                unsigned nr_data,
1103                                unsigned nr_parity,
1104                                unsigned stripe_size)
1105 {
1106         unsigned u64s;
1107
1108         bkey_stripe_init(&s->k_i);
1109         s->v.sectors                    = cpu_to_le16(stripe_size);
1110         s->v.algorithm                  = 0;
1111         s->v.nr_blocks                  = nr_data + nr_parity;
1112         s->v.nr_redundant               = nr_parity;
1113         s->v.csum_granularity_bits      = ilog2(c->sb.encoded_extent_max);
1114         s->v.csum_type                  = BCH_CSUM_CRC32C;
1115         s->v.pad                        = 0;
1116
1117         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1118                 BUG_ON(1 << s->v.csum_granularity_bits >=
1119                        le16_to_cpu(s->v.sectors) ||
1120                        s->v.csum_granularity_bits == U8_MAX);
1121                 s->v.csum_granularity_bits++;
1122         }
1123
1124         set_bkey_val_u64s(&s->k, u64s);
1125 }
1126
1127 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1128 {
1129         struct ec_stripe_new *s;
1130
1131         lockdep_assert_held(&h->lock);
1132
1133         s = kzalloc(sizeof(*s), GFP_KERNEL);
1134         if (!s)
1135                 return -ENOMEM;
1136
1137         mutex_init(&s->lock);
1138         closure_init(&s->iodone, NULL);
1139         atomic_set(&s->pin, 1);
1140         s->c            = c;
1141         s->h            = h;
1142         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1143                                 BCH_BKEY_PTRS_MAX) - h->redundancy;
1144         s->nr_parity    = h->redundancy;
1145
1146         bch2_keylist_init(&s->keys, s->inline_keys);
1147
1148         s->new_stripe.offset    = 0;
1149         s->new_stripe.size      = h->blocksize;
1150
1151         ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
1152                            s->nr_parity, h->blocksize);
1153
1154         h->s = s;
1155         return 0;
1156 }
1157
1158 static struct ec_stripe_head *
1159 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1160                          unsigned algo, unsigned redundancy,
1161                          bool copygc)
1162 {
1163         struct ec_stripe_head *h;
1164         struct bch_dev *ca;
1165         unsigned i;
1166
1167         h = kzalloc(sizeof(*h), GFP_KERNEL);
1168         if (!h)
1169                 return NULL;
1170
1171         mutex_init(&h->lock);
1172         mutex_lock(&h->lock);
1173
1174         h->target       = target;
1175         h->algo         = algo;
1176         h->redundancy   = redundancy;
1177         h->copygc       = copygc;
1178
1179         rcu_read_lock();
1180         h->devs = target_rw_devs(c, BCH_DATA_user, target);
1181
1182         for_each_member_device_rcu(ca, c, i, &h->devs)
1183                 if (!ca->mi.durability)
1184                         __clear_bit(i, h->devs.d);
1185
1186         h->blocksize = pick_blocksize(c, &h->devs);
1187
1188         for_each_member_device_rcu(ca, c, i, &h->devs)
1189                 if (ca->mi.bucket_size == h->blocksize)
1190                         h->nr_active_devs++;
1191
1192         rcu_read_unlock();
1193         list_add(&h->list, &c->ec_stripe_head_list);
1194         return h;
1195 }
1196
1197 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1198 {
1199         if (h->s &&
1200             h->s->allocated &&
1201             bitmap_weight(h->s->blocks_allocated,
1202                           h->s->blocks.nr) == h->s->blocks.nr)
1203                 ec_stripe_set_pending(c, h);
1204
1205         mutex_unlock(&h->lock);
1206 }
1207
1208 struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
1209                                                  unsigned target,
1210                                                  unsigned algo,
1211                                                  unsigned redundancy,
1212                                                  bool copygc)
1213 {
1214         struct ec_stripe_head *h;
1215
1216         if (!redundancy)
1217                 return NULL;
1218
1219         mutex_lock(&c->ec_stripe_head_lock);
1220         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1221                 if (h->target           == target &&
1222                     h->algo             == algo &&
1223                     h->redundancy       == redundancy &&
1224                     h->copygc           == copygc) {
1225                         mutex_lock(&h->lock);
1226                         goto found;
1227                 }
1228
1229         h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
1230 found:
1231         mutex_unlock(&c->ec_stripe_head_lock);
1232         return h;
1233 }
1234
1235 static enum bucket_alloc_ret
1236 new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
1237                          struct closure *cl)
1238 {
1239         struct bch_devs_mask devs;
1240         struct open_bucket *ob;
1241         unsigned i, nr_have, nr_data =
1242                 min_t(unsigned, h->nr_active_devs,
1243                       BCH_BKEY_PTRS_MAX) - h->redundancy;
1244         bool have_cache = true;
1245         enum bucket_alloc_ret ret = ALLOC_SUCCESS;
1246
1247         devs = h->devs;
1248
1249         for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
1250                 __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
1251                 --nr_data;
1252         }
1253
1254         BUG_ON(h->s->blocks.nr > nr_data);
1255         BUG_ON(h->s->parity.nr > h->redundancy);
1256
1257         open_bucket_for_each(c, &h->s->parity, ob, i)
1258                 __clear_bit(ob->ptr.dev, devs.d);
1259         open_bucket_for_each(c, &h->s->blocks, ob, i)
1260                 __clear_bit(ob->ptr.dev, devs.d);
1261
1262         percpu_down_read(&c->mark_lock);
1263         rcu_read_lock();
1264
1265         if (h->s->parity.nr < h->redundancy) {
1266                 nr_have = h->s->parity.nr;
1267
1268                 ret = bch2_bucket_alloc_set(c, &h->s->parity,
1269                                             &h->parity_stripe,
1270                                             &devs,
1271                                             h->redundancy,
1272                                             &nr_have,
1273                                             &have_cache,
1274                                             h->copygc
1275                                             ? RESERVE_MOVINGGC
1276                                             : RESERVE_NONE,
1277                                             0,
1278                                             cl);
1279                 if (ret)
1280                         goto err;
1281         }
1282
1283         if (h->s->blocks.nr < nr_data) {
1284                 nr_have = h->s->blocks.nr;
1285
1286                 ret = bch2_bucket_alloc_set(c, &h->s->blocks,
1287                                             &h->block_stripe,
1288                                             &devs,
1289                                             nr_data,
1290                                             &nr_have,
1291                                             &have_cache,
1292                                             h->copygc
1293                                             ? RESERVE_MOVINGGC
1294                                             : RESERVE_NONE,
1295                                             0,
1296                                             cl);
1297                 if (ret)
1298                         goto err;
1299         }
1300 err:
1301         rcu_read_unlock();
1302         percpu_up_read(&c->mark_lock);
1303         return ret;
1304 }
1305
1306 /* XXX: doesn't obey target: */
1307 static s64 get_existing_stripe(struct bch_fs *c,
1308                                unsigned target,
1309                                unsigned algo,
1310                                unsigned redundancy)
1311 {
1312         ec_stripes_heap *h = &c->ec_stripes_heap;
1313         struct stripe *m;
1314         size_t heap_idx;
1315         u64 stripe_idx;
1316
1317         if (may_create_new_stripe(c))
1318                 return -1;
1319
1320         spin_lock(&c->ec_stripes_heap_lock);
1321         for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
1322                 if (!h->data[heap_idx].blocks_nonempty)
1323                         continue;
1324
1325                 stripe_idx = h->data[heap_idx].idx;
1326                 m = genradix_ptr(&c->stripes[0], stripe_idx);
1327
1328                 if (m->algorithm        == algo &&
1329                     m->nr_redundant     == redundancy &&
1330                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant) {
1331                         bch2_stripes_heap_del(c, m, stripe_idx);
1332                         spin_unlock(&c->ec_stripes_heap_lock);
1333                         return stripe_idx;
1334                 }
1335         }
1336
1337         spin_unlock(&c->ec_stripes_heap_lock);
1338         return -1;
1339 }
1340
1341 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
1342 {
1343         struct btree_trans trans;
1344         struct btree_iter *iter;
1345         struct bkey_s_c k;
1346         int ret;
1347
1348         bch2_trans_init(&trans, c, 0, 0);
1349         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
1350         k = bch2_btree_iter_peek_slot(iter);
1351         ret = bkey_err(k);
1352         if (!ret)
1353                 bkey_reassemble(&stripe->key.k_i, k);
1354         bch2_trans_exit(&trans);
1355
1356         return ret;
1357 }
1358
1359 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
1360                                                unsigned target,
1361                                                unsigned algo,
1362                                                unsigned redundancy,
1363                                                bool copygc,
1364                                                struct closure *cl)
1365 {
1366         struct ec_stripe_head *h;
1367         struct open_bucket *ob;
1368         unsigned i, data_idx = 0;
1369         s64 idx;
1370         int ret;
1371
1372         h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
1373         if (!h) {
1374                 bch_err(c, "no stripe head");
1375                 return NULL;
1376         }
1377
1378         if (!h->s) {
1379                 if (ec_new_stripe_alloc(c, h)) {
1380                         bch2_ec_stripe_head_put(c, h);
1381                         bch_err(c, "failed to allocate new stripe");
1382                         return NULL;
1383                 }
1384
1385                 idx = get_existing_stripe(c, target, algo, redundancy);
1386                 if (idx >= 0) {
1387                         h->s->have_existing_stripe = true;
1388                         ret = get_stripe_key(c, idx, &h->s->existing_stripe);
1389                         if (ret) {
1390                                 bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
1391                                 bch2_ec_stripe_head_put(c, h);
1392                                 return NULL;
1393                         }
1394
1395                         if (ec_stripe_buf_alloc(&h->s->existing_stripe)) {
1396                                 /*
1397                                  * this is a problem: we have deleted from the
1398                                  * stripes heap already
1399                                  */
1400                                 BUG();
1401                         }
1402
1403                         for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
1404                                 if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i))
1405                                         __set_bit(i, h->s->blocks_allocated);
1406
1407                                 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
1408                         }
1409
1410                         bkey_copy(&h->s->new_stripe.key.k_i,
1411                                   &h->s->existing_stripe.key.k_i);
1412                 }
1413
1414                 if (ec_stripe_buf_alloc(&h->s->new_stripe)) {
1415                         BUG();
1416                 }
1417         }
1418
1419         if (!h->s->allocated) {
1420                 if (!h->s->have_existing_stripe &&
1421                     !h->s->res.sectors) {
1422                         ret = bch2_disk_reservation_get(c, &h->s->res,
1423                                         h->blocksize,
1424                                         h->s->nr_parity, 0);
1425                         if (ret) {
1426                                 /*
1427                                  * This means we need to wait for copygc to
1428                                  * empty out buckets from existing stripes:
1429                                  */
1430                                 bch2_ec_stripe_head_put(c, h);
1431                                 h = NULL;
1432                                 goto out;
1433                         }
1434                 }
1435
1436                 ret = new_stripe_alloc_buckets(c, h, cl);
1437                 if (ret) {
1438                         bch2_ec_stripe_head_put(c, h);
1439                         h = ERR_PTR(-ret);
1440                         goto out;
1441                 }
1442
1443                 open_bucket_for_each(c, &h->s->blocks, ob, i) {
1444                         data_idx = find_next_zero_bit(h->s->blocks_allocated,
1445                                                       h->s->nr_data, data_idx);
1446                         BUG_ON(data_idx >= h->s->nr_data);
1447
1448                         h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr;
1449                         h->s->data_block_idx[i] = data_idx;
1450                         data_idx++;
1451                 }
1452
1453                 open_bucket_for_each(c, &h->s->parity, ob, i)
1454                         h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
1455
1456                 //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
1457                 h->s->allocated = true;
1458         }
1459 out:
1460         return h;
1461 }
1462
1463 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1464 {
1465         struct ec_stripe_head *h;
1466         struct open_bucket *ob;
1467         unsigned i;
1468
1469         mutex_lock(&c->ec_stripe_head_lock);
1470         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
1471
1472                 mutex_lock(&h->lock);
1473                 if (!h->s)
1474                         goto unlock;
1475
1476                 open_bucket_for_each(c, &h->s->blocks, ob, i)
1477                         if (ob->ptr.dev == ca->dev_idx)
1478                                 goto found;
1479                 open_bucket_for_each(c, &h->s->parity, ob, i)
1480                         if (ob->ptr.dev == ca->dev_idx)
1481                                 goto found;
1482                 goto unlock;
1483 found:
1484                 h->s->err = -EROFS;
1485                 ec_stripe_set_pending(c, h);
1486 unlock:
1487                 mutex_unlock(&h->lock);
1488         }
1489         mutex_unlock(&c->ec_stripe_head_lock);
1490 }
1491
1492 static int __bch2_stripe_write_key(struct btree_trans *trans,
1493                                    struct btree_iter *iter,
1494                                    struct stripe *m,
1495                                    size_t idx,
1496                                    struct bkey_i_stripe *new_key)
1497 {
1498         struct bch_fs *c = trans->c;
1499         struct bkey_s_c k;
1500         unsigned i;
1501         int ret;
1502
1503         bch2_btree_iter_set_pos(iter, POS(0, idx));
1504
1505         k = bch2_btree_iter_peek_slot(iter);
1506         ret = bkey_err(k);
1507         if (ret)
1508                 return ret;
1509
1510         if (k.k->type != KEY_TYPE_stripe)
1511                 return -EIO;
1512
1513         bkey_reassemble(&new_key->k_i, k);
1514
1515         spin_lock(&c->ec_stripes_heap_lock);
1516
1517         for (i = 0; i < new_key->v.nr_blocks; i++)
1518                 stripe_blockcount_set(&new_key->v, i,
1519                                       m->block_sectors[i]);
1520         m->dirty = false;
1521
1522         spin_unlock(&c->ec_stripes_heap_lock);
1523
1524         bch2_trans_update(trans, iter, &new_key->k_i, 0);
1525         return 0;
1526 }
1527
1528 int bch2_stripes_write(struct bch_fs *c, unsigned flags)
1529 {
1530         struct btree_trans trans;
1531         struct btree_iter *iter;
1532         struct genradix_iter giter;
1533         struct bkey_i_stripe *new_key;
1534         struct stripe *m;
1535         int ret = 0;
1536
1537         new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
1538         BUG_ON(!new_key);
1539
1540         bch2_trans_init(&trans, c, 0, 0);
1541
1542         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
1543                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1544
1545         genradix_for_each(&c->stripes[0], giter, m) {
1546                 if (!m->dirty)
1547                         continue;
1548
1549                 ret = __bch2_trans_do(&trans, NULL, NULL,
1550                                       BTREE_INSERT_NOFAIL|flags,
1551                         __bch2_stripe_write_key(&trans, iter, m,
1552                                         giter.pos, new_key));
1553
1554                 if (ret)
1555                         break;
1556         }
1557
1558         bch2_trans_exit(&trans);
1559
1560         kfree(new_key);
1561
1562         return ret;
1563 }
1564
1565 static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
1566                               unsigned level, struct bkey_s_c k)
1567 {
1568         int ret = 0;
1569
1570         if (k.k->type == KEY_TYPE_stripe) {
1571                 struct stripe *m;
1572
1573                 ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
1574                         bch2_mark_key(c, k, 0, 0, NULL, 0,
1575                                       BTREE_TRIGGER_NOATOMIC);
1576                 if (ret)
1577                         return ret;
1578
1579                 spin_lock(&c->ec_stripes_heap_lock);
1580                 m = genradix_ptr(&c->stripes[0], k.k->p.offset);
1581                 bch2_stripes_heap_insert(c, m, k.k->p.offset);
1582                 spin_unlock(&c->ec_stripes_heap_lock);
1583         }
1584
1585         return ret;
1586 }
1587
1588 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
1589 {
1590         int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC,
1591                                           NULL, bch2_stripes_read_fn);
1592         if (ret)
1593                 bch_err(c, "error reading stripes: %i", ret);
1594
1595         return ret;
1596 }
1597
1598 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
1599 {
1600         struct btree_trans trans;
1601         struct btree_iter *iter;
1602         struct bkey_s_c k;
1603         size_t i, idx = 0;
1604         int ret = 0;
1605
1606         bch2_trans_init(&trans, c, 0, 0);
1607
1608         iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
1609
1610         k = bch2_btree_iter_prev(iter);
1611         if (!IS_ERR_OR_NULL(k.k))
1612                 idx = k.k->p.offset + 1;
1613         ret = bch2_trans_exit(&trans);
1614         if (ret)
1615                 return ret;
1616
1617         if (!idx)
1618                 return 0;
1619
1620         if (!gc &&
1621             !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
1622                        GFP_KERNEL))
1623                 return -ENOMEM;
1624 #if 0
1625         ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
1626 #else
1627         for (i = 0; i < idx; i++)
1628                 if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
1629                         return -ENOMEM;
1630 #endif
1631         return 0;
1632 }
1633
1634 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
1635 {
1636         ec_stripes_heap *h = &c->ec_stripes_heap;
1637         struct stripe *m;
1638         size_t i;
1639
1640         spin_lock(&c->ec_stripes_heap_lock);
1641         for (i = 0; i < min_t(size_t, h->used, 20); i++) {
1642                 m = genradix_ptr(&c->stripes[0], h->data[i].idx);
1643
1644                 pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
1645                        h->data[i].blocks_nonempty,
1646                        m->nr_blocks - m->nr_redundant,
1647                        m->nr_redundant);
1648         }
1649         spin_unlock(&c->ec_stripes_heap_lock);
1650 }
1651
1652 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
1653 {
1654         struct ec_stripe_head *h;
1655         struct ec_stripe_new *s;
1656
1657         mutex_lock(&c->ec_stripe_head_lock);
1658         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
1659                 pr_buf(out, "target %u algo %u redundancy %u:\n",
1660                        h->target, h->algo, h->redundancy);
1661
1662                 if (h->s)
1663                         pr_buf(out, "\tpending: blocks %u allocated %u\n",
1664                                h->s->blocks.nr,
1665                                bitmap_weight(h->s->blocks_allocated,
1666                                              h->s->blocks.nr));
1667         }
1668         mutex_unlock(&c->ec_stripe_head_lock);
1669
1670         mutex_lock(&c->ec_stripe_new_lock);
1671         list_for_each_entry(s, &c->ec_stripe_new_list, list) {
1672                 pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
1673                        s->blocks.nr,
1674                        bitmap_weight(s->blocks_allocated,
1675                                      s->blocks.nr),
1676                        atomic_read(&s->pin));
1677         }
1678         mutex_unlock(&c->ec_stripe_new_lock);
1679 }
1680
1681 void bch2_fs_ec_exit(struct bch_fs *c)
1682 {
1683         struct ec_stripe_head *h;
1684
1685         while (1) {
1686                 mutex_lock(&c->ec_stripe_head_lock);
1687                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
1688                                              struct ec_stripe_head, list);
1689                 if (h)
1690                         list_del(&h->list);
1691                 mutex_unlock(&c->ec_stripe_head_lock);
1692                 if (!h)
1693                         break;
1694
1695                 BUG_ON(h->s);
1696                 kfree(h);
1697         }
1698
1699         BUG_ON(!list_empty(&c->ec_stripe_new_list));
1700
1701         free_heap(&c->ec_stripes_heap);
1702         genradix_free(&c->stripes[0]);
1703         bioset_exit(&c->ec_bioset);
1704 }
1705
1706 int bch2_fs_ec_init(struct bch_fs *c)
1707 {
1708         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
1709         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1710
1711         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
1712                            BIOSET_NEED_BVECS);
1713 }