]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/ec.c
Update bcachefs sources to c6d45169c6 bcachefs: Convert split_devs() to darray
[bcachefs-tools-debian] / libbcachefs / ec.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* erasure coding */
4
5 #include "bcachefs.h"
6 #include "alloc_foreground.h"
7 #include "backpointers.h"
8 #include "bkey_buf.h"
9 #include "bset.h"
10 #include "btree_gc.h"
11 #include "btree_update.h"
12 #include "btree_write_buffer.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "disk_groups.h"
16 #include "ec.h"
17 #include "error.h"
18 #include "io_read.h"
19 #include "keylist.h"
20 #include "recovery.h"
21 #include "replicas.h"
22 #include "super-io.h"
23 #include "util.h"
24
25 #include <linux/sort.h>
26
27 #ifdef __KERNEL__
28
29 #include <linux/raid/pq.h>
30 #include <linux/raid/xor.h>
31
32 static void raid5_recov(unsigned disks, unsigned failed_idx,
33                         size_t size, void **data)
34 {
35         unsigned i = 2, nr;
36
37         BUG_ON(failed_idx >= disks);
38
39         swap(data[0], data[failed_idx]);
40         memcpy(data[0], data[1], size);
41
42         while (i < disks) {
43                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
44                 xor_blocks(nr, size, data[0], data + i);
45                 i += nr;
46         }
47
48         swap(data[0], data[failed_idx]);
49 }
50
51 static void raid_gen(int nd, int np, size_t size, void **v)
52 {
53         if (np >= 1)
54                 raid5_recov(nd + np, nd, size, v);
55         if (np >= 2)
56                 raid6_call.gen_syndrome(nd + np, size, v);
57         BUG_ON(np > 2);
58 }
59
60 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
61 {
62         switch (nr) {
63         case 0:
64                 break;
65         case 1:
66                 if (ir[0] < nd + 1)
67                         raid5_recov(nd + 1, ir[0], size, v);
68                 else
69                         raid6_call.gen_syndrome(nd + np, size, v);
70                 break;
71         case 2:
72                 if (ir[1] < nd) {
73                         /* data+data failure. */
74                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
75                 } else if (ir[0] < nd) {
76                         /* data + p/q failure */
77
78                         if (ir[1] == nd) /* data + p failure */
79                                 raid6_datap_recov(nd + np, size, ir[0], v);
80                         else { /* data + q failure */
81                                 raid5_recov(nd + 1, ir[0], size, v);
82                                 raid6_call.gen_syndrome(nd + np, size, v);
83                         }
84                 } else {
85                         raid_gen(nd, np, size, v);
86                 }
87                 break;
88         default:
89                 BUG();
90         }
91 }
92
93 #else
94
95 #include <raid/raid.h>
96
97 #endif
98
99 struct ec_bio {
100         struct bch_dev          *ca;
101         struct ec_stripe_buf    *buf;
102         size_t                  idx;
103         struct bio              bio;
104 };
105
106 /* Stripes btree keys: */
107
108 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
109                         enum bkey_invalid_flags flags,
110                         struct printbuf *err)
111 {
112         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
113         int ret = 0;
114
115         bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
116                          bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
117                          stripe_pos_bad,
118                          "stripe at bad pos");
119
120         bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
121                          stripe_val_size_bad,
122                          "incorrect value size (%zu < %u)",
123                          bkey_val_u64s(k.k), stripe_val_u64s(s));
124
125         ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
126 fsck_err:
127         return ret;
128 }
129
130 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
131                          struct bkey_s_c k)
132 {
133         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
134         unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
135
136         prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
137                s->algorithm,
138                le16_to_cpu(s->sectors),
139                nr_data,
140                s->nr_redundant,
141                s->csum_type,
142                1U << s->csum_granularity_bits);
143
144         for (i = 0; i < s->nr_blocks; i++) {
145                 const struct bch_extent_ptr *ptr = s->ptrs + i;
146                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
147                 u32 offset;
148                 u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
149
150                 prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
151                 if (i < nr_data)
152                         prt_printf(out, "#%u", stripe_blockcount_get(s, i));
153                 prt_printf(out, " gen %u", ptr->gen);
154                 if (ptr_stale(ca, ptr))
155                         prt_printf(out, " stale");
156         }
157 }
158
159 /* returns blocknr in stripe that we matched: */
160 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
161                                                 struct bkey_s_c k, unsigned *block)
162 {
163         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
164         unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
165
166         bkey_for_each_ptr(ptrs, ptr)
167                 for (i = 0; i < nr_data; i++)
168                         if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
169                                                       le16_to_cpu(s->sectors))) {
170                                 *block = i;
171                                 return ptr;
172                         }
173
174         return NULL;
175 }
176
177 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
178 {
179         switch (k.k->type) {
180         case KEY_TYPE_extent: {
181                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
182                 const union bch_extent_entry *entry;
183
184                 extent_for_each_entry(e, entry)
185                         if (extent_entry_type(entry) ==
186                             BCH_EXTENT_ENTRY_stripe_ptr &&
187                             entry->stripe_ptr.idx == idx)
188                                 return true;
189
190                 break;
191         }
192         }
193
194         return false;
195 }
196
197 /* Stripe bufs: */
198
199 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
200 {
201         if (buf->key.k.type == KEY_TYPE_stripe) {
202                 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
203                 unsigned i;
204
205                 for (i = 0; i < s->v.nr_blocks; i++) {
206                         kvpfree(buf->data[i], buf->size << 9);
207                         buf->data[i] = NULL;
208                 }
209         }
210 }
211
212 /* XXX: this is a non-mempoolified memory allocation: */
213 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
214                               unsigned offset, unsigned size)
215 {
216         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
217         unsigned csum_granularity = 1U << v->csum_granularity_bits;
218         unsigned end = offset + size;
219         unsigned i;
220
221         BUG_ON(end > le16_to_cpu(v->sectors));
222
223         offset  = round_down(offset, csum_granularity);
224         end     = min_t(unsigned, le16_to_cpu(v->sectors),
225                         round_up(end, csum_granularity));
226
227         buf->offset     = offset;
228         buf->size       = end - offset;
229
230         memset(buf->valid, 0xFF, sizeof(buf->valid));
231
232         for (i = 0; i < v->nr_blocks; i++) {
233                 buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
234                 if (!buf->data[i])
235                         goto err;
236         }
237
238         return 0;
239 err:
240         ec_stripe_buf_exit(buf);
241         return -BCH_ERR_ENOMEM_stripe_buf;
242 }
243
244 /* Checksumming: */
245
246 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
247                                          unsigned block, unsigned offset)
248 {
249         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
250         unsigned csum_granularity = 1 << v->csum_granularity_bits;
251         unsigned end = buf->offset + buf->size;
252         unsigned len = min(csum_granularity, end - offset);
253
254         BUG_ON(offset >= end);
255         BUG_ON(offset <  buf->offset);
256         BUG_ON(offset & (csum_granularity - 1));
257         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
258                (len & (csum_granularity - 1)));
259
260         return bch2_checksum(NULL, v->csum_type,
261                              null_nonce(),
262                              buf->data[block] + ((offset - buf->offset) << 9),
263                              len << 9);
264 }
265
266 static void ec_generate_checksums(struct ec_stripe_buf *buf)
267 {
268         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
269         unsigned i, j, csums_per_device = stripe_csums_per_device(v);
270
271         if (!v->csum_type)
272                 return;
273
274         BUG_ON(buf->offset);
275         BUG_ON(buf->size != le16_to_cpu(v->sectors));
276
277         for (i = 0; i < v->nr_blocks; i++)
278                 for (j = 0; j < csums_per_device; j++)
279                         stripe_csum_set(v, i, j,
280                                 ec_block_checksum(buf, i, j << v->csum_granularity_bits));
281 }
282
283 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
284 {
285         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
286         unsigned csum_granularity = 1 << v->csum_granularity_bits;
287         unsigned i;
288
289         if (!v->csum_type)
290                 return;
291
292         for (i = 0; i < v->nr_blocks; i++) {
293                 unsigned offset = buf->offset;
294                 unsigned end = buf->offset + buf->size;
295
296                 if (!test_bit(i, buf->valid))
297                         continue;
298
299                 while (offset < end) {
300                         unsigned j = offset >> v->csum_granularity_bits;
301                         unsigned len = min(csum_granularity, end - offset);
302                         struct bch_csum want = stripe_csum_get(v, i, j);
303                         struct bch_csum got = ec_block_checksum(buf, i, offset);
304
305                         if (bch2_crc_cmp(want, got)) {
306                                 struct printbuf err = PRINTBUF;
307                                 struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
308
309                                 prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
310                                            want.hi, want.lo,
311                                            got.hi, got.lo,
312                                            bch2_csum_types[v->csum_type]);
313                                 prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
314                                 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
315                                 bch_err_ratelimited(ca, "%s", err.buf);
316                                 printbuf_exit(&err);
317
318                                 clear_bit(i, buf->valid);
319
320                                 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
321                                 break;
322                         }
323
324                         offset += len;
325                 }
326         }
327 }
328
329 /* Erasure coding: */
330
331 static void ec_generate_ec(struct ec_stripe_buf *buf)
332 {
333         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
334         unsigned nr_data = v->nr_blocks - v->nr_redundant;
335         unsigned bytes = le16_to_cpu(v->sectors) << 9;
336
337         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
338 }
339
340 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
341 {
342         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
343
344         return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
345 }
346
347 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
348 {
349         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
350         unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
351         unsigned nr_data = v->nr_blocks - v->nr_redundant;
352         unsigned bytes = buf->size << 9;
353
354         if (ec_nr_failed(buf) > v->nr_redundant) {
355                 bch_err_ratelimited(c,
356                         "error doing reconstruct read: unable to read enough blocks");
357                 return -1;
358         }
359
360         for (i = 0; i < nr_data; i++)
361                 if (!test_bit(i, buf->valid))
362                         failed[nr_failed++] = i;
363
364         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
365         return 0;
366 }
367
368 /* IO: */
369
370 static void ec_block_endio(struct bio *bio)
371 {
372         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
373         struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
374         struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
375         struct bch_dev *ca = ec_bio->ca;
376         struct closure *cl = bio->bi_private;
377
378         if (bch2_dev_io_err_on(bio->bi_status, ca,
379                                bio_data_dir(bio)
380                                ? BCH_MEMBER_ERROR_write
381                                : BCH_MEMBER_ERROR_read,
382                                "erasure coding %s error: %s",
383                                bio_data_dir(bio) ? "write" : "read",
384                                bch2_blk_status_to_str(bio->bi_status)))
385                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
386
387         if (ptr_stale(ca, ptr)) {
388                 bch_err_ratelimited(ca->fs,
389                                     "error %s stripe: stale pointer after io",
390                                     bio_data_dir(bio) == READ ? "reading from" : "writing to");
391                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
392         }
393
394         bio_put(&ec_bio->bio);
395         percpu_ref_put(&ca->io_ref);
396         closure_put(cl);
397 }
398
399 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
400                         blk_opf_t opf, unsigned idx, struct closure *cl)
401 {
402         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
403         unsigned offset = 0, bytes = buf->size << 9;
404         struct bch_extent_ptr *ptr = &v->ptrs[idx];
405         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
406         enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
407                 ? BCH_DATA_user
408                 : BCH_DATA_parity;
409         int rw = op_is_write(opf);
410
411         if (ptr_stale(ca, ptr)) {
412                 bch_err_ratelimited(c,
413                                     "error %s stripe: stale pointer",
414                                     rw == READ ? "reading from" : "writing to");
415                 clear_bit(idx, buf->valid);
416                 return;
417         }
418
419         if (!bch2_dev_get_ioref(ca, rw)) {
420                 clear_bit(idx, buf->valid);
421                 return;
422         }
423
424         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
425
426         while (offset < bytes) {
427                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
428                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
429                 unsigned b = min_t(size_t, bytes - offset,
430                                    nr_iovecs << PAGE_SHIFT);
431                 struct ec_bio *ec_bio;
432
433                 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
434                                                        nr_iovecs,
435                                                        opf,
436                                                        GFP_KERNEL,
437                                                        &c->ec_bioset),
438                                       struct ec_bio, bio);
439
440                 ec_bio->ca                      = ca;
441                 ec_bio->buf                     = buf;
442                 ec_bio->idx                     = idx;
443
444                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
445                 ec_bio->bio.bi_end_io           = ec_block_endio;
446                 ec_bio->bio.bi_private          = cl;
447
448                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
449
450                 closure_get(cl);
451                 percpu_ref_get(&ca->io_ref);
452
453                 submit_bio(&ec_bio->bio);
454
455                 offset += b;
456         }
457
458         percpu_ref_put(&ca->io_ref);
459 }
460
461 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
462                                 struct ec_stripe_buf *stripe)
463 {
464         struct btree_iter iter;
465         struct bkey_s_c k;
466         int ret;
467
468         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
469                                POS(0, idx), BTREE_ITER_SLOTS);
470         ret = bkey_err(k);
471         if (ret)
472                 goto err;
473         if (k.k->type != KEY_TYPE_stripe) {
474                 ret = -ENOENT;
475                 goto err;
476         }
477         bkey_reassemble(&stripe->key, k);
478 err:
479         bch2_trans_iter_exit(trans, &iter);
480         return ret;
481 }
482
483 /* recovery read path: */
484 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
485 {
486         struct bch_fs *c = trans->c;
487         struct ec_stripe_buf *buf;
488         struct closure cl;
489         struct bch_stripe *v;
490         unsigned i, offset;
491         int ret = 0;
492
493         closure_init_stack(&cl);
494
495         BUG_ON(!rbio->pick.has_ec);
496
497         buf = kzalloc(sizeof(*buf), GFP_NOFS);
498         if (!buf)
499                 return -BCH_ERR_ENOMEM_ec_read_extent;
500
501         ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
502         if (ret) {
503                 bch_err_ratelimited(c,
504                         "error doing reconstruct read: error %i looking up stripe", ret);
505                 kfree(buf);
506                 return -EIO;
507         }
508
509         v = &bkey_i_to_stripe(&buf->key)->v;
510
511         if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
512                 bch_err_ratelimited(c,
513                         "error doing reconstruct read: pointer doesn't match stripe");
514                 ret = -EIO;
515                 goto err;
516         }
517
518         offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
519         if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
520                 bch_err_ratelimited(c,
521                         "error doing reconstruct read: read is bigger than stripe");
522                 ret = -EIO;
523                 goto err;
524         }
525
526         ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
527         if (ret)
528                 goto err;
529
530         for (i = 0; i < v->nr_blocks; i++)
531                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
532
533         closure_sync(&cl);
534
535         if (ec_nr_failed(buf) > v->nr_redundant) {
536                 bch_err_ratelimited(c,
537                         "error doing reconstruct read: unable to read enough blocks");
538                 ret = -EIO;
539                 goto err;
540         }
541
542         ec_validate_checksums(c, buf);
543
544         ret = ec_do_recov(c, buf);
545         if (ret)
546                 goto err;
547
548         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
549                       buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
550 err:
551         ec_stripe_buf_exit(buf);
552         kfree(buf);
553         return ret;
554 }
555
556 /* stripe bucket accounting: */
557
558 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
559 {
560         ec_stripes_heap n, *h = &c->ec_stripes_heap;
561
562         if (idx >= h->size) {
563                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
564                         return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
565
566                 mutex_lock(&c->ec_stripes_heap_lock);
567                 if (n.size > h->size) {
568                         memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
569                         n.used = h->used;
570                         swap(*h, n);
571                 }
572                 mutex_unlock(&c->ec_stripes_heap_lock);
573
574                 free_heap(&n);
575         }
576
577         if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
578                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
579
580         if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
581             !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
582                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
583
584         return 0;
585 }
586
587 static int ec_stripe_mem_alloc(struct btree_trans *trans,
588                                struct btree_iter *iter)
589 {
590         return allocate_dropping_locks_errcode(trans,
591                         __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
592 }
593
594 /*
595  * Hash table of open stripes:
596  * Stripes that are being created or modified are kept in a hash table, so that
597  * stripe deletion can skip them.
598  */
599
600 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
601 {
602         unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
603         struct ec_stripe_new *s;
604
605         hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
606                 if (s->idx == idx)
607                         return true;
608         return false;
609 }
610
611 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
612 {
613         bool ret = false;
614
615         spin_lock(&c->ec_stripes_new_lock);
616         ret = __bch2_stripe_is_open(c, idx);
617         spin_unlock(&c->ec_stripes_new_lock);
618
619         return ret;
620 }
621
622 static bool bch2_try_open_stripe(struct bch_fs *c,
623                                  struct ec_stripe_new *s,
624                                  u64 idx)
625 {
626         bool ret;
627
628         spin_lock(&c->ec_stripes_new_lock);
629         ret = !__bch2_stripe_is_open(c, idx);
630         if (ret) {
631                 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
632
633                 s->idx = idx;
634                 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
635         }
636         spin_unlock(&c->ec_stripes_new_lock);
637
638         return ret;
639 }
640
641 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
642 {
643         BUG_ON(!s->idx);
644
645         spin_lock(&c->ec_stripes_new_lock);
646         hlist_del_init(&s->hash);
647         spin_unlock(&c->ec_stripes_new_lock);
648
649         s->idx = 0;
650 }
651
652 /* Heap of all existing stripes, ordered by blocks_nonempty */
653
654 static u64 stripe_idx_to_delete(struct bch_fs *c)
655 {
656         ec_stripes_heap *h = &c->ec_stripes_heap;
657
658         lockdep_assert_held(&c->ec_stripes_heap_lock);
659
660         if (h->used &&
661             h->data[0].blocks_nonempty == 0 &&
662             !bch2_stripe_is_open(c, h->data[0].idx))
663                 return h->data[0].idx;
664
665         return 0;
666 }
667
668 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
669                                       struct ec_stripe_heap_entry l,
670                                       struct ec_stripe_heap_entry r)
671 {
672         return ((l.blocks_nonempty > r.blocks_nonempty) -
673                 (l.blocks_nonempty < r.blocks_nonempty));
674 }
675
676 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
677                                                    size_t i)
678 {
679         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
680
681         genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
682 }
683
684 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
685 {
686         ec_stripes_heap *h = &c->ec_stripes_heap;
687         struct stripe *m = genradix_ptr(&c->stripes, idx);
688
689         BUG_ON(m->heap_idx >= h->used);
690         BUG_ON(h->data[m->heap_idx].idx != idx);
691 }
692
693 void bch2_stripes_heap_del(struct bch_fs *c,
694                            struct stripe *m, size_t idx)
695 {
696         mutex_lock(&c->ec_stripes_heap_lock);
697         heap_verify_backpointer(c, idx);
698
699         heap_del(&c->ec_stripes_heap, m->heap_idx,
700                  ec_stripes_heap_cmp,
701                  ec_stripes_heap_set_backpointer);
702         mutex_unlock(&c->ec_stripes_heap_lock);
703 }
704
705 void bch2_stripes_heap_insert(struct bch_fs *c,
706                               struct stripe *m, size_t idx)
707 {
708         mutex_lock(&c->ec_stripes_heap_lock);
709         BUG_ON(heap_full(&c->ec_stripes_heap));
710
711         heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
712                         .idx = idx,
713                         .blocks_nonempty = m->blocks_nonempty,
714                 }),
715                  ec_stripes_heap_cmp,
716                  ec_stripes_heap_set_backpointer);
717
718         heap_verify_backpointer(c, idx);
719         mutex_unlock(&c->ec_stripes_heap_lock);
720 }
721
722 void bch2_stripes_heap_update(struct bch_fs *c,
723                               struct stripe *m, size_t idx)
724 {
725         ec_stripes_heap *h = &c->ec_stripes_heap;
726         bool do_deletes;
727         size_t i;
728
729         mutex_lock(&c->ec_stripes_heap_lock);
730         heap_verify_backpointer(c, idx);
731
732         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
733
734         i = m->heap_idx;
735         heap_sift_up(h,   i, ec_stripes_heap_cmp,
736                      ec_stripes_heap_set_backpointer);
737         heap_sift_down(h, i, ec_stripes_heap_cmp,
738                        ec_stripes_heap_set_backpointer);
739
740         heap_verify_backpointer(c, idx);
741
742         do_deletes = stripe_idx_to_delete(c) != 0;
743         mutex_unlock(&c->ec_stripes_heap_lock);
744
745         if (do_deletes)
746                 bch2_do_stripe_deletes(c);
747 }
748
749 /* stripe deletion */
750
751 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
752 {
753         struct bch_fs *c = trans->c;
754         struct btree_iter iter;
755         struct bkey_s_c k;
756         struct bkey_s_c_stripe s;
757         int ret;
758
759         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
760                                BTREE_ITER_INTENT);
761         ret = bkey_err(k);
762         if (ret)
763                 goto err;
764
765         if (k.k->type != KEY_TYPE_stripe) {
766                 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
767                 ret = -EINVAL;
768                 goto err;
769         }
770
771         s = bkey_s_c_to_stripe(k);
772         for (unsigned i = 0; i < s.v->nr_blocks; i++)
773                 if (stripe_blockcount_get(s.v, i)) {
774                         struct printbuf buf = PRINTBUF;
775
776                         bch2_bkey_val_to_text(&buf, c, k);
777                         bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
778                         printbuf_exit(&buf);
779                         ret = -EINVAL;
780                         goto err;
781                 }
782
783         ret = bch2_btree_delete_at(trans, &iter, 0);
784 err:
785         bch2_trans_iter_exit(trans, &iter);
786         return ret;
787 }
788
789 static void ec_stripe_delete_work(struct work_struct *work)
790 {
791         struct bch_fs *c =
792                 container_of(work, struct bch_fs, ec_stripe_delete_work);
793
794         while (1) {
795                 mutex_lock(&c->ec_stripes_heap_lock);
796                 u64 idx = stripe_idx_to_delete(c);
797                 mutex_unlock(&c->ec_stripes_heap_lock);
798
799                 if (!idx)
800                         break;
801
802                 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
803                                         ec_stripe_delete(trans, idx));
804                 bch_err_fn(c, ret);
805                 if (ret)
806                         break;
807         }
808
809         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
810 }
811
812 void bch2_do_stripe_deletes(struct bch_fs *c)
813 {
814         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
815             !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
816                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
817 }
818
819 /* stripe creation: */
820
821 static int ec_stripe_key_update(struct btree_trans *trans,
822                                 struct bkey_i_stripe *new,
823                                 bool create)
824 {
825         struct bch_fs *c = trans->c;
826         struct btree_iter iter;
827         struct bkey_s_c k;
828         int ret;
829
830         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
831                                new->k.p, BTREE_ITER_INTENT);
832         ret = bkey_err(k);
833         if (ret)
834                 goto err;
835
836         if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
837                 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
838                                      create ? "creating" : "updating",
839                                      bch2_bkey_types[k.k->type]);
840                 ret = -EINVAL;
841                 goto err;
842         }
843
844         if (k.k->type == KEY_TYPE_stripe) {
845                 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
846                 unsigned i;
847
848                 if (old->nr_blocks != new->v.nr_blocks) {
849                         bch_err(c, "error updating stripe: nr_blocks does not match");
850                         ret = -EINVAL;
851                         goto err;
852                 }
853
854                 for (i = 0; i < new->v.nr_blocks; i++) {
855                         unsigned v = stripe_blockcount_get(old, i);
856
857                         BUG_ON(v &&
858                                (old->ptrs[i].dev != new->v.ptrs[i].dev ||
859                                 old->ptrs[i].gen != new->v.ptrs[i].gen ||
860                                 old->ptrs[i].offset != new->v.ptrs[i].offset));
861
862                         stripe_blockcount_set(&new->v, i, v);
863                 }
864         }
865
866         ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
867 err:
868         bch2_trans_iter_exit(trans, &iter);
869         return ret;
870 }
871
872 static int ec_stripe_update_extent(struct btree_trans *trans,
873                                    struct bpos bucket, u8 gen,
874                                    struct ec_stripe_buf *s,
875                                    struct bpos *bp_pos)
876 {
877         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
878         struct bch_fs *c = trans->c;
879         struct bch_backpointer bp;
880         struct btree_iter iter;
881         struct bkey_s_c k;
882         const struct bch_extent_ptr *ptr_c;
883         struct bch_extent_ptr *ptr, *ec_ptr = NULL;
884         struct bch_extent_stripe_ptr stripe_ptr;
885         struct bkey_i *n;
886         int ret, dev, block;
887
888         ret = bch2_get_next_backpointer(trans, bucket, gen,
889                                 bp_pos, &bp, BTREE_ITER_CACHED);
890         if (ret)
891                 return ret;
892         if (bpos_eq(*bp_pos, SPOS_MAX))
893                 return 0;
894
895         if (bp.level) {
896                 struct printbuf buf = PRINTBUF;
897                 struct btree_iter node_iter;
898                 struct btree *b;
899
900                 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
901                 bch2_trans_iter_exit(trans, &node_iter);
902
903                 if (!b)
904                         return 0;
905
906                 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
907                 bch2_backpointer_to_text(&buf, &bp);
908
909                 bch2_fs_inconsistent(c, "%s", buf.buf);
910                 printbuf_exit(&buf);
911                 return -EIO;
912         }
913
914         k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
915         ret = bkey_err(k);
916         if (ret)
917                 return ret;
918         if (!k.k) {
919                 /*
920                  * extent no longer exists - we could flush the btree
921                  * write buffer and retry to verify, but no need:
922                  */
923                 return 0;
924         }
925
926         if (extent_has_stripe_ptr(k, s->key.k.p.offset))
927                 goto out;
928
929         ptr_c = bkey_matches_stripe(v, k, &block);
930         /*
931          * It doesn't generally make sense to erasure code cached ptrs:
932          * XXX: should we be incrementing a counter?
933          */
934         if (!ptr_c || ptr_c->cached)
935                 goto out;
936
937         dev = v->ptrs[block].dev;
938
939         n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
940         ret = PTR_ERR_OR_ZERO(n);
941         if (ret)
942                 goto out;
943
944         bkey_reassemble(n, k);
945
946         bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
947         ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
948         BUG_ON(!ec_ptr);
949
950         stripe_ptr = (struct bch_extent_stripe_ptr) {
951                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
952                 .block          = block,
953                 .redundancy     = v->nr_redundant,
954                 .idx            = s->key.k.p.offset,
955         };
956
957         __extent_entry_insert(n,
958                         (union bch_extent_entry *) ec_ptr,
959                         (union bch_extent_entry *) &stripe_ptr);
960
961         ret = bch2_trans_update(trans, &iter, n, 0);
962 out:
963         bch2_trans_iter_exit(trans, &iter);
964         return ret;
965 }
966
967 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
968                                    unsigned block)
969 {
970         struct bch_fs *c = trans->c;
971         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
972         struct bch_extent_ptr bucket = v->ptrs[block];
973         struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
974         struct bpos bp_pos = POS_MIN;
975         int ret = 0;
976
977         while (1) {
978                 ret = commit_do(trans, NULL, NULL,
979                                 BCH_TRANS_COMMIT_no_check_rw|
980                                 BCH_TRANS_COMMIT_no_enospc,
981                         ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
982                                                 s, &bp_pos));
983                 if (ret)
984                         break;
985                 if (bkey_eq(bp_pos, POS_MAX))
986                         break;
987
988                 bp_pos = bpos_nosnap_successor(bp_pos);
989         }
990
991         return ret;
992 }
993
994 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
995 {
996         struct btree_trans *trans = bch2_trans_get(c);
997         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
998         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
999         int ret = 0;
1000
1001         ret = bch2_btree_write_buffer_flush_sync(trans);
1002         if (ret)
1003                 goto err;
1004
1005         for (i = 0; i < nr_data; i++) {
1006                 ret = ec_stripe_update_bucket(trans, s, i);
1007                 if (ret)
1008                         break;
1009         }
1010 err:
1011         bch2_trans_put(trans);
1012
1013         return ret;
1014 }
1015
1016 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1017                                        struct ec_stripe_new *s,
1018                                        unsigned block,
1019                                        struct open_bucket *ob)
1020 {
1021         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1022         unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1023         int ret;
1024
1025         if (!bch2_dev_get_ioref(ca, WRITE)) {
1026                 s->err = -BCH_ERR_erofs_no_writes;
1027                 return;
1028         }
1029
1030         memset(s->new_stripe.data[block] + (offset << 9),
1031                0,
1032                ob->sectors_free << 9);
1033
1034         ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1035                         ob->bucket * ca->mi.bucket_size + offset,
1036                         ob->sectors_free,
1037                         GFP_KERNEL, 0);
1038
1039         percpu_ref_put(&ca->io_ref);
1040
1041         if (ret)
1042                 s->err = ret;
1043 }
1044
1045 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1046 {
1047         if (s->idx)
1048                 bch2_stripe_close(c, s);
1049         kfree(s);
1050 }
1051
1052 /*
1053  * data buckets of new stripe all written: create the stripe
1054  */
1055 static void ec_stripe_create(struct ec_stripe_new *s)
1056 {
1057         struct bch_fs *c = s->c;
1058         struct open_bucket *ob;
1059         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1060         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1061         int ret;
1062
1063         BUG_ON(s->h->s == s);
1064
1065         closure_sync(&s->iodone);
1066
1067         if (!s->err) {
1068                 for (i = 0; i < nr_data; i++)
1069                         if (s->blocks[i]) {
1070                                 ob = c->open_buckets + s->blocks[i];
1071
1072                                 if (ob->sectors_free)
1073                                         zero_out_rest_of_ec_bucket(c, s, i, ob);
1074                         }
1075         }
1076
1077         if (s->err) {
1078                 if (!bch2_err_matches(s->err, EROFS))
1079                         bch_err(c, "error creating stripe: error writing data buckets");
1080                 goto err;
1081         }
1082
1083         if (s->have_existing_stripe) {
1084                 ec_validate_checksums(c, &s->existing_stripe);
1085
1086                 if (ec_do_recov(c, &s->existing_stripe)) {
1087                         bch_err(c, "error creating stripe: error reading existing stripe");
1088                         goto err;
1089                 }
1090
1091                 for (i = 0; i < nr_data; i++)
1092                         if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1093                                 swap(s->new_stripe.data[i],
1094                                      s->existing_stripe.data[i]);
1095
1096                 ec_stripe_buf_exit(&s->existing_stripe);
1097         }
1098
1099         BUG_ON(!s->allocated);
1100         BUG_ON(!s->idx);
1101
1102         ec_generate_ec(&s->new_stripe);
1103
1104         ec_generate_checksums(&s->new_stripe);
1105
1106         /* write p/q: */
1107         for (i = nr_data; i < v->nr_blocks; i++)
1108                 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1109         closure_sync(&s->iodone);
1110
1111         if (ec_nr_failed(&s->new_stripe)) {
1112                 bch_err(c, "error creating stripe: error writing redundancy buckets");
1113                 goto err;
1114         }
1115
1116         ret = bch2_trans_do(c, &s->res, NULL,
1117                             BCH_TRANS_COMMIT_no_check_rw|
1118                             BCH_TRANS_COMMIT_no_enospc,
1119                             ec_stripe_key_update(trans,
1120                                         bkey_i_to_stripe(&s->new_stripe.key),
1121                                         !s->have_existing_stripe));
1122         bch_err_msg(c, ret, "creating stripe key");
1123         if (ret) {
1124                 goto err;
1125         }
1126
1127         ret = ec_stripe_update_extents(c, &s->new_stripe);
1128         bch_err_msg(c, ret, "error updating extents");
1129         if (ret)
1130                 goto err;
1131 err:
1132         bch2_disk_reservation_put(c, &s->res);
1133
1134         for (i = 0; i < v->nr_blocks; i++)
1135                 if (s->blocks[i]) {
1136                         ob = c->open_buckets + s->blocks[i];
1137
1138                         if (i < nr_data) {
1139                                 ob->ec = NULL;
1140                                 __bch2_open_bucket_put(c, ob);
1141                         } else {
1142                                 bch2_open_bucket_put(c, ob);
1143                         }
1144                 }
1145
1146         mutex_lock(&c->ec_stripe_new_lock);
1147         list_del(&s->list);
1148         mutex_unlock(&c->ec_stripe_new_lock);
1149         wake_up(&c->ec_stripe_new_wait);
1150
1151         ec_stripe_buf_exit(&s->existing_stripe);
1152         ec_stripe_buf_exit(&s->new_stripe);
1153         closure_debug_destroy(&s->iodone);
1154
1155         ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1156 }
1157
1158 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1159 {
1160         struct ec_stripe_new *s;
1161
1162         mutex_lock(&c->ec_stripe_new_lock);
1163         list_for_each_entry(s, &c->ec_stripe_new_list, list)
1164                 if (!atomic_read(&s->ref[STRIPE_REF_io]))
1165                         goto out;
1166         s = NULL;
1167 out:
1168         mutex_unlock(&c->ec_stripe_new_lock);
1169
1170         return s;
1171 }
1172
1173 static void ec_stripe_create_work(struct work_struct *work)
1174 {
1175         struct bch_fs *c = container_of(work,
1176                 struct bch_fs, ec_stripe_create_work);
1177         struct ec_stripe_new *s;
1178
1179         while ((s = get_pending_stripe(c)))
1180                 ec_stripe_create(s);
1181
1182         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1183 }
1184
1185 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1186 {
1187         bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1188
1189         if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1190                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1191 }
1192
1193 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1194 {
1195         struct ec_stripe_new *s = h->s;
1196
1197         BUG_ON(!s->allocated && !s->err);
1198
1199         h->s            = NULL;
1200         s->pending      = true;
1201
1202         mutex_lock(&c->ec_stripe_new_lock);
1203         list_add(&s->list, &c->ec_stripe_new_list);
1204         mutex_unlock(&c->ec_stripe_new_lock);
1205
1206         ec_stripe_new_put(c, s, STRIPE_REF_io);
1207 }
1208
1209 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1210 {
1211         struct ec_stripe_new *s = ob->ec;
1212
1213         s->err = -EIO;
1214 }
1215
1216 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1217 {
1218         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1219         struct bch_dev *ca;
1220         unsigned offset;
1221
1222         if (!ob)
1223                 return NULL;
1224
1225         BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1226
1227         ca      = bch_dev_bkey_exists(c, ob->dev);
1228         offset  = ca->mi.bucket_size - ob->sectors_free;
1229
1230         return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1231 }
1232
1233 static int unsigned_cmp(const void *_l, const void *_r)
1234 {
1235         unsigned l = *((const unsigned *) _l);
1236         unsigned r = *((const unsigned *) _r);
1237
1238         return cmp_int(l, r);
1239 }
1240
1241 /* pick most common bucket size: */
1242 static unsigned pick_blocksize(struct bch_fs *c,
1243                                struct bch_devs_mask *devs)
1244 {
1245         unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1246         struct {
1247                 unsigned nr, size;
1248         } cur = { 0, 0 }, best = { 0, 0 };
1249
1250         for_each_member_device_rcu(c, ca, devs)
1251                 sizes[nr++] = ca->mi.bucket_size;
1252
1253         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1254
1255         for (unsigned i = 0; i < nr; i++) {
1256                 if (sizes[i] != cur.size) {
1257                         if (cur.nr > best.nr)
1258                                 best = cur;
1259
1260                         cur.nr = 0;
1261                         cur.size = sizes[i];
1262                 }
1263
1264                 cur.nr++;
1265         }
1266
1267         if (cur.nr > best.nr)
1268                 best = cur;
1269
1270         return best.size;
1271 }
1272
1273 static bool may_create_new_stripe(struct bch_fs *c)
1274 {
1275         return false;
1276 }
1277
1278 static void ec_stripe_key_init(struct bch_fs *c,
1279                                struct bkey_i *k,
1280                                unsigned nr_data,
1281                                unsigned nr_parity,
1282                                unsigned stripe_size)
1283 {
1284         struct bkey_i_stripe *s = bkey_stripe_init(k);
1285         unsigned u64s;
1286
1287         s->v.sectors                    = cpu_to_le16(stripe_size);
1288         s->v.algorithm                  = 0;
1289         s->v.nr_blocks                  = nr_data + nr_parity;
1290         s->v.nr_redundant               = nr_parity;
1291         s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
1292         s->v.csum_type                  = BCH_CSUM_crc32c;
1293         s->v.pad                        = 0;
1294
1295         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1296                 BUG_ON(1 << s->v.csum_granularity_bits >=
1297                        le16_to_cpu(s->v.sectors) ||
1298                        s->v.csum_granularity_bits == U8_MAX);
1299                 s->v.csum_granularity_bits++;
1300         }
1301
1302         set_bkey_val_u64s(&s->k, u64s);
1303 }
1304
1305 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1306 {
1307         struct ec_stripe_new *s;
1308
1309         lockdep_assert_held(&h->lock);
1310
1311         s = kzalloc(sizeof(*s), GFP_KERNEL);
1312         if (!s)
1313                 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1314
1315         mutex_init(&s->lock);
1316         closure_init(&s->iodone, NULL);
1317         atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1318         atomic_set(&s->ref[STRIPE_REF_io], 1);
1319         s->c            = c;
1320         s->h            = h;
1321         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1322                                 BCH_BKEY_PTRS_MAX) - h->redundancy;
1323         s->nr_parity    = h->redundancy;
1324
1325         ec_stripe_key_init(c, &s->new_stripe.key,
1326                            s->nr_data, s->nr_parity, h->blocksize);
1327
1328         h->s = s;
1329         return 0;
1330 }
1331
1332 static struct ec_stripe_head *
1333 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1334                          unsigned algo, unsigned redundancy,
1335                          enum bch_watermark watermark)
1336 {
1337         struct ec_stripe_head *h;
1338
1339         h = kzalloc(sizeof(*h), GFP_KERNEL);
1340         if (!h)
1341                 return NULL;
1342
1343         mutex_init(&h->lock);
1344         BUG_ON(!mutex_trylock(&h->lock));
1345
1346         h->target       = target;
1347         h->algo         = algo;
1348         h->redundancy   = redundancy;
1349         h->watermark    = watermark;
1350
1351         rcu_read_lock();
1352         h->devs = target_rw_devs(c, BCH_DATA_user, target);
1353
1354         for_each_member_device_rcu(c, ca, &h->devs)
1355                 if (!ca->mi.durability)
1356                         __clear_bit(ca->dev_idx, h->devs.d);
1357
1358         h->blocksize = pick_blocksize(c, &h->devs);
1359
1360         for_each_member_device_rcu(c, ca, &h->devs)
1361                 if (ca->mi.bucket_size == h->blocksize)
1362                         h->nr_active_devs++;
1363
1364         rcu_read_unlock();
1365
1366         /*
1367          * If we only have redundancy + 1 devices, we're better off with just
1368          * replication:
1369          */
1370         if (h->nr_active_devs < h->redundancy + 2)
1371                 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
1372                         h->nr_active_devs, h->redundancy + 2);
1373
1374         list_add(&h->list, &c->ec_stripe_head_list);
1375         return h;
1376 }
1377
1378 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1379 {
1380         if (h->s &&
1381             h->s->allocated &&
1382             bitmap_weight(h->s->blocks_allocated,
1383                           h->s->nr_data) == h->s->nr_data)
1384                 ec_stripe_set_pending(c, h);
1385
1386         mutex_unlock(&h->lock);
1387 }
1388
1389 static struct ec_stripe_head *
1390 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1391                           unsigned target,
1392                           unsigned algo,
1393                           unsigned redundancy,
1394                           enum bch_watermark watermark)
1395 {
1396         struct bch_fs *c = trans->c;
1397         struct ec_stripe_head *h;
1398         int ret;
1399
1400         if (!redundancy)
1401                 return NULL;
1402
1403         ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1404         if (ret)
1405                 return ERR_PTR(ret);
1406
1407         if (test_bit(BCH_FS_going_ro, &c->flags)) {
1408                 h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1409                 goto found;
1410         }
1411
1412         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1413                 if (h->target           == target &&
1414                     h->algo             == algo &&
1415                     h->redundancy       == redundancy &&
1416                     h->watermark        == watermark) {
1417                         ret = bch2_trans_mutex_lock(trans, &h->lock);
1418                         if (ret)
1419                                 h = ERR_PTR(ret);
1420                         goto found;
1421                 }
1422
1423         h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
1424 found:
1425         if (!IS_ERR_OR_NULL(h) &&
1426             h->nr_active_devs < h->redundancy + 2) {
1427                 mutex_unlock(&h->lock);
1428                 h = NULL;
1429         }
1430         mutex_unlock(&c->ec_stripe_head_lock);
1431         return h;
1432 }
1433
1434 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1435                                     enum bch_watermark watermark, struct closure *cl)
1436 {
1437         struct bch_fs *c = trans->c;
1438         struct bch_devs_mask devs = h->devs;
1439         struct open_bucket *ob;
1440         struct open_buckets buckets;
1441         struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1442         unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1443         bool have_cache = true;
1444         int ret = 0;
1445
1446         BUG_ON(v->nr_blocks     != h->s->nr_data + h->s->nr_parity);
1447         BUG_ON(v->nr_redundant  != h->s->nr_parity);
1448
1449         for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1450                 __clear_bit(v->ptrs[i].dev, devs.d);
1451                 if (i < h->s->nr_data)
1452                         nr_have_data++;
1453                 else
1454                         nr_have_parity++;
1455         }
1456
1457         BUG_ON(nr_have_data     > h->s->nr_data);
1458         BUG_ON(nr_have_parity   > h->s->nr_parity);
1459
1460         buckets.nr = 0;
1461         if (nr_have_parity < h->s->nr_parity) {
1462                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1463                                             &h->parity_stripe,
1464                                             &devs,
1465                                             h->s->nr_parity,
1466                                             &nr_have_parity,
1467                                             &have_cache, 0,
1468                                             BCH_DATA_parity,
1469                                             watermark,
1470                                             cl);
1471
1472                 open_bucket_for_each(c, &buckets, ob, i) {
1473                         j = find_next_zero_bit(h->s->blocks_gotten,
1474                                                h->s->nr_data + h->s->nr_parity,
1475                                                h->s->nr_data);
1476                         BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1477
1478                         h->s->blocks[j] = buckets.v[i];
1479                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1480                         __set_bit(j, h->s->blocks_gotten);
1481                 }
1482
1483                 if (ret)
1484                         return ret;
1485         }
1486
1487         buckets.nr = 0;
1488         if (nr_have_data < h->s->nr_data) {
1489                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1490                                             &h->block_stripe,
1491                                             &devs,
1492                                             h->s->nr_data,
1493                                             &nr_have_data,
1494                                             &have_cache, 0,
1495                                             BCH_DATA_user,
1496                                             watermark,
1497                                             cl);
1498
1499                 open_bucket_for_each(c, &buckets, ob, i) {
1500                         j = find_next_zero_bit(h->s->blocks_gotten,
1501                                                h->s->nr_data, 0);
1502                         BUG_ON(j >= h->s->nr_data);
1503
1504                         h->s->blocks[j] = buckets.v[i];
1505                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1506                         __set_bit(j, h->s->blocks_gotten);
1507                 }
1508
1509                 if (ret)
1510                         return ret;
1511         }
1512
1513         return 0;
1514 }
1515
1516 /* XXX: doesn't obey target: */
1517 static s64 get_existing_stripe(struct bch_fs *c,
1518                                struct ec_stripe_head *head)
1519 {
1520         ec_stripes_heap *h = &c->ec_stripes_heap;
1521         struct stripe *m;
1522         size_t heap_idx;
1523         u64 stripe_idx;
1524         s64 ret = -1;
1525
1526         if (may_create_new_stripe(c))
1527                 return -1;
1528
1529         mutex_lock(&c->ec_stripes_heap_lock);
1530         for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
1531                 /* No blocks worth reusing, stripe will just be deleted: */
1532                 if (!h->data[heap_idx].blocks_nonempty)
1533                         continue;
1534
1535                 stripe_idx = h->data[heap_idx].idx;
1536
1537                 m = genradix_ptr(&c->stripes, stripe_idx);
1538
1539                 if (m->algorithm        == head->algo &&
1540                     m->nr_redundant     == head->redundancy &&
1541                     m->sectors          == head->blocksize &&
1542                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant &&
1543                     bch2_try_open_stripe(c, head->s, stripe_idx)) {
1544                         ret = stripe_idx;
1545                         break;
1546                 }
1547         }
1548         mutex_unlock(&c->ec_stripes_heap_lock);
1549         return ret;
1550 }
1551
1552 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
1553 {
1554         struct bch_fs *c = trans->c;
1555         struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1556         struct bch_stripe *existing_v;
1557         unsigned i;
1558         s64 idx;
1559         int ret;
1560
1561         /*
1562          * If we can't allocate a new stripe, and there's no stripes with empty
1563          * blocks for us to reuse, that means we have to wait on copygc:
1564          */
1565         idx = get_existing_stripe(c, h);
1566         if (idx < 0)
1567                 return -BCH_ERR_stripe_alloc_blocked;
1568
1569         ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
1570         if (ret) {
1571                 bch2_stripe_close(c, h->s);
1572                 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
1573                         bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
1574                 return ret;
1575         }
1576
1577         existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
1578
1579         BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
1580         h->s->nr_data = existing_v->nr_blocks -
1581                 existing_v->nr_redundant;
1582
1583         ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
1584         if (ret) {
1585                 bch2_stripe_close(c, h->s);
1586                 return ret;
1587         }
1588
1589         BUG_ON(h->s->existing_stripe.size != h->blocksize);
1590         BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
1591
1592         /*
1593          * Free buckets we initially allocated - they might conflict with
1594          * blocks from the stripe we're reusing:
1595          */
1596         for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
1597                 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
1598                 h->s->blocks[i] = 0;
1599         }
1600         memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
1601         memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
1602
1603         for (i = 0; i < existing_v->nr_blocks; i++) {
1604                 if (stripe_blockcount_get(existing_v, i)) {
1605                         __set_bit(i, h->s->blocks_gotten);
1606                         __set_bit(i, h->s->blocks_allocated);
1607                 }
1608
1609                 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
1610         }
1611
1612         bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
1613         h->s->have_existing_stripe = true;
1614
1615         return 0;
1616 }
1617
1618 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
1619 {
1620         struct bch_fs *c = trans->c;
1621         struct btree_iter iter;
1622         struct bkey_s_c k;
1623         struct bpos min_pos = POS(0, 1);
1624         struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
1625         int ret;
1626
1627         if (!h->s->res.sectors) {
1628                 ret = bch2_disk_reservation_get(c, &h->s->res,
1629                                         h->blocksize,
1630                                         h->s->nr_parity,
1631                                         BCH_DISK_RESERVATION_NOFAIL);
1632                 if (ret)
1633                         return ret;
1634         }
1635
1636         for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
1637                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
1638                 if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
1639                         if (start_pos.offset) {
1640                                 start_pos = min_pos;
1641                                 bch2_btree_iter_set_pos(&iter, start_pos);
1642                                 continue;
1643                         }
1644
1645                         ret = -BCH_ERR_ENOSPC_stripe_create;
1646                         break;
1647                 }
1648
1649                 if (bkey_deleted(k.k) &&
1650                     bch2_try_open_stripe(c, h->s, k.k->p.offset))
1651                         break;
1652         }
1653
1654         c->ec_stripe_hint = iter.pos.offset;
1655
1656         if (ret)
1657                 goto err;
1658
1659         ret = ec_stripe_mem_alloc(trans, &iter);
1660         if (ret) {
1661                 bch2_stripe_close(c, h->s);
1662                 goto err;
1663         }
1664
1665         h->s->new_stripe.key.k.p = iter.pos;
1666 out:
1667         bch2_trans_iter_exit(trans, &iter);
1668         return ret;
1669 err:
1670         bch2_disk_reservation_put(c, &h->s->res);
1671         goto out;
1672 }
1673
1674 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
1675                                                unsigned target,
1676                                                unsigned algo,
1677                                                unsigned redundancy,
1678                                                enum bch_watermark watermark,
1679                                                struct closure *cl)
1680 {
1681         struct bch_fs *c = trans->c;
1682         struct ec_stripe_head *h;
1683         bool waiting = false;
1684         int ret;
1685
1686         h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
1687         if (IS_ERR_OR_NULL(h))
1688                 return h;
1689
1690         if (!h->s) {
1691                 ret = ec_new_stripe_alloc(c, h);
1692                 if (ret) {
1693                         bch_err(c, "failed to allocate new stripe");
1694                         goto err;
1695                 }
1696         }
1697
1698         if (h->s->allocated)
1699                 goto allocated;
1700
1701         if (h->s->have_existing_stripe)
1702                 goto alloc_existing;
1703
1704         /* First, try to allocate a full stripe: */
1705         ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
1706                 __bch2_ec_stripe_head_reserve(trans, h);
1707         if (!ret)
1708                 goto allocate_buf;
1709         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
1710             bch2_err_matches(ret, ENOMEM))
1711                 goto err;
1712
1713         /*
1714          * Not enough buckets available for a full stripe: we must reuse an
1715          * existing stripe:
1716          */
1717         while (1) {
1718                 ret = __bch2_ec_stripe_head_reuse(trans, h);
1719                 if (!ret)
1720                         break;
1721                 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
1722                         goto err;
1723
1724                 if (watermark == BCH_WATERMARK_copygc) {
1725                         ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
1726                                 __bch2_ec_stripe_head_reserve(trans, h);
1727                         if (ret)
1728                                 goto err;
1729                         goto allocate_buf;
1730                 }
1731
1732                 /* XXX freelist_wait? */
1733                 closure_wait(&c->freelist_wait, cl);
1734                 waiting = true;
1735         }
1736
1737         if (waiting)
1738                 closure_wake_up(&c->freelist_wait);
1739 alloc_existing:
1740         /*
1741          * Retry allocating buckets, with the watermark for this
1742          * particular write:
1743          */
1744         ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
1745         if (ret)
1746                 goto err;
1747
1748 allocate_buf:
1749         ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
1750         if (ret)
1751                 goto err;
1752
1753         h->s->allocated = true;
1754 allocated:
1755         BUG_ON(!h->s->idx);
1756         BUG_ON(!h->s->new_stripe.data[0]);
1757         BUG_ON(trans->restarted);
1758         return h;
1759 err:
1760         bch2_ec_stripe_head_put(c, h);
1761         return ERR_PTR(ret);
1762 }
1763
1764 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
1765 {
1766         struct ec_stripe_head *h;
1767         struct open_bucket *ob;
1768         unsigned i;
1769
1770         mutex_lock(&c->ec_stripe_head_lock);
1771         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
1772                 mutex_lock(&h->lock);
1773                 if (!h->s)
1774                         goto unlock;
1775
1776                 if (!ca)
1777                         goto found;
1778
1779                 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
1780                         if (!h->s->blocks[i])
1781                                 continue;
1782
1783                         ob = c->open_buckets + h->s->blocks[i];
1784                         if (ob->dev == ca->dev_idx)
1785                                 goto found;
1786                 }
1787                 goto unlock;
1788 found:
1789                 h->s->err = -BCH_ERR_erofs_no_writes;
1790                 ec_stripe_set_pending(c, h);
1791 unlock:
1792                 mutex_unlock(&h->lock);
1793         }
1794         mutex_unlock(&c->ec_stripe_head_lock);
1795 }
1796
1797 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1798 {
1799         __bch2_ec_stop(c, ca);
1800 }
1801
1802 void bch2_fs_ec_stop(struct bch_fs *c)
1803 {
1804         __bch2_ec_stop(c, NULL);
1805 }
1806
1807 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
1808 {
1809         bool ret;
1810
1811         mutex_lock(&c->ec_stripe_new_lock);
1812         ret = list_empty(&c->ec_stripe_new_list);
1813         mutex_unlock(&c->ec_stripe_new_lock);
1814
1815         return ret;
1816 }
1817
1818 void bch2_fs_ec_flush(struct bch_fs *c)
1819 {
1820         wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
1821 }
1822
1823 int bch2_stripes_read(struct bch_fs *c)
1824 {
1825         int ret = bch2_trans_run(c,
1826                 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
1827                                    BTREE_ITER_PREFETCH, k, ({
1828                         if (k.k->type != KEY_TYPE_stripe)
1829                                 continue;
1830
1831                         ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
1832                         if (ret)
1833                                 break;
1834
1835                         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
1836
1837                         struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
1838                         m->sectors      = le16_to_cpu(s->sectors);
1839                         m->algorithm    = s->algorithm;
1840                         m->nr_blocks    = s->nr_blocks;
1841                         m->nr_redundant = s->nr_redundant;
1842                         m->blocks_nonempty = 0;
1843
1844                         for (unsigned i = 0; i < s->nr_blocks; i++)
1845                                 m->blocks_nonempty += !!stripe_blockcount_get(s, i);
1846
1847                         bch2_stripes_heap_insert(c, m, k.k->p.offset);
1848                         0;
1849                 })));
1850         bch_err_fn(c, ret);
1851         return ret;
1852 }
1853
1854 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
1855 {
1856         ec_stripes_heap *h = &c->ec_stripes_heap;
1857         struct stripe *m;
1858         size_t i;
1859
1860         mutex_lock(&c->ec_stripes_heap_lock);
1861         for (i = 0; i < min_t(size_t, h->used, 50); i++) {
1862                 m = genradix_ptr(&c->stripes, h->data[i].idx);
1863
1864                 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
1865                        h->data[i].blocks_nonempty,
1866                        m->nr_blocks - m->nr_redundant,
1867                        m->nr_redundant);
1868                 if (bch2_stripe_is_open(c, h->data[i].idx))
1869                         prt_str(out, " open");
1870                 prt_newline(out);
1871         }
1872         mutex_unlock(&c->ec_stripes_heap_lock);
1873 }
1874
1875 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
1876 {
1877         struct ec_stripe_head *h;
1878         struct ec_stripe_new *s;
1879
1880         mutex_lock(&c->ec_stripe_head_lock);
1881         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
1882                 prt_printf(out, "target %u algo %u redundancy %u %s:\n",
1883                        h->target, h->algo, h->redundancy,
1884                        bch2_watermarks[h->watermark]);
1885
1886                 if (h->s)
1887                         prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
1888                                h->s->idx, h->s->nr_data, h->s->nr_parity,
1889                                bitmap_weight(h->s->blocks_allocated,
1890                                              h->s->nr_data));
1891         }
1892         mutex_unlock(&c->ec_stripe_head_lock);
1893
1894         prt_printf(out, "in flight:\n");
1895
1896         mutex_lock(&c->ec_stripe_new_lock);
1897         list_for_each_entry(s, &c->ec_stripe_new_list, list) {
1898                 prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
1899                            s->idx, s->nr_data, s->nr_parity,
1900                            atomic_read(&s->ref[STRIPE_REF_io]),
1901                            atomic_read(&s->ref[STRIPE_REF_stripe]),
1902                            bch2_watermarks[s->h->watermark]);
1903         }
1904         mutex_unlock(&c->ec_stripe_new_lock);
1905 }
1906
1907 void bch2_fs_ec_exit(struct bch_fs *c)
1908 {
1909         struct ec_stripe_head *h;
1910         unsigned i;
1911
1912         while (1) {
1913                 mutex_lock(&c->ec_stripe_head_lock);
1914                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
1915                                              struct ec_stripe_head, list);
1916                 if (h)
1917                         list_del(&h->list);
1918                 mutex_unlock(&c->ec_stripe_head_lock);
1919                 if (!h)
1920                         break;
1921
1922                 if (h->s) {
1923                         for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
1924                                 BUG_ON(h->s->blocks[i]);
1925
1926                         kfree(h->s);
1927                 }
1928                 kfree(h);
1929         }
1930
1931         BUG_ON(!list_empty(&c->ec_stripe_new_list));
1932
1933         free_heap(&c->ec_stripes_heap);
1934         genradix_free(&c->stripes);
1935         bioset_exit(&c->ec_bioset);
1936 }
1937
1938 void bch2_fs_ec_init_early(struct bch_fs *c)
1939 {
1940         spin_lock_init(&c->ec_stripes_new_lock);
1941         mutex_init(&c->ec_stripes_heap_lock);
1942
1943         INIT_LIST_HEAD(&c->ec_stripe_head_list);
1944         mutex_init(&c->ec_stripe_head_lock);
1945
1946         INIT_LIST_HEAD(&c->ec_stripe_new_list);
1947         mutex_init(&c->ec_stripe_new_lock);
1948         init_waitqueue_head(&c->ec_stripe_new_wait);
1949
1950         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
1951         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1952 }
1953
1954 int bch2_fs_ec_init(struct bch_fs *c)
1955 {
1956         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
1957                            BIOSET_NEED_BVECS);
1958 }