]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
Update bcachefs sources to 14f68409be bcachefs: Optimize fiemap
[bcachefs-tools-debian] / libbcachefs / io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "alloc_foreground.h"
11 #include "bset.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "compress.h"
16 #include "clock.h"
17 #include "debug.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "extents.h"
22 #include "io.h"
23 #include "journal.h"
24 #include "keylist.h"
25 #include "move.h"
26 #include "rebalance.h"
27 #include "super.h"
28 #include "super-io.h"
29
30 #include <linux/blkdev.h>
31 #include <linux/random.h>
32
33 #include <trace/events/bcachefs.h>
34
35 static bool bch2_target_congested(struct bch_fs *c, u16 target)
36 {
37         const struct bch_devs_mask *devs;
38         unsigned d, nr = 0, total = 0;
39         u64 now = local_clock(), last;
40         s64 congested;
41         struct bch_dev *ca;
42
43         if (!target)
44                 return false;
45
46         rcu_read_lock();
47         devs = bch2_target_to_mask(c, target);
48         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
49                 ca = rcu_dereference(c->devs[d]);
50                 if (!ca)
51                         continue;
52
53                 congested = atomic_read(&ca->congested);
54                 last = READ_ONCE(ca->congested_last);
55                 if (time_after64(now, last))
56                         congested -= (now - last) >> 12;
57
58                 total += max(congested, 0LL);
59                 nr++;
60         }
61         rcu_read_unlock();
62
63         return bch2_rand_range(nr * CONGESTED_MAX) < total;
64 }
65
66 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
67                                        u64 now, int rw)
68 {
69         u64 latency_capable =
70                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
71         /* ideally we'd be taking into account the device's variance here: */
72         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
73         s64 latency_over = io_latency - latency_threshold;
74
75         if (latency_threshold && latency_over > 0) {
76                 /*
77                  * bump up congested by approximately latency_over * 4 /
78                  * latency_threshold - we don't need much accuracy here so don't
79                  * bother with the divide:
80                  */
81                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
82                         atomic_add(latency_over >>
83                                    max_t(int, ilog2(latency_threshold) - 2, 0),
84                                    &ca->congested);
85
86                 ca->congested_last = now;
87         } else if (atomic_read(&ca->congested) > 0) {
88                 atomic_dec(&ca->congested);
89         }
90 }
91
92 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
93 {
94         atomic64_t *latency = &ca->cur_latency[rw];
95         u64 now = local_clock();
96         u64 io_latency = time_after64(now, submit_time)
97                 ? now - submit_time
98                 : 0;
99         u64 old, new, v = atomic64_read(latency);
100
101         do {
102                 old = v;
103
104                 /*
105                  * If the io latency was reasonably close to the current
106                  * latency, skip doing the update and atomic operation - most of
107                  * the time:
108                  */
109                 if (abs((int) (old - io_latency)) < (old >> 1) &&
110                     now & ~(~0 << 5))
111                         break;
112
113                 new = ewma_add(old, io_latency, 5);
114         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
115
116         bch2_congested_acct(ca, io_latency, now, rw);
117
118         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
119 }
120
121 /* Allocate, free from mempool: */
122
123 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
124 {
125         struct bvec_iter_all iter;
126         struct bio_vec *bv;
127         unsigned i;
128
129         bio_for_each_segment_all(bv, bio, i, iter)
130                 if (bv->bv_page != ZERO_PAGE(0))
131                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
132         bio->bi_vcnt = 0;
133 }
134
135 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
136 {
137         struct page *page;
138
139         if (likely(!*using_mempool)) {
140                 page = alloc_page(GFP_NOIO);
141                 if (unlikely(!page)) {
142                         mutex_lock(&c->bio_bounce_pages_lock);
143                         *using_mempool = true;
144                         goto pool_alloc;
145
146                 }
147         } else {
148 pool_alloc:
149                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
150         }
151
152         return page;
153 }
154
155 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
156                                size_t size)
157 {
158         bool using_mempool = false;
159
160         while (size) {
161                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
162                 unsigned len = min(PAGE_SIZE, size);
163
164                 BUG_ON(!bio_add_page(bio, page, len, 0));
165                 size -= len;
166         }
167
168         if (using_mempool)
169                 mutex_unlock(&c->bio_bounce_pages_lock);
170 }
171
172 /* Writes */
173
174 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
175                                enum bch_data_type type,
176                                const struct bkey_i *k)
177 {
178         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
179         const struct bch_extent_ptr *ptr;
180         struct bch_write_bio *n;
181         struct bch_dev *ca;
182
183         BUG_ON(c->opts.nochanges);
184
185         bkey_for_each_ptr(ptrs, ptr) {
186                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
187                        !c->devs[ptr->dev]);
188
189                 ca = bch_dev_bkey_exists(c, ptr->dev);
190
191                 if (to_entry(ptr + 1) < ptrs.end) {
192                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
193                                                    &ca->replica_set));
194
195                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
196                         n->bio.bi_private       = wbio->bio.bi_private;
197                         n->parent               = wbio;
198                         n->split                = true;
199                         n->bounce               = false;
200                         n->put_bio              = true;
201                         n->bio.bi_opf           = wbio->bio.bi_opf;
202                         bio_inc_remaining(&wbio->bio);
203                 } else {
204                         n = wbio;
205                         n->split                = false;
206                 }
207
208                 n->c                    = c;
209                 n->dev                  = ptr->dev;
210                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
211                 n->submit_time          = local_clock();
212                 n->bio.bi_iter.bi_sector = ptr->offset;
213
214                 if (!journal_flushes_device(ca))
215                         n->bio.bi_opf |= REQ_FUA;
216
217                 if (likely(n->have_ioref)) {
218                         this_cpu_add(ca->io_done->sectors[WRITE][type],
219                                      bio_sectors(&n->bio));
220
221                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
222                         submit_bio(&n->bio);
223                 } else {
224                         n->bio.bi_status        = BLK_STS_REMOVED;
225                         bio_endio(&n->bio);
226                 }
227         }
228 }
229
230 static void __bch2_write(struct closure *);
231
232 static void bch2_write_done(struct closure *cl)
233 {
234         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
235         struct bch_fs *c = op->c;
236
237         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
238                 op->error = bch2_journal_error(&c->journal);
239
240         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
241                 bch2_disk_reservation_put(c, &op->res);
242         percpu_ref_put(&c->writes);
243         bch2_keylist_free(&op->insert_keys, op->inline_keys);
244
245         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
246
247         closure_return(cl);
248 }
249
250 int bch2_write_index_default(struct bch_write_op *op)
251 {
252         struct bch_fs *c = op->c;
253         struct btree_trans trans;
254         struct btree_iter *iter;
255         struct keylist *keys = &op->insert_keys;
256         int ret;
257
258         BUG_ON(bch2_keylist_empty(keys));
259         bch2_verify_keylist_sorted(keys);
260
261         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
262 retry:
263         bch2_trans_begin(&trans);
264
265         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
266                                    bkey_start_pos(&bch2_keylist_front(keys)->k),
267                                    BTREE_ITER_INTENT);
268
269         do {
270                 BKEY_PADDED(k) split;
271
272                 bkey_copy(&split.k, bch2_keylist_front(keys));
273
274                 ret = bch2_extent_trim_atomic(&split.k, iter);
275                 if (ret)
276                         break;
277
278                 bch2_trans_update(&trans,
279                                   BTREE_INSERT_ENTRY(iter, &split.k));
280
281                 ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
282                                         BTREE_INSERT_NOFAIL|
283                                         BTREE_INSERT_USE_RESERVE);
284                 if (ret)
285                         break;
286
287                 if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
288                         bch2_cut_front(iter->pos, bch2_keylist_front(keys));
289                 else
290                         bch2_keylist_pop_front(keys);
291         } while (!bch2_keylist_empty(keys));
292
293         if (ret == -EINTR) {
294                 ret = 0;
295                 goto retry;
296         }
297
298         bch2_trans_exit(&trans);
299
300         return ret;
301 }
302
303 /**
304  * bch_write_index - after a write, update index to point to new data
305  */
306 static void __bch2_write_index(struct bch_write_op *op)
307 {
308         struct bch_fs *c = op->c;
309         struct keylist *keys = &op->insert_keys;
310         struct bch_extent_ptr *ptr;
311         struct bkey_i *src, *dst = keys->keys, *n, *k;
312         unsigned dev;
313         int ret;
314
315         for (src = keys->keys; src != keys->top; src = n) {
316                 n = bkey_next(src);
317                 bkey_copy(dst, src);
318
319                 bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
320                         test_bit(ptr->dev, op->failed.d));
321
322                 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
323                         ret = -EIO;
324                         goto err;
325                 }
326
327                 dst = bkey_next(dst);
328         }
329
330         keys->top = dst;
331
332         /*
333          * probably not the ideal place to hook this in, but I don't
334          * particularly want to plumb io_opts all the way through the btree
335          * update stack right now
336          */
337         for_each_keylist_key(keys, k)
338                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
339
340         if (!bch2_keylist_empty(keys)) {
341                 u64 sectors_start = keylist_sectors(keys);
342                 int ret = op->index_update_fn(op);
343
344                 BUG_ON(keylist_sectors(keys) && !ret);
345
346                 op->written += sectors_start - keylist_sectors(keys);
347
348                 if (ret) {
349                         __bcache_io_error(c, "btree IO error %i", ret);
350                         op->error = ret;
351                 }
352         }
353 out:
354         /* If some a bucket wasn't written, we can't erasure code it: */
355         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
356                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
357
358         bch2_open_buckets_put(c, &op->open_buckets);
359         return;
360 err:
361         keys->top = keys->keys;
362         op->error = ret;
363         goto out;
364 }
365
366 static void bch2_write_index(struct closure *cl)
367 {
368         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
369         struct bch_fs *c = op->c;
370
371         __bch2_write_index(op);
372
373         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
374                 bch2_journal_flush_seq_async(&c->journal,
375                                              *op_journal_seq(op),
376                                              cl);
377                 continue_at(cl, bch2_write_done, index_update_wq(op));
378         } else {
379                 continue_at_nobarrier(cl, bch2_write_done, NULL);
380         }
381 }
382
383 static void bch2_write_endio(struct bio *bio)
384 {
385         struct closure *cl              = bio->bi_private;
386         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
387         struct bch_write_bio *wbio      = to_wbio(bio);
388         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
389         struct bch_fs *c                = wbio->c;
390         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
391
392         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
393                 set_bit(wbio->dev, op->failed.d);
394
395         if (wbio->have_ioref) {
396                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
397                 percpu_ref_put(&ca->io_ref);
398         }
399
400         if (wbio->bounce)
401                 bch2_bio_free_pages_pool(c, bio);
402
403         if (wbio->put_bio)
404                 bio_put(bio);
405
406         if (parent)
407                 bio_endio(&parent->bio);
408         else
409                 closure_put(cl);
410 }
411
412 static void init_append_extent(struct bch_write_op *op,
413                                struct write_point *wp,
414                                struct bversion version,
415                                struct bch_extent_crc_unpacked crc)
416 {
417         struct bch_fs *c = op->c;
418         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
419         struct extent_ptr_decoded p = { .crc = crc };
420         struct open_bucket *ob;
421         unsigned i;
422
423         op->pos.offset += crc.uncompressed_size;
424         e->k.p          = op->pos;
425         e->k.size       = crc.uncompressed_size;
426         e->k.version    = version;
427
428         BUG_ON(crc.compressed_size > wp->sectors_free);
429         wp->sectors_free -= crc.compressed_size;
430
431         open_bucket_for_each(c, &wp->ptrs, ob, i) {
432                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
433
434                 p.ptr = ob->ptr;
435                 p.ptr.cached = !ca->mi.durability ||
436                         (op->flags & BCH_WRITE_CACHED) != 0;
437                 p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
438                 bch2_extent_ptr_decoded_append(&e->k_i, &p);
439
440                 BUG_ON(crc.compressed_size > ob->sectors_free);
441                 ob->sectors_free -= crc.compressed_size;
442         }
443
444         bch2_keylist_push(&op->insert_keys);
445 }
446
447 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
448                                         struct write_point *wp,
449                                         struct bio *src,
450                                         bool *page_alloc_failed,
451                                         void *buf)
452 {
453         struct bch_write_bio *wbio;
454         struct bio *bio;
455         unsigned output_available =
456                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
457         unsigned pages = DIV_ROUND_UP(output_available +
458                                       (buf
459                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
460                                        : 0), PAGE_SIZE);
461
462         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
463         wbio                    = wbio_init(bio);
464         wbio->put_bio           = true;
465         /* copy WRITE_SYNC flag */
466         wbio->bio.bi_opf        = src->bi_opf;
467
468         if (buf) {
469                 bch2_bio_map(bio, buf, output_available);
470                 return bio;
471         }
472
473         wbio->bounce            = true;
474
475         /*
476          * We can't use mempool for more than c->sb.encoded_extent_max
477          * worth of pages, but we'd like to allocate more if we can:
478          */
479         bch2_bio_alloc_pages_pool(c, bio,
480                                   min_t(unsigned, output_available,
481                                         c->sb.encoded_extent_max << 9));
482
483         if (bio->bi_iter.bi_size < output_available)
484                 *page_alloc_failed =
485                         bch2_bio_alloc_pages(bio,
486                                              output_available -
487                                              bio->bi_iter.bi_size,
488                                              GFP_NOFS) != 0;
489
490         return bio;
491 }
492
493 static int bch2_write_rechecksum(struct bch_fs *c,
494                                  struct bch_write_op *op,
495                                  unsigned new_csum_type)
496 {
497         struct bio *bio = &op->wbio.bio;
498         struct bch_extent_crc_unpacked new_crc;
499         int ret;
500
501         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
502
503         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
504             bch2_csum_type_is_encryption(new_csum_type))
505                 new_csum_type = op->crc.csum_type;
506
507         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
508                                   NULL, &new_crc,
509                                   op->crc.offset, op->crc.live_size,
510                                   new_csum_type);
511         if (ret)
512                 return ret;
513
514         bio_advance(bio, op->crc.offset << 9);
515         bio->bi_iter.bi_size = op->crc.live_size << 9;
516         op->crc = new_crc;
517         return 0;
518 }
519
520 static int bch2_write_decrypt(struct bch_write_op *op)
521 {
522         struct bch_fs *c = op->c;
523         struct nonce nonce = extent_nonce(op->version, op->crc);
524         struct bch_csum csum;
525
526         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
527                 return 0;
528
529         /*
530          * If we need to decrypt data in the write path, we'll no longer be able
531          * to verify the existing checksum (poly1305 mac, in this case) after
532          * it's decrypted - this is the last point we'll be able to reverify the
533          * checksum:
534          */
535         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
536         if (bch2_crc_cmp(op->crc.csum, csum))
537                 return -EIO;
538
539         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
540         op->crc.csum_type = 0;
541         op->crc.csum = (struct bch_csum) { 0, 0 };
542         return 0;
543 }
544
545 static enum prep_encoded_ret {
546         PREP_ENCODED_OK,
547         PREP_ENCODED_ERR,
548         PREP_ENCODED_CHECKSUM_ERR,
549         PREP_ENCODED_DO_WRITE,
550 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
551 {
552         struct bch_fs *c = op->c;
553         struct bio *bio = &op->wbio.bio;
554
555         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
556                 return PREP_ENCODED_OK;
557
558         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
559
560         /* Can we just write the entire extent as is? */
561         if (op->crc.uncompressed_size == op->crc.live_size &&
562             op->crc.compressed_size <= wp->sectors_free &&
563             op->crc.compression_type == op->compression_type) {
564                 if (!op->crc.compression_type &&
565                     op->csum_type != op->crc.csum_type &&
566                     bch2_write_rechecksum(c, op, op->csum_type))
567                         return PREP_ENCODED_CHECKSUM_ERR;
568
569                 return PREP_ENCODED_DO_WRITE;
570         }
571
572         /*
573          * If the data is compressed and we couldn't write the entire extent as
574          * is, we have to decompress it:
575          */
576         if (op->crc.compression_type) {
577                 struct bch_csum csum;
578
579                 if (bch2_write_decrypt(op))
580                         return PREP_ENCODED_CHECKSUM_ERR;
581
582                 /* Last point we can still verify checksum: */
583                 csum = bch2_checksum_bio(c, op->crc.csum_type,
584                                          extent_nonce(op->version, op->crc),
585                                          bio);
586                 if (bch2_crc_cmp(op->crc.csum, csum))
587                         return PREP_ENCODED_CHECKSUM_ERR;
588
589                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
590                         return PREP_ENCODED_ERR;
591         }
592
593         /*
594          * No longer have compressed data after this point - data might be
595          * encrypted:
596          */
597
598         /*
599          * If the data is checksummed and we're only writing a subset,
600          * rechecksum and adjust bio to point to currently live data:
601          */
602         if ((op->crc.live_size != op->crc.uncompressed_size ||
603              op->crc.csum_type != op->csum_type) &&
604             bch2_write_rechecksum(c, op, op->csum_type))
605                 return PREP_ENCODED_CHECKSUM_ERR;
606
607         /*
608          * If we want to compress the data, it has to be decrypted:
609          */
610         if ((op->compression_type ||
611              bch2_csum_type_is_encryption(op->crc.csum_type) !=
612              bch2_csum_type_is_encryption(op->csum_type)) &&
613             bch2_write_decrypt(op))
614                 return PREP_ENCODED_CHECKSUM_ERR;
615
616         return PREP_ENCODED_OK;
617 }
618
619 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
620 {
621         struct bch_fs *c = op->c;
622         struct bio *src = &op->wbio.bio, *dst = src;
623         struct bvec_iter saved_iter;
624         struct bkey_i *key_to_write;
625         void *ec_buf;
626         unsigned key_to_write_offset = op->insert_keys.top_p -
627                 op->insert_keys.keys_p;
628         unsigned total_output = 0, total_input = 0;
629         bool bounce = false;
630         bool page_alloc_failed = false;
631         int ret, more = 0;
632
633         BUG_ON(!bio_sectors(src));
634
635         ec_buf = bch2_writepoint_ec_buf(c, wp);
636
637         switch (bch2_write_prep_encoded_data(op, wp)) {
638         case PREP_ENCODED_OK:
639                 break;
640         case PREP_ENCODED_ERR:
641                 ret = -EIO;
642                 goto err;
643         case PREP_ENCODED_CHECKSUM_ERR:
644                 goto csum_err;
645         case PREP_ENCODED_DO_WRITE:
646                 if (ec_buf) {
647                         dst = bch2_write_bio_alloc(c, wp, src,
648                                                    &page_alloc_failed,
649                                                    ec_buf);
650                         bio_copy_data(dst, src);
651                         bounce = true;
652                 }
653                 init_append_extent(op, wp, op->version, op->crc);
654                 goto do_write;
655         }
656
657         if (ec_buf ||
658             op->compression_type ||
659             (op->csum_type &&
660              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
661             (bch2_csum_type_is_encryption(op->csum_type) &&
662              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
663                 dst = bch2_write_bio_alloc(c, wp, src,
664                                            &page_alloc_failed,
665                                            ec_buf);
666                 bounce = true;
667         }
668
669         saved_iter = dst->bi_iter;
670
671         do {
672                 struct bch_extent_crc_unpacked crc =
673                         (struct bch_extent_crc_unpacked) { 0 };
674                 struct bversion version = op->version;
675                 size_t dst_len, src_len;
676
677                 if (page_alloc_failed &&
678                     bio_sectors(dst) < wp->sectors_free &&
679                     bio_sectors(dst) < c->sb.encoded_extent_max)
680                         break;
681
682                 BUG_ON(op->compression_type &&
683                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
684                        bch2_csum_type_is_encryption(op->crc.csum_type));
685                 BUG_ON(op->compression_type && !bounce);
686
687                 crc.compression_type = op->compression_type
688                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
689                                              op->compression_type)
690                         : 0;
691                 if (!crc.compression_type) {
692                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
693                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
694
695                         if (op->csum_type)
696                                 dst_len = min_t(unsigned, dst_len,
697                                                 c->sb.encoded_extent_max << 9);
698
699                         if (bounce) {
700                                 swap(dst->bi_iter.bi_size, dst_len);
701                                 bio_copy_data(dst, src);
702                                 swap(dst->bi_iter.bi_size, dst_len);
703                         }
704
705                         src_len = dst_len;
706                 }
707
708                 BUG_ON(!src_len || !dst_len);
709
710                 if (bch2_csum_type_is_encryption(op->csum_type)) {
711                         if (bversion_zero(version)) {
712                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
713                         } else {
714                                 crc.nonce = op->nonce;
715                                 op->nonce += src_len >> 9;
716                         }
717                 }
718
719                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
720                     !crc.compression_type &&
721                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
722                     bch2_csum_type_is_encryption(op->csum_type)) {
723                         /*
724                          * Note: when we're using rechecksum(), we need to be
725                          * checksumming @src because it has all the data our
726                          * existing checksum covers - if we bounced (because we
727                          * were trying to compress), @dst will only have the
728                          * part of the data the new checksum will cover.
729                          *
730                          * But normally we want to be checksumming post bounce,
731                          * because part of the reason for bouncing is so the
732                          * data can't be modified (by userspace) while it's in
733                          * flight.
734                          */
735                         if (bch2_rechecksum_bio(c, src, version, op->crc,
736                                         &crc, &op->crc,
737                                         src_len >> 9,
738                                         bio_sectors(src) - (src_len >> 9),
739                                         op->csum_type))
740                                 goto csum_err;
741                 } else {
742                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
743                             bch2_rechecksum_bio(c, src, version, op->crc,
744                                         NULL, &op->crc,
745                                         src_len >> 9,
746                                         bio_sectors(src) - (src_len >> 9),
747                                         op->crc.csum_type))
748                                 goto csum_err;
749
750                         crc.compressed_size     = dst_len >> 9;
751                         crc.uncompressed_size   = src_len >> 9;
752                         crc.live_size           = src_len >> 9;
753
754                         swap(dst->bi_iter.bi_size, dst_len);
755                         bch2_encrypt_bio(c, op->csum_type,
756                                          extent_nonce(version, crc), dst);
757                         crc.csum = bch2_checksum_bio(c, op->csum_type,
758                                          extent_nonce(version, crc), dst);
759                         crc.csum_type = op->csum_type;
760                         swap(dst->bi_iter.bi_size, dst_len);
761                 }
762
763                 init_append_extent(op, wp, version, crc);
764
765                 if (dst != src)
766                         bio_advance(dst, dst_len);
767                 bio_advance(src, src_len);
768                 total_output    += dst_len;
769                 total_input     += src_len;
770         } while (dst->bi_iter.bi_size &&
771                  src->bi_iter.bi_size &&
772                  wp->sectors_free &&
773                  !bch2_keylist_realloc(&op->insert_keys,
774                                       op->inline_keys,
775                                       ARRAY_SIZE(op->inline_keys),
776                                       BKEY_EXTENT_U64s_MAX));
777
778         more = src->bi_iter.bi_size != 0;
779
780         dst->bi_iter = saved_iter;
781
782         if (dst == src && more) {
783                 BUG_ON(total_output != total_input);
784
785                 dst = bio_split(src, total_input >> 9,
786                                 GFP_NOIO, &c->bio_write);
787                 wbio_init(dst)->put_bio = true;
788                 /* copy WRITE_SYNC flag */
789                 dst->bi_opf             = src->bi_opf;
790         }
791
792         dst->bi_iter.bi_size = total_output;
793 do_write:
794         /* might have done a realloc... */
795
796         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
797
798         bch2_ec_add_backpointer(c, wp,
799                                 bkey_start_pos(&key_to_write->k),
800                                 total_input >> 9);
801
802         dst->bi_end_io  = bch2_write_endio;
803         dst->bi_private = &op->cl;
804         bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
805
806         closure_get(dst->bi_private);
807
808         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
809                                   key_to_write);
810         return more;
811 csum_err:
812         bch_err(c, "error verifying existing checksum while "
813                 "rewriting existing data (memory corruption?)");
814         ret = -EIO;
815 err:
816         if (to_wbio(dst)->bounce)
817                 bch2_bio_free_pages_pool(c, dst);
818         if (to_wbio(dst)->put_bio)
819                 bio_put(dst);
820
821         return ret;
822 }
823
824 static void __bch2_write(struct closure *cl)
825 {
826         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
827         struct bch_fs *c = op->c;
828         struct write_point *wp;
829         int ret;
830 again:
831         memset(&op->failed, 0, sizeof(op->failed));
832
833         do {
834                 /* +1 for possible cache device: */
835                 if (op->open_buckets.nr + op->nr_replicas + 1 >
836                     ARRAY_SIZE(op->open_buckets.v))
837                         goto flush_io;
838
839                 if (bch2_keylist_realloc(&op->insert_keys,
840                                         op->inline_keys,
841                                         ARRAY_SIZE(op->inline_keys),
842                                         BKEY_EXTENT_U64s_MAX))
843                         goto flush_io;
844
845                 wp = bch2_alloc_sectors_start(c,
846                         op->target,
847                         op->opts.erasure_code,
848                         op->write_point,
849                         &op->devs_have,
850                         op->nr_replicas,
851                         op->nr_replicas_required,
852                         op->alloc_reserve,
853                         op->flags,
854                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
855                 EBUG_ON(!wp);
856
857                 if (unlikely(IS_ERR(wp))) {
858                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
859                                 ret = PTR_ERR(wp);
860                                 goto err;
861                         }
862
863                         goto flush_io;
864                 }
865
866                 ret = bch2_write_extent(op, wp);
867
868                 bch2_open_bucket_get(c, wp, &op->open_buckets);
869                 bch2_alloc_sectors_done(c, wp);
870
871                 if (ret < 0)
872                         goto err;
873         } while (ret);
874
875         continue_at(cl, bch2_write_index, index_update_wq(op));
876         return;
877 err:
878         op->error = ret;
879
880         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
881                     ? bch2_write_index
882                     : bch2_write_done, index_update_wq(op));
883         return;
884 flush_io:
885         closure_sync(cl);
886
887         if (!bch2_keylist_empty(&op->insert_keys)) {
888                 __bch2_write_index(op);
889
890                 if (op->error) {
891                         continue_at_nobarrier(cl, bch2_write_done, NULL);
892                         return;
893                 }
894         }
895
896         goto again;
897 }
898
899 /**
900  * bch_write - handle a write to a cache device or flash only volume
901  *
902  * This is the starting point for any data to end up in a cache device; it could
903  * be from a normal write, or a writeback write, or a write to a flash only
904  * volume - it's also used by the moving garbage collector to compact data in
905  * mostly empty buckets.
906  *
907  * It first writes the data to the cache, creating a list of keys to be inserted
908  * (if the data won't fit in a single open bucket, there will be multiple keys);
909  * after the data is written it calls bch_journal, and after the keys have been
910  * added to the next journal write they're inserted into the btree.
911  *
912  * If op->discard is true, instead of inserting the data it invalidates the
913  * region of the cache represented by op->bio and op->inode.
914  */
915 void bch2_write(struct closure *cl)
916 {
917         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
918         struct bio *bio = &op->wbio.bio;
919         struct bch_fs *c = op->c;
920
921         BUG_ON(!op->nr_replicas);
922         BUG_ON(!op->write_point.v);
923         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
924
925         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
926                 __bcache_io_error(c, "misaligned write");
927                 op->error = -EIO;
928                 goto err;
929         }
930
931         op->start_time = local_clock();
932
933         bch2_keylist_init(&op->insert_keys, op->inline_keys);
934         wbio_init(bio)->put_bio = false;
935
936         if (c->opts.nochanges ||
937             !percpu_ref_tryget(&c->writes)) {
938                 __bcache_io_error(c, "read only");
939                 op->error = -EROFS;
940                 goto err;
941         }
942
943         bch2_increment_clock(c, bio_sectors(bio), WRITE);
944
945         continue_at_nobarrier(cl, __bch2_write, NULL);
946         return;
947 err:
948         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
949                 bch2_disk_reservation_put(c, &op->res);
950         closure_return(cl);
951 }
952
953 /* Cache promotion on read */
954
955 struct promote_op {
956         struct closure          cl;
957         struct rcu_head         rcu;
958         u64                     start_time;
959
960         struct rhash_head       hash;
961         struct bpos             pos;
962
963         struct migrate_write    write;
964         struct bio_vec          bi_inline_vecs[0]; /* must be last */
965 };
966
967 static const struct rhashtable_params bch_promote_params = {
968         .head_offset    = offsetof(struct promote_op, hash),
969         .key_offset     = offsetof(struct promote_op, pos),
970         .key_len        = sizeof(struct bpos),
971 };
972
973 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
974                                   struct bpos pos,
975                                   struct bch_io_opts opts,
976                                   unsigned flags)
977 {
978         if (!(flags & BCH_READ_MAY_PROMOTE))
979                 return false;
980
981         if (!opts.promote_target)
982                 return false;
983
984         if (bch2_bkey_has_target(c, k, opts.promote_target))
985                 return false;
986
987         if (bch2_target_congested(c, opts.promote_target)) {
988                 /* XXX trace this */
989                 return false;
990         }
991
992         if (rhashtable_lookup_fast(&c->promote_table, &pos,
993                                    bch_promote_params))
994                 return false;
995
996         return true;
997 }
998
999 static void promote_free(struct bch_fs *c, struct promote_op *op)
1000 {
1001         int ret;
1002
1003         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1004                                      bch_promote_params);
1005         BUG_ON(ret);
1006         percpu_ref_put(&c->writes);
1007         kfree_rcu(op, rcu);
1008 }
1009
1010 static void promote_done(struct closure *cl)
1011 {
1012         struct promote_op *op =
1013                 container_of(cl, struct promote_op, cl);
1014         struct bch_fs *c = op->write.op.c;
1015
1016         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1017                                op->start_time);
1018
1019         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1020         promote_free(c, op);
1021 }
1022
1023 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1024 {
1025         struct bch_fs *c = rbio->c;
1026         struct closure *cl = &op->cl;
1027         struct bio *bio = &op->write.op.wbio.bio;
1028
1029         trace_promote(&rbio->bio);
1030
1031         /* we now own pages: */
1032         BUG_ON(!rbio->bounce);
1033         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1034
1035         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1036                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1037         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1038
1039         bch2_migrate_read_done(&op->write, rbio);
1040
1041         closure_init(cl, NULL);
1042         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1043         closure_return_with_destructor(cl, promote_done);
1044 }
1045
1046 noinline
1047 static struct promote_op *__promote_alloc(struct bch_fs *c,
1048                                           enum btree_id btree_id,
1049                                           struct bpos pos,
1050                                           struct extent_ptr_decoded *pick,
1051                                           struct bch_io_opts opts,
1052                                           unsigned sectors,
1053                                           struct bch_read_bio **rbio)
1054 {
1055         struct promote_op *op = NULL;
1056         struct bio *bio;
1057         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1058         int ret;
1059
1060         if (!percpu_ref_tryget(&c->writes))
1061                 return NULL;
1062
1063         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1064         if (!op)
1065                 goto err;
1066
1067         op->start_time = local_clock();
1068         op->pos = pos;
1069
1070         /*
1071          * We don't use the mempool here because extents that aren't
1072          * checksummed or compressed can be too big for the mempool:
1073          */
1074         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1075                         sizeof(struct bio_vec) * pages,
1076                         GFP_NOIO);
1077         if (!*rbio)
1078                 goto err;
1079
1080         rbio_init(&(*rbio)->bio, opts);
1081         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1082
1083         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1084                                  GFP_NOIO))
1085                 goto err;
1086
1087         (*rbio)->bounce         = true;
1088         (*rbio)->split          = true;
1089         (*rbio)->kmalloc        = true;
1090
1091         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1092                                           bch_promote_params))
1093                 goto err;
1094
1095         bio = &op->write.op.wbio.bio;
1096         bio_init(bio, bio->bi_inline_vecs, pages);
1097
1098         ret = bch2_migrate_write_init(c, &op->write,
1099                         writepoint_hashed((unsigned long) current),
1100                         opts,
1101                         DATA_PROMOTE,
1102                         (struct data_opts) {
1103                                 .target = opts.promote_target
1104                         },
1105                         btree_id,
1106                         bkey_s_c_null);
1107         BUG_ON(ret);
1108
1109         return op;
1110 err:
1111         if (*rbio)
1112                 bio_free_pages(&(*rbio)->bio);
1113         kfree(*rbio);
1114         *rbio = NULL;
1115         kfree(op);
1116         percpu_ref_put(&c->writes);
1117         return NULL;
1118 }
1119
1120 static inline struct promote_op *promote_alloc(struct bch_fs *c,
1121                                                struct bvec_iter iter,
1122                                                struct bkey_s_c k,
1123                                                struct extent_ptr_decoded *pick,
1124                                                struct bch_io_opts opts,
1125                                                unsigned flags,
1126                                                struct bch_read_bio **rbio,
1127                                                bool *bounce,
1128                                                bool *read_full)
1129 {
1130         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1131         /* data might have to be decompressed in the write path: */
1132         unsigned sectors = promote_full
1133                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1134                 : bvec_iter_sectors(iter);
1135         struct bpos pos = promote_full
1136                 ? bkey_start_pos(k.k)
1137                 : POS(k.k->p.inode, iter.bi_sector);
1138         struct promote_op *promote;
1139
1140         if (!should_promote(c, k, pos, opts, flags))
1141                 return NULL;
1142
1143         promote = __promote_alloc(c,
1144                                   k.k->type == KEY_TYPE_reflink_v
1145                                   ? BTREE_ID_REFLINK
1146                                   : BTREE_ID_EXTENTS,
1147                                   pos, pick, opts, sectors, rbio);
1148         if (!promote)
1149                 return NULL;
1150
1151         *bounce         = true;
1152         *read_full      = promote_full;
1153         return promote;
1154 }
1155
1156 /* Read */
1157
1158 #define READ_RETRY_AVOID        1
1159 #define READ_RETRY              2
1160 #define READ_ERR                3
1161
1162 enum rbio_context {
1163         RBIO_CONTEXT_NULL,
1164         RBIO_CONTEXT_HIGHPRI,
1165         RBIO_CONTEXT_UNBOUND,
1166 };
1167
1168 static inline struct bch_read_bio *
1169 bch2_rbio_parent(struct bch_read_bio *rbio)
1170 {
1171         return rbio->split ? rbio->parent : rbio;
1172 }
1173
1174 __always_inline
1175 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1176                            enum rbio_context context,
1177                            struct workqueue_struct *wq)
1178 {
1179         if (context <= rbio->context) {
1180                 fn(&rbio->work);
1181         } else {
1182                 rbio->work.func         = fn;
1183                 rbio->context           = context;
1184                 queue_work(wq, &rbio->work);
1185         }
1186 }
1187
1188 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1189 {
1190         BUG_ON(rbio->bounce && !rbio->split);
1191
1192         if (rbio->promote)
1193                 promote_free(rbio->c, rbio->promote);
1194         rbio->promote = NULL;
1195
1196         if (rbio->bounce)
1197                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1198
1199         if (rbio->split) {
1200                 struct bch_read_bio *parent = rbio->parent;
1201
1202                 if (rbio->kmalloc)
1203                         kfree(rbio);
1204                 else
1205                         bio_put(&rbio->bio);
1206
1207                 rbio = parent;
1208         }
1209
1210         return rbio;
1211 }
1212
1213 static void bch2_rbio_done(struct bch_read_bio *rbio)
1214 {
1215         bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1216                                rbio->start_time);
1217         bio_endio(&rbio->bio);
1218 }
1219
1220 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1221                                      struct bvec_iter bvec_iter, u64 inode,
1222                                      struct bch_io_failures *failed,
1223                                      unsigned flags)
1224 {
1225         struct btree_trans trans;
1226         struct btree_iter *iter;
1227         BKEY_PADDED(k) tmp;
1228         struct bkey_s_c k;
1229         int ret;
1230
1231         flags &= ~BCH_READ_LAST_FRAGMENT;
1232
1233         bch2_trans_init(&trans, c, 0, 0);
1234
1235         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1236                                    rbio->pos, BTREE_ITER_SLOTS);
1237 retry:
1238         rbio->bio.bi_status = 0;
1239
1240         k = bch2_btree_iter_peek_slot(iter);
1241         if (bkey_err(k))
1242                 goto err;
1243
1244         bkey_reassemble(&tmp.k, k);
1245         k = bkey_i_to_s_c(&tmp.k);
1246         bch2_trans_unlock(&trans);
1247
1248         if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
1249                                    rbio->pick.ptr,
1250                                    rbio->pos.offset -
1251                                    rbio->pick.crc.offset)) {
1252                 /* extent we wanted to read no longer exists: */
1253                 rbio->hole = true;
1254                 goto out;
1255         }
1256
1257         ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
1258         if (ret == READ_RETRY)
1259                 goto retry;
1260         if (ret)
1261                 goto err;
1262 out:
1263         bch2_rbio_done(rbio);
1264         bch2_trans_exit(&trans);
1265         return;
1266 err:
1267         rbio->bio.bi_status = BLK_STS_IOERR;
1268         goto out;
1269 }
1270
1271 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1272                             struct bvec_iter bvec_iter, u64 inode,
1273                             struct bch_io_failures *failed, unsigned flags)
1274 {
1275         struct btree_trans trans;
1276         struct btree_iter *iter;
1277         struct bkey_s_c k;
1278         int ret;
1279
1280         flags &= ~BCH_READ_LAST_FRAGMENT;
1281         flags |= BCH_READ_MUST_CLONE;
1282
1283         bch2_trans_init(&trans, c, 0, 0);
1284 retry:
1285         bch2_trans_begin(&trans);
1286
1287         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1288                            POS(inode, bvec_iter.bi_sector),
1289                            BTREE_ITER_SLOTS, k, ret) {
1290                 BKEY_PADDED(k) tmp;
1291                 unsigned bytes, sectors, offset_into_extent;
1292
1293                 bkey_reassemble(&tmp.k, k);
1294                 k = bkey_i_to_s_c(&tmp.k);
1295
1296                 offset_into_extent = iter->pos.offset -
1297                         bkey_start_offset(k.k);
1298                 sectors = k.k->size - offset_into_extent;
1299
1300                 ret = bch2_read_indirect_extent(&trans,
1301                                         &offset_into_extent, &tmp.k);
1302                 if (ret)
1303                         break;
1304
1305                 sectors = min(sectors, k.k->size - offset_into_extent);
1306
1307                 bch2_trans_unlock(&trans);
1308
1309                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1310                 swap(bvec_iter.bi_size, bytes);
1311
1312                 ret = __bch2_read_extent(c, rbio, bvec_iter, k,
1313                                 offset_into_extent, failed, flags);
1314                 switch (ret) {
1315                 case READ_RETRY:
1316                         goto retry;
1317                 case READ_ERR:
1318                         goto err;
1319                 };
1320
1321                 if (bytes == bvec_iter.bi_size)
1322                         goto out;
1323
1324                 swap(bvec_iter.bi_size, bytes);
1325                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1326         }
1327
1328         /*
1329          * If we get here, it better have been because there was an error
1330          * reading a btree node
1331          */
1332         BUG_ON(!ret);
1333         __bcache_io_error(c, "btree IO error: %i", ret);
1334 err:
1335         rbio->bio.bi_status = BLK_STS_IOERR;
1336 out:
1337         bch2_trans_exit(&trans);
1338         bch2_rbio_done(rbio);
1339 }
1340
1341 static void bch2_rbio_retry(struct work_struct *work)
1342 {
1343         struct bch_read_bio *rbio =
1344                 container_of(work, struct bch_read_bio, work);
1345         struct bch_fs *c        = rbio->c;
1346         struct bvec_iter iter   = rbio->bvec_iter;
1347         unsigned flags          = rbio->flags;
1348         u64 inode               = rbio->pos.inode;
1349         struct bch_io_failures failed = { .nr = 0 };
1350
1351         trace_read_retry(&rbio->bio);
1352
1353         if (rbio->retry == READ_RETRY_AVOID)
1354                 bch2_mark_io_failure(&failed, &rbio->pick);
1355
1356         rbio->bio.bi_status = 0;
1357
1358         rbio = bch2_rbio_free(rbio);
1359
1360         flags |= BCH_READ_IN_RETRY;
1361         flags &= ~BCH_READ_MAY_PROMOTE;
1362
1363         if (flags & BCH_READ_NODECODE)
1364                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1365         else
1366                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1367 }
1368
1369 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1370                             blk_status_t error)
1371 {
1372         rbio->retry = retry;
1373
1374         if (rbio->flags & BCH_READ_IN_RETRY)
1375                 return;
1376
1377         if (retry == READ_ERR) {
1378                 rbio = bch2_rbio_free(rbio);
1379
1380                 rbio->bio.bi_status = error;
1381                 bch2_rbio_done(rbio);
1382         } else {
1383                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1384                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1385         }
1386 }
1387
1388 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1389 {
1390         struct bch_fs *c = rbio->c;
1391         struct btree_trans trans;
1392         struct btree_iter *iter;
1393         struct bkey_s_c k;
1394         BKEY_PADDED(k) new;
1395         struct bch_extent_crc_unpacked new_crc;
1396         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1397         int ret;
1398
1399         if (rbio->pick.crc.compression_type)
1400                 return;
1401
1402         bch2_trans_init(&trans, c, 0, 0);
1403 retry:
1404         bch2_trans_begin(&trans);
1405
1406         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1407                                    BTREE_ITER_INTENT);
1408         k = bch2_btree_iter_peek(iter);
1409         if (IS_ERR_OR_NULL(k.k))
1410                 goto out;
1411
1412         bkey_reassemble(&new.k, k);
1413         k = bkey_i_to_s_c(&new.k);
1414
1415         if (bversion_cmp(k.k->version, rbio->version) ||
1416             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1417                 goto out;
1418
1419         /* Extent was merged? */
1420         if (bkey_start_offset(k.k) < data_offset ||
1421             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1422                 goto out;
1423
1424         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1425                         rbio->pick.crc, NULL, &new_crc,
1426                         bkey_start_offset(k.k) - data_offset, k.k->size,
1427                         rbio->pick.crc.csum_type)) {
1428                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1429                 goto out;
1430         }
1431
1432         if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
1433                 goto out;
1434
1435         bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
1436         ret = bch2_trans_commit(&trans, NULL, NULL,
1437                                 BTREE_INSERT_ATOMIC|
1438                                 BTREE_INSERT_NOFAIL|
1439                                 BTREE_INSERT_NOWAIT);
1440         if (ret == -EINTR)
1441                 goto retry;
1442 out:
1443         bch2_trans_exit(&trans);
1444 }
1445
1446 /* Inner part that may run in process context */
1447 static void __bch2_read_endio(struct work_struct *work)
1448 {
1449         struct bch_read_bio *rbio =
1450                 container_of(work, struct bch_read_bio, work);
1451         struct bch_fs *c        = rbio->c;
1452         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1453         struct bio *src         = &rbio->bio;
1454         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1455         struct bvec_iter dst_iter = rbio->bvec_iter;
1456         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1457         struct nonce nonce = extent_nonce(rbio->version, crc);
1458         struct bch_csum csum;
1459
1460         /* Reset iterator for checksumming and copying bounced data: */
1461         if (rbio->bounce) {
1462                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1463                 src->bi_iter.bi_idx             = 0;
1464                 src->bi_iter.bi_bvec_done       = 0;
1465         } else {
1466                 src->bi_iter                    = rbio->bvec_iter;
1467         }
1468
1469         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1470         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1471                 goto csum_err;
1472
1473         if (unlikely(rbio->narrow_crcs))
1474                 bch2_rbio_narrow_crcs(rbio);
1475
1476         if (rbio->flags & BCH_READ_NODECODE)
1477                 goto nodecode;
1478
1479         /* Adjust crc to point to subset of data we want: */
1480         crc.offset     += rbio->offset_into_extent;
1481         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1482
1483         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1484                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1485                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1486                         goto decompression_err;
1487         } else {
1488                 /* don't need to decrypt the entire bio: */
1489                 nonce = nonce_add(nonce, crc.offset << 9);
1490                 bio_advance(src, crc.offset << 9);
1491
1492                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1493                 src->bi_iter.bi_size = dst_iter.bi_size;
1494
1495                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1496
1497                 if (rbio->bounce) {
1498                         struct bvec_iter src_iter = src->bi_iter;
1499                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1500                 }
1501         }
1502
1503         if (rbio->promote) {
1504                 /*
1505                  * Re encrypt data we decrypted, so it's consistent with
1506                  * rbio->crc:
1507                  */
1508                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1509                 promote_start(rbio->promote, rbio);
1510                 rbio->promote = NULL;
1511         }
1512 nodecode:
1513         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1514                 rbio = bch2_rbio_free(rbio);
1515                 bch2_rbio_done(rbio);
1516         }
1517         return;
1518 csum_err:
1519         /*
1520          * Checksum error: if the bio wasn't bounced, we may have been
1521          * reading into buffers owned by userspace (that userspace can
1522          * scribble over) - retry the read, bouncing it this time:
1523          */
1524         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1525                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1526                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1527                 return;
1528         }
1529
1530         bch2_dev_io_error(ca,
1531                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1532                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1533                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1534                 csum.hi, csum.lo, crc.csum_type);
1535         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1536         return;
1537 decompression_err:
1538         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1539                           rbio->pos.inode,
1540                           (u64) rbio->bvec_iter.bi_sector);
1541         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1542         return;
1543 }
1544
1545 static void bch2_read_endio(struct bio *bio)
1546 {
1547         struct bch_read_bio *rbio =
1548                 container_of(bio, struct bch_read_bio, bio);
1549         struct bch_fs *c        = rbio->c;
1550         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1551         struct workqueue_struct *wq = NULL;
1552         enum rbio_context context = RBIO_CONTEXT_NULL;
1553
1554         if (rbio->have_ioref) {
1555                 bch2_latency_acct(ca, rbio->submit_time, READ);
1556                 percpu_ref_put(&ca->io_ref);
1557         }
1558
1559         if (!rbio->split)
1560                 rbio->bio.bi_end_io = rbio->end_io;
1561
1562         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1563                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1564                 return;
1565         }
1566
1567         if (rbio->pick.ptr.cached &&
1568             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1569              ptr_stale(ca, &rbio->pick.ptr))) {
1570                 atomic_long_inc(&c->read_realloc_races);
1571
1572                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1573                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1574                 else
1575                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1576                 return;
1577         }
1578
1579         if (rbio->narrow_crcs ||
1580             rbio->pick.crc.compression_type ||
1581             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1582                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1583         else if (rbio->pick.crc.csum_type)
1584                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1585
1586         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1587 }
1588
1589 int __bch2_read_indirect_extent(struct btree_trans *trans,
1590                                 unsigned *offset_into_extent,
1591                                 struct bkey_i *orig_k)
1592 {
1593         struct btree_iter *iter;
1594         struct bkey_s_c k;
1595         u64 reflink_offset;
1596         int ret;
1597
1598         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
1599                 *offset_into_extent;
1600
1601         iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
1602                                      POS(0, reflink_offset),
1603                                      BTREE_ITER_SLOTS, 1);
1604         ret = PTR_ERR_OR_ZERO(iter);
1605         if (ret)
1606                 return ret;
1607
1608         k = bch2_btree_iter_peek_slot(iter);
1609         ret = bkey_err(k);
1610         if (ret)
1611                 goto err;
1612
1613         if (k.k->type != KEY_TYPE_reflink_v) {
1614                 __bcache_io_error(trans->c,
1615                                 "pointer to nonexistent indirect extent");
1616                 ret = -EIO;
1617                 goto err;
1618         }
1619
1620         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1621         bkey_reassemble(orig_k, k);
1622 err:
1623         bch2_trans_iter_put(trans, iter);
1624         return ret;
1625 }
1626
1627 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1628                        struct bvec_iter iter, struct bkey_s_c k,
1629                        unsigned offset_into_extent,
1630                        struct bch_io_failures *failed, unsigned flags)
1631 {
1632         struct extent_ptr_decoded pick;
1633         struct bch_read_bio *rbio = NULL;
1634         struct bch_dev *ca;
1635         struct promote_op *promote = NULL;
1636         bool bounce = false, read_full = false, narrow_crcs = false;
1637         struct bpos pos = bkey_start_pos(k.k);
1638         int pick_ret;
1639
1640         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1641
1642         /* hole or reservation - just zero fill: */
1643         if (!pick_ret)
1644                 goto hole;
1645
1646         if (pick_ret < 0) {
1647                 __bcache_io_error(c, "no device to read from");
1648                 goto err;
1649         }
1650
1651         if (pick_ret > 0)
1652                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1653
1654         if (flags & BCH_READ_NODECODE) {
1655                 /*
1656                  * can happen if we retry, and the extent we were going to read
1657                  * has been merged in the meantime:
1658                  */
1659                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1660                         goto hole;
1661
1662                 iter.bi_size    = pick.crc.compressed_size << 9;
1663                 goto noclone;
1664         }
1665
1666         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1667             bio_flagged(&orig->bio, BIO_CHAIN))
1668                 flags |= BCH_READ_MUST_CLONE;
1669
1670         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
1671                 bch2_can_narrow_extent_crcs(k, pick.crc);
1672
1673         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1674                 flags |= BCH_READ_MUST_BOUNCE;
1675
1676         BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1677
1678         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1679             (pick.crc.csum_type != BCH_CSUM_NONE &&
1680              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1681               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1682                (flags & BCH_READ_USER_MAPPED)) ||
1683               (flags & BCH_READ_MUST_BOUNCE)))) {
1684                 read_full = true;
1685                 bounce = true;
1686         }
1687
1688         promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
1689                                 &rbio, &bounce, &read_full);
1690
1691         if (!read_full) {
1692                 EBUG_ON(pick.crc.compression_type);
1693                 EBUG_ON(pick.crc.csum_type &&
1694                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1695                          bvec_iter_sectors(iter) != pick.crc.live_size ||
1696                          pick.crc.offset ||
1697                          offset_into_extent));
1698
1699                 pos.offset += offset_into_extent;
1700                 pick.ptr.offset += pick.crc.offset +
1701                         offset_into_extent;
1702                 offset_into_extent              = 0;
1703                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
1704                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
1705                 pick.crc.offset                 = 0;
1706                 pick.crc.live_size              = bvec_iter_sectors(iter);
1707                 offset_into_extent              = 0;
1708         }
1709
1710         if (rbio) {
1711                 /*
1712                  * promote already allocated bounce rbio:
1713                  * promote needs to allocate a bio big enough for uncompressing
1714                  * data in the write path, but we're not going to use it all
1715                  * here:
1716                  */
1717                 BUG_ON(rbio->bio.bi_iter.bi_size <
1718                        pick.crc.compressed_size << 9);
1719                 rbio->bio.bi_iter.bi_size =
1720                         pick.crc.compressed_size << 9;
1721         } else if (bounce) {
1722                 unsigned sectors = pick.crc.compressed_size;
1723
1724                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
1725                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
1726                                                   &c->bio_read_split),
1727                                  orig->opts);
1728
1729                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1730                 rbio->bounce    = true;
1731                 rbio->split     = true;
1732         } else if (flags & BCH_READ_MUST_CLONE) {
1733                 /*
1734                  * Have to clone if there were any splits, due to error
1735                  * reporting issues (if a split errored, and retrying didn't
1736                  * work, when it reports the error to its parent (us) we don't
1737                  * know if the error was from our bio, and we should retry, or
1738                  * from the whole bio, in which case we don't want to retry and
1739                  * lose the error)
1740                  */
1741                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
1742                                                 &c->bio_read_split),
1743                                  orig->opts);
1744                 rbio->bio.bi_iter = iter;
1745                 rbio->split     = true;
1746         } else {
1747 noclone:
1748                 rbio = orig;
1749                 rbio->bio.bi_iter = iter;
1750                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1751         }
1752
1753         BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1754
1755         rbio->c                 = c;
1756         rbio->submit_time       = local_clock();
1757         if (rbio->split)
1758                 rbio->parent    = orig;
1759         else
1760                 rbio->end_io    = orig->bio.bi_end_io;
1761         rbio->bvec_iter         = iter;
1762         rbio->offset_into_extent= offset_into_extent;
1763         rbio->flags             = flags;
1764         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
1765         rbio->narrow_crcs       = narrow_crcs;
1766         rbio->hole              = 0;
1767         rbio->retry             = 0;
1768         rbio->context           = 0;
1769         rbio->devs_have         = bch2_bkey_devs(k);
1770         rbio->pick              = pick;
1771         rbio->pos               = pos;
1772         rbio->version           = k.k->version;
1773         rbio->promote           = promote;
1774         INIT_WORK(&rbio->work, NULL);
1775
1776         rbio->bio.bi_opf        = orig->bio.bi_opf;
1777         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1778         rbio->bio.bi_end_io     = bch2_read_endio;
1779
1780         if (rbio->bounce)
1781                 trace_read_bounce(&rbio->bio);
1782
1783         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1784
1785         percpu_down_read(&c->mark_lock);
1786         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
1787         percpu_up_read(&c->mark_lock);
1788
1789         if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
1790                 bio_inc_remaining(&orig->bio);
1791                 trace_read_split(&orig->bio);
1792         }
1793
1794         if (!rbio->pick.idx) {
1795                 if (!rbio->have_ioref) {
1796                         __bcache_io_error(c, "no device to read from");
1797                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1798                         goto out;
1799                 }
1800
1801                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
1802                              bio_sectors(&rbio->bio));
1803                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1804
1805                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1806                         submit_bio(&rbio->bio);
1807                 else
1808                         submit_bio_wait(&rbio->bio);
1809         } else {
1810                 /* Attempting reconstruct read: */
1811                 if (bch2_ec_read_extent(c, rbio)) {
1812                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1813                         goto out;
1814                 }
1815
1816                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1817                         bio_endio(&rbio->bio);
1818         }
1819 out:
1820         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1821                 return 0;
1822         } else {
1823                 int ret;
1824
1825                 rbio->context = RBIO_CONTEXT_UNBOUND;
1826                 bch2_read_endio(&rbio->bio);
1827
1828                 ret = rbio->retry;
1829                 rbio = bch2_rbio_free(rbio);
1830
1831                 if (ret == READ_RETRY_AVOID) {
1832                         bch2_mark_io_failure(failed, &pick);
1833                         ret = READ_RETRY;
1834                 }
1835
1836                 return ret;
1837         }
1838
1839 err:
1840         if (flags & BCH_READ_IN_RETRY)
1841                 return READ_ERR;
1842
1843         orig->bio.bi_status = BLK_STS_IOERR;
1844         goto out_read_done;
1845
1846 hole:
1847         /*
1848          * won't normally happen in the BCH_READ_NODECODE
1849          * (bch2_move_extent()) path, but if we retry and the extent we wanted
1850          * to read no longer exists we have to signal that:
1851          */
1852         if (flags & BCH_READ_NODECODE)
1853                 orig->hole = true;
1854
1855         zero_fill_bio_iter(&orig->bio, iter);
1856 out_read_done:
1857         if (flags & BCH_READ_LAST_FRAGMENT)
1858                 bch2_rbio_done(orig);
1859         return 0;
1860 }
1861
1862 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
1863 {
1864         struct btree_trans trans;
1865         struct btree_iter *iter;
1866         struct bkey_s_c k;
1867         unsigned flags = BCH_READ_RETRY_IF_STALE|
1868                 BCH_READ_MAY_PROMOTE|
1869                 BCH_READ_USER_MAPPED;
1870         int ret;
1871
1872         bch2_trans_init(&trans, c, 0, 0);
1873
1874         BUG_ON(rbio->_state);
1875         BUG_ON(flags & BCH_READ_NODECODE);
1876         BUG_ON(flags & BCH_READ_IN_RETRY);
1877
1878         rbio->c = c;
1879         rbio->start_time = local_clock();
1880
1881         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1882                                    POS(inode, rbio->bio.bi_iter.bi_sector),
1883                                    BTREE_ITER_SLOTS);
1884
1885         while (1) {
1886                 BKEY_PADDED(k) tmp;
1887                 unsigned bytes, sectors, offset_into_extent;
1888
1889                 bch2_btree_iter_set_pos(iter,
1890                                 POS(inode, rbio->bio.bi_iter.bi_sector));
1891
1892                 k = bch2_btree_iter_peek_slot(iter);
1893                 ret = bkey_err(k);
1894                 if (ret)
1895                         goto err;
1896
1897                 bkey_reassemble(&tmp.k, k);
1898                 k = bkey_i_to_s_c(&tmp.k);
1899
1900                 offset_into_extent = iter->pos.offset -
1901                         bkey_start_offset(k.k);
1902                 sectors = k.k->size - offset_into_extent;
1903
1904                 ret = bch2_read_indirect_extent(&trans,
1905                                         &offset_into_extent, &tmp.k);
1906                 if (ret)
1907                         goto err;
1908
1909                 /*
1910                  * With indirect extents, the amount of data to read is the min
1911                  * of the original extent and the indirect extent:
1912                  */
1913                 sectors = min(sectors, k.k->size - offset_into_extent);
1914
1915                 /*
1916                  * Unlock the iterator while the btree node's lock is still in
1917                  * cache, before doing the IO:
1918                  */
1919                 bch2_trans_unlock(&trans);
1920
1921                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
1922                 swap(rbio->bio.bi_iter.bi_size, bytes);
1923
1924                 if (rbio->bio.bi_iter.bi_size == bytes)
1925                         flags |= BCH_READ_LAST_FRAGMENT;
1926
1927                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
1928
1929                 if (flags & BCH_READ_LAST_FRAGMENT)
1930                         break;
1931
1932                 swap(rbio->bio.bi_iter.bi_size, bytes);
1933                 bio_advance(&rbio->bio, bytes);
1934         }
1935 out:
1936         bch2_trans_exit(&trans);
1937         return;
1938 err:
1939         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
1940         bch2_rbio_done(rbio);
1941         goto out;
1942 }
1943
1944 void bch2_fs_io_exit(struct bch_fs *c)
1945 {
1946         if (c->promote_table.tbl)
1947                 rhashtable_destroy(&c->promote_table);
1948         mempool_exit(&c->bio_bounce_pages);
1949         bioset_exit(&c->bio_write);
1950         bioset_exit(&c->bio_read_split);
1951         bioset_exit(&c->bio_read);
1952 }
1953
1954 int bch2_fs_io_init(struct bch_fs *c)
1955 {
1956         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1957                         BIOSET_NEED_BVECS) ||
1958             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1959                         BIOSET_NEED_BVECS) ||
1960             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
1961                         BIOSET_NEED_BVECS) ||
1962             mempool_init_page_pool(&c->bio_bounce_pages,
1963                                    max_t(unsigned,
1964                                          c->opts.btree_node_size,
1965                                          c->sb.encoded_extent_max) /
1966                                    PAGE_SECTORS, 0) ||
1967             rhashtable_init(&c->promote_table, &bch_promote_params))
1968                 return -ENOMEM;
1969
1970         return 0;
1971 }