]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
dc922a9140b1fbc94f36bdc804e806f4a43cfecf
[bcachefs-tools-debian] / libbcachefs / io.c
1 /*
2  * Some low level IO code, and hacks for various block layer limitations
3  *
4  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "alloc_foreground.h"
10 #include "bset.h"
11 #include "btree_update.h"
12 #include "buckets.h"
13 #include "checksum.h"
14 #include "compress.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "disk_groups.h"
18 #include "ec.h"
19 #include "error.h"
20 #include "extents.h"
21 #include "io.h"
22 #include "journal.h"
23 #include "keylist.h"
24 #include "move.h"
25 #include "rebalance.h"
26 #include "super.h"
27 #include "super-io.h"
28
29 #include <linux/blkdev.h>
30 #include <linux/random.h>
31
32 #include <trace/events/bcachefs.h>
33
34 static bool bch2_target_congested(struct bch_fs *c, u16 target)
35 {
36         const struct bch_devs_mask *devs;
37         unsigned d, nr = 0, total = 0;
38         u64 now = local_clock(), last;
39         s64 congested;
40         struct bch_dev *ca;
41
42         if (!target)
43                 return false;
44
45         rcu_read_lock();
46         devs = bch2_target_to_mask(c, target);
47         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
48                 ca = rcu_dereference(c->devs[d]);
49                 if (!ca)
50                         continue;
51
52                 congested = atomic_read(&ca->congested);
53                 last = READ_ONCE(ca->congested_last);
54                 if (time_after64(now, last))
55                         congested -= (now - last) >> 12;
56
57                 total += max(congested, 0LL);
58                 nr++;
59         }
60         rcu_read_unlock();
61
62         return bch2_rand_range(nr * CONGESTED_MAX) < total;
63 }
64
65 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
66                                        u64 now, int rw)
67 {
68         u64 latency_capable =
69                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
70         /* ideally we'd be taking into account the device's variance here: */
71         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
72         s64 latency_over = io_latency - latency_threshold;
73
74         if (latency_threshold && latency_over > 0) {
75                 /*
76                  * bump up congested by approximately latency_over * 4 /
77                  * latency_threshold - we don't need much accuracy here so don't
78                  * bother with the divide:
79                  */
80                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
81                         atomic_add(latency_over >>
82                                    max_t(int, ilog2(latency_threshold) - 2, 0),
83                                    &ca->congested);
84
85                 ca->congested_last = now;
86         } else if (atomic_read(&ca->congested) > 0) {
87                 atomic_dec(&ca->congested);
88         }
89 }
90
91 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
92 {
93         atomic64_t *latency = &ca->cur_latency[rw];
94         u64 now = local_clock();
95         u64 io_latency = time_after64(now, submit_time)
96                 ? now - submit_time
97                 : 0;
98         u64 old, new, v = atomic64_read(latency);
99
100         do {
101                 old = v;
102
103                 /*
104                  * If the io latency was reasonably close to the current
105                  * latency, skip doing the update and atomic operation - most of
106                  * the time:
107                  */
108                 if (abs((int) (old - io_latency)) < (old >> 1) &&
109                     now & ~(~0 << 5))
110                         break;
111
112                 new = ewma_add(old, io_latency, 5);
113         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
114
115         bch2_congested_acct(ca, io_latency, now, rw);
116
117         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
118 }
119
120 /* Allocate, free from mempool: */
121
122 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
123 {
124         struct bio_vec *bv;
125         unsigned i;
126
127         bio_for_each_segment_all(bv, bio, i)
128                 if (bv->bv_page != ZERO_PAGE(0))
129                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
130         bio->bi_vcnt = 0;
131 }
132
133 static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
134                                     bool *using_mempool)
135 {
136         struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
137
138         if (likely(!*using_mempool)) {
139                 bv->bv_page = alloc_page(GFP_NOIO);
140                 if (unlikely(!bv->bv_page)) {
141                         mutex_lock(&c->bio_bounce_pages_lock);
142                         *using_mempool = true;
143                         goto pool_alloc;
144
145                 }
146         } else {
147 pool_alloc:
148                 bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
149         }
150
151         bv->bv_len = PAGE_SIZE;
152         bv->bv_offset = 0;
153 }
154
155 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
156                                size_t bytes)
157 {
158         bool using_mempool = false;
159
160         BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
161
162         bio->bi_iter.bi_size = bytes;
163
164         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
165                 bch2_bio_alloc_page_pool(c, bio, &using_mempool);
166
167         if (using_mempool)
168                 mutex_unlock(&c->bio_bounce_pages_lock);
169 }
170
171 void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
172                                     size_t bytes)
173 {
174         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
175                 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
176
177                 BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
178
179                 bv->bv_page = alloc_page(GFP_NOIO);
180                 if (!bv->bv_page) {
181                         /*
182                          * We already allocated from mempool, we can't allocate from it again
183                          * without freeing the pages we already allocated or else we could
184                          * deadlock:
185                          */
186                         bch2_bio_free_pages_pool(c, bio);
187                         bch2_bio_alloc_pages_pool(c, bio, bytes);
188                         return;
189                 }
190
191                 bv->bv_len = PAGE_SIZE;
192                 bv->bv_offset = 0;
193                 bio->bi_vcnt++;
194         }
195
196         bio->bi_iter.bi_size = bytes;
197 }
198
199 /* Writes */
200
201 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
202                                enum bch_data_type type,
203                                const struct bkey_i *k)
204 {
205         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
206         const struct bch_extent_ptr *ptr;
207         struct bch_write_bio *n;
208         struct bch_dev *ca;
209
210         BUG_ON(c->opts.nochanges);
211
212         bkey_for_each_ptr(ptrs, ptr) {
213                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
214                        !c->devs[ptr->dev]);
215
216                 ca = bch_dev_bkey_exists(c, ptr->dev);
217
218                 if (to_entry(ptr + 1) < ptrs.end) {
219                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
220                                                    &ca->replica_set));
221
222                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
223                         n->bio.bi_private       = wbio->bio.bi_private;
224                         n->parent               = wbio;
225                         n->split                = true;
226                         n->bounce               = false;
227                         n->put_bio              = true;
228                         n->bio.bi_opf           = wbio->bio.bi_opf;
229                         bio_inc_remaining(&wbio->bio);
230                 } else {
231                         n = wbio;
232                         n->split                = false;
233                 }
234
235                 n->c                    = c;
236                 n->dev                  = ptr->dev;
237                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
238                 n->submit_time          = local_clock();
239                 n->bio.bi_iter.bi_sector = ptr->offset;
240
241                 if (!journal_flushes_device(ca))
242                         n->bio.bi_opf |= REQ_FUA;
243
244                 if (likely(n->have_ioref)) {
245                         this_cpu_add(ca->io_done->sectors[WRITE][type],
246                                      bio_sectors(&n->bio));
247
248                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
249                         submit_bio(&n->bio);
250                 } else {
251                         n->bio.bi_status        = BLK_STS_REMOVED;
252                         bio_endio(&n->bio);
253                 }
254         }
255 }
256
257 static void __bch2_write(struct closure *);
258
259 static void bch2_write_done(struct closure *cl)
260 {
261         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
262         struct bch_fs *c = op->c;
263
264         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
265                 op->error = bch2_journal_error(&c->journal);
266
267         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
268                 bch2_disk_reservation_put(c, &op->res);
269         percpu_ref_put(&c->writes);
270         bch2_keylist_free(&op->insert_keys, op->inline_keys);
271
272         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
273
274         closure_return(cl);
275 }
276
277 int bch2_write_index_default(struct bch_write_op *op)
278 {
279         struct bch_fs *c = op->c;
280         struct btree_trans trans;
281         struct btree_iter *iter;
282         struct keylist *keys = &op->insert_keys;
283         int ret;
284
285         BUG_ON(bch2_keylist_empty(keys));
286         bch2_verify_keylist_sorted(keys);
287
288         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
289
290         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
291                                    bkey_start_pos(&bch2_keylist_front(keys)->k),
292                                    BTREE_ITER_INTENT);
293
294         do {
295                 BKEY_PADDED(k) split;
296
297                 bkey_copy(&split.k, bch2_keylist_front(keys));
298
299                 bch2_extent_trim_atomic(&split.k, iter);
300
301                 bch2_trans_update(&trans,
302                                   BTREE_INSERT_ENTRY(iter, &split.k));
303
304                 ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
305                                         BTREE_INSERT_NOFAIL|
306                                         BTREE_INSERT_USE_RESERVE);
307                 if (ret)
308                         break;
309
310                 if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
311                         bch2_cut_front(iter->pos, bch2_keylist_front(keys));
312                 else
313                         bch2_keylist_pop_front(keys);
314         } while (!bch2_keylist_empty(keys));
315
316         bch2_trans_exit(&trans);
317
318         return ret;
319 }
320
321 /**
322  * bch_write_index - after a write, update index to point to new data
323  */
324 static void __bch2_write_index(struct bch_write_op *op)
325 {
326         struct bch_fs *c = op->c;
327         struct keylist *keys = &op->insert_keys;
328         struct bch_extent_ptr *ptr;
329         struct bkey_i *src, *dst = keys->keys, *n, *k;
330         unsigned dev;
331         int ret;
332
333         for (src = keys->keys; src != keys->top; src = n) {
334                 n = bkey_next(src);
335                 bkey_copy(dst, src);
336
337                 bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
338                         test_bit(ptr->dev, op->failed.d));
339
340                 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
341                         ret = -EIO;
342                         goto err;
343                 }
344
345                 dst = bkey_next(dst);
346         }
347
348         keys->top = dst;
349
350         /*
351          * probably not the ideal place to hook this in, but I don't
352          * particularly want to plumb io_opts all the way through the btree
353          * update stack right now
354          */
355         for_each_keylist_key(keys, k)
356                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
357
358         if (!bch2_keylist_empty(keys)) {
359                 u64 sectors_start = keylist_sectors(keys);
360                 int ret = op->index_update_fn(op);
361
362                 BUG_ON(keylist_sectors(keys) && !ret);
363
364                 op->written += sectors_start - keylist_sectors(keys);
365
366                 if (ret) {
367                         __bcache_io_error(c, "btree IO error %i", ret);
368                         op->error = ret;
369                 }
370         }
371 out:
372         /* If some a bucket wasn't written, we can't erasure code it: */
373         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
374                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
375
376         bch2_open_buckets_put(c, &op->open_buckets);
377         return;
378 err:
379         keys->top = keys->keys;
380         op->error = ret;
381         goto out;
382 }
383
384 static void bch2_write_index(struct closure *cl)
385 {
386         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
387         struct bch_fs *c = op->c;
388
389         __bch2_write_index(op);
390
391         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
392                 bch2_journal_flush_seq_async(&c->journal,
393                                              *op_journal_seq(op),
394                                              cl);
395                 continue_at(cl, bch2_write_done, index_update_wq(op));
396         } else {
397                 continue_at_nobarrier(cl, bch2_write_done, NULL);
398         }
399 }
400
401 static void bch2_write_endio(struct bio *bio)
402 {
403         struct closure *cl              = bio->bi_private;
404         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
405         struct bch_write_bio *wbio      = to_wbio(bio);
406         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
407         struct bch_fs *c                = wbio->c;
408         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
409
410         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
411                 set_bit(wbio->dev, op->failed.d);
412
413         if (wbio->have_ioref) {
414                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
415                 percpu_ref_put(&ca->io_ref);
416         }
417
418         if (wbio->bounce)
419                 bch2_bio_free_pages_pool(c, bio);
420
421         if (wbio->put_bio)
422                 bio_put(bio);
423
424         if (parent)
425                 bio_endio(&parent->bio);
426         else
427                 closure_put(cl);
428 }
429
430 static void init_append_extent(struct bch_write_op *op,
431                                struct write_point *wp,
432                                struct bversion version,
433                                struct bch_extent_crc_unpacked crc)
434 {
435         struct bch_fs *c = op->c;
436         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
437         struct extent_ptr_decoded p = { .crc = crc };
438         struct open_bucket *ob;
439         unsigned i;
440
441         op->pos.offset += crc.uncompressed_size;
442         e->k.p          = op->pos;
443         e->k.size       = crc.uncompressed_size;
444         e->k.version    = version;
445
446         BUG_ON(crc.compressed_size > wp->sectors_free);
447         wp->sectors_free -= crc.compressed_size;
448
449         open_bucket_for_each(c, &wp->ptrs, ob, i) {
450                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
451
452                 p.ptr = ob->ptr;
453                 p.ptr.cached = !ca->mi.durability ||
454                         (op->flags & BCH_WRITE_CACHED) != 0;
455                 p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
456                 bch2_extent_ptr_decoded_append(e, &p);
457
458                 BUG_ON(crc.compressed_size > ob->sectors_free);
459                 ob->sectors_free -= crc.compressed_size;
460         }
461
462         bch2_keylist_push(&op->insert_keys);
463 }
464
465 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
466                                         struct write_point *wp,
467                                         struct bio *src,
468                                         bool *page_alloc_failed,
469                                         void *buf)
470 {
471         struct bch_write_bio *wbio;
472         struct bio *bio;
473         unsigned output_available =
474                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
475         unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
476
477         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
478         wbio                    = wbio_init(bio);
479         wbio->put_bio           = true;
480         /* copy WRITE_SYNC flag */
481         wbio->bio.bi_opf        = src->bi_opf;
482
483         if (buf) {
484                 bio->bi_iter.bi_size = output_available;
485                 bch2_bio_map(bio, buf);
486                 return bio;
487         }
488
489         wbio->bounce            = true;
490
491         /*
492          * We can't use mempool for more than c->sb.encoded_extent_max
493          * worth of pages, but we'd like to allocate more if we can:
494          */
495         while (bio->bi_iter.bi_size < output_available) {
496                 unsigned len = min_t(unsigned, PAGE_SIZE,
497                                      output_available - bio->bi_iter.bi_size);
498                 struct page *p;
499
500                 p = alloc_page(GFP_NOIO);
501                 if (!p) {
502                         unsigned pool_max =
503                                 min_t(unsigned, output_available,
504                                       c->sb.encoded_extent_max << 9);
505
506                         if (bio_sectors(bio) < pool_max)
507                                 bch2_bio_alloc_pages_pool(c, bio, pool_max);
508                         break;
509                 }
510
511                 bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
512                         .bv_page        = p,
513                         .bv_len         = len,
514                         .bv_offset      = 0,
515                 };
516                 bio->bi_iter.bi_size += len;
517         }
518
519         *page_alloc_failed = bio->bi_vcnt < pages;
520         return bio;
521 }
522
523 static int bch2_write_rechecksum(struct bch_fs *c,
524                                  struct bch_write_op *op,
525                                  unsigned new_csum_type)
526 {
527         struct bio *bio = &op->wbio.bio;
528         struct bch_extent_crc_unpacked new_crc;
529         int ret;
530
531         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
532
533         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
534             bch2_csum_type_is_encryption(new_csum_type))
535                 new_csum_type = op->crc.csum_type;
536
537         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
538                                   NULL, &new_crc,
539                                   op->crc.offset, op->crc.live_size,
540                                   new_csum_type);
541         if (ret)
542                 return ret;
543
544         bio_advance(bio, op->crc.offset << 9);
545         bio->bi_iter.bi_size = op->crc.live_size << 9;
546         op->crc = new_crc;
547         return 0;
548 }
549
550 static int bch2_write_decrypt(struct bch_write_op *op)
551 {
552         struct bch_fs *c = op->c;
553         struct nonce nonce = extent_nonce(op->version, op->crc);
554         struct bch_csum csum;
555
556         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
557                 return 0;
558
559         /*
560          * If we need to decrypt data in the write path, we'll no longer be able
561          * to verify the existing checksum (poly1305 mac, in this case) after
562          * it's decrypted - this is the last point we'll be able to reverify the
563          * checksum:
564          */
565         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
566         if (bch2_crc_cmp(op->crc.csum, csum))
567                 return -EIO;
568
569         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
570         op->crc.csum_type = 0;
571         op->crc.csum = (struct bch_csum) { 0, 0 };
572         return 0;
573 }
574
575 static enum prep_encoded_ret {
576         PREP_ENCODED_OK,
577         PREP_ENCODED_ERR,
578         PREP_ENCODED_CHECKSUM_ERR,
579         PREP_ENCODED_DO_WRITE,
580 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
581 {
582         struct bch_fs *c = op->c;
583         struct bio *bio = &op->wbio.bio;
584
585         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
586                 return PREP_ENCODED_OK;
587
588         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
589
590         /* Can we just write the entire extent as is? */
591         if (op->crc.uncompressed_size == op->crc.live_size &&
592             op->crc.compressed_size <= wp->sectors_free &&
593             op->crc.compression_type == op->compression_type) {
594                 if (!op->crc.compression_type &&
595                     op->csum_type != op->crc.csum_type &&
596                     bch2_write_rechecksum(c, op, op->csum_type))
597                         return PREP_ENCODED_CHECKSUM_ERR;
598
599                 return PREP_ENCODED_DO_WRITE;
600         }
601
602         /*
603          * If the data is compressed and we couldn't write the entire extent as
604          * is, we have to decompress it:
605          */
606         if (op->crc.compression_type) {
607                 struct bch_csum csum;
608
609                 if (bch2_write_decrypt(op))
610                         return PREP_ENCODED_CHECKSUM_ERR;
611
612                 /* Last point we can still verify checksum: */
613                 csum = bch2_checksum_bio(c, op->crc.csum_type,
614                                          extent_nonce(op->version, op->crc),
615                                          bio);
616                 if (bch2_crc_cmp(op->crc.csum, csum))
617                         return PREP_ENCODED_CHECKSUM_ERR;
618
619                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
620                         return PREP_ENCODED_ERR;
621         }
622
623         /*
624          * No longer have compressed data after this point - data might be
625          * encrypted:
626          */
627
628         /*
629          * If the data is checksummed and we're only writing a subset,
630          * rechecksum and adjust bio to point to currently live data:
631          */
632         if ((op->crc.live_size != op->crc.uncompressed_size ||
633              op->crc.csum_type != op->csum_type) &&
634             bch2_write_rechecksum(c, op, op->csum_type))
635                 return PREP_ENCODED_CHECKSUM_ERR;
636
637         /*
638          * If we want to compress the data, it has to be decrypted:
639          */
640         if ((op->compression_type ||
641              bch2_csum_type_is_encryption(op->crc.csum_type) !=
642              bch2_csum_type_is_encryption(op->csum_type)) &&
643             bch2_write_decrypt(op))
644                 return PREP_ENCODED_CHECKSUM_ERR;
645
646         return PREP_ENCODED_OK;
647 }
648
649 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
650 {
651         struct bch_fs *c = op->c;
652         struct bio *src = &op->wbio.bio, *dst = src;
653         struct bvec_iter saved_iter;
654         struct bkey_i *key_to_write;
655         void *ec_buf;
656         unsigned key_to_write_offset = op->insert_keys.top_p -
657                 op->insert_keys.keys_p;
658         unsigned total_output = 0, total_input = 0;
659         bool bounce = false;
660         bool page_alloc_failed = false;
661         int ret, more = 0;
662
663         BUG_ON(!bio_sectors(src));
664
665         ec_buf = bch2_writepoint_ec_buf(c, wp);
666
667         switch (bch2_write_prep_encoded_data(op, wp)) {
668         case PREP_ENCODED_OK:
669                 break;
670         case PREP_ENCODED_ERR:
671                 ret = -EIO;
672                 goto err;
673         case PREP_ENCODED_CHECKSUM_ERR:
674                 goto csum_err;
675         case PREP_ENCODED_DO_WRITE:
676                 if (ec_buf) {
677                         dst = bch2_write_bio_alloc(c, wp, src,
678                                                    &page_alloc_failed,
679                                                    ec_buf);
680                         bio_copy_data(dst, src);
681                         bounce = true;
682                 }
683                 init_append_extent(op, wp, op->version, op->crc);
684                 goto do_write;
685         }
686
687         if (ec_buf ||
688             op->compression_type ||
689             (op->csum_type &&
690              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
691             (bch2_csum_type_is_encryption(op->csum_type) &&
692              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
693                 dst = bch2_write_bio_alloc(c, wp, src,
694                                            &page_alloc_failed,
695                                            ec_buf);
696                 bounce = true;
697         }
698
699         saved_iter = dst->bi_iter;
700
701         do {
702                 struct bch_extent_crc_unpacked crc =
703                         (struct bch_extent_crc_unpacked) { 0 };
704                 struct bversion version = op->version;
705                 size_t dst_len, src_len;
706
707                 if (page_alloc_failed &&
708                     bio_sectors(dst) < wp->sectors_free &&
709                     bio_sectors(dst) < c->sb.encoded_extent_max)
710                         break;
711
712                 BUG_ON(op->compression_type &&
713                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
714                        bch2_csum_type_is_encryption(op->crc.csum_type));
715                 BUG_ON(op->compression_type && !bounce);
716
717                 crc.compression_type = op->compression_type
718                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
719                                              op->compression_type)
720                         : 0;
721                 if (!crc.compression_type) {
722                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
723                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
724
725                         if (op->csum_type)
726                                 dst_len = min_t(unsigned, dst_len,
727                                                 c->sb.encoded_extent_max << 9);
728
729                         if (bounce) {
730                                 swap(dst->bi_iter.bi_size, dst_len);
731                                 bio_copy_data(dst, src);
732                                 swap(dst->bi_iter.bi_size, dst_len);
733                         }
734
735                         src_len = dst_len;
736                 }
737
738                 BUG_ON(!src_len || !dst_len);
739
740                 if (bch2_csum_type_is_encryption(op->csum_type)) {
741                         if (bversion_zero(version)) {
742                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
743                         } else {
744                                 crc.nonce = op->nonce;
745                                 op->nonce += src_len >> 9;
746                         }
747                 }
748
749                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
750                     !crc.compression_type &&
751                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
752                     bch2_csum_type_is_encryption(op->csum_type)) {
753                         /*
754                          * Note: when we're using rechecksum(), we need to be
755                          * checksumming @src because it has all the data our
756                          * existing checksum covers - if we bounced (because we
757                          * were trying to compress), @dst will only have the
758                          * part of the data the new checksum will cover.
759                          *
760                          * But normally we want to be checksumming post bounce,
761                          * because part of the reason for bouncing is so the
762                          * data can't be modified (by userspace) while it's in
763                          * flight.
764                          */
765                         if (bch2_rechecksum_bio(c, src, version, op->crc,
766                                         &crc, &op->crc,
767                                         src_len >> 9,
768                                         bio_sectors(src) - (src_len >> 9),
769                                         op->csum_type))
770                                 goto csum_err;
771                 } else {
772                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
773                             bch2_rechecksum_bio(c, src, version, op->crc,
774                                         NULL, &op->crc,
775                                         src_len >> 9,
776                                         bio_sectors(src) - (src_len >> 9),
777                                         op->crc.csum_type))
778                                 goto csum_err;
779
780                         crc.compressed_size     = dst_len >> 9;
781                         crc.uncompressed_size   = src_len >> 9;
782                         crc.live_size           = src_len >> 9;
783
784                         swap(dst->bi_iter.bi_size, dst_len);
785                         bch2_encrypt_bio(c, op->csum_type,
786                                          extent_nonce(version, crc), dst);
787                         crc.csum = bch2_checksum_bio(c, op->csum_type,
788                                          extent_nonce(version, crc), dst);
789                         crc.csum_type = op->csum_type;
790                         swap(dst->bi_iter.bi_size, dst_len);
791                 }
792
793                 init_append_extent(op, wp, version, crc);
794
795                 if (dst != src)
796                         bio_advance(dst, dst_len);
797                 bio_advance(src, src_len);
798                 total_output    += dst_len;
799                 total_input     += src_len;
800         } while (dst->bi_iter.bi_size &&
801                  src->bi_iter.bi_size &&
802                  wp->sectors_free &&
803                  !bch2_keylist_realloc(&op->insert_keys,
804                                       op->inline_keys,
805                                       ARRAY_SIZE(op->inline_keys),
806                                       BKEY_EXTENT_U64s_MAX));
807
808         more = src->bi_iter.bi_size != 0;
809
810         dst->bi_iter = saved_iter;
811
812         if (dst == src && more) {
813                 BUG_ON(total_output != total_input);
814
815                 dst = bio_split(src, total_input >> 9,
816                                 GFP_NOIO, &c->bio_write);
817                 wbio_init(dst)->put_bio = true;
818                 /* copy WRITE_SYNC flag */
819                 dst->bi_opf             = src->bi_opf;
820         }
821
822         dst->bi_iter.bi_size = total_output;
823
824         /* Free unneeded pages after compressing: */
825         if (to_wbio(dst)->bounce)
826                 while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
827                         mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
828                                      &c->bio_bounce_pages);
829 do_write:
830         /* might have done a realloc... */
831
832         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
833
834         bch2_ec_add_backpointer(c, wp,
835                                 bkey_start_pos(&key_to_write->k),
836                                 total_input >> 9);
837
838         dst->bi_end_io  = bch2_write_endio;
839         dst->bi_private = &op->cl;
840         bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
841
842         closure_get(dst->bi_private);
843
844         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
845                                   key_to_write);
846         return more;
847 csum_err:
848         bch_err(c, "error verifying existing checksum while "
849                 "rewriting existing data (memory corruption?)");
850         ret = -EIO;
851 err:
852         if (to_wbio(dst)->bounce)
853                 bch2_bio_free_pages_pool(c, dst);
854         if (to_wbio(dst)->put_bio)
855                 bio_put(dst);
856
857         return ret;
858 }
859
860 static void __bch2_write(struct closure *cl)
861 {
862         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
863         struct bch_fs *c = op->c;
864         struct write_point *wp;
865         int ret;
866 again:
867         memset(&op->failed, 0, sizeof(op->failed));
868
869         do {
870                 /* +1 for possible cache device: */
871                 if (op->open_buckets.nr + op->nr_replicas + 1 >
872                     ARRAY_SIZE(op->open_buckets.v))
873                         goto flush_io;
874
875                 if (bch2_keylist_realloc(&op->insert_keys,
876                                         op->inline_keys,
877                                         ARRAY_SIZE(op->inline_keys),
878                                         BKEY_EXTENT_U64s_MAX))
879                         goto flush_io;
880
881                 wp = bch2_alloc_sectors_start(c,
882                         op->target,
883                         op->opts.erasure_code,
884                         op->write_point,
885                         &op->devs_have,
886                         op->nr_replicas,
887                         op->nr_replicas_required,
888                         op->alloc_reserve,
889                         op->flags,
890                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
891                 EBUG_ON(!wp);
892
893                 if (unlikely(IS_ERR(wp))) {
894                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
895                                 ret = PTR_ERR(wp);
896                                 goto err;
897                         }
898
899                         goto flush_io;
900                 }
901
902                 ret = bch2_write_extent(op, wp);
903
904                 bch2_open_bucket_get(c, wp, &op->open_buckets);
905                 bch2_alloc_sectors_done(c, wp);
906
907                 if (ret < 0)
908                         goto err;
909         } while (ret);
910
911         continue_at(cl, bch2_write_index, index_update_wq(op));
912         return;
913 err:
914         op->error = ret;
915
916         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
917                     ? bch2_write_index
918                     : bch2_write_done, index_update_wq(op));
919         return;
920 flush_io:
921         closure_sync(cl);
922
923         if (!bch2_keylist_empty(&op->insert_keys)) {
924                 __bch2_write_index(op);
925
926                 if (op->error) {
927                         continue_at_nobarrier(cl, bch2_write_done, NULL);
928                         return;
929                 }
930         }
931
932         goto again;
933 }
934
935 /**
936  * bch_write - handle a write to a cache device or flash only volume
937  *
938  * This is the starting point for any data to end up in a cache device; it could
939  * be from a normal write, or a writeback write, or a write to a flash only
940  * volume - it's also used by the moving garbage collector to compact data in
941  * mostly empty buckets.
942  *
943  * It first writes the data to the cache, creating a list of keys to be inserted
944  * (if the data won't fit in a single open bucket, there will be multiple keys);
945  * after the data is written it calls bch_journal, and after the keys have been
946  * added to the next journal write they're inserted into the btree.
947  *
948  * If op->discard is true, instead of inserting the data it invalidates the
949  * region of the cache represented by op->bio and op->inode.
950  */
951 void bch2_write(struct closure *cl)
952 {
953         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
954         struct bch_fs *c = op->c;
955
956         BUG_ON(!op->nr_replicas);
957         BUG_ON(!op->write_point.v);
958         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
959         BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
960
961         op->start_time = local_clock();
962
963         bch2_keylist_init(&op->insert_keys, op->inline_keys);
964         wbio_init(&op->wbio.bio)->put_bio = false;
965
966         if (c->opts.nochanges ||
967             !percpu_ref_tryget(&c->writes)) {
968                 __bcache_io_error(c, "read only");
969                 op->error = -EROFS;
970                 if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
971                         bch2_disk_reservation_put(c, &op->res);
972                 closure_return(cl);
973                 return;
974         }
975
976         bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
977
978         continue_at_nobarrier(cl, __bch2_write, NULL);
979 }
980
981 /* Cache promotion on read */
982
983 struct promote_op {
984         struct closure          cl;
985         struct rcu_head         rcu;
986         u64                     start_time;
987
988         struct rhash_head       hash;
989         struct bpos             pos;
990
991         struct migrate_write    write;
992         struct bio_vec          bi_inline_vecs[0]; /* must be last */
993 };
994
995 static const struct rhashtable_params bch_promote_params = {
996         .head_offset    = offsetof(struct promote_op, hash),
997         .key_offset     = offsetof(struct promote_op, pos),
998         .key_len        = sizeof(struct bpos),
999 };
1000
1001 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1002                                   struct bpos pos,
1003                                   struct bch_io_opts opts,
1004                                   unsigned flags)
1005 {
1006         if (!opts.promote_target)
1007                 return false;
1008
1009         if (!(flags & BCH_READ_MAY_PROMOTE))
1010                 return false;
1011
1012         if (percpu_ref_is_dying(&c->writes))
1013                 return false;
1014
1015         if (!bkey_extent_is_data(k.k))
1016                 return false;
1017
1018         if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
1019                 return false;
1020
1021         if (bch2_target_congested(c, opts.promote_target))
1022                 return false;
1023
1024         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1025                                    bch_promote_params))
1026                 return false;
1027
1028         return true;
1029 }
1030
1031 static void promote_free(struct bch_fs *c, struct promote_op *op)
1032 {
1033         int ret;
1034
1035         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1036                                      bch_promote_params);
1037         BUG_ON(ret);
1038         percpu_ref_put(&c->writes);
1039         kfree_rcu(op, rcu);
1040 }
1041
1042 static void promote_done(struct closure *cl)
1043 {
1044         struct promote_op *op =
1045                 container_of(cl, struct promote_op, cl);
1046         struct bch_fs *c = op->write.op.c;
1047
1048         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1049                                op->start_time);
1050
1051         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1052         promote_free(c, op);
1053 }
1054
1055 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1056 {
1057         struct bch_fs *c = rbio->c;
1058         struct closure *cl = &op->cl;
1059         struct bio *bio = &op->write.op.wbio.bio;
1060
1061         trace_promote(&rbio->bio);
1062
1063         /* we now own pages: */
1064         BUG_ON(!rbio->bounce);
1065         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1066
1067         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1068                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1069         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1070
1071         bch2_migrate_read_done(&op->write, rbio);
1072
1073         closure_init(cl, NULL);
1074         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1075         closure_return_with_destructor(cl, promote_done);
1076 }
1077
1078 noinline
1079 static struct promote_op *__promote_alloc(struct bch_fs *c,
1080                                           struct bpos pos,
1081                                           struct extent_ptr_decoded *pick,
1082                                           struct bch_io_opts opts,
1083                                           unsigned rbio_sectors,
1084                                           struct bch_read_bio **rbio)
1085 {
1086         struct promote_op *op = NULL;
1087         struct bio *bio;
1088         unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
1089         /* data might have to be decompressed in the write path: */
1090         unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
1091                                            PAGE_SECTORS);
1092         int ret;
1093
1094         if (!percpu_ref_tryget(&c->writes))
1095                 return NULL;
1096
1097         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
1098                      GFP_NOIO);
1099         if (!op)
1100                 goto err;
1101
1102         op->start_time = local_clock();
1103         op->pos = pos;
1104
1105         /*
1106          * promotes require bouncing, but if the extent isn't
1107          * checksummed/compressed it might be too big for the mempool:
1108          */
1109         if (rbio_sectors > c->sb.encoded_extent_max) {
1110                 *rbio = kzalloc(sizeof(struct bch_read_bio) +
1111                                 sizeof(struct bio_vec) * rbio_pages,
1112                                 GFP_NOIO);
1113                 if (!*rbio)
1114                         goto err;
1115
1116                 rbio_init(&(*rbio)->bio, opts);
1117                 bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
1118                          rbio_pages);
1119
1120                 (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
1121                 bch2_bio_map(&(*rbio)->bio, NULL);
1122
1123                 if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
1124                         goto err;
1125
1126                 (*rbio)->bounce         = true;
1127                 (*rbio)->split          = true;
1128                 (*rbio)->kmalloc        = true;
1129         }
1130
1131         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1132                                           bch_promote_params))
1133                 goto err;
1134
1135         bio = &op->write.op.wbio.bio;
1136         bio_init(bio, bio->bi_inline_vecs, wbio_pages);
1137
1138         ret = bch2_migrate_write_init(c, &op->write,
1139                         writepoint_hashed((unsigned long) current),
1140                         opts,
1141                         DATA_PROMOTE,
1142                         (struct data_opts) {
1143                                 .target = opts.promote_target
1144                         },
1145                         bkey_s_c_null);
1146         BUG_ON(ret);
1147
1148         return op;
1149 err:
1150         if (*rbio)
1151                 bio_free_pages(&(*rbio)->bio);
1152         kfree(*rbio);
1153         *rbio = NULL;
1154         kfree(op);
1155         percpu_ref_put(&c->writes);
1156         return NULL;
1157 }
1158
1159 static inline struct promote_op *promote_alloc(struct bch_fs *c,
1160                                                struct bvec_iter iter,
1161                                                struct bkey_s_c k,
1162                                                struct extent_ptr_decoded *pick,
1163                                                struct bch_io_opts opts,
1164                                                unsigned flags,
1165                                                struct bch_read_bio **rbio,
1166                                                bool *bounce,
1167                                                bool *read_full)
1168 {
1169         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1170         unsigned sectors = promote_full
1171                 ? pick->crc.compressed_size
1172                 : bvec_iter_sectors(iter);
1173         struct bpos pos = promote_full
1174                 ? bkey_start_pos(k.k)
1175                 : POS(k.k->p.inode, iter.bi_sector);
1176         struct promote_op *promote;
1177
1178         if (!should_promote(c, k, pos, opts, flags))
1179                 return NULL;
1180
1181         promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
1182         if (!promote)
1183                 return NULL;
1184
1185         *bounce         = true;
1186         *read_full      = promote_full;
1187         return promote;
1188 }
1189
1190 /* Read */
1191
1192 #define READ_RETRY_AVOID        1
1193 #define READ_RETRY              2
1194 #define READ_ERR                3
1195
1196 enum rbio_context {
1197         RBIO_CONTEXT_NULL,
1198         RBIO_CONTEXT_HIGHPRI,
1199         RBIO_CONTEXT_UNBOUND,
1200 };
1201
1202 static inline struct bch_read_bio *
1203 bch2_rbio_parent(struct bch_read_bio *rbio)
1204 {
1205         return rbio->split ? rbio->parent : rbio;
1206 }
1207
1208 __always_inline
1209 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1210                            enum rbio_context context,
1211                            struct workqueue_struct *wq)
1212 {
1213         if (context <= rbio->context) {
1214                 fn(&rbio->work);
1215         } else {
1216                 rbio->work.func         = fn;
1217                 rbio->context           = context;
1218                 queue_work(wq, &rbio->work);
1219         }
1220 }
1221
1222 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1223 {
1224         BUG_ON(rbio->bounce && !rbio->split);
1225
1226         if (rbio->promote)
1227                 promote_free(rbio->c, rbio->promote);
1228         rbio->promote = NULL;
1229
1230         if (rbio->bounce)
1231                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1232
1233         if (rbio->split) {
1234                 struct bch_read_bio *parent = rbio->parent;
1235
1236                 if (rbio->kmalloc)
1237                         kfree(rbio);
1238                 else
1239                         bio_put(&rbio->bio);
1240
1241                 rbio = parent;
1242         }
1243
1244         return rbio;
1245 }
1246
1247 static void bch2_rbio_done(struct bch_read_bio *rbio)
1248 {
1249         bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1250                                rbio->start_time);
1251         bio_endio(&rbio->bio);
1252 }
1253
1254 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1255                                      struct bvec_iter bvec_iter, u64 inode,
1256                                      struct bch_io_failures *failed,
1257                                      unsigned flags)
1258 {
1259         struct btree_trans trans;
1260         struct btree_iter *iter;
1261         BKEY_PADDED(k) tmp;
1262         struct bkey_s_c k;
1263         int ret;
1264
1265         flags &= ~BCH_READ_LAST_FRAGMENT;
1266
1267         bch2_trans_init(&trans, c, 0, 0);
1268
1269         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1270                                    rbio->pos, BTREE_ITER_SLOTS);
1271 retry:
1272         rbio->bio.bi_status = 0;
1273
1274         k = bch2_btree_iter_peek_slot(iter);
1275         if (bkey_err(k))
1276                 goto err;
1277
1278         bkey_reassemble(&tmp.k, k);
1279         k = bkey_i_to_s_c(&tmp.k);
1280         bch2_trans_unlock(&trans);
1281
1282         if (!bkey_extent_is_data(k.k) ||
1283             !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
1284                                      rbio->pick.ptr,
1285                                      rbio->pos.offset -
1286                                      rbio->pick.crc.offset)) {
1287                 /* extent we wanted to read no longer exists: */
1288                 rbio->hole = true;
1289                 goto out;
1290         }
1291
1292         ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1293         if (ret == READ_RETRY)
1294                 goto retry;
1295         if (ret)
1296                 goto err;
1297 out:
1298         bch2_rbio_done(rbio);
1299         bch2_trans_exit(&trans);
1300         return;
1301 err:
1302         rbio->bio.bi_status = BLK_STS_IOERR;
1303         goto out;
1304 }
1305
1306 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1307                             struct bvec_iter bvec_iter, u64 inode,
1308                             struct bch_io_failures *failed, unsigned flags)
1309 {
1310         struct btree_trans trans;
1311         struct btree_iter *iter;
1312         struct bkey_s_c k;
1313         int ret;
1314
1315         bch2_trans_init(&trans, c, 0, 0);
1316
1317         flags &= ~BCH_READ_LAST_FRAGMENT;
1318         flags |= BCH_READ_MUST_CLONE;
1319 retry:
1320         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1321                            POS(inode, bvec_iter.bi_sector),
1322                            BTREE_ITER_SLOTS, k, ret) {
1323                 BKEY_PADDED(k) tmp;
1324                 unsigned bytes;
1325
1326                 bkey_reassemble(&tmp.k, k);
1327                 k = bkey_i_to_s_c(&tmp.k);
1328                 bch2_trans_unlock(&trans);
1329
1330                 bytes = min_t(unsigned, bvec_iter.bi_size,
1331                               (k.k->p.offset - bvec_iter.bi_sector) << 9);
1332                 swap(bvec_iter.bi_size, bytes);
1333
1334                 ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1335                 switch (ret) {
1336                 case READ_RETRY:
1337                         goto retry;
1338                 case READ_ERR:
1339                         goto err;
1340                 };
1341
1342                 if (bytes == bvec_iter.bi_size)
1343                         goto out;
1344
1345                 swap(bvec_iter.bi_size, bytes);
1346                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1347         }
1348
1349         /*
1350          * If we get here, it better have been because there was an error
1351          * reading a btree node
1352          */
1353         BUG_ON(!ret);
1354         __bcache_io_error(c, "btree IO error: %i", ret);
1355 err:
1356         rbio->bio.bi_status = BLK_STS_IOERR;
1357 out:
1358         bch2_trans_exit(&trans);
1359         bch2_rbio_done(rbio);
1360 }
1361
1362 static void bch2_rbio_retry(struct work_struct *work)
1363 {
1364         struct bch_read_bio *rbio =
1365                 container_of(work, struct bch_read_bio, work);
1366         struct bch_fs *c        = rbio->c;
1367         struct bvec_iter iter   = rbio->bvec_iter;
1368         unsigned flags          = rbio->flags;
1369         u64 inode               = rbio->pos.inode;
1370         struct bch_io_failures failed = { .nr = 0 };
1371
1372         trace_read_retry(&rbio->bio);
1373
1374         if (rbio->retry == READ_RETRY_AVOID)
1375                 bch2_mark_io_failure(&failed, &rbio->pick);
1376
1377         rbio->bio.bi_status = 0;
1378
1379         rbio = bch2_rbio_free(rbio);
1380
1381         flags |= BCH_READ_IN_RETRY;
1382         flags &= ~BCH_READ_MAY_PROMOTE;
1383
1384         if (flags & BCH_READ_NODECODE)
1385                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1386         else
1387                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1388 }
1389
1390 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1391                             blk_status_t error)
1392 {
1393         rbio->retry = retry;
1394
1395         if (rbio->flags & BCH_READ_IN_RETRY)
1396                 return;
1397
1398         if (retry == READ_ERR) {
1399                 rbio = bch2_rbio_free(rbio);
1400
1401                 rbio->bio.bi_status = error;
1402                 bch2_rbio_done(rbio);
1403         } else {
1404                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1405                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1406         }
1407 }
1408
1409 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1410 {
1411         struct bch_fs *c = rbio->c;
1412         struct btree_trans trans;
1413         struct btree_iter *iter;
1414         struct bkey_s_c k;
1415         struct bkey_i_extent *e;
1416         BKEY_PADDED(k) new;
1417         struct bch_extent_crc_unpacked new_crc;
1418         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1419         int ret;
1420
1421         if (rbio->pick.crc.compression_type)
1422                 return;
1423
1424         bch2_trans_init(&trans, c, 0, 0);
1425 retry:
1426         bch2_trans_begin(&trans);
1427
1428         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1429                                    BTREE_ITER_INTENT);
1430         k = bch2_btree_iter_peek(iter);
1431         if (IS_ERR_OR_NULL(k.k))
1432                 goto out;
1433
1434         if (!bkey_extent_is_data(k.k))
1435                 goto out;
1436
1437         bkey_reassemble(&new.k, k);
1438         e = bkey_i_to_extent(&new.k);
1439
1440         if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
1441                                      rbio->pick.ptr, data_offset) ||
1442             bversion_cmp(e->k.version, rbio->version))
1443                 goto out;
1444
1445         /* Extent was merged? */
1446         if (bkey_start_offset(&e->k) < data_offset ||
1447             e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1448                 goto out;
1449
1450         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1451                         rbio->pick.crc, NULL, &new_crc,
1452                         bkey_start_offset(&e->k) - data_offset, e->k.size,
1453                         rbio->pick.crc.csum_type)) {
1454                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1455                 goto out;
1456         }
1457
1458         if (!bch2_extent_narrow_crcs(e, new_crc))
1459                 goto out;
1460
1461         bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
1462         ret = bch2_trans_commit(&trans, NULL, NULL,
1463                                 BTREE_INSERT_ATOMIC|
1464                                 BTREE_INSERT_NOFAIL|
1465                                 BTREE_INSERT_NOWAIT);
1466         if (ret == -EINTR)
1467                 goto retry;
1468 out:
1469         bch2_trans_exit(&trans);
1470 }
1471
1472 static bool should_narrow_crcs(struct bkey_s_c k,
1473                                struct extent_ptr_decoded *pick,
1474                                unsigned flags)
1475 {
1476         return !(flags & BCH_READ_IN_RETRY) &&
1477                 bkey_extent_is_data(k.k) &&
1478                 bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
1479 }
1480
1481 /* Inner part that may run in process context */
1482 static void __bch2_read_endio(struct work_struct *work)
1483 {
1484         struct bch_read_bio *rbio =
1485                 container_of(work, struct bch_read_bio, work);
1486         struct bch_fs *c        = rbio->c;
1487         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1488         struct bio *src         = &rbio->bio;
1489         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1490         struct bvec_iter dst_iter = rbio->bvec_iter;
1491         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1492         struct nonce nonce = extent_nonce(rbio->version, crc);
1493         struct bch_csum csum;
1494
1495         /* Reset iterator for checksumming and copying bounced data: */
1496         if (rbio->bounce) {
1497                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1498                 src->bi_iter.bi_idx             = 0;
1499                 src->bi_iter.bi_bvec_done       = 0;
1500         } else {
1501                 src->bi_iter                    = rbio->bvec_iter;
1502         }
1503
1504         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1505         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1506                 goto csum_err;
1507
1508         if (unlikely(rbio->narrow_crcs))
1509                 bch2_rbio_narrow_crcs(rbio);
1510
1511         if (rbio->flags & BCH_READ_NODECODE)
1512                 goto nodecode;
1513
1514         /* Adjust crc to point to subset of data we want: */
1515         crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
1516         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1517
1518         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1519                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1520                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1521                         goto decompression_err;
1522         } else {
1523                 /* don't need to decrypt the entire bio: */
1524                 nonce = nonce_add(nonce, crc.offset << 9);
1525                 bio_advance(src, crc.offset << 9);
1526
1527                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1528                 src->bi_iter.bi_size = dst_iter.bi_size;
1529
1530                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1531
1532                 if (rbio->bounce) {
1533                         struct bvec_iter src_iter = src->bi_iter;
1534                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1535                 }
1536         }
1537
1538         if (rbio->promote) {
1539                 /*
1540                  * Re encrypt data we decrypted, so it's consistent with
1541                  * rbio->crc:
1542                  */
1543                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1544                 promote_start(rbio->promote, rbio);
1545                 rbio->promote = NULL;
1546         }
1547 nodecode:
1548         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1549                 rbio = bch2_rbio_free(rbio);
1550                 bch2_rbio_done(rbio);
1551         }
1552         return;
1553 csum_err:
1554         /*
1555          * Checksum error: if the bio wasn't bounced, we may have been
1556          * reading into buffers owned by userspace (that userspace can
1557          * scribble over) - retry the read, bouncing it this time:
1558          */
1559         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1560                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1561                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1562                 return;
1563         }
1564
1565         bch2_dev_io_error(ca,
1566                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1567                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1568                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1569                 csum.hi, csum.lo, crc.csum_type);
1570         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1571         return;
1572 decompression_err:
1573         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1574                           rbio->pos.inode,
1575                           (u64) rbio->bvec_iter.bi_sector);
1576         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1577         return;
1578 }
1579
1580 static void bch2_read_endio(struct bio *bio)
1581 {
1582         struct bch_read_bio *rbio =
1583                 container_of(bio, struct bch_read_bio, bio);
1584         struct bch_fs *c        = rbio->c;
1585         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1586         struct workqueue_struct *wq = NULL;
1587         enum rbio_context context = RBIO_CONTEXT_NULL;
1588
1589         if (rbio->have_ioref) {
1590                 bch2_latency_acct(ca, rbio->submit_time, READ);
1591                 percpu_ref_put(&ca->io_ref);
1592         }
1593
1594         if (!rbio->split)
1595                 rbio->bio.bi_end_io = rbio->end_io;
1596
1597         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1598                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1599                 return;
1600         }
1601
1602         if (rbio->pick.ptr.cached &&
1603             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1604              ptr_stale(ca, &rbio->pick.ptr))) {
1605                 atomic_long_inc(&c->read_realloc_races);
1606
1607                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1608                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1609                 else
1610                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1611                 return;
1612         }
1613
1614         if (rbio->narrow_crcs ||
1615             rbio->pick.crc.compression_type ||
1616             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1617                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1618         else if (rbio->pick.crc.csum_type)
1619                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1620
1621         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1622 }
1623
1624 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1625                        struct bvec_iter iter, struct bkey_s_c k,
1626                        struct bch_io_failures *failed, unsigned flags)
1627 {
1628         struct extent_ptr_decoded pick;
1629         struct bch_read_bio *rbio = NULL;
1630         struct bch_dev *ca;
1631         struct promote_op *promote = NULL;
1632         bool bounce = false, read_full = false, narrow_crcs = false;
1633         struct bpos pos = bkey_start_pos(k.k);
1634         int pick_ret;
1635
1636         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1637
1638         /* hole or reservation - just zero fill: */
1639         if (!pick_ret)
1640                 goto hole;
1641
1642         if (pick_ret < 0) {
1643                 __bcache_io_error(c, "no device to read from");
1644                 goto err;
1645         }
1646
1647         if (pick_ret > 0)
1648                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1649
1650         if (flags & BCH_READ_NODECODE) {
1651                 /*
1652                  * can happen if we retry, and the extent we were going to read
1653                  * has been merged in the meantime:
1654                  */
1655                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1656                         goto hole;
1657
1658                 iter.bi_sector  = pos.offset;
1659                 iter.bi_size    = pick.crc.compressed_size << 9;
1660                 goto noclone;
1661         }
1662
1663         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1664             bio_flagged(&orig->bio, BIO_CHAIN))
1665                 flags |= BCH_READ_MUST_CLONE;
1666
1667         narrow_crcs = should_narrow_crcs(k, &pick, flags);
1668
1669         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1670                 flags |= BCH_READ_MUST_BOUNCE;
1671
1672         EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
1673                 k.k->p.offset < bvec_iter_end_sector(iter));
1674
1675         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1676             (pick.crc.csum_type != BCH_CSUM_NONE &&
1677              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1678               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1679                (flags & BCH_READ_USER_MAPPED)) ||
1680               (flags & BCH_READ_MUST_BOUNCE)))) {
1681                 read_full = true;
1682                 bounce = true;
1683         }
1684
1685         promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
1686                                 &rbio, &bounce, &read_full);
1687
1688         if (!read_full) {
1689                 EBUG_ON(pick.crc.compression_type);
1690                 EBUG_ON(pick.crc.csum_type &&
1691                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1692                          bvec_iter_sectors(iter) != pick.crc.live_size ||
1693                          pick.crc.offset ||
1694                          iter.bi_sector != pos.offset));
1695
1696                 pick.ptr.offset += pick.crc.offset +
1697                         (iter.bi_sector - pos.offset);
1698                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
1699                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
1700                 pick.crc.offset                 = 0;
1701                 pick.crc.live_size              = bvec_iter_sectors(iter);
1702                 pos.offset                      = iter.bi_sector;
1703         }
1704
1705         if (rbio) {
1706                 /* promote already allocated bounce rbio */
1707         } else if (bounce) {
1708                 unsigned sectors = pick.crc.compressed_size;
1709
1710                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
1711                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
1712                                                   &c->bio_read_split),
1713                                  orig->opts);
1714
1715                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1716                 rbio->bounce    = true;
1717                 rbio->split     = true;
1718         } else if (flags & BCH_READ_MUST_CLONE) {
1719                 /*
1720                  * Have to clone if there were any splits, due to error
1721                  * reporting issues (if a split errored, and retrying didn't
1722                  * work, when it reports the error to its parent (us) we don't
1723                  * know if the error was from our bio, and we should retry, or
1724                  * from the whole bio, in which case we don't want to retry and
1725                  * lose the error)
1726                  */
1727                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
1728                                                 &c->bio_read_split),
1729                                  orig->opts);
1730                 rbio->bio.bi_iter = iter;
1731                 rbio->split     = true;
1732         } else {
1733 noclone:
1734                 rbio = orig;
1735                 rbio->bio.bi_iter = iter;
1736                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1737         }
1738
1739         BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1740
1741         rbio->c                 = c;
1742         rbio->submit_time       = local_clock();
1743         if (rbio->split)
1744                 rbio->parent    = orig;
1745         else
1746                 rbio->end_io    = orig->bio.bi_end_io;
1747         rbio->bvec_iter         = iter;
1748         rbio->flags             = flags;
1749         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
1750         rbio->narrow_crcs       = narrow_crcs;
1751         rbio->hole              = 0;
1752         rbio->retry             = 0;
1753         rbio->context           = 0;
1754         rbio->devs_have         = bch2_bkey_devs(k);
1755         rbio->pick              = pick;
1756         rbio->pos               = pos;
1757         rbio->version           = k.k->version;
1758         rbio->promote           = promote;
1759         INIT_WORK(&rbio->work, NULL);
1760
1761         rbio->bio.bi_opf        = orig->bio.bi_opf;
1762         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1763         rbio->bio.bi_end_io     = bch2_read_endio;
1764
1765         if (rbio->bounce)
1766                 trace_read_bounce(&rbio->bio);
1767
1768         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1769
1770         percpu_down_read_preempt_disable(&c->mark_lock);
1771         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
1772         percpu_up_read_preempt_enable(&c->mark_lock);
1773
1774         if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
1775                 bio_inc_remaining(&orig->bio);
1776                 trace_read_split(&orig->bio);
1777         }
1778
1779         if (!rbio->pick.idx) {
1780                 if (!rbio->have_ioref) {
1781                         __bcache_io_error(c, "no device to read from");
1782                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1783                         goto out;
1784                 }
1785
1786                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
1787                              bio_sectors(&rbio->bio));
1788                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1789
1790                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1791                         submit_bio(&rbio->bio);
1792                 else
1793                         submit_bio_wait(&rbio->bio);
1794         } else {
1795                 /* Attempting reconstruct read: */
1796                 if (bch2_ec_read_extent(c, rbio)) {
1797                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1798                         goto out;
1799                 }
1800
1801                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1802                         bio_endio(&rbio->bio);
1803         }
1804 out:
1805         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1806                 return 0;
1807         } else {
1808                 int ret;
1809
1810                 rbio->context = RBIO_CONTEXT_UNBOUND;
1811                 bch2_read_endio(&rbio->bio);
1812
1813                 ret = rbio->retry;
1814                 rbio = bch2_rbio_free(rbio);
1815
1816                 if (ret == READ_RETRY_AVOID) {
1817                         bch2_mark_io_failure(failed, &pick);
1818                         ret = READ_RETRY;
1819                 }
1820
1821                 return ret;
1822         }
1823
1824 err:
1825         if (flags & BCH_READ_IN_RETRY)
1826                 return READ_ERR;
1827
1828         orig->bio.bi_status = BLK_STS_IOERR;
1829         goto out_read_done;
1830
1831 hole:
1832         /*
1833          * won't normally happen in the BCH_READ_NODECODE
1834          * (bch2_move_extent()) path, but if we retry and the extent we wanted
1835          * to read no longer exists we have to signal that:
1836          */
1837         if (flags & BCH_READ_NODECODE)
1838                 orig->hole = true;
1839
1840         zero_fill_bio_iter(&orig->bio, iter);
1841 out_read_done:
1842         if (flags & BCH_READ_LAST_FRAGMENT)
1843                 bch2_rbio_done(orig);
1844         return 0;
1845 }
1846
1847 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
1848 {
1849         struct btree_trans trans;
1850         struct btree_iter *iter;
1851         struct bkey_s_c k;
1852         unsigned flags = BCH_READ_RETRY_IF_STALE|
1853                 BCH_READ_MAY_PROMOTE|
1854                 BCH_READ_USER_MAPPED;
1855         int ret;
1856
1857         bch2_trans_init(&trans, c, 0, 0);
1858
1859         BUG_ON(rbio->_state);
1860         BUG_ON(flags & BCH_READ_NODECODE);
1861         BUG_ON(flags & BCH_READ_IN_RETRY);
1862
1863         rbio->c = c;
1864         rbio->start_time = local_clock();
1865
1866         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1867                            POS(inode, rbio->bio.bi_iter.bi_sector),
1868                            BTREE_ITER_SLOTS, k, ret) {
1869                 BKEY_PADDED(k) tmp;
1870                 unsigned bytes;
1871
1872                 /*
1873                  * Unlock the iterator while the btree node's lock is still in
1874                  * cache, before doing the IO:
1875                  */
1876                 bkey_reassemble(&tmp.k, k);
1877                 k = bkey_i_to_s_c(&tmp.k);
1878                 bch2_trans_unlock(&trans);
1879
1880                 bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
1881                               (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
1882                 swap(rbio->bio.bi_iter.bi_size, bytes);
1883
1884                 if (rbio->bio.bi_iter.bi_size == bytes)
1885                         flags |= BCH_READ_LAST_FRAGMENT;
1886
1887                 bch2_read_extent(c, rbio, k, flags);
1888
1889                 if (flags & BCH_READ_LAST_FRAGMENT)
1890                         return;
1891
1892                 swap(rbio->bio.bi_iter.bi_size, bytes);
1893                 bio_advance(&rbio->bio, bytes);
1894         }
1895
1896         /*
1897          * If we get here, it better have been because there was an error
1898          * reading a btree node
1899          */
1900         BUG_ON(!ret);
1901         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
1902
1903         bch2_trans_exit(&trans);
1904         bch2_rbio_done(rbio);
1905 }
1906
1907 void bch2_fs_io_exit(struct bch_fs *c)
1908 {
1909         if (c->promote_table.tbl)
1910                 rhashtable_destroy(&c->promote_table);
1911         mempool_exit(&c->bio_bounce_pages);
1912         bioset_exit(&c->bio_write);
1913         bioset_exit(&c->bio_read_split);
1914         bioset_exit(&c->bio_read);
1915 }
1916
1917 int bch2_fs_io_init(struct bch_fs *c)
1918 {
1919         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1920                         BIOSET_NEED_BVECS) ||
1921             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1922                         BIOSET_NEED_BVECS) ||
1923             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
1924                         BIOSET_NEED_BVECS) ||
1925             mempool_init_page_pool(&c->bio_bounce_pages,
1926                                    max_t(unsigned,
1927                                          c->opts.btree_node_size,
1928                                          c->sb.encoded_extent_max) /
1929                                    PAGE_SECTORS, 0) ||
1930             rhashtable_init(&c->promote_table, &bch_promote_params))
1931                 return -ENOMEM;
1932
1933         return 0;
1934 }