]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
Update bcachefs sources to 86a99a7b7f bcachefs: Convert some enums to x-macros
[bcachefs-tools-debian] / libbcachefs / io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "alloc_foreground.h"
11 #include "bkey_on_stack.h"
12 #include "bset.h"
13 #include "btree_update.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "compress.h"
17 #include "clock.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "ec.h"
21 #include "error.h"
22 #include "extent_update.h"
23 #include "inode.h"
24 #include "io.h"
25 #include "journal.h"
26 #include "keylist.h"
27 #include "move.h"
28 #include "rebalance.h"
29 #include "super.h"
30 #include "super-io.h"
31
32 #include <linux/blkdev.h>
33 #include <linux/random.h>
34
35 #include <trace/events/bcachefs.h>
36
37 static bool bch2_target_congested(struct bch_fs *c, u16 target)
38 {
39         const struct bch_devs_mask *devs;
40         unsigned d, nr = 0, total = 0;
41         u64 now = local_clock(), last;
42         s64 congested;
43         struct bch_dev *ca;
44
45         if (!target)
46                 return false;
47
48         rcu_read_lock();
49         devs = bch2_target_to_mask(c, target);
50         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
51                 ca = rcu_dereference(c->devs[d]);
52                 if (!ca)
53                         continue;
54
55                 congested = atomic_read(&ca->congested);
56                 last = READ_ONCE(ca->congested_last);
57                 if (time_after64(now, last))
58                         congested -= (now - last) >> 12;
59
60                 total += max(congested, 0LL);
61                 nr++;
62         }
63         rcu_read_unlock();
64
65         return bch2_rand_range(nr * CONGESTED_MAX) < total;
66 }
67
68 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
69                                        u64 now, int rw)
70 {
71         u64 latency_capable =
72                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
73         /* ideally we'd be taking into account the device's variance here: */
74         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
75         s64 latency_over = io_latency - latency_threshold;
76
77         if (latency_threshold && latency_over > 0) {
78                 /*
79                  * bump up congested by approximately latency_over * 4 /
80                  * latency_threshold - we don't need much accuracy here so don't
81                  * bother with the divide:
82                  */
83                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
84                         atomic_add(latency_over >>
85                                    max_t(int, ilog2(latency_threshold) - 2, 0),
86                                    &ca->congested);
87
88                 ca->congested_last = now;
89         } else if (atomic_read(&ca->congested) > 0) {
90                 atomic_dec(&ca->congested);
91         }
92 }
93
94 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
95 {
96         atomic64_t *latency = &ca->cur_latency[rw];
97         u64 now = local_clock();
98         u64 io_latency = time_after64(now, submit_time)
99                 ? now - submit_time
100                 : 0;
101         u64 old, new, v = atomic64_read(latency);
102
103         do {
104                 old = v;
105
106                 /*
107                  * If the io latency was reasonably close to the current
108                  * latency, skip doing the update and atomic operation - most of
109                  * the time:
110                  */
111                 if (abs((int) (old - io_latency)) < (old >> 1) &&
112                     now & ~(~0 << 5))
113                         break;
114
115                 new = ewma_add(old, io_latency, 5);
116         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
117
118         bch2_congested_acct(ca, io_latency, now, rw);
119
120         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
121 }
122
123 /* Allocate, free from mempool: */
124
125 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
126 {
127         struct bvec_iter_all iter;
128         struct bio_vec *bv;
129
130         bio_for_each_segment_all(bv, bio, iter)
131                 if (bv->bv_page != ZERO_PAGE(0))
132                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
133         bio->bi_vcnt = 0;
134 }
135
136 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
137 {
138         struct page *page;
139
140         if (likely(!*using_mempool)) {
141                 page = alloc_page(GFP_NOIO);
142                 if (unlikely(!page)) {
143                         mutex_lock(&c->bio_bounce_pages_lock);
144                         *using_mempool = true;
145                         goto pool_alloc;
146
147                 }
148         } else {
149 pool_alloc:
150                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
151         }
152
153         return page;
154 }
155
156 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
157                                size_t size)
158 {
159         bool using_mempool = false;
160
161         while (size) {
162                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
163                 unsigned len = min(PAGE_SIZE, size);
164
165                 BUG_ON(!bio_add_page(bio, page, len, 0));
166                 size -= len;
167         }
168
169         if (using_mempool)
170                 mutex_unlock(&c->bio_bounce_pages_lock);
171 }
172
173 /* Extent update path: */
174
175 static int sum_sector_overwrites(struct btree_trans *trans,
176                                  struct btree_iter *extent_iter,
177                                  struct bkey_i *new,
178                                  bool may_allocate,
179                                  bool *maybe_extending,
180                                  s64 *delta)
181 {
182         struct btree_iter *iter;
183         struct bkey_s_c old;
184         int ret = 0;
185
186         *maybe_extending = true;
187         *delta = 0;
188
189         iter = bch2_trans_copy_iter(trans, extent_iter);
190         if (IS_ERR(iter))
191                 return PTR_ERR(iter);
192
193         for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
194                 if (!may_allocate &&
195                     bch2_bkey_nr_ptrs_fully_allocated(old) <
196                     bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
197                         ret = -ENOSPC;
198                         break;
199                 }
200
201                 *delta += (min(new->k.p.offset,
202                               old.k->p.offset) -
203                           max(bkey_start_offset(&new->k),
204                               bkey_start_offset(old.k))) *
205                         (bkey_extent_is_allocation(&new->k) -
206                          bkey_extent_is_allocation(old.k));
207
208                 if (bkey_cmp(old.k->p, new->k.p) >= 0) {
209                         /*
210                          * Check if there's already data above where we're
211                          * going to be writing to - this means we're definitely
212                          * not extending the file:
213                          *
214                          * Note that it's not sufficient to check if there's
215                          * data up to the sector offset we're going to be
216                          * writing to, because i_size could be up to one block
217                          * less:
218                          */
219                         if (!bkey_cmp(old.k->p, new->k.p))
220                                 old = bch2_btree_iter_next(iter);
221
222                         if (old.k && !bkey_err(old) &&
223                             old.k->p.inode == extent_iter->pos.inode &&
224                             bkey_extent_is_data(old.k))
225                                 *maybe_extending = false;
226
227                         break;
228                 }
229         }
230
231         bch2_trans_iter_put(trans, iter);
232         return ret;
233 }
234
235 int bch2_extent_update(struct btree_trans *trans,
236                        struct btree_iter *iter,
237                        struct bkey_i *k,
238                        struct disk_reservation *disk_res,
239                        u64 *journal_seq,
240                        u64 new_i_size,
241                        s64 *i_sectors_delta)
242 {
243         /* this must live until after bch2_trans_commit(): */
244         struct bkey_inode_buf inode_p;
245         bool extending = false;
246         s64 delta = 0;
247         int ret;
248
249         ret = bch2_extent_trim_atomic(k, iter);
250         if (ret)
251                 return ret;
252
253         ret = sum_sector_overwrites(trans, iter, k,
254                         disk_res && disk_res->sectors != 0,
255                         &extending, &delta);
256         if (ret)
257                 return ret;
258
259         new_i_size = extending
260                 ? min(k->k.p.offset << 9, new_i_size)
261                 : 0;
262
263         if (delta || new_i_size) {
264                 struct btree_iter *inode_iter;
265                 struct bch_inode_unpacked inode_u;
266
267                 inode_iter = bch2_inode_peek(trans, &inode_u,
268                                 k->k.p.inode, BTREE_ITER_INTENT);
269                 if (IS_ERR(inode_iter))
270                         return PTR_ERR(inode_iter);
271
272                 /*
273                  * XXX:
274                  * writeback can race a bit with truncate, because truncate
275                  * first updates the inode then truncates the pagecache. This is
276                  * ugly, but lets us preserve the invariant that the in memory
277                  * i_size is always >= the on disk i_size.
278                  *
279                 BUG_ON(new_i_size > inode_u.bi_size &&
280                        (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
281                  */
282                 BUG_ON(new_i_size > inode_u.bi_size && !extending);
283
284                 if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
285                     new_i_size > inode_u.bi_size)
286                         inode_u.bi_size = new_i_size;
287                 else
288                         new_i_size = 0;
289
290                 inode_u.bi_sectors += delta;
291
292                 if (delta || new_i_size) {
293                         bch2_inode_pack(&inode_p, &inode_u);
294                         bch2_trans_update(trans, inode_iter,
295                                           &inode_p.inode.k_i);
296                 }
297
298                 bch2_trans_iter_put(trans, inode_iter);
299         }
300
301         bch2_trans_update(trans, iter, k);
302
303         ret = bch2_trans_commit(trans, disk_res, journal_seq,
304                                 BTREE_INSERT_NOCHECK_RW|
305                                 BTREE_INSERT_NOFAIL|
306                                 BTREE_INSERT_USE_RESERVE);
307         if (!ret && i_sectors_delta)
308                 *i_sectors_delta += delta;
309
310         return ret;
311 }
312
313 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
314                    struct bpos end, u64 *journal_seq,
315                    s64 *i_sectors_delta)
316 {
317         struct bch_fs *c        = trans->c;
318         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
319         struct bkey_s_c k;
320         int ret = 0, ret2 = 0;
321
322         while ((k = bch2_btree_iter_peek(iter)).k &&
323                bkey_cmp(iter->pos, end) < 0) {
324                 struct disk_reservation disk_res =
325                         bch2_disk_reservation_init(c, 0);
326                 struct bkey_i delete;
327
328                 bch2_trans_reset(trans, TRANS_RESET_MEM);
329
330                 ret = bkey_err(k);
331                 if (ret)
332                         goto btree_err;
333
334                 bkey_init(&delete.k);
335                 delete.k.p = iter->pos;
336
337                 /* create the biggest key we can */
338                 bch2_key_resize(&delete.k, max_sectors);
339                 bch2_cut_back(end, &delete);
340
341                 ret = bch2_extent_update(trans, iter, &delete,
342                                 &disk_res, journal_seq,
343                                 0, i_sectors_delta);
344                 bch2_disk_reservation_put(c, &disk_res);
345 btree_err:
346                 if (ret == -EINTR) {
347                         ret2 = ret;
348                         ret = 0;
349                 }
350                 if (ret)
351                         break;
352         }
353
354         if (bkey_cmp(iter->pos, end) > 0) {
355                 bch2_btree_iter_set_pos(iter, end);
356                 ret = bch2_btree_iter_traverse(iter);
357         }
358
359         return ret ?: ret2;
360 }
361
362 int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
363                 u64 *journal_seq, s64 *i_sectors_delta)
364 {
365         struct btree_trans trans;
366         struct btree_iter *iter;
367         int ret = 0;
368
369         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
370         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
371                                    POS(inum, start),
372                                    BTREE_ITER_INTENT);
373
374         ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
375                              journal_seq, i_sectors_delta);
376         bch2_trans_exit(&trans);
377
378         if (ret == -EINTR)
379                 ret = 0;
380
381         return ret;
382 }
383
384 int bch2_write_index_default(struct bch_write_op *op)
385 {
386         struct bch_fs *c = op->c;
387         struct bkey_on_stack sk;
388         struct keylist *keys = &op->insert_keys;
389         struct bkey_i *k = bch2_keylist_front(keys);
390         struct btree_trans trans;
391         struct btree_iter *iter;
392         int ret;
393
394         bkey_on_stack_init(&sk);
395         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
396
397         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
398                                    bkey_start_pos(&k->k),
399                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
400
401         do {
402                 bch2_trans_reset(&trans, TRANS_RESET_MEM);
403
404                 k = bch2_keylist_front(keys);
405
406                 bkey_on_stack_realloc(&sk, c, k->k.u64s);
407                 bkey_copy(sk.k, k);
408                 bch2_cut_front(iter->pos, sk.k);
409
410                 ret = bch2_extent_update(&trans, iter, sk.k,
411                                          &op->res, op_journal_seq(op),
412                                          op->new_i_size, &op->i_sectors_delta);
413                 if (ret == -EINTR)
414                         continue;
415                 if (ret)
416                         break;
417
418                 if (bkey_cmp(iter->pos, k->k.p) >= 0)
419                         bch2_keylist_pop_front(keys);
420         } while (!bch2_keylist_empty(keys));
421
422         bch2_trans_exit(&trans);
423         bkey_on_stack_exit(&sk, c);
424
425         return ret;
426 }
427
428 /* Writes */
429
430 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
431                                enum bch_data_type type,
432                                const struct bkey_i *k)
433 {
434         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
435         const struct bch_extent_ptr *ptr;
436         struct bch_write_bio *n;
437         struct bch_dev *ca;
438
439         BUG_ON(c->opts.nochanges);
440
441         bkey_for_each_ptr(ptrs, ptr) {
442                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
443                        !c->devs[ptr->dev]);
444
445                 ca = bch_dev_bkey_exists(c, ptr->dev);
446
447                 if (to_entry(ptr + 1) < ptrs.end) {
448                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
449                                                    &ca->replica_set));
450
451                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
452                         n->bio.bi_private       = wbio->bio.bi_private;
453                         n->parent               = wbio;
454                         n->split                = true;
455                         n->bounce               = false;
456                         n->put_bio              = true;
457                         n->bio.bi_opf           = wbio->bio.bi_opf;
458                         bio_inc_remaining(&wbio->bio);
459                 } else {
460                         n = wbio;
461                         n->split                = false;
462                 }
463
464                 n->c                    = c;
465                 n->dev                  = ptr->dev;
466                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
467                 n->submit_time          = local_clock();
468                 n->bio.bi_iter.bi_sector = ptr->offset;
469
470                 if (!journal_flushes_device(ca))
471                         n->bio.bi_opf |= REQ_FUA;
472
473                 if (likely(n->have_ioref)) {
474                         this_cpu_add(ca->io_done->sectors[WRITE][type],
475                                      bio_sectors(&n->bio));
476
477                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
478                         submit_bio(&n->bio);
479                 } else {
480                         n->bio.bi_status        = BLK_STS_REMOVED;
481                         bio_endio(&n->bio);
482                 }
483         }
484 }
485
486 static void __bch2_write(struct closure *);
487
488 static void bch2_write_done(struct closure *cl)
489 {
490         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
491         struct bch_fs *c = op->c;
492
493         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
494                 op->error = bch2_journal_error(&c->journal);
495
496         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
497                 bch2_disk_reservation_put(c, &op->res);
498         percpu_ref_put(&c->writes);
499         bch2_keylist_free(&op->insert_keys, op->inline_keys);
500
501         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
502
503         if (op->end_io) {
504                 EBUG_ON(cl->parent);
505                 closure_debug_destroy(cl);
506                 op->end_io(op);
507         } else {
508                 closure_return(cl);
509         }
510 }
511
512 /**
513  * bch_write_index - after a write, update index to point to new data
514  */
515 static void __bch2_write_index(struct bch_write_op *op)
516 {
517         struct bch_fs *c = op->c;
518         struct keylist *keys = &op->insert_keys;
519         struct bch_extent_ptr *ptr;
520         struct bkey_i *src, *dst = keys->keys, *n, *k;
521         unsigned dev;
522         int ret;
523
524         for (src = keys->keys; src != keys->top; src = n) {
525                 n = bkey_next(src);
526
527                 if (bkey_extent_is_direct_data(&src->k)) {
528                         bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
529                                             test_bit(ptr->dev, op->failed.d));
530
531                         if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
532                                 ret = -EIO;
533                                 goto err;
534                         }
535                 }
536
537                 if (dst != src)
538                         memmove_u64s_down(dst, src, src->u64s);
539                 dst = bkey_next(dst);
540         }
541
542         keys->top = dst;
543
544         /*
545          * probably not the ideal place to hook this in, but I don't
546          * particularly want to plumb io_opts all the way through the btree
547          * update stack right now
548          */
549         for_each_keylist_key(keys, k)
550                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
551
552         if (!bch2_keylist_empty(keys)) {
553                 u64 sectors_start = keylist_sectors(keys);
554                 int ret = op->index_update_fn(op);
555
556                 BUG_ON(ret == -EINTR);
557                 BUG_ON(keylist_sectors(keys) && !ret);
558
559                 op->written += sectors_start - keylist_sectors(keys);
560
561                 if (ret) {
562                         __bcache_io_error(c, "btree IO error %i", ret);
563                         op->error = ret;
564                 }
565         }
566 out:
567         /* If some a bucket wasn't written, we can't erasure code it: */
568         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
569                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
570
571         bch2_open_buckets_put(c, &op->open_buckets);
572         return;
573 err:
574         keys->top = keys->keys;
575         op->error = ret;
576         goto out;
577 }
578
579 static void bch2_write_index(struct closure *cl)
580 {
581         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
582         struct bch_fs *c = op->c;
583
584         __bch2_write_index(op);
585
586         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
587                 bch2_journal_flush_seq_async(&c->journal,
588                                              *op_journal_seq(op),
589                                              cl);
590                 continue_at(cl, bch2_write_done, index_update_wq(op));
591         } else {
592                 continue_at_nobarrier(cl, bch2_write_done, NULL);
593         }
594 }
595
596 static void bch2_write_endio(struct bio *bio)
597 {
598         struct closure *cl              = bio->bi_private;
599         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
600         struct bch_write_bio *wbio      = to_wbio(bio);
601         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
602         struct bch_fs *c                = wbio->c;
603         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
604
605         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
606                 set_bit(wbio->dev, op->failed.d);
607
608         if (wbio->have_ioref) {
609                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
610                 percpu_ref_put(&ca->io_ref);
611         }
612
613         if (wbio->bounce)
614                 bch2_bio_free_pages_pool(c, bio);
615
616         if (wbio->put_bio)
617                 bio_put(bio);
618
619         if (parent)
620                 bio_endio(&parent->bio);
621         else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
622                 closure_put(cl);
623         else
624                 continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
625 }
626
627 static void init_append_extent(struct bch_write_op *op,
628                                struct write_point *wp,
629                                struct bversion version,
630                                struct bch_extent_crc_unpacked crc)
631 {
632         struct bch_fs *c = op->c;
633         struct bkey_i_extent *e;
634         struct open_bucket *ob;
635         unsigned i;
636
637         BUG_ON(crc.compressed_size > wp->sectors_free);
638         wp->sectors_free -= crc.compressed_size;
639         op->pos.offset += crc.uncompressed_size;
640
641         e = bkey_extent_init(op->insert_keys.top);
642         e->k.p          = op->pos;
643         e->k.size       = crc.uncompressed_size;
644         e->k.version    = version;
645
646         if (crc.csum_type ||
647             crc.compression_type ||
648             crc.nonce)
649                 bch2_extent_crc_append(&e->k_i, crc);
650
651         open_bucket_for_each(c, &wp->ptrs, ob, i) {
652                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
653                 union bch_extent_entry *end =
654                         bkey_val_end(bkey_i_to_s(&e->k_i));
655
656                 end->ptr = ob->ptr;
657                 end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
658                 end->ptr.cached = !ca->mi.durability ||
659                         (op->flags & BCH_WRITE_CACHED) != 0;
660                 end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
661
662                 e->k.u64s++;
663
664                 BUG_ON(crc.compressed_size > ob->sectors_free);
665                 ob->sectors_free -= crc.compressed_size;
666         }
667
668         bch2_keylist_push(&op->insert_keys);
669 }
670
671 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
672                                         struct write_point *wp,
673                                         struct bio *src,
674                                         bool *page_alloc_failed,
675                                         void *buf)
676 {
677         struct bch_write_bio *wbio;
678         struct bio *bio;
679         unsigned output_available =
680                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
681         unsigned pages = DIV_ROUND_UP(output_available +
682                                       (buf
683                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
684                                        : 0), PAGE_SIZE);
685
686         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
687         wbio                    = wbio_init(bio);
688         wbio->put_bio           = true;
689         /* copy WRITE_SYNC flag */
690         wbio->bio.bi_opf        = src->bi_opf;
691
692         if (buf) {
693                 bch2_bio_map(bio, buf, output_available);
694                 return bio;
695         }
696
697         wbio->bounce            = true;
698
699         /*
700          * We can't use mempool for more than c->sb.encoded_extent_max
701          * worth of pages, but we'd like to allocate more if we can:
702          */
703         bch2_bio_alloc_pages_pool(c, bio,
704                                   min_t(unsigned, output_available,
705                                         c->sb.encoded_extent_max << 9));
706
707         if (bio->bi_iter.bi_size < output_available)
708                 *page_alloc_failed =
709                         bch2_bio_alloc_pages(bio,
710                                              output_available -
711                                              bio->bi_iter.bi_size,
712                                              GFP_NOFS) != 0;
713
714         return bio;
715 }
716
717 static int bch2_write_rechecksum(struct bch_fs *c,
718                                  struct bch_write_op *op,
719                                  unsigned new_csum_type)
720 {
721         struct bio *bio = &op->wbio.bio;
722         struct bch_extent_crc_unpacked new_crc;
723         int ret;
724
725         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
726
727         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
728             bch2_csum_type_is_encryption(new_csum_type))
729                 new_csum_type = op->crc.csum_type;
730
731         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
732                                   NULL, &new_crc,
733                                   op->crc.offset, op->crc.live_size,
734                                   new_csum_type);
735         if (ret)
736                 return ret;
737
738         bio_advance(bio, op->crc.offset << 9);
739         bio->bi_iter.bi_size = op->crc.live_size << 9;
740         op->crc = new_crc;
741         return 0;
742 }
743
744 static int bch2_write_decrypt(struct bch_write_op *op)
745 {
746         struct bch_fs *c = op->c;
747         struct nonce nonce = extent_nonce(op->version, op->crc);
748         struct bch_csum csum;
749
750         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
751                 return 0;
752
753         /*
754          * If we need to decrypt data in the write path, we'll no longer be able
755          * to verify the existing checksum (poly1305 mac, in this case) after
756          * it's decrypted - this is the last point we'll be able to reverify the
757          * checksum:
758          */
759         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
760         if (bch2_crc_cmp(op->crc.csum, csum))
761                 return -EIO;
762
763         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
764         op->crc.csum_type = 0;
765         op->crc.csum = (struct bch_csum) { 0, 0 };
766         return 0;
767 }
768
769 static enum prep_encoded_ret {
770         PREP_ENCODED_OK,
771         PREP_ENCODED_ERR,
772         PREP_ENCODED_CHECKSUM_ERR,
773         PREP_ENCODED_DO_WRITE,
774 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
775 {
776         struct bch_fs *c = op->c;
777         struct bio *bio = &op->wbio.bio;
778
779         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
780                 return PREP_ENCODED_OK;
781
782         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
783
784         /* Can we just write the entire extent as is? */
785         if (op->crc.uncompressed_size == op->crc.live_size &&
786             op->crc.compressed_size <= wp->sectors_free &&
787             op->crc.compression_type == op->compression_type) {
788                 if (!op->crc.compression_type &&
789                     op->csum_type != op->crc.csum_type &&
790                     bch2_write_rechecksum(c, op, op->csum_type))
791                         return PREP_ENCODED_CHECKSUM_ERR;
792
793                 return PREP_ENCODED_DO_WRITE;
794         }
795
796         /*
797          * If the data is compressed and we couldn't write the entire extent as
798          * is, we have to decompress it:
799          */
800         if (op->crc.compression_type) {
801                 struct bch_csum csum;
802
803                 if (bch2_write_decrypt(op))
804                         return PREP_ENCODED_CHECKSUM_ERR;
805
806                 /* Last point we can still verify checksum: */
807                 csum = bch2_checksum_bio(c, op->crc.csum_type,
808                                          extent_nonce(op->version, op->crc),
809                                          bio);
810                 if (bch2_crc_cmp(op->crc.csum, csum))
811                         return PREP_ENCODED_CHECKSUM_ERR;
812
813                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
814                         return PREP_ENCODED_ERR;
815         }
816
817         /*
818          * No longer have compressed data after this point - data might be
819          * encrypted:
820          */
821
822         /*
823          * If the data is checksummed and we're only writing a subset,
824          * rechecksum and adjust bio to point to currently live data:
825          */
826         if ((op->crc.live_size != op->crc.uncompressed_size ||
827              op->crc.csum_type != op->csum_type) &&
828             bch2_write_rechecksum(c, op, op->csum_type))
829                 return PREP_ENCODED_CHECKSUM_ERR;
830
831         /*
832          * If we want to compress the data, it has to be decrypted:
833          */
834         if ((op->compression_type ||
835              bch2_csum_type_is_encryption(op->crc.csum_type) !=
836              bch2_csum_type_is_encryption(op->csum_type)) &&
837             bch2_write_decrypt(op))
838                 return PREP_ENCODED_CHECKSUM_ERR;
839
840         return PREP_ENCODED_OK;
841 }
842
843 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
844                              struct bio **_dst)
845 {
846         struct bch_fs *c = op->c;
847         struct bio *src = &op->wbio.bio, *dst = src;
848         struct bvec_iter saved_iter;
849         void *ec_buf;
850         struct bpos ec_pos = op->pos;
851         unsigned total_output = 0, total_input = 0;
852         bool bounce = false;
853         bool page_alloc_failed = false;
854         int ret, more = 0;
855
856         BUG_ON(!bio_sectors(src));
857
858         ec_buf = bch2_writepoint_ec_buf(c, wp);
859
860         switch (bch2_write_prep_encoded_data(op, wp)) {
861         case PREP_ENCODED_OK:
862                 break;
863         case PREP_ENCODED_ERR:
864                 ret = -EIO;
865                 goto err;
866         case PREP_ENCODED_CHECKSUM_ERR:
867                 goto csum_err;
868         case PREP_ENCODED_DO_WRITE:
869                 /* XXX look for bug here */
870                 if (ec_buf) {
871                         dst = bch2_write_bio_alloc(c, wp, src,
872                                                    &page_alloc_failed,
873                                                    ec_buf);
874                         bio_copy_data(dst, src);
875                         bounce = true;
876                 }
877                 init_append_extent(op, wp, op->version, op->crc);
878                 goto do_write;
879         }
880
881         if (ec_buf ||
882             op->compression_type ||
883             (op->csum_type &&
884              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
885             (bch2_csum_type_is_encryption(op->csum_type) &&
886              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
887                 dst = bch2_write_bio_alloc(c, wp, src,
888                                            &page_alloc_failed,
889                                            ec_buf);
890                 bounce = true;
891         }
892
893         saved_iter = dst->bi_iter;
894
895         do {
896                 struct bch_extent_crc_unpacked crc =
897                         (struct bch_extent_crc_unpacked) { 0 };
898                 struct bversion version = op->version;
899                 size_t dst_len, src_len;
900
901                 if (page_alloc_failed &&
902                     bio_sectors(dst) < wp->sectors_free &&
903                     bio_sectors(dst) < c->sb.encoded_extent_max)
904                         break;
905
906                 BUG_ON(op->compression_type &&
907                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
908                        bch2_csum_type_is_encryption(op->crc.csum_type));
909                 BUG_ON(op->compression_type && !bounce);
910
911                 crc.compression_type = op->compression_type
912                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
913                                              op->compression_type)
914                         : 0;
915                 if (!crc.compression_type) {
916                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
917                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
918
919                         if (op->csum_type)
920                                 dst_len = min_t(unsigned, dst_len,
921                                                 c->sb.encoded_extent_max << 9);
922
923                         if (bounce) {
924                                 swap(dst->bi_iter.bi_size, dst_len);
925                                 bio_copy_data(dst, src);
926                                 swap(dst->bi_iter.bi_size, dst_len);
927                         }
928
929                         src_len = dst_len;
930                 }
931
932                 BUG_ON(!src_len || !dst_len);
933
934                 if (bch2_csum_type_is_encryption(op->csum_type)) {
935                         if (bversion_zero(version)) {
936                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
937                         } else {
938                                 crc.nonce = op->nonce;
939                                 op->nonce += src_len >> 9;
940                         }
941                 }
942
943                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
944                     !crc.compression_type &&
945                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
946                     bch2_csum_type_is_encryption(op->csum_type)) {
947                         /*
948                          * Note: when we're using rechecksum(), we need to be
949                          * checksumming @src because it has all the data our
950                          * existing checksum covers - if we bounced (because we
951                          * were trying to compress), @dst will only have the
952                          * part of the data the new checksum will cover.
953                          *
954                          * But normally we want to be checksumming post bounce,
955                          * because part of the reason for bouncing is so the
956                          * data can't be modified (by userspace) while it's in
957                          * flight.
958                          */
959                         if (bch2_rechecksum_bio(c, src, version, op->crc,
960                                         &crc, &op->crc,
961                                         src_len >> 9,
962                                         bio_sectors(src) - (src_len >> 9),
963                                         op->csum_type))
964                                 goto csum_err;
965                 } else {
966                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
967                             bch2_rechecksum_bio(c, src, version, op->crc,
968                                         NULL, &op->crc,
969                                         src_len >> 9,
970                                         bio_sectors(src) - (src_len >> 9),
971                                         op->crc.csum_type))
972                                 goto csum_err;
973
974                         crc.compressed_size     = dst_len >> 9;
975                         crc.uncompressed_size   = src_len >> 9;
976                         crc.live_size           = src_len >> 9;
977
978                         swap(dst->bi_iter.bi_size, dst_len);
979                         bch2_encrypt_bio(c, op->csum_type,
980                                          extent_nonce(version, crc), dst);
981                         crc.csum = bch2_checksum_bio(c, op->csum_type,
982                                          extent_nonce(version, crc), dst);
983                         crc.csum_type = op->csum_type;
984                         swap(dst->bi_iter.bi_size, dst_len);
985                 }
986
987                 init_append_extent(op, wp, version, crc);
988
989                 if (dst != src)
990                         bio_advance(dst, dst_len);
991                 bio_advance(src, src_len);
992                 total_output    += dst_len;
993                 total_input     += src_len;
994         } while (dst->bi_iter.bi_size &&
995                  src->bi_iter.bi_size &&
996                  wp->sectors_free &&
997                  !bch2_keylist_realloc(&op->insert_keys,
998                                       op->inline_keys,
999                                       ARRAY_SIZE(op->inline_keys),
1000                                       BKEY_EXTENT_U64s_MAX));
1001
1002         more = src->bi_iter.bi_size != 0;
1003
1004         dst->bi_iter = saved_iter;
1005
1006         if (dst == src && more) {
1007                 BUG_ON(total_output != total_input);
1008
1009                 dst = bio_split(src, total_input >> 9,
1010                                 GFP_NOIO, &c->bio_write);
1011                 wbio_init(dst)->put_bio = true;
1012                 /* copy WRITE_SYNC flag */
1013                 dst->bi_opf             = src->bi_opf;
1014         }
1015
1016         dst->bi_iter.bi_size = total_output;
1017 do_write:
1018         /* might have done a realloc... */
1019         bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
1020
1021         *_dst = dst;
1022         return more;
1023 csum_err:
1024         bch_err(c, "error verifying existing checksum while "
1025                 "rewriting existing data (memory corruption?)");
1026         ret = -EIO;
1027 err:
1028         if (to_wbio(dst)->bounce)
1029                 bch2_bio_free_pages_pool(c, dst);
1030         if (to_wbio(dst)->put_bio)
1031                 bio_put(dst);
1032
1033         return ret;
1034 }
1035
1036 static void __bch2_write(struct closure *cl)
1037 {
1038         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1039         struct bch_fs *c = op->c;
1040         struct write_point *wp;
1041         struct bio *bio;
1042         bool skip_put = true;
1043         int ret;
1044 again:
1045         memset(&op->failed, 0, sizeof(op->failed));
1046
1047         do {
1048                 struct bkey_i *key_to_write;
1049                 unsigned key_to_write_offset = op->insert_keys.top_p -
1050                         op->insert_keys.keys_p;
1051
1052                 /* +1 for possible cache device: */
1053                 if (op->open_buckets.nr + op->nr_replicas + 1 >
1054                     ARRAY_SIZE(op->open_buckets.v))
1055                         goto flush_io;
1056
1057                 if (bch2_keylist_realloc(&op->insert_keys,
1058                                         op->inline_keys,
1059                                         ARRAY_SIZE(op->inline_keys),
1060                                         BKEY_EXTENT_U64s_MAX))
1061                         goto flush_io;
1062
1063                 wp = bch2_alloc_sectors_start(c,
1064                         op->target,
1065                         op->opts.erasure_code,
1066                         op->write_point,
1067                         &op->devs_have,
1068                         op->nr_replicas,
1069                         op->nr_replicas_required,
1070                         op->alloc_reserve,
1071                         op->flags,
1072                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
1073                 EBUG_ON(!wp);
1074
1075                 if (unlikely(IS_ERR(wp))) {
1076                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
1077                                 ret = PTR_ERR(wp);
1078                                 goto err;
1079                         }
1080
1081                         goto flush_io;
1082                 }
1083
1084                 bch2_open_bucket_get(c, wp, &op->open_buckets);
1085                 ret = bch2_write_extent(op, wp, &bio);
1086                 bch2_alloc_sectors_done(c, wp);
1087
1088                 if (ret < 0)
1089                         goto err;
1090
1091                 if (ret)
1092                         skip_put = false;
1093
1094                 bio->bi_end_io  = bch2_write_endio;
1095                 bio->bi_private = &op->cl;
1096                 bio->bi_opf |= REQ_OP_WRITE;
1097
1098                 if (!skip_put)
1099                         closure_get(bio->bi_private);
1100                 else
1101                         op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
1102
1103                 key_to_write = (void *) (op->insert_keys.keys_p +
1104                                          key_to_write_offset);
1105
1106                 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
1107                                           key_to_write);
1108         } while (ret);
1109
1110         if (!skip_put)
1111                 continue_at(cl, bch2_write_index, index_update_wq(op));
1112         return;
1113 err:
1114         op->error = ret;
1115
1116         continue_at(cl, bch2_write_index, index_update_wq(op));
1117         return;
1118 flush_io:
1119         closure_sync(cl);
1120
1121         if (!bch2_keylist_empty(&op->insert_keys)) {
1122                 __bch2_write_index(op);
1123
1124                 if (op->error) {
1125                         continue_at_nobarrier(cl, bch2_write_done, NULL);
1126                         return;
1127                 }
1128         }
1129
1130         goto again;
1131 }
1132
1133 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1134 {
1135         struct closure *cl = &op->cl;
1136         struct bio *bio = &op->wbio.bio;
1137         struct bvec_iter iter;
1138         struct bkey_i_inline_data *id;
1139         unsigned sectors;
1140         int ret;
1141
1142         bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1143
1144         ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1145                                    ARRAY_SIZE(op->inline_keys),
1146                                    BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1147         if (ret) {
1148                 op->error = ret;
1149                 goto err;
1150         }
1151
1152         sectors = bio_sectors(bio);
1153         op->pos.offset += sectors;
1154
1155         id = bkey_inline_data_init(op->insert_keys.top);
1156         id->k.p         = op->pos;
1157         id->k.version   = op->version;
1158         id->k.size      = sectors;
1159
1160         iter = bio->bi_iter;
1161         iter.bi_size = data_len;
1162         memcpy_from_bio(id->v.data, bio, iter);
1163
1164         while (data_len & 7)
1165                 id->v.data[data_len++] = '\0';
1166         set_bkey_val_bytes(&id->k, data_len);
1167         bch2_keylist_push(&op->insert_keys);
1168
1169         op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1170         continue_at_nobarrier(cl, bch2_write_index, NULL);
1171         return;
1172 err:
1173         bch2_write_done(&op->cl);
1174 }
1175
1176 /**
1177  * bch_write - handle a write to a cache device or flash only volume
1178  *
1179  * This is the starting point for any data to end up in a cache device; it could
1180  * be from a normal write, or a writeback write, or a write to a flash only
1181  * volume - it's also used by the moving garbage collector to compact data in
1182  * mostly empty buckets.
1183  *
1184  * It first writes the data to the cache, creating a list of keys to be inserted
1185  * (if the data won't fit in a single open bucket, there will be multiple keys);
1186  * after the data is written it calls bch_journal, and after the keys have been
1187  * added to the next journal write they're inserted into the btree.
1188  *
1189  * If op->discard is true, instead of inserting the data it invalidates the
1190  * region of the cache represented by op->bio and op->inode.
1191  */
1192 void bch2_write(struct closure *cl)
1193 {
1194         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1195         struct bio *bio = &op->wbio.bio;
1196         struct bch_fs *c = op->c;
1197         unsigned data_len;
1198
1199         BUG_ON(!op->nr_replicas);
1200         BUG_ON(!op->write_point.v);
1201         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
1202
1203         op->start_time = local_clock();
1204         bch2_keylist_init(&op->insert_keys, op->inline_keys);
1205         wbio_init(bio)->put_bio = false;
1206
1207         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
1208                 __bcache_io_error(c, "misaligned write");
1209                 op->error = -EIO;
1210                 goto err;
1211         }
1212
1213         if (c->opts.nochanges ||
1214             !percpu_ref_tryget(&c->writes)) {
1215                 __bcache_io_error(c, "read only");
1216                 op->error = -EROFS;
1217                 goto err;
1218         }
1219
1220         bch2_increment_clock(c, bio_sectors(bio), WRITE);
1221
1222         data_len = min_t(u64, bio->bi_iter.bi_size,
1223                          op->new_i_size - (op->pos.offset << 9));
1224
1225         if (c->opts.inline_data &&
1226             data_len <= min(block_bytes(c) / 2, 1024U)) {
1227                 bch2_write_data_inline(op, data_len);
1228                 return;
1229         }
1230
1231         continue_at_nobarrier(cl, __bch2_write, NULL);
1232         return;
1233 err:
1234         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
1235                 bch2_disk_reservation_put(c, &op->res);
1236
1237         if (op->end_io) {
1238                 EBUG_ON(cl->parent);
1239                 closure_debug_destroy(cl);
1240                 op->end_io(op);
1241         } else {
1242                 closure_return(cl);
1243         }
1244 }
1245
1246 /* Cache promotion on read */
1247
1248 struct promote_op {
1249         struct closure          cl;
1250         struct rcu_head         rcu;
1251         u64                     start_time;
1252
1253         struct rhash_head       hash;
1254         struct bpos             pos;
1255
1256         struct migrate_write    write;
1257         struct bio_vec          bi_inline_vecs[0]; /* must be last */
1258 };
1259
1260 static const struct rhashtable_params bch_promote_params = {
1261         .head_offset    = offsetof(struct promote_op, hash),
1262         .key_offset     = offsetof(struct promote_op, pos),
1263         .key_len        = sizeof(struct bpos),
1264 };
1265
1266 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1267                                   struct bpos pos,
1268                                   struct bch_io_opts opts,
1269                                   unsigned flags)
1270 {
1271         if (!(flags & BCH_READ_MAY_PROMOTE))
1272                 return false;
1273
1274         if (!opts.promote_target)
1275                 return false;
1276
1277         if (bch2_bkey_has_target(c, k, opts.promote_target))
1278                 return false;
1279
1280         if (bch2_target_congested(c, opts.promote_target)) {
1281                 /* XXX trace this */
1282                 return false;
1283         }
1284
1285         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1286                                    bch_promote_params))
1287                 return false;
1288
1289         return true;
1290 }
1291
1292 static void promote_free(struct bch_fs *c, struct promote_op *op)
1293 {
1294         int ret;
1295
1296         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1297                                      bch_promote_params);
1298         BUG_ON(ret);
1299         percpu_ref_put(&c->writes);
1300         kfree_rcu(op, rcu);
1301 }
1302
1303 static void promote_done(struct closure *cl)
1304 {
1305         struct promote_op *op =
1306                 container_of(cl, struct promote_op, cl);
1307         struct bch_fs *c = op->write.op.c;
1308
1309         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1310                                op->start_time);
1311
1312         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1313         promote_free(c, op);
1314 }
1315
1316 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1317 {
1318         struct bch_fs *c = rbio->c;
1319         struct closure *cl = &op->cl;
1320         struct bio *bio = &op->write.op.wbio.bio;
1321
1322         trace_promote(&rbio->bio);
1323
1324         /* we now own pages: */
1325         BUG_ON(!rbio->bounce);
1326         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1327
1328         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1329                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1330         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1331
1332         bch2_migrate_read_done(&op->write, rbio);
1333
1334         closure_init(cl, NULL);
1335         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1336         closure_return_with_destructor(cl, promote_done);
1337 }
1338
1339 static struct promote_op *__promote_alloc(struct bch_fs *c,
1340                                           enum btree_id btree_id,
1341                                           struct bpos pos,
1342                                           struct extent_ptr_decoded *pick,
1343                                           struct bch_io_opts opts,
1344                                           unsigned sectors,
1345                                           struct bch_read_bio **rbio)
1346 {
1347         struct promote_op *op = NULL;
1348         struct bio *bio;
1349         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1350         int ret;
1351
1352         if (!percpu_ref_tryget(&c->writes))
1353                 return NULL;
1354
1355         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1356         if (!op)
1357                 goto err;
1358
1359         op->start_time = local_clock();
1360         op->pos = pos;
1361
1362         /*
1363          * We don't use the mempool here because extents that aren't
1364          * checksummed or compressed can be too big for the mempool:
1365          */
1366         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1367                         sizeof(struct bio_vec) * pages,
1368                         GFP_NOIO);
1369         if (!*rbio)
1370                 goto err;
1371
1372         rbio_init(&(*rbio)->bio, opts);
1373         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1374
1375         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1376                                  GFP_NOIO))
1377                 goto err;
1378
1379         (*rbio)->bounce         = true;
1380         (*rbio)->split          = true;
1381         (*rbio)->kmalloc        = true;
1382
1383         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1384                                           bch_promote_params))
1385                 goto err;
1386
1387         bio = &op->write.op.wbio.bio;
1388         bio_init(bio, bio->bi_inline_vecs, pages);
1389
1390         ret = bch2_migrate_write_init(c, &op->write,
1391                         writepoint_hashed((unsigned long) current),
1392                         opts,
1393                         DATA_PROMOTE,
1394                         (struct data_opts) {
1395                                 .target = opts.promote_target
1396                         },
1397                         btree_id,
1398                         bkey_s_c_null);
1399         BUG_ON(ret);
1400
1401         return op;
1402 err:
1403         if (*rbio)
1404                 bio_free_pages(&(*rbio)->bio);
1405         kfree(*rbio);
1406         *rbio = NULL;
1407         kfree(op);
1408         percpu_ref_put(&c->writes);
1409         return NULL;
1410 }
1411
1412 noinline
1413 static struct promote_op *promote_alloc(struct bch_fs *c,
1414                                                struct bvec_iter iter,
1415                                                struct bkey_s_c k,
1416                                                struct extent_ptr_decoded *pick,
1417                                                struct bch_io_opts opts,
1418                                                unsigned flags,
1419                                                struct bch_read_bio **rbio,
1420                                                bool *bounce,
1421                                                bool *read_full)
1422 {
1423         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1424         /* data might have to be decompressed in the write path: */
1425         unsigned sectors = promote_full
1426                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1427                 : bvec_iter_sectors(iter);
1428         struct bpos pos = promote_full
1429                 ? bkey_start_pos(k.k)
1430                 : POS(k.k->p.inode, iter.bi_sector);
1431         struct promote_op *promote;
1432
1433         if (!should_promote(c, k, pos, opts, flags))
1434                 return NULL;
1435
1436         promote = __promote_alloc(c,
1437                                   k.k->type == KEY_TYPE_reflink_v
1438                                   ? BTREE_ID_REFLINK
1439                                   : BTREE_ID_EXTENTS,
1440                                   pos, pick, opts, sectors, rbio);
1441         if (!promote)
1442                 return NULL;
1443
1444         *bounce         = true;
1445         *read_full      = promote_full;
1446         return promote;
1447 }
1448
1449 /* Read */
1450
1451 #define READ_RETRY_AVOID        1
1452 #define READ_RETRY              2
1453 #define READ_ERR                3
1454
1455 enum rbio_context {
1456         RBIO_CONTEXT_NULL,
1457         RBIO_CONTEXT_HIGHPRI,
1458         RBIO_CONTEXT_UNBOUND,
1459 };
1460
1461 static inline struct bch_read_bio *
1462 bch2_rbio_parent(struct bch_read_bio *rbio)
1463 {
1464         return rbio->split ? rbio->parent : rbio;
1465 }
1466
1467 __always_inline
1468 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1469                            enum rbio_context context,
1470                            struct workqueue_struct *wq)
1471 {
1472         if (context <= rbio->context) {
1473                 fn(&rbio->work);
1474         } else {
1475                 rbio->work.func         = fn;
1476                 rbio->context           = context;
1477                 queue_work(wq, &rbio->work);
1478         }
1479 }
1480
1481 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1482 {
1483         BUG_ON(rbio->bounce && !rbio->split);
1484
1485         if (rbio->promote)
1486                 promote_free(rbio->c, rbio->promote);
1487         rbio->promote = NULL;
1488
1489         if (rbio->bounce)
1490                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1491
1492         if (rbio->split) {
1493                 struct bch_read_bio *parent = rbio->parent;
1494
1495                 if (rbio->kmalloc)
1496                         kfree(rbio);
1497                 else
1498                         bio_put(&rbio->bio);
1499
1500                 rbio = parent;
1501         }
1502
1503         return rbio;
1504 }
1505
1506 /*
1507  * Only called on a top level bch_read_bio to complete an entire read request,
1508  * not a split:
1509  */
1510 static void bch2_rbio_done(struct bch_read_bio *rbio)
1511 {
1512         if (rbio->start_time)
1513                 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1514                                        rbio->start_time);
1515         bio_endio(&rbio->bio);
1516 }
1517
1518 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1519                                      struct bvec_iter bvec_iter, u64 inode,
1520                                      struct bch_io_failures *failed,
1521                                      unsigned flags)
1522 {
1523         struct btree_trans trans;
1524         struct btree_iter *iter;
1525         struct bkey_on_stack sk;
1526         struct bkey_s_c k;
1527         int ret;
1528
1529         flags &= ~BCH_READ_LAST_FRAGMENT;
1530         flags |= BCH_READ_MUST_CLONE;
1531
1532         bkey_on_stack_init(&sk);
1533         bch2_trans_init(&trans, c, 0, 0);
1534
1535         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1536                                    rbio->pos, BTREE_ITER_SLOTS);
1537 retry:
1538         rbio->bio.bi_status = 0;
1539
1540         k = bch2_btree_iter_peek_slot(iter);
1541         if (bkey_err(k))
1542                 goto err;
1543
1544         bkey_on_stack_reassemble(&sk, c, k);
1545         k = bkey_i_to_s_c(sk.k);
1546         bch2_trans_unlock(&trans);
1547
1548         if (!bch2_bkey_matches_ptr(c, k,
1549                                    rbio->pick.ptr,
1550                                    rbio->pos.offset -
1551                                    rbio->pick.crc.offset)) {
1552                 /* extent we wanted to read no longer exists: */
1553                 rbio->hole = true;
1554                 goto out;
1555         }
1556
1557         ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
1558         if (ret == READ_RETRY)
1559                 goto retry;
1560         if (ret)
1561                 goto err;
1562 out:
1563         bch2_rbio_done(rbio);
1564         bch2_trans_exit(&trans);
1565         bkey_on_stack_exit(&sk, c);
1566         return;
1567 err:
1568         rbio->bio.bi_status = BLK_STS_IOERR;
1569         goto out;
1570 }
1571
1572 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1573                             struct bvec_iter bvec_iter, u64 inode,
1574                             struct bch_io_failures *failed, unsigned flags)
1575 {
1576         struct btree_trans trans;
1577         struct btree_iter *iter;
1578         struct bkey_on_stack sk;
1579         struct bkey_s_c k;
1580         int ret;
1581
1582         flags &= ~BCH_READ_LAST_FRAGMENT;
1583         flags |= BCH_READ_MUST_CLONE;
1584
1585         bkey_on_stack_init(&sk);
1586         bch2_trans_init(&trans, c, 0, 0);
1587 retry:
1588         bch2_trans_begin(&trans);
1589
1590         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1591                            POS(inode, bvec_iter.bi_sector),
1592                            BTREE_ITER_SLOTS, k, ret) {
1593                 unsigned bytes, sectors, offset_into_extent;
1594
1595                 bkey_on_stack_reassemble(&sk, c, k);
1596                 k = bkey_i_to_s_c(sk.k);
1597
1598                 offset_into_extent = iter->pos.offset -
1599                         bkey_start_offset(k.k);
1600                 sectors = k.k->size - offset_into_extent;
1601
1602                 ret = bch2_read_indirect_extent(&trans,
1603                                         &offset_into_extent, sk.k);
1604                 if (ret)
1605                         break;
1606
1607                 sectors = min(sectors, k.k->size - offset_into_extent);
1608
1609                 bch2_trans_unlock(&trans);
1610
1611                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1612                 swap(bvec_iter.bi_size, bytes);
1613
1614                 ret = __bch2_read_extent(c, rbio, bvec_iter, k,
1615                                 offset_into_extent, failed, flags);
1616                 switch (ret) {
1617                 case READ_RETRY:
1618                         goto retry;
1619                 case READ_ERR:
1620                         goto err;
1621                 };
1622
1623                 if (bytes == bvec_iter.bi_size)
1624                         goto out;
1625
1626                 swap(bvec_iter.bi_size, bytes);
1627                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1628         }
1629
1630         if (ret == -EINTR)
1631                 goto retry;
1632         /*
1633          * If we get here, it better have been because there was an error
1634          * reading a btree node
1635          */
1636         BUG_ON(!ret);
1637         __bcache_io_error(c, "btree IO error: %i", ret);
1638 err:
1639         rbio->bio.bi_status = BLK_STS_IOERR;
1640 out:
1641         bch2_trans_exit(&trans);
1642         bkey_on_stack_exit(&sk, c);
1643         bch2_rbio_done(rbio);
1644 }
1645
1646 static void bch2_rbio_retry(struct work_struct *work)
1647 {
1648         struct bch_read_bio *rbio =
1649                 container_of(work, struct bch_read_bio, work);
1650         struct bch_fs *c        = rbio->c;
1651         struct bvec_iter iter   = rbio->bvec_iter;
1652         unsigned flags          = rbio->flags;
1653         u64 inode               = rbio->pos.inode;
1654         struct bch_io_failures failed = { .nr = 0 };
1655
1656         trace_read_retry(&rbio->bio);
1657
1658         if (rbio->retry == READ_RETRY_AVOID)
1659                 bch2_mark_io_failure(&failed, &rbio->pick);
1660
1661         rbio->bio.bi_status = 0;
1662
1663         rbio = bch2_rbio_free(rbio);
1664
1665         flags |= BCH_READ_IN_RETRY;
1666         flags &= ~BCH_READ_MAY_PROMOTE;
1667
1668         if (flags & BCH_READ_NODECODE)
1669                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1670         else
1671                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1672 }
1673
1674 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1675                             blk_status_t error)
1676 {
1677         rbio->retry = retry;
1678
1679         if (rbio->flags & BCH_READ_IN_RETRY)
1680                 return;
1681
1682         if (retry == READ_ERR) {
1683                 rbio = bch2_rbio_free(rbio);
1684
1685                 rbio->bio.bi_status = error;
1686                 bch2_rbio_done(rbio);
1687         } else {
1688                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1689                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1690         }
1691 }
1692
1693 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1694 {
1695         struct bch_fs *c = rbio->c;
1696         struct btree_trans trans;
1697         struct btree_iter *iter;
1698         struct bkey_s_c k;
1699         struct bkey_on_stack new;
1700         struct bch_extent_crc_unpacked new_crc;
1701         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1702         int ret;
1703
1704         if (rbio->pick.crc.compression_type)
1705                 return;
1706
1707         bkey_on_stack_init(&new);
1708         bch2_trans_init(&trans, c, 0, 0);
1709 retry:
1710         bch2_trans_begin(&trans);
1711
1712         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1713                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1714         k = bch2_btree_iter_peek_slot(iter);
1715         if (IS_ERR_OR_NULL(k.k))
1716                 goto out;
1717
1718         bkey_on_stack_reassemble(&new, c, k);
1719         k = bkey_i_to_s_c(new.k);
1720
1721         if (bversion_cmp(k.k->version, rbio->version) ||
1722             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1723                 goto out;
1724
1725         /* Extent was merged? */
1726         if (bkey_start_offset(k.k) < data_offset ||
1727             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1728                 goto out;
1729
1730         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1731                         rbio->pick.crc, NULL, &new_crc,
1732                         bkey_start_offset(k.k) - data_offset, k.k->size,
1733                         rbio->pick.crc.csum_type)) {
1734                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1735                 goto out;
1736         }
1737
1738         if (!bch2_bkey_narrow_crcs(new.k, new_crc))
1739                 goto out;
1740
1741         bch2_trans_update(&trans, iter, new.k);
1742         ret = bch2_trans_commit(&trans, NULL, NULL,
1743                                 BTREE_INSERT_NOFAIL|
1744                                 BTREE_INSERT_NOWAIT);
1745         if (ret == -EINTR)
1746                 goto retry;
1747 out:
1748         bch2_trans_exit(&trans);
1749         bkey_on_stack_exit(&new, c);
1750 }
1751
1752 /* Inner part that may run in process context */
1753 static void __bch2_read_endio(struct work_struct *work)
1754 {
1755         struct bch_read_bio *rbio =
1756                 container_of(work, struct bch_read_bio, work);
1757         struct bch_fs *c        = rbio->c;
1758         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1759         struct bio *src         = &rbio->bio;
1760         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1761         struct bvec_iter dst_iter = rbio->bvec_iter;
1762         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1763         struct nonce nonce = extent_nonce(rbio->version, crc);
1764         struct bch_csum csum;
1765
1766         /* Reset iterator for checksumming and copying bounced data: */
1767         if (rbio->bounce) {
1768                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1769                 src->bi_iter.bi_idx             = 0;
1770                 src->bi_iter.bi_bvec_done       = 0;
1771         } else {
1772                 src->bi_iter                    = rbio->bvec_iter;
1773         }
1774
1775         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1776         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1777                 goto csum_err;
1778
1779         if (unlikely(rbio->narrow_crcs))
1780                 bch2_rbio_narrow_crcs(rbio);
1781
1782         if (rbio->flags & BCH_READ_NODECODE)
1783                 goto nodecode;
1784
1785         /* Adjust crc to point to subset of data we want: */
1786         crc.offset     += rbio->offset_into_extent;
1787         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1788
1789         if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
1790                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1791                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1792                         goto decompression_err;
1793         } else {
1794                 /* don't need to decrypt the entire bio: */
1795                 nonce = nonce_add(nonce, crc.offset << 9);
1796                 bio_advance(src, crc.offset << 9);
1797
1798                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1799                 src->bi_iter.bi_size = dst_iter.bi_size;
1800
1801                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1802
1803                 if (rbio->bounce) {
1804                         struct bvec_iter src_iter = src->bi_iter;
1805                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1806                 }
1807         }
1808
1809         if (rbio->promote) {
1810                 /*
1811                  * Re encrypt data we decrypted, so it's consistent with
1812                  * rbio->crc:
1813                  */
1814                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1815                 promote_start(rbio->promote, rbio);
1816                 rbio->promote = NULL;
1817         }
1818 nodecode:
1819         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1820                 rbio = bch2_rbio_free(rbio);
1821                 bch2_rbio_done(rbio);
1822         }
1823         return;
1824 csum_err:
1825         /*
1826          * Checksum error: if the bio wasn't bounced, we may have been
1827          * reading into buffers owned by userspace (that userspace can
1828          * scribble over) - retry the read, bouncing it this time:
1829          */
1830         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1831                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1832                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1833                 return;
1834         }
1835
1836         bch2_dev_io_error(ca,
1837                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1838                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1839                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1840                 csum.hi, csum.lo, crc.csum_type);
1841         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1842         return;
1843 decompression_err:
1844         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1845                           rbio->pos.inode,
1846                           (u64) rbio->bvec_iter.bi_sector);
1847         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1848         return;
1849 }
1850
1851 static void bch2_read_endio(struct bio *bio)
1852 {
1853         struct bch_read_bio *rbio =
1854                 container_of(bio, struct bch_read_bio, bio);
1855         struct bch_fs *c        = rbio->c;
1856         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1857         struct workqueue_struct *wq = NULL;
1858         enum rbio_context context = RBIO_CONTEXT_NULL;
1859
1860         if (rbio->have_ioref) {
1861                 bch2_latency_acct(ca, rbio->submit_time, READ);
1862                 percpu_ref_put(&ca->io_ref);
1863         }
1864
1865         if (!rbio->split)
1866                 rbio->bio.bi_end_io = rbio->end_io;
1867
1868         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1869                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1870                 return;
1871         }
1872
1873         if (rbio->pick.ptr.cached &&
1874             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1875              ptr_stale(ca, &rbio->pick.ptr))) {
1876                 atomic_long_inc(&c->read_realloc_races);
1877
1878                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1879                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1880                 else
1881                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1882                 return;
1883         }
1884
1885         if (rbio->narrow_crcs ||
1886             rbio->pick.crc.compression_type ||
1887             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1888                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1889         else if (rbio->pick.crc.csum_type)
1890                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1891
1892         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1893 }
1894
1895 int __bch2_read_indirect_extent(struct btree_trans *trans,
1896                                 unsigned *offset_into_extent,
1897                                 struct bkey_i *orig_k)
1898 {
1899         struct btree_iter *iter;
1900         struct bkey_s_c k;
1901         u64 reflink_offset;
1902         int ret;
1903
1904         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
1905                 *offset_into_extent;
1906
1907         iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
1908                                    POS(0, reflink_offset),
1909                                    BTREE_ITER_SLOTS);
1910         ret = PTR_ERR_OR_ZERO(iter);
1911         if (ret)
1912                 return ret;
1913
1914         k = bch2_btree_iter_peek_slot(iter);
1915         ret = bkey_err(k);
1916         if (ret)
1917                 goto err;
1918
1919         if (k.k->type != KEY_TYPE_reflink_v) {
1920                 __bcache_io_error(trans->c,
1921                                 "pointer to nonexistent indirect extent");
1922                 ret = -EIO;
1923                 goto err;
1924         }
1925
1926         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1927         bkey_reassemble(orig_k, k);
1928 err:
1929         bch2_trans_iter_put(trans, iter);
1930         return ret;
1931 }
1932
1933 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1934                        struct bvec_iter iter, struct bkey_s_c k,
1935                        unsigned offset_into_extent,
1936                        struct bch_io_failures *failed, unsigned flags)
1937 {
1938         struct extent_ptr_decoded pick;
1939         struct bch_read_bio *rbio = NULL;
1940         struct bch_dev *ca;
1941         struct promote_op *promote = NULL;
1942         bool bounce = false, read_full = false, narrow_crcs = false;
1943         struct bpos pos = bkey_start_pos(k.k);
1944         int pick_ret;
1945
1946         if (k.k->type == KEY_TYPE_inline_data) {
1947                 struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
1948                 unsigned bytes = min_t(unsigned, iter.bi_size,
1949                                        bkey_val_bytes(d.k));
1950
1951                 swap(iter.bi_size, bytes);
1952                 memcpy_to_bio(&orig->bio, iter, d.v->data);
1953                 swap(iter.bi_size, bytes);
1954                 bio_advance_iter(&orig->bio, &iter, bytes);
1955                 zero_fill_bio_iter(&orig->bio, iter);
1956                 goto out_read_done;
1957         }
1958
1959         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1960
1961         /* hole or reservation - just zero fill: */
1962         if (!pick_ret)
1963                 goto hole;
1964
1965         if (pick_ret < 0) {
1966                 __bcache_io_error(c, "no device to read from");
1967                 goto err;
1968         }
1969
1970         if (pick_ret > 0)
1971                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1972
1973         if (flags & BCH_READ_NODECODE) {
1974                 /*
1975                  * can happen if we retry, and the extent we were going to read
1976                  * has been merged in the meantime:
1977                  */
1978                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1979                         goto hole;
1980
1981                 iter.bi_size    = pick.crc.compressed_size << 9;
1982                 goto noclone;
1983         }
1984
1985         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1986             bio_flagged(&orig->bio, BIO_CHAIN))
1987                 flags |= BCH_READ_MUST_CLONE;
1988
1989         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
1990                 bch2_can_narrow_extent_crcs(k, pick.crc);
1991
1992         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1993                 flags |= BCH_READ_MUST_BOUNCE;
1994
1995         EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1996
1997         if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
1998             (pick.crc.csum_type != BCH_CSUM_NONE &&
1999              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2000               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2001                (flags & BCH_READ_USER_MAPPED)) ||
2002               (flags & BCH_READ_MUST_BOUNCE)))) {
2003                 read_full = true;
2004                 bounce = true;
2005         }
2006
2007         if (orig->opts.promote_target)
2008                 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2009                                         &rbio, &bounce, &read_full);
2010
2011         if (!read_full) {
2012                 EBUG_ON(pick.crc.compression_type);
2013                 EBUG_ON(pick.crc.csum_type &&
2014                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2015                          bvec_iter_sectors(iter) != pick.crc.live_size ||
2016                          pick.crc.offset ||
2017                          offset_into_extent));
2018
2019                 pos.offset += offset_into_extent;
2020                 pick.ptr.offset += pick.crc.offset +
2021                         offset_into_extent;
2022                 offset_into_extent              = 0;
2023                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
2024                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
2025                 pick.crc.offset                 = 0;
2026                 pick.crc.live_size              = bvec_iter_sectors(iter);
2027                 offset_into_extent              = 0;
2028         }
2029
2030         if (rbio) {
2031                 /*
2032                  * promote already allocated bounce rbio:
2033                  * promote needs to allocate a bio big enough for uncompressing
2034                  * data in the write path, but we're not going to use it all
2035                  * here:
2036                  */
2037                 EBUG_ON(rbio->bio.bi_iter.bi_size <
2038                        pick.crc.compressed_size << 9);
2039                 rbio->bio.bi_iter.bi_size =
2040                         pick.crc.compressed_size << 9;
2041         } else if (bounce) {
2042                 unsigned sectors = pick.crc.compressed_size;
2043
2044                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
2045                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
2046                                                   &c->bio_read_split),
2047                                  orig->opts);
2048
2049                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2050                 rbio->bounce    = true;
2051                 rbio->split     = true;
2052         } else if (flags & BCH_READ_MUST_CLONE) {
2053                 /*
2054                  * Have to clone if there were any splits, due to error
2055                  * reporting issues (if a split errored, and retrying didn't
2056                  * work, when it reports the error to its parent (us) we don't
2057                  * know if the error was from our bio, and we should retry, or
2058                  * from the whole bio, in which case we don't want to retry and
2059                  * lose the error)
2060                  */
2061                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
2062                                                 &c->bio_read_split),
2063                                  orig->opts);
2064                 rbio->bio.bi_iter = iter;
2065                 rbio->split     = true;
2066         } else {
2067 noclone:
2068                 rbio = orig;
2069                 rbio->bio.bi_iter = iter;
2070                 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2071         }
2072
2073         EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2074
2075         rbio->c                 = c;
2076         rbio->submit_time       = local_clock();
2077         if (rbio->split)
2078                 rbio->parent    = orig;
2079         else
2080                 rbio->end_io    = orig->bio.bi_end_io;
2081         rbio->bvec_iter         = iter;
2082         rbio->offset_into_extent= offset_into_extent;
2083         rbio->flags             = flags;
2084         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2085         rbio->narrow_crcs       = narrow_crcs;
2086         rbio->hole              = 0;
2087         rbio->retry             = 0;
2088         rbio->context           = 0;
2089         /* XXX: only initialize this if needed */
2090         rbio->devs_have         = bch2_bkey_devs(k);
2091         rbio->pick              = pick;
2092         rbio->pos               = pos;
2093         rbio->version           = k.k->version;
2094         rbio->promote           = promote;
2095         INIT_WORK(&rbio->work, NULL);
2096
2097         rbio->bio.bi_opf        = orig->bio.bi_opf;
2098         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2099         rbio->bio.bi_end_io     = bch2_read_endio;
2100
2101         if (rbio->bounce)
2102                 trace_read_bounce(&rbio->bio);
2103
2104         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2105
2106         rcu_read_lock();
2107         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
2108         rcu_read_unlock();
2109
2110         if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2111                 bio_inc_remaining(&orig->bio);
2112                 trace_read_split(&orig->bio);
2113         }
2114
2115         if (!rbio->pick.idx) {
2116                 if (!rbio->have_ioref) {
2117                         __bcache_io_error(c, "no device to read from");
2118                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2119                         goto out;
2120                 }
2121
2122                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
2123                              bio_sectors(&rbio->bio));
2124                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2125
2126                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2127                         submit_bio(&rbio->bio);
2128                 else
2129                         submit_bio_wait(&rbio->bio);
2130         } else {
2131                 /* Attempting reconstruct read: */
2132                 if (bch2_ec_read_extent(c, rbio)) {
2133                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2134                         goto out;
2135                 }
2136
2137                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2138                         bio_endio(&rbio->bio);
2139         }
2140 out:
2141         if (likely(!(flags & BCH_READ_IN_RETRY))) {
2142                 return 0;
2143         } else {
2144                 int ret;
2145
2146                 rbio->context = RBIO_CONTEXT_UNBOUND;
2147                 bch2_read_endio(&rbio->bio);
2148
2149                 ret = rbio->retry;
2150                 rbio = bch2_rbio_free(rbio);
2151
2152                 if (ret == READ_RETRY_AVOID) {
2153                         bch2_mark_io_failure(failed, &pick);
2154                         ret = READ_RETRY;
2155                 }
2156
2157                 return ret;
2158         }
2159
2160 err:
2161         if (flags & BCH_READ_IN_RETRY)
2162                 return READ_ERR;
2163
2164         orig->bio.bi_status = BLK_STS_IOERR;
2165         goto out_read_done;
2166
2167 hole:
2168         /*
2169          * won't normally happen in the BCH_READ_NODECODE
2170          * (bch2_move_extent()) path, but if we retry and the extent we wanted
2171          * to read no longer exists we have to signal that:
2172          */
2173         if (flags & BCH_READ_NODECODE)
2174                 orig->hole = true;
2175
2176         zero_fill_bio_iter(&orig->bio, iter);
2177 out_read_done:
2178         if (flags & BCH_READ_LAST_FRAGMENT)
2179                 bch2_rbio_done(orig);
2180         return 0;
2181 }
2182
2183 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
2184 {
2185         struct btree_trans trans;
2186         struct btree_iter *iter;
2187         struct bkey_on_stack sk;
2188         struct bkey_s_c k;
2189         unsigned flags = BCH_READ_RETRY_IF_STALE|
2190                 BCH_READ_MAY_PROMOTE|
2191                 BCH_READ_USER_MAPPED;
2192         int ret;
2193
2194         BUG_ON(rbio->_state);
2195         BUG_ON(flags & BCH_READ_NODECODE);
2196         BUG_ON(flags & BCH_READ_IN_RETRY);
2197
2198         rbio->c = c;
2199         rbio->start_time = local_clock();
2200
2201         bkey_on_stack_init(&sk);
2202         bch2_trans_init(&trans, c, 0, 0);
2203 retry:
2204         bch2_trans_begin(&trans);
2205
2206         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2207                                    POS(inode, rbio->bio.bi_iter.bi_sector),
2208                                    BTREE_ITER_SLOTS);
2209         while (1) {
2210                 unsigned bytes, sectors, offset_into_extent;
2211
2212                 bch2_btree_iter_set_pos(iter,
2213                                 POS(inode, rbio->bio.bi_iter.bi_sector));
2214
2215                 k = bch2_btree_iter_peek_slot(iter);
2216                 ret = bkey_err(k);
2217                 if (ret)
2218                         goto err;
2219
2220                 offset_into_extent = iter->pos.offset -
2221                         bkey_start_offset(k.k);
2222                 sectors = k.k->size - offset_into_extent;
2223
2224                 bkey_on_stack_reassemble(&sk, c, k);
2225                 k = bkey_i_to_s_c(sk.k);
2226
2227                 ret = bch2_read_indirect_extent(&trans,
2228                                         &offset_into_extent, sk.k);
2229                 if (ret)
2230                         goto err;
2231
2232                 /*
2233                  * With indirect extents, the amount of data to read is the min
2234                  * of the original extent and the indirect extent:
2235                  */
2236                 sectors = min(sectors, k.k->size - offset_into_extent);
2237
2238                 /*
2239                  * Unlock the iterator while the btree node's lock is still in
2240                  * cache, before doing the IO:
2241                  */
2242                 bch2_trans_unlock(&trans);
2243
2244                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
2245                 swap(rbio->bio.bi_iter.bi_size, bytes);
2246
2247                 if (rbio->bio.bi_iter.bi_size == bytes)
2248                         flags |= BCH_READ_LAST_FRAGMENT;
2249
2250                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
2251
2252                 if (flags & BCH_READ_LAST_FRAGMENT)
2253                         break;
2254
2255                 swap(rbio->bio.bi_iter.bi_size, bytes);
2256                 bio_advance(&rbio->bio, bytes);
2257         }
2258 out:
2259         bch2_trans_exit(&trans);
2260         bkey_on_stack_exit(&sk, c);
2261         return;
2262 err:
2263         if (ret == -EINTR)
2264                 goto retry;
2265
2266         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
2267         bch2_rbio_done(rbio);
2268         goto out;
2269 }
2270
2271 void bch2_fs_io_exit(struct bch_fs *c)
2272 {
2273         if (c->promote_table.tbl)
2274                 rhashtable_destroy(&c->promote_table);
2275         mempool_exit(&c->bio_bounce_pages);
2276         bioset_exit(&c->bio_write);
2277         bioset_exit(&c->bio_read_split);
2278         bioset_exit(&c->bio_read);
2279 }
2280
2281 int bch2_fs_io_init(struct bch_fs *c)
2282 {
2283         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2284                         BIOSET_NEED_BVECS) ||
2285             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2286                         BIOSET_NEED_BVECS) ||
2287             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2288                         BIOSET_NEED_BVECS) ||
2289             mempool_init_page_pool(&c->bio_bounce_pages,
2290                                    max_t(unsigned,
2291                                          c->opts.btree_node_size,
2292                                          c->sb.encoded_extent_max) /
2293                                    PAGE_SECTORS, 0) ||
2294             rhashtable_init(&c->promote_table, &bch_promote_params))
2295                 return -ENOMEM;
2296
2297         return 0;
2298 }