]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
Update bcachefs sources to 3cd63315a6 bcachefs: Track incompressible data
[bcachefs-tools-debian] / libbcachefs / io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "alloc_foreground.h"
11 #include "bkey_on_stack.h"
12 #include "bset.h"
13 #include "btree_update.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "compress.h"
17 #include "clock.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "ec.h"
21 #include "error.h"
22 #include "extent_update.h"
23 #include "inode.h"
24 #include "io.h"
25 #include "journal.h"
26 #include "keylist.h"
27 #include "move.h"
28 #include "rebalance.h"
29 #include "super.h"
30 #include "super-io.h"
31
32 #include <linux/blkdev.h>
33 #include <linux/random.h>
34
35 #include <trace/events/bcachefs.h>
36
37 static bool bch2_target_congested(struct bch_fs *c, u16 target)
38 {
39         const struct bch_devs_mask *devs;
40         unsigned d, nr = 0, total = 0;
41         u64 now = local_clock(), last;
42         s64 congested;
43         struct bch_dev *ca;
44
45         if (!target)
46                 return false;
47
48         rcu_read_lock();
49         devs = bch2_target_to_mask(c, target);
50         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
51                 ca = rcu_dereference(c->devs[d]);
52                 if (!ca)
53                         continue;
54
55                 congested = atomic_read(&ca->congested);
56                 last = READ_ONCE(ca->congested_last);
57                 if (time_after64(now, last))
58                         congested -= (now - last) >> 12;
59
60                 total += max(congested, 0LL);
61                 nr++;
62         }
63         rcu_read_unlock();
64
65         return bch2_rand_range(nr * CONGESTED_MAX) < total;
66 }
67
68 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
69                                        u64 now, int rw)
70 {
71         u64 latency_capable =
72                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
73         /* ideally we'd be taking into account the device's variance here: */
74         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
75         s64 latency_over = io_latency - latency_threshold;
76
77         if (latency_threshold && latency_over > 0) {
78                 /*
79                  * bump up congested by approximately latency_over * 4 /
80                  * latency_threshold - we don't need much accuracy here so don't
81                  * bother with the divide:
82                  */
83                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
84                         atomic_add(latency_over >>
85                                    max_t(int, ilog2(latency_threshold) - 2, 0),
86                                    &ca->congested);
87
88                 ca->congested_last = now;
89         } else if (atomic_read(&ca->congested) > 0) {
90                 atomic_dec(&ca->congested);
91         }
92 }
93
94 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
95 {
96         atomic64_t *latency = &ca->cur_latency[rw];
97         u64 now = local_clock();
98         u64 io_latency = time_after64(now, submit_time)
99                 ? now - submit_time
100                 : 0;
101         u64 old, new, v = atomic64_read(latency);
102
103         do {
104                 old = v;
105
106                 /*
107                  * If the io latency was reasonably close to the current
108                  * latency, skip doing the update and atomic operation - most of
109                  * the time:
110                  */
111                 if (abs((int) (old - io_latency)) < (old >> 1) &&
112                     now & ~(~0 << 5))
113                         break;
114
115                 new = ewma_add(old, io_latency, 5);
116         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
117
118         bch2_congested_acct(ca, io_latency, now, rw);
119
120         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
121 }
122
123 /* Allocate, free from mempool: */
124
125 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
126 {
127         struct bvec_iter_all iter;
128         struct bio_vec *bv;
129
130         bio_for_each_segment_all(bv, bio, iter)
131                 if (bv->bv_page != ZERO_PAGE(0))
132                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
133         bio->bi_vcnt = 0;
134 }
135
136 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
137 {
138         struct page *page;
139
140         if (likely(!*using_mempool)) {
141                 page = alloc_page(GFP_NOIO);
142                 if (unlikely(!page)) {
143                         mutex_lock(&c->bio_bounce_pages_lock);
144                         *using_mempool = true;
145                         goto pool_alloc;
146
147                 }
148         } else {
149 pool_alloc:
150                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
151         }
152
153         return page;
154 }
155
156 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
157                                size_t size)
158 {
159         bool using_mempool = false;
160
161         while (size) {
162                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
163                 unsigned len = min(PAGE_SIZE, size);
164
165                 BUG_ON(!bio_add_page(bio, page, len, 0));
166                 size -= len;
167         }
168
169         if (using_mempool)
170                 mutex_unlock(&c->bio_bounce_pages_lock);
171 }
172
173 /* Extent update path: */
174
175 static int sum_sector_overwrites(struct btree_trans *trans,
176                                  struct btree_iter *extent_iter,
177                                  struct bkey_i *new,
178                                  bool may_allocate,
179                                  bool *maybe_extending,
180                                  s64 *delta)
181 {
182         struct btree_iter *iter;
183         struct bkey_s_c old;
184         int ret = 0;
185
186         *maybe_extending = true;
187         *delta = 0;
188
189         iter = bch2_trans_copy_iter(trans, extent_iter);
190         if (IS_ERR(iter))
191                 return PTR_ERR(iter);
192
193         for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
194                 if (!may_allocate &&
195                     bch2_bkey_nr_ptrs_fully_allocated(old) <
196                     bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
197                         ret = -ENOSPC;
198                         break;
199                 }
200
201                 *delta += (min(new->k.p.offset,
202                               old.k->p.offset) -
203                           max(bkey_start_offset(&new->k),
204                               bkey_start_offset(old.k))) *
205                         (bkey_extent_is_allocation(&new->k) -
206                          bkey_extent_is_allocation(old.k));
207
208                 if (bkey_cmp(old.k->p, new->k.p) >= 0) {
209                         /*
210                          * Check if there's already data above where we're
211                          * going to be writing to - this means we're definitely
212                          * not extending the file:
213                          *
214                          * Note that it's not sufficient to check if there's
215                          * data up to the sector offset we're going to be
216                          * writing to, because i_size could be up to one block
217                          * less:
218                          */
219                         if (!bkey_cmp(old.k->p, new->k.p))
220                                 old = bch2_btree_iter_next(iter);
221
222                         if (old.k && !bkey_err(old) &&
223                             old.k->p.inode == extent_iter->pos.inode &&
224                             bkey_extent_is_data(old.k))
225                                 *maybe_extending = false;
226
227                         break;
228                 }
229         }
230
231         bch2_trans_iter_put(trans, iter);
232         return ret;
233 }
234
235 int bch2_extent_update(struct btree_trans *trans,
236                        struct btree_iter *iter,
237                        struct bkey_i *k,
238                        struct disk_reservation *disk_res,
239                        u64 *journal_seq,
240                        u64 new_i_size,
241                        s64 *i_sectors_delta)
242 {
243         /* this must live until after bch2_trans_commit(): */
244         struct bkey_inode_buf inode_p;
245         bool extending = false;
246         s64 delta = 0;
247         int ret;
248
249         ret = bch2_extent_trim_atomic(k, iter);
250         if (ret)
251                 return ret;
252
253         ret = sum_sector_overwrites(trans, iter, k,
254                         disk_res && disk_res->sectors != 0,
255                         &extending, &delta);
256         if (ret)
257                 return ret;
258
259         new_i_size = extending
260                 ? min(k->k.p.offset << 9, new_i_size)
261                 : 0;
262
263         if (delta || new_i_size) {
264                 struct btree_iter *inode_iter;
265                 struct bch_inode_unpacked inode_u;
266
267                 inode_iter = bch2_inode_peek(trans, &inode_u,
268                                 k->k.p.inode, BTREE_ITER_INTENT);
269                 if (IS_ERR(inode_iter))
270                         return PTR_ERR(inode_iter);
271
272                 /*
273                  * XXX:
274                  * writeback can race a bit with truncate, because truncate
275                  * first updates the inode then truncates the pagecache. This is
276                  * ugly, but lets us preserve the invariant that the in memory
277                  * i_size is always >= the on disk i_size.
278                  *
279                 BUG_ON(new_i_size > inode_u.bi_size &&
280                        (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
281                  */
282                 BUG_ON(new_i_size > inode_u.bi_size && !extending);
283
284                 if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
285                     new_i_size > inode_u.bi_size)
286                         inode_u.bi_size = new_i_size;
287                 else
288                         new_i_size = 0;
289
290                 inode_u.bi_sectors += delta;
291
292                 if (delta || new_i_size) {
293                         bch2_inode_pack(&inode_p, &inode_u);
294                         bch2_trans_update(trans, inode_iter,
295                                           &inode_p.inode.k_i, 0);
296                 }
297
298                 bch2_trans_iter_put(trans, inode_iter);
299         }
300
301         bch2_trans_update(trans, iter, k, 0);
302
303         ret = bch2_trans_commit(trans, disk_res, journal_seq,
304                                 BTREE_INSERT_NOCHECK_RW|
305                                 BTREE_INSERT_NOFAIL|
306                                 BTREE_INSERT_USE_RESERVE);
307         if (!ret && i_sectors_delta)
308                 *i_sectors_delta += delta;
309
310         return ret;
311 }
312
313 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
314                    struct bpos end, u64 *journal_seq,
315                    s64 *i_sectors_delta)
316 {
317         struct bch_fs *c        = trans->c;
318         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
319         struct bkey_s_c k;
320         int ret = 0, ret2 = 0;
321
322         while ((k = bch2_btree_iter_peek(iter)).k &&
323                bkey_cmp(iter->pos, end) < 0) {
324                 struct disk_reservation disk_res =
325                         bch2_disk_reservation_init(c, 0);
326                 struct bkey_i delete;
327
328                 bch2_trans_reset(trans, TRANS_RESET_MEM);
329
330                 ret = bkey_err(k);
331                 if (ret)
332                         goto btree_err;
333
334                 bkey_init(&delete.k);
335                 delete.k.p = iter->pos;
336
337                 /* create the biggest key we can */
338                 bch2_key_resize(&delete.k, max_sectors);
339                 bch2_cut_back(end, &delete);
340
341                 ret = bch2_extent_update(trans, iter, &delete,
342                                 &disk_res, journal_seq,
343                                 0, i_sectors_delta);
344                 bch2_disk_reservation_put(c, &disk_res);
345 btree_err:
346                 if (ret == -EINTR) {
347                         ret2 = ret;
348                         ret = 0;
349                 }
350                 if (ret)
351                         break;
352         }
353
354         if (bkey_cmp(iter->pos, end) > 0) {
355                 bch2_btree_iter_set_pos(iter, end);
356                 ret = bch2_btree_iter_traverse(iter);
357         }
358
359         return ret ?: ret2;
360 }
361
362 int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
363                 u64 *journal_seq, s64 *i_sectors_delta)
364 {
365         struct btree_trans trans;
366         struct btree_iter *iter;
367         int ret = 0;
368
369         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
370         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
371                                    POS(inum, start),
372                                    BTREE_ITER_INTENT);
373
374         ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
375                              journal_seq, i_sectors_delta);
376         bch2_trans_exit(&trans);
377
378         if (ret == -EINTR)
379                 ret = 0;
380
381         return ret;
382 }
383
384 int bch2_write_index_default(struct bch_write_op *op)
385 {
386         struct bch_fs *c = op->c;
387         struct bkey_on_stack sk;
388         struct keylist *keys = &op->insert_keys;
389         struct bkey_i *k = bch2_keylist_front(keys);
390         struct btree_trans trans;
391         struct btree_iter *iter;
392         int ret;
393
394         bkey_on_stack_init(&sk);
395         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
396
397         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
398                                    bkey_start_pos(&k->k),
399                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
400
401         do {
402                 bch2_trans_reset(&trans, TRANS_RESET_MEM);
403
404                 k = bch2_keylist_front(keys);
405
406                 bkey_on_stack_realloc(&sk, c, k->k.u64s);
407                 bkey_copy(sk.k, k);
408                 bch2_cut_front(iter->pos, sk.k);
409
410                 ret = bch2_extent_update(&trans, iter, sk.k,
411                                          &op->res, op_journal_seq(op),
412                                          op->new_i_size, &op->i_sectors_delta);
413                 if (ret == -EINTR)
414                         continue;
415                 if (ret)
416                         break;
417
418                 if (bkey_cmp(iter->pos, k->k.p) >= 0)
419                         bch2_keylist_pop_front(keys);
420         } while (!bch2_keylist_empty(keys));
421
422         bch2_trans_exit(&trans);
423         bkey_on_stack_exit(&sk, c);
424
425         return ret;
426 }
427
428 /* Writes */
429
430 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
431                                enum bch_data_type type,
432                                const struct bkey_i *k)
433 {
434         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
435         const struct bch_extent_ptr *ptr;
436         struct bch_write_bio *n;
437         struct bch_dev *ca;
438
439         BUG_ON(c->opts.nochanges);
440
441         bkey_for_each_ptr(ptrs, ptr) {
442                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
443                        !c->devs[ptr->dev]);
444
445                 ca = bch_dev_bkey_exists(c, ptr->dev);
446
447                 if (to_entry(ptr + 1) < ptrs.end) {
448                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
449                                                    &ca->replica_set));
450
451                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
452                         n->bio.bi_private       = wbio->bio.bi_private;
453                         n->parent               = wbio;
454                         n->split                = true;
455                         n->bounce               = false;
456                         n->put_bio              = true;
457                         n->bio.bi_opf           = wbio->bio.bi_opf;
458                         bio_inc_remaining(&wbio->bio);
459                 } else {
460                         n = wbio;
461                         n->split                = false;
462                 }
463
464                 n->c                    = c;
465                 n->dev                  = ptr->dev;
466                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
467                 n->submit_time          = local_clock();
468                 n->bio.bi_iter.bi_sector = ptr->offset;
469
470                 if (!journal_flushes_device(ca))
471                         n->bio.bi_opf |= REQ_FUA;
472
473                 if (likely(n->have_ioref)) {
474                         this_cpu_add(ca->io_done->sectors[WRITE][type],
475                                      bio_sectors(&n->bio));
476
477                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
478                         submit_bio(&n->bio);
479                 } else {
480                         n->bio.bi_status        = BLK_STS_REMOVED;
481                         bio_endio(&n->bio);
482                 }
483         }
484 }
485
486 static void __bch2_write(struct closure *);
487
488 static void bch2_write_done(struct closure *cl)
489 {
490         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
491         struct bch_fs *c = op->c;
492
493         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
494                 op->error = bch2_journal_error(&c->journal);
495
496         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
497                 bch2_disk_reservation_put(c, &op->res);
498         percpu_ref_put(&c->writes);
499         bch2_keylist_free(&op->insert_keys, op->inline_keys);
500
501         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
502
503         if (op->end_io) {
504                 EBUG_ON(cl->parent);
505                 closure_debug_destroy(cl);
506                 op->end_io(op);
507         } else {
508                 closure_return(cl);
509         }
510 }
511
512 /**
513  * bch_write_index - after a write, update index to point to new data
514  */
515 static void __bch2_write_index(struct bch_write_op *op)
516 {
517         struct bch_fs *c = op->c;
518         struct keylist *keys = &op->insert_keys;
519         struct bch_extent_ptr *ptr;
520         struct bkey_i *src, *dst = keys->keys, *n, *k;
521         unsigned dev;
522         int ret;
523
524         for (src = keys->keys; src != keys->top; src = n) {
525                 n = bkey_next(src);
526
527                 if (bkey_extent_is_direct_data(&src->k)) {
528                         bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
529                                             test_bit(ptr->dev, op->failed.d));
530
531                         if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
532                                 ret = -EIO;
533                                 goto err;
534                         }
535                 }
536
537                 if (dst != src)
538                         memmove_u64s_down(dst, src, src->u64s);
539                 dst = bkey_next(dst);
540         }
541
542         keys->top = dst;
543
544         /*
545          * probably not the ideal place to hook this in, but I don't
546          * particularly want to plumb io_opts all the way through the btree
547          * update stack right now
548          */
549         for_each_keylist_key(keys, k) {
550                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
551
552                 if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
553                         bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
554
555         }
556
557         if (!bch2_keylist_empty(keys)) {
558                 u64 sectors_start = keylist_sectors(keys);
559                 int ret = op->index_update_fn(op);
560
561                 BUG_ON(ret == -EINTR);
562                 BUG_ON(keylist_sectors(keys) && !ret);
563
564                 op->written += sectors_start - keylist_sectors(keys);
565
566                 if (ret) {
567                         __bcache_io_error(c, "btree IO error %i", ret);
568                         op->error = ret;
569                 }
570         }
571 out:
572         /* If some a bucket wasn't written, we can't erasure code it: */
573         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
574                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
575
576         bch2_open_buckets_put(c, &op->open_buckets);
577         return;
578 err:
579         keys->top = keys->keys;
580         op->error = ret;
581         goto out;
582 }
583
584 static void bch2_write_index(struct closure *cl)
585 {
586         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
587         struct bch_fs *c = op->c;
588
589         __bch2_write_index(op);
590
591         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
592                 bch2_journal_flush_seq_async(&c->journal,
593                                              *op_journal_seq(op),
594                                              cl);
595                 continue_at(cl, bch2_write_done, index_update_wq(op));
596         } else {
597                 continue_at_nobarrier(cl, bch2_write_done, NULL);
598         }
599 }
600
601 static void bch2_write_endio(struct bio *bio)
602 {
603         struct closure *cl              = bio->bi_private;
604         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
605         struct bch_write_bio *wbio      = to_wbio(bio);
606         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
607         struct bch_fs *c                = wbio->c;
608         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
609
610         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
611                 set_bit(wbio->dev, op->failed.d);
612
613         if (wbio->have_ioref) {
614                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
615                 percpu_ref_put(&ca->io_ref);
616         }
617
618         if (wbio->bounce)
619                 bch2_bio_free_pages_pool(c, bio);
620
621         if (wbio->put_bio)
622                 bio_put(bio);
623
624         if (parent)
625                 bio_endio(&parent->bio);
626         else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
627                 closure_put(cl);
628         else
629                 continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
630 }
631
632 static void init_append_extent(struct bch_write_op *op,
633                                struct write_point *wp,
634                                struct bversion version,
635                                struct bch_extent_crc_unpacked crc)
636 {
637         struct bch_fs *c = op->c;
638         struct bkey_i_extent *e;
639         struct open_bucket *ob;
640         unsigned i;
641
642         BUG_ON(crc.compressed_size > wp->sectors_free);
643         wp->sectors_free -= crc.compressed_size;
644         op->pos.offset += crc.uncompressed_size;
645
646         e = bkey_extent_init(op->insert_keys.top);
647         e->k.p          = op->pos;
648         e->k.size       = crc.uncompressed_size;
649         e->k.version    = version;
650
651         if (crc.csum_type ||
652             crc.compression_type ||
653             crc.nonce)
654                 bch2_extent_crc_append(&e->k_i, crc);
655
656         open_bucket_for_each(c, &wp->ptrs, ob, i) {
657                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
658                 union bch_extent_entry *end =
659                         bkey_val_end(bkey_i_to_s(&e->k_i));
660
661                 end->ptr = ob->ptr;
662                 end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
663                 end->ptr.cached = !ca->mi.durability ||
664                         (op->flags & BCH_WRITE_CACHED) != 0;
665                 end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
666
667                 e->k.u64s++;
668
669                 BUG_ON(crc.compressed_size > ob->sectors_free);
670                 ob->sectors_free -= crc.compressed_size;
671         }
672
673         bch2_keylist_push(&op->insert_keys);
674 }
675
676 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
677                                         struct write_point *wp,
678                                         struct bio *src,
679                                         bool *page_alloc_failed,
680                                         void *buf)
681 {
682         struct bch_write_bio *wbio;
683         struct bio *bio;
684         unsigned output_available =
685                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
686         unsigned pages = DIV_ROUND_UP(output_available +
687                                       (buf
688                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
689                                        : 0), PAGE_SIZE);
690
691         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
692         wbio                    = wbio_init(bio);
693         wbio->put_bio           = true;
694         /* copy WRITE_SYNC flag */
695         wbio->bio.bi_opf        = src->bi_opf;
696
697         if (buf) {
698                 bch2_bio_map(bio, buf, output_available);
699                 return bio;
700         }
701
702         wbio->bounce            = true;
703
704         /*
705          * We can't use mempool for more than c->sb.encoded_extent_max
706          * worth of pages, but we'd like to allocate more if we can:
707          */
708         bch2_bio_alloc_pages_pool(c, bio,
709                                   min_t(unsigned, output_available,
710                                         c->sb.encoded_extent_max << 9));
711
712         if (bio->bi_iter.bi_size < output_available)
713                 *page_alloc_failed =
714                         bch2_bio_alloc_pages(bio,
715                                              output_available -
716                                              bio->bi_iter.bi_size,
717                                              GFP_NOFS) != 0;
718
719         return bio;
720 }
721
722 static int bch2_write_rechecksum(struct bch_fs *c,
723                                  struct bch_write_op *op,
724                                  unsigned new_csum_type)
725 {
726         struct bio *bio = &op->wbio.bio;
727         struct bch_extent_crc_unpacked new_crc;
728         int ret;
729
730         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
731
732         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
733             bch2_csum_type_is_encryption(new_csum_type))
734                 new_csum_type = op->crc.csum_type;
735
736         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
737                                   NULL, &new_crc,
738                                   op->crc.offset, op->crc.live_size,
739                                   new_csum_type);
740         if (ret)
741                 return ret;
742
743         bio_advance(bio, op->crc.offset << 9);
744         bio->bi_iter.bi_size = op->crc.live_size << 9;
745         op->crc = new_crc;
746         return 0;
747 }
748
749 static int bch2_write_decrypt(struct bch_write_op *op)
750 {
751         struct bch_fs *c = op->c;
752         struct nonce nonce = extent_nonce(op->version, op->crc);
753         struct bch_csum csum;
754
755         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
756                 return 0;
757
758         /*
759          * If we need to decrypt data in the write path, we'll no longer be able
760          * to verify the existing checksum (poly1305 mac, in this case) after
761          * it's decrypted - this is the last point we'll be able to reverify the
762          * checksum:
763          */
764         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
765         if (bch2_crc_cmp(op->crc.csum, csum))
766                 return -EIO;
767
768         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
769         op->crc.csum_type = 0;
770         op->crc.csum = (struct bch_csum) { 0, 0 };
771         return 0;
772 }
773
774 static enum prep_encoded_ret {
775         PREP_ENCODED_OK,
776         PREP_ENCODED_ERR,
777         PREP_ENCODED_CHECKSUM_ERR,
778         PREP_ENCODED_DO_WRITE,
779 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
780 {
781         struct bch_fs *c = op->c;
782         struct bio *bio = &op->wbio.bio;
783
784         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
785                 return PREP_ENCODED_OK;
786
787         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
788
789         /* Can we just write the entire extent as is? */
790         if (op->crc.uncompressed_size == op->crc.live_size &&
791             op->crc.compressed_size <= wp->sectors_free &&
792             (op->crc.compression_type == op->compression_type ||
793              op->incompressible)) {
794                 if (!crc_is_compressed(op->crc) &&
795                     op->csum_type != op->crc.csum_type &&
796                     bch2_write_rechecksum(c, op, op->csum_type))
797                         return PREP_ENCODED_CHECKSUM_ERR;
798
799                 return PREP_ENCODED_DO_WRITE;
800         }
801
802         /*
803          * If the data is compressed and we couldn't write the entire extent as
804          * is, we have to decompress it:
805          */
806         if (crc_is_compressed(op->crc)) {
807                 struct bch_csum csum;
808
809                 if (bch2_write_decrypt(op))
810                         return PREP_ENCODED_CHECKSUM_ERR;
811
812                 /* Last point we can still verify checksum: */
813                 csum = bch2_checksum_bio(c, op->crc.csum_type,
814                                          extent_nonce(op->version, op->crc),
815                                          bio);
816                 if (bch2_crc_cmp(op->crc.csum, csum))
817                         return PREP_ENCODED_CHECKSUM_ERR;
818
819                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
820                         return PREP_ENCODED_ERR;
821         }
822
823         /*
824          * No longer have compressed data after this point - data might be
825          * encrypted:
826          */
827
828         /*
829          * If the data is checksummed and we're only writing a subset,
830          * rechecksum and adjust bio to point to currently live data:
831          */
832         if ((op->crc.live_size != op->crc.uncompressed_size ||
833              op->crc.csum_type != op->csum_type) &&
834             bch2_write_rechecksum(c, op, op->csum_type))
835                 return PREP_ENCODED_CHECKSUM_ERR;
836
837         /*
838          * If we want to compress the data, it has to be decrypted:
839          */
840         if ((op->compression_type ||
841              bch2_csum_type_is_encryption(op->crc.csum_type) !=
842              bch2_csum_type_is_encryption(op->csum_type)) &&
843             bch2_write_decrypt(op))
844                 return PREP_ENCODED_CHECKSUM_ERR;
845
846         return PREP_ENCODED_OK;
847 }
848
849 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
850                              struct bio **_dst)
851 {
852         struct bch_fs *c = op->c;
853         struct bio *src = &op->wbio.bio, *dst = src;
854         struct bvec_iter saved_iter;
855         void *ec_buf;
856         struct bpos ec_pos = op->pos;
857         unsigned total_output = 0, total_input = 0;
858         bool bounce = false;
859         bool page_alloc_failed = false;
860         int ret, more = 0;
861
862         BUG_ON(!bio_sectors(src));
863
864         ec_buf = bch2_writepoint_ec_buf(c, wp);
865
866         switch (bch2_write_prep_encoded_data(op, wp)) {
867         case PREP_ENCODED_OK:
868                 break;
869         case PREP_ENCODED_ERR:
870                 ret = -EIO;
871                 goto err;
872         case PREP_ENCODED_CHECKSUM_ERR:
873                 BUG();
874                 goto csum_err;
875         case PREP_ENCODED_DO_WRITE:
876                 /* XXX look for bug here */
877                 if (ec_buf) {
878                         dst = bch2_write_bio_alloc(c, wp, src,
879                                                    &page_alloc_failed,
880                                                    ec_buf);
881                         bio_copy_data(dst, src);
882                         bounce = true;
883                 }
884                 init_append_extent(op, wp, op->version, op->crc);
885                 goto do_write;
886         }
887
888         if (ec_buf ||
889             op->compression_type ||
890             (op->csum_type &&
891              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
892             (bch2_csum_type_is_encryption(op->csum_type) &&
893              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
894                 dst = bch2_write_bio_alloc(c, wp, src,
895                                            &page_alloc_failed,
896                                            ec_buf);
897                 bounce = true;
898         }
899
900         saved_iter = dst->bi_iter;
901
902         do {
903                 struct bch_extent_crc_unpacked crc =
904                         (struct bch_extent_crc_unpacked) { 0 };
905                 struct bversion version = op->version;
906                 size_t dst_len, src_len;
907
908                 if (page_alloc_failed &&
909                     bio_sectors(dst) < wp->sectors_free &&
910                     bio_sectors(dst) < c->sb.encoded_extent_max)
911                         break;
912
913                 BUG_ON(op->compression_type &&
914                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
915                        bch2_csum_type_is_encryption(op->crc.csum_type));
916                 BUG_ON(op->compression_type && !bounce);
917
918                 crc.compression_type = op->incompressible
919                         ? BCH_COMPRESSION_TYPE_incompressible
920                         : op->compression_type
921                         ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
922                                             op->compression_type)
923                         : 0;
924                 if (!crc_is_compressed(crc)) {
925                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
926                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
927
928                         if (op->csum_type)
929                                 dst_len = min_t(unsigned, dst_len,
930                                                 c->sb.encoded_extent_max << 9);
931
932                         if (bounce) {
933                                 swap(dst->bi_iter.bi_size, dst_len);
934                                 bio_copy_data(dst, src);
935                                 swap(dst->bi_iter.bi_size, dst_len);
936                         }
937
938                         src_len = dst_len;
939                 }
940
941                 BUG_ON(!src_len || !dst_len);
942
943                 if (bch2_csum_type_is_encryption(op->csum_type)) {
944                         if (bversion_zero(version)) {
945                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
946                         } else {
947                                 crc.nonce = op->nonce;
948                                 op->nonce += src_len >> 9;
949                         }
950                 }
951
952                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
953                     !crc_is_compressed(crc) &&
954                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
955                     bch2_csum_type_is_encryption(op->csum_type)) {
956                         /*
957                          * Note: when we're using rechecksum(), we need to be
958                          * checksumming @src because it has all the data our
959                          * existing checksum covers - if we bounced (because we
960                          * were trying to compress), @dst will only have the
961                          * part of the data the new checksum will cover.
962                          *
963                          * But normally we want to be checksumming post bounce,
964                          * because part of the reason for bouncing is so the
965                          * data can't be modified (by userspace) while it's in
966                          * flight.
967                          */
968                         if (bch2_rechecksum_bio(c, src, version, op->crc,
969                                         &crc, &op->crc,
970                                         src_len >> 9,
971                                         bio_sectors(src) - (src_len >> 9),
972                                         op->csum_type))
973                                 goto csum_err;
974                 } else {
975                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
976                             bch2_rechecksum_bio(c, src, version, op->crc,
977                                         NULL, &op->crc,
978                                         src_len >> 9,
979                                         bio_sectors(src) - (src_len >> 9),
980                                         op->crc.csum_type))
981                                 goto csum_err;
982
983                         crc.compressed_size     = dst_len >> 9;
984                         crc.uncompressed_size   = src_len >> 9;
985                         crc.live_size           = src_len >> 9;
986
987                         swap(dst->bi_iter.bi_size, dst_len);
988                         bch2_encrypt_bio(c, op->csum_type,
989                                          extent_nonce(version, crc), dst);
990                         crc.csum = bch2_checksum_bio(c, op->csum_type,
991                                          extent_nonce(version, crc), dst);
992                         crc.csum_type = op->csum_type;
993                         swap(dst->bi_iter.bi_size, dst_len);
994                 }
995
996                 init_append_extent(op, wp, version, crc);
997
998                 if (dst != src)
999                         bio_advance(dst, dst_len);
1000                 bio_advance(src, src_len);
1001                 total_output    += dst_len;
1002                 total_input     += src_len;
1003         } while (dst->bi_iter.bi_size &&
1004                  src->bi_iter.bi_size &&
1005                  wp->sectors_free &&
1006                  !bch2_keylist_realloc(&op->insert_keys,
1007                                       op->inline_keys,
1008                                       ARRAY_SIZE(op->inline_keys),
1009                                       BKEY_EXTENT_U64s_MAX));
1010
1011         more = src->bi_iter.bi_size != 0;
1012
1013         dst->bi_iter = saved_iter;
1014
1015         if (dst == src && more) {
1016                 BUG_ON(total_output != total_input);
1017
1018                 dst = bio_split(src, total_input >> 9,
1019                                 GFP_NOIO, &c->bio_write);
1020                 wbio_init(dst)->put_bio = true;
1021                 /* copy WRITE_SYNC flag */
1022                 dst->bi_opf             = src->bi_opf;
1023         }
1024
1025         dst->bi_iter.bi_size = total_output;
1026 do_write:
1027         /* might have done a realloc... */
1028         bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
1029
1030         *_dst = dst;
1031         return more;
1032 csum_err:
1033         bch_err(c, "error verifying existing checksum while "
1034                 "rewriting existing data (memory corruption?)");
1035         ret = -EIO;
1036 err:
1037         if (to_wbio(dst)->bounce)
1038                 bch2_bio_free_pages_pool(c, dst);
1039         if (to_wbio(dst)->put_bio)
1040                 bio_put(dst);
1041
1042         return ret;
1043 }
1044
1045 static void __bch2_write(struct closure *cl)
1046 {
1047         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1048         struct bch_fs *c = op->c;
1049         struct write_point *wp;
1050         struct bio *bio;
1051         bool skip_put = true;
1052         int ret;
1053 again:
1054         memset(&op->failed, 0, sizeof(op->failed));
1055
1056         do {
1057                 struct bkey_i *key_to_write;
1058                 unsigned key_to_write_offset = op->insert_keys.top_p -
1059                         op->insert_keys.keys_p;
1060
1061                 /* +1 for possible cache device: */
1062                 if (op->open_buckets.nr + op->nr_replicas + 1 >
1063                     ARRAY_SIZE(op->open_buckets.v))
1064                         goto flush_io;
1065
1066                 if (bch2_keylist_realloc(&op->insert_keys,
1067                                         op->inline_keys,
1068                                         ARRAY_SIZE(op->inline_keys),
1069                                         BKEY_EXTENT_U64s_MAX))
1070                         goto flush_io;
1071
1072                 wp = bch2_alloc_sectors_start(c,
1073                         op->target,
1074                         op->opts.erasure_code,
1075                         op->write_point,
1076                         &op->devs_have,
1077                         op->nr_replicas,
1078                         op->nr_replicas_required,
1079                         op->alloc_reserve,
1080                         op->flags,
1081                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
1082                 EBUG_ON(!wp);
1083
1084                 if (unlikely(IS_ERR(wp))) {
1085                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
1086                                 ret = PTR_ERR(wp);
1087                                 goto err;
1088                         }
1089
1090                         goto flush_io;
1091                 }
1092
1093                 bch2_open_bucket_get(c, wp, &op->open_buckets);
1094                 ret = bch2_write_extent(op, wp, &bio);
1095                 bch2_alloc_sectors_done(c, wp);
1096
1097                 if (ret < 0)
1098                         goto err;
1099
1100                 if (ret)
1101                         skip_put = false;
1102
1103                 bio->bi_end_io  = bch2_write_endio;
1104                 bio->bi_private = &op->cl;
1105                 bio->bi_opf |= REQ_OP_WRITE;
1106
1107                 if (!skip_put)
1108                         closure_get(bio->bi_private);
1109                 else
1110                         op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
1111
1112                 key_to_write = (void *) (op->insert_keys.keys_p +
1113                                          key_to_write_offset);
1114
1115                 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
1116                                           key_to_write);
1117         } while (ret);
1118
1119         if (!skip_put)
1120                 continue_at(cl, bch2_write_index, index_update_wq(op));
1121         return;
1122 err:
1123         op->error = ret;
1124
1125         continue_at(cl, bch2_write_index, index_update_wq(op));
1126         return;
1127 flush_io:
1128         closure_sync(cl);
1129
1130         if (!bch2_keylist_empty(&op->insert_keys)) {
1131                 __bch2_write_index(op);
1132
1133                 if (op->error) {
1134                         continue_at_nobarrier(cl, bch2_write_done, NULL);
1135                         return;
1136                 }
1137         }
1138
1139         goto again;
1140 }
1141
1142 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1143 {
1144         struct closure *cl = &op->cl;
1145         struct bio *bio = &op->wbio.bio;
1146         struct bvec_iter iter;
1147         struct bkey_i_inline_data *id;
1148         unsigned sectors;
1149         int ret;
1150
1151         bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1152
1153         ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1154                                    ARRAY_SIZE(op->inline_keys),
1155                                    BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1156         if (ret) {
1157                 op->error = ret;
1158                 goto err;
1159         }
1160
1161         sectors = bio_sectors(bio);
1162         op->pos.offset += sectors;
1163
1164         id = bkey_inline_data_init(op->insert_keys.top);
1165         id->k.p         = op->pos;
1166         id->k.version   = op->version;
1167         id->k.size      = sectors;
1168
1169         iter = bio->bi_iter;
1170         iter.bi_size = data_len;
1171         memcpy_from_bio(id->v.data, bio, iter);
1172
1173         while (data_len & 7)
1174                 id->v.data[data_len++] = '\0';
1175         set_bkey_val_bytes(&id->k, data_len);
1176         bch2_keylist_push(&op->insert_keys);
1177
1178         op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1179         continue_at_nobarrier(cl, bch2_write_index, NULL);
1180         return;
1181 err:
1182         bch2_write_done(&op->cl);
1183 }
1184
1185 /**
1186  * bch_write - handle a write to a cache device or flash only volume
1187  *
1188  * This is the starting point for any data to end up in a cache device; it could
1189  * be from a normal write, or a writeback write, or a write to a flash only
1190  * volume - it's also used by the moving garbage collector to compact data in
1191  * mostly empty buckets.
1192  *
1193  * It first writes the data to the cache, creating a list of keys to be inserted
1194  * (if the data won't fit in a single open bucket, there will be multiple keys);
1195  * after the data is written it calls bch_journal, and after the keys have been
1196  * added to the next journal write they're inserted into the btree.
1197  *
1198  * If op->discard is true, instead of inserting the data it invalidates the
1199  * region of the cache represented by op->bio and op->inode.
1200  */
1201 void bch2_write(struct closure *cl)
1202 {
1203         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1204         struct bio *bio = &op->wbio.bio;
1205         struct bch_fs *c = op->c;
1206         unsigned data_len;
1207
1208         BUG_ON(!op->nr_replicas);
1209         BUG_ON(!op->write_point.v);
1210         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
1211
1212         op->start_time = local_clock();
1213         bch2_keylist_init(&op->insert_keys, op->inline_keys);
1214         wbio_init(bio)->put_bio = false;
1215
1216         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
1217                 __bcache_io_error(c, "misaligned write");
1218                 op->error = -EIO;
1219                 goto err;
1220         }
1221
1222         if (c->opts.nochanges ||
1223             !percpu_ref_tryget(&c->writes)) {
1224                 __bcache_io_error(c, "read only");
1225                 op->error = -EROFS;
1226                 goto err;
1227         }
1228
1229         bch2_increment_clock(c, bio_sectors(bio), WRITE);
1230
1231         data_len = min_t(u64, bio->bi_iter.bi_size,
1232                          op->new_i_size - (op->pos.offset << 9));
1233
1234         if (c->opts.inline_data &&
1235             data_len <= min(block_bytes(c) / 2, 1024U)) {
1236                 bch2_write_data_inline(op, data_len);
1237                 return;
1238         }
1239
1240         continue_at_nobarrier(cl, __bch2_write, NULL);
1241         return;
1242 err:
1243         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
1244                 bch2_disk_reservation_put(c, &op->res);
1245
1246         if (op->end_io) {
1247                 EBUG_ON(cl->parent);
1248                 closure_debug_destroy(cl);
1249                 op->end_io(op);
1250         } else {
1251                 closure_return(cl);
1252         }
1253 }
1254
1255 /* Cache promotion on read */
1256
1257 struct promote_op {
1258         struct closure          cl;
1259         struct rcu_head         rcu;
1260         u64                     start_time;
1261
1262         struct rhash_head       hash;
1263         struct bpos             pos;
1264
1265         struct migrate_write    write;
1266         struct bio_vec          bi_inline_vecs[0]; /* must be last */
1267 };
1268
1269 static const struct rhashtable_params bch_promote_params = {
1270         .head_offset    = offsetof(struct promote_op, hash),
1271         .key_offset     = offsetof(struct promote_op, pos),
1272         .key_len        = sizeof(struct bpos),
1273 };
1274
1275 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1276                                   struct bpos pos,
1277                                   struct bch_io_opts opts,
1278                                   unsigned flags)
1279 {
1280         if (!(flags & BCH_READ_MAY_PROMOTE))
1281                 return false;
1282
1283         if (!opts.promote_target)
1284                 return false;
1285
1286         if (bch2_bkey_has_target(c, k, opts.promote_target))
1287                 return false;
1288
1289         if (bch2_target_congested(c, opts.promote_target)) {
1290                 /* XXX trace this */
1291                 return false;
1292         }
1293
1294         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1295                                    bch_promote_params))
1296                 return false;
1297
1298         return true;
1299 }
1300
1301 static void promote_free(struct bch_fs *c, struct promote_op *op)
1302 {
1303         int ret;
1304
1305         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1306                                      bch_promote_params);
1307         BUG_ON(ret);
1308         percpu_ref_put(&c->writes);
1309         kfree_rcu(op, rcu);
1310 }
1311
1312 static void promote_done(struct closure *cl)
1313 {
1314         struct promote_op *op =
1315                 container_of(cl, struct promote_op, cl);
1316         struct bch_fs *c = op->write.op.c;
1317
1318         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1319                                op->start_time);
1320
1321         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1322         promote_free(c, op);
1323 }
1324
1325 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1326 {
1327         struct bch_fs *c = rbio->c;
1328         struct closure *cl = &op->cl;
1329         struct bio *bio = &op->write.op.wbio.bio;
1330
1331         trace_promote(&rbio->bio);
1332
1333         /* we now own pages: */
1334         BUG_ON(!rbio->bounce);
1335         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1336
1337         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1338                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1339         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1340
1341         bch2_migrate_read_done(&op->write, rbio);
1342
1343         closure_init(cl, NULL);
1344         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1345         closure_return_with_destructor(cl, promote_done);
1346 }
1347
1348 static struct promote_op *__promote_alloc(struct bch_fs *c,
1349                                           enum btree_id btree_id,
1350                                           struct bkey_s_c k,
1351                                           struct bpos pos,
1352                                           struct extent_ptr_decoded *pick,
1353                                           struct bch_io_opts opts,
1354                                           unsigned sectors,
1355                                           struct bch_read_bio **rbio)
1356 {
1357         struct promote_op *op = NULL;
1358         struct bio *bio;
1359         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1360         int ret;
1361
1362         if (!percpu_ref_tryget(&c->writes))
1363                 return NULL;
1364
1365         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1366         if (!op)
1367                 goto err;
1368
1369         op->start_time = local_clock();
1370         op->pos = pos;
1371
1372         /*
1373          * We don't use the mempool here because extents that aren't
1374          * checksummed or compressed can be too big for the mempool:
1375          */
1376         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1377                         sizeof(struct bio_vec) * pages,
1378                         GFP_NOIO);
1379         if (!*rbio)
1380                 goto err;
1381
1382         rbio_init(&(*rbio)->bio, opts);
1383         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1384
1385         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1386                                  GFP_NOIO))
1387                 goto err;
1388
1389         (*rbio)->bounce         = true;
1390         (*rbio)->split          = true;
1391         (*rbio)->kmalloc        = true;
1392
1393         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1394                                           bch_promote_params))
1395                 goto err;
1396
1397         bio = &op->write.op.wbio.bio;
1398         bio_init(bio, bio->bi_inline_vecs, pages);
1399
1400         ret = bch2_migrate_write_init(c, &op->write,
1401                         writepoint_hashed((unsigned long) current),
1402                         opts,
1403                         DATA_PROMOTE,
1404                         (struct data_opts) {
1405                                 .target = opts.promote_target
1406                         },
1407                         btree_id, k);
1408         BUG_ON(ret);
1409
1410         return op;
1411 err:
1412         if (*rbio)
1413                 bio_free_pages(&(*rbio)->bio);
1414         kfree(*rbio);
1415         *rbio = NULL;
1416         kfree(op);
1417         percpu_ref_put(&c->writes);
1418         return NULL;
1419 }
1420
1421 noinline
1422 static struct promote_op *promote_alloc(struct bch_fs *c,
1423                                                struct bvec_iter iter,
1424                                                struct bkey_s_c k,
1425                                                struct extent_ptr_decoded *pick,
1426                                                struct bch_io_opts opts,
1427                                                unsigned flags,
1428                                                struct bch_read_bio **rbio,
1429                                                bool *bounce,
1430                                                bool *read_full)
1431 {
1432         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1433         /* data might have to be decompressed in the write path: */
1434         unsigned sectors = promote_full
1435                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1436                 : bvec_iter_sectors(iter);
1437         struct bpos pos = promote_full
1438                 ? bkey_start_pos(k.k)
1439                 : POS(k.k->p.inode, iter.bi_sector);
1440         struct promote_op *promote;
1441
1442         if (!should_promote(c, k, pos, opts, flags))
1443                 return NULL;
1444
1445         promote = __promote_alloc(c,
1446                                   k.k->type == KEY_TYPE_reflink_v
1447                                   ? BTREE_ID_REFLINK
1448                                   : BTREE_ID_EXTENTS,
1449                                   k, pos, pick, opts, sectors, rbio);
1450         if (!promote)
1451                 return NULL;
1452
1453         *bounce         = true;
1454         *read_full      = promote_full;
1455         return promote;
1456 }
1457
1458 /* Read */
1459
1460 #define READ_RETRY_AVOID        1
1461 #define READ_RETRY              2
1462 #define READ_ERR                3
1463
1464 enum rbio_context {
1465         RBIO_CONTEXT_NULL,
1466         RBIO_CONTEXT_HIGHPRI,
1467         RBIO_CONTEXT_UNBOUND,
1468 };
1469
1470 static inline struct bch_read_bio *
1471 bch2_rbio_parent(struct bch_read_bio *rbio)
1472 {
1473         return rbio->split ? rbio->parent : rbio;
1474 }
1475
1476 __always_inline
1477 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1478                            enum rbio_context context,
1479                            struct workqueue_struct *wq)
1480 {
1481         if (context <= rbio->context) {
1482                 fn(&rbio->work);
1483         } else {
1484                 rbio->work.func         = fn;
1485                 rbio->context           = context;
1486                 queue_work(wq, &rbio->work);
1487         }
1488 }
1489
1490 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1491 {
1492         BUG_ON(rbio->bounce && !rbio->split);
1493
1494         if (rbio->promote)
1495                 promote_free(rbio->c, rbio->promote);
1496         rbio->promote = NULL;
1497
1498         if (rbio->bounce)
1499                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1500
1501         if (rbio->split) {
1502                 struct bch_read_bio *parent = rbio->parent;
1503
1504                 if (rbio->kmalloc)
1505                         kfree(rbio);
1506                 else
1507                         bio_put(&rbio->bio);
1508
1509                 rbio = parent;
1510         }
1511
1512         return rbio;
1513 }
1514
1515 /*
1516  * Only called on a top level bch_read_bio to complete an entire read request,
1517  * not a split:
1518  */
1519 static void bch2_rbio_done(struct bch_read_bio *rbio)
1520 {
1521         if (rbio->start_time)
1522                 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1523                                        rbio->start_time);
1524         bio_endio(&rbio->bio);
1525 }
1526
1527 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1528                                      struct bvec_iter bvec_iter, u64 inode,
1529                                      struct bch_io_failures *failed,
1530                                      unsigned flags)
1531 {
1532         struct btree_trans trans;
1533         struct btree_iter *iter;
1534         struct bkey_on_stack sk;
1535         struct bkey_s_c k;
1536         int ret;
1537
1538         flags &= ~BCH_READ_LAST_FRAGMENT;
1539         flags |= BCH_READ_MUST_CLONE;
1540
1541         bkey_on_stack_init(&sk);
1542         bch2_trans_init(&trans, c, 0, 0);
1543
1544         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1545                                    rbio->pos, BTREE_ITER_SLOTS);
1546 retry:
1547         rbio->bio.bi_status = 0;
1548
1549         k = bch2_btree_iter_peek_slot(iter);
1550         if (bkey_err(k))
1551                 goto err;
1552
1553         bkey_on_stack_reassemble(&sk, c, k);
1554         k = bkey_i_to_s_c(sk.k);
1555         bch2_trans_unlock(&trans);
1556
1557         if (!bch2_bkey_matches_ptr(c, k,
1558                                    rbio->pick.ptr,
1559                                    rbio->pos.offset -
1560                                    rbio->pick.crc.offset)) {
1561                 /* extent we wanted to read no longer exists: */
1562                 rbio->hole = true;
1563                 goto out;
1564         }
1565
1566         ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
1567         if (ret == READ_RETRY)
1568                 goto retry;
1569         if (ret)
1570                 goto err;
1571 out:
1572         bch2_rbio_done(rbio);
1573         bch2_trans_exit(&trans);
1574         bkey_on_stack_exit(&sk, c);
1575         return;
1576 err:
1577         rbio->bio.bi_status = BLK_STS_IOERR;
1578         goto out;
1579 }
1580
1581 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1582                             struct bvec_iter bvec_iter, u64 inode,
1583                             struct bch_io_failures *failed, unsigned flags)
1584 {
1585         struct btree_trans trans;
1586         struct btree_iter *iter;
1587         struct bkey_on_stack sk;
1588         struct bkey_s_c k;
1589         int ret;
1590
1591         flags &= ~BCH_READ_LAST_FRAGMENT;
1592         flags |= BCH_READ_MUST_CLONE;
1593
1594         bkey_on_stack_init(&sk);
1595         bch2_trans_init(&trans, c, 0, 0);
1596 retry:
1597         bch2_trans_begin(&trans);
1598
1599         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1600                            POS(inode, bvec_iter.bi_sector),
1601                            BTREE_ITER_SLOTS, k, ret) {
1602                 unsigned bytes, sectors, offset_into_extent;
1603
1604                 bkey_on_stack_reassemble(&sk, c, k);
1605                 k = bkey_i_to_s_c(sk.k);
1606
1607                 offset_into_extent = iter->pos.offset -
1608                         bkey_start_offset(k.k);
1609                 sectors = k.k->size - offset_into_extent;
1610
1611                 ret = bch2_read_indirect_extent(&trans,
1612                                         &offset_into_extent, sk.k);
1613                 if (ret)
1614                         break;
1615
1616                 sectors = min(sectors, k.k->size - offset_into_extent);
1617
1618                 bch2_trans_unlock(&trans);
1619
1620                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1621                 swap(bvec_iter.bi_size, bytes);
1622
1623                 ret = __bch2_read_extent(c, rbio, bvec_iter, k,
1624                                 offset_into_extent, failed, flags);
1625                 switch (ret) {
1626                 case READ_RETRY:
1627                         goto retry;
1628                 case READ_ERR:
1629                         goto err;
1630                 };
1631
1632                 if (bytes == bvec_iter.bi_size)
1633                         goto out;
1634
1635                 swap(bvec_iter.bi_size, bytes);
1636                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1637         }
1638
1639         if (ret == -EINTR)
1640                 goto retry;
1641         /*
1642          * If we get here, it better have been because there was an error
1643          * reading a btree node
1644          */
1645         BUG_ON(!ret);
1646         __bcache_io_error(c, "btree IO error: %i", ret);
1647 err:
1648         rbio->bio.bi_status = BLK_STS_IOERR;
1649 out:
1650         bch2_trans_exit(&trans);
1651         bkey_on_stack_exit(&sk, c);
1652         bch2_rbio_done(rbio);
1653 }
1654
1655 static void bch2_rbio_retry(struct work_struct *work)
1656 {
1657         struct bch_read_bio *rbio =
1658                 container_of(work, struct bch_read_bio, work);
1659         struct bch_fs *c        = rbio->c;
1660         struct bvec_iter iter   = rbio->bvec_iter;
1661         unsigned flags          = rbio->flags;
1662         u64 inode               = rbio->pos.inode;
1663         struct bch_io_failures failed = { .nr = 0 };
1664
1665         trace_read_retry(&rbio->bio);
1666
1667         if (rbio->retry == READ_RETRY_AVOID)
1668                 bch2_mark_io_failure(&failed, &rbio->pick);
1669
1670         rbio->bio.bi_status = 0;
1671
1672         rbio = bch2_rbio_free(rbio);
1673
1674         flags |= BCH_READ_IN_RETRY;
1675         flags &= ~BCH_READ_MAY_PROMOTE;
1676
1677         if (flags & BCH_READ_NODECODE)
1678                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1679         else
1680                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1681 }
1682
1683 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1684                             blk_status_t error)
1685 {
1686         rbio->retry = retry;
1687
1688         if (rbio->flags & BCH_READ_IN_RETRY)
1689                 return;
1690
1691         if (retry == READ_ERR) {
1692                 rbio = bch2_rbio_free(rbio);
1693
1694                 rbio->bio.bi_status = error;
1695                 bch2_rbio_done(rbio);
1696         } else {
1697                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1698                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1699         }
1700 }
1701
1702 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1703 {
1704         struct bch_fs *c = rbio->c;
1705         struct btree_trans trans;
1706         struct btree_iter *iter;
1707         struct bkey_s_c k;
1708         struct bkey_on_stack new;
1709         struct bch_extent_crc_unpacked new_crc;
1710         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1711         int ret;
1712
1713         if (crc_is_compressed(rbio->pick.crc))
1714                 return;
1715
1716         bkey_on_stack_init(&new);
1717         bch2_trans_init(&trans, c, 0, 0);
1718 retry:
1719         bch2_trans_begin(&trans);
1720
1721         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1722                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1723         k = bch2_btree_iter_peek_slot(iter);
1724         if (IS_ERR_OR_NULL(k.k))
1725                 goto out;
1726
1727         bkey_on_stack_reassemble(&new, c, k);
1728         k = bkey_i_to_s_c(new.k);
1729
1730         if (bversion_cmp(k.k->version, rbio->version) ||
1731             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1732                 goto out;
1733
1734         /* Extent was merged? */
1735         if (bkey_start_offset(k.k) < data_offset ||
1736             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1737                 goto out;
1738
1739         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1740                         rbio->pick.crc, NULL, &new_crc,
1741                         bkey_start_offset(k.k) - data_offset, k.k->size,
1742                         rbio->pick.crc.csum_type)) {
1743                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1744                 goto out;
1745         }
1746
1747         if (!bch2_bkey_narrow_crcs(new.k, new_crc))
1748                 goto out;
1749
1750         bch2_trans_update(&trans, iter, new.k, 0);
1751         ret = bch2_trans_commit(&trans, NULL, NULL,
1752                                 BTREE_INSERT_NOFAIL|
1753                                 BTREE_INSERT_NOWAIT);
1754         if (ret == -EINTR)
1755                 goto retry;
1756 out:
1757         bch2_trans_exit(&trans);
1758         bkey_on_stack_exit(&new, c);
1759 }
1760
1761 /* Inner part that may run in process context */
1762 static void __bch2_read_endio(struct work_struct *work)
1763 {
1764         struct bch_read_bio *rbio =
1765                 container_of(work, struct bch_read_bio, work);
1766         struct bch_fs *c        = rbio->c;
1767         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1768         struct bio *src         = &rbio->bio;
1769         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1770         struct bvec_iter dst_iter = rbio->bvec_iter;
1771         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1772         struct nonce nonce = extent_nonce(rbio->version, crc);
1773         struct bch_csum csum;
1774
1775         /* Reset iterator for checksumming and copying bounced data: */
1776         if (rbio->bounce) {
1777                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1778                 src->bi_iter.bi_idx             = 0;
1779                 src->bi_iter.bi_bvec_done       = 0;
1780         } else {
1781                 src->bi_iter                    = rbio->bvec_iter;
1782         }
1783
1784         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1785         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1786                 goto csum_err;
1787
1788         if (unlikely(rbio->narrow_crcs))
1789                 bch2_rbio_narrow_crcs(rbio);
1790
1791         if (rbio->flags & BCH_READ_NODECODE)
1792                 goto nodecode;
1793
1794         /* Adjust crc to point to subset of data we want: */
1795         crc.offset     += rbio->offset_into_extent;
1796         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1797
1798         if (crc_is_compressed(crc)) {
1799                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1800                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1801                         goto decompression_err;
1802         } else {
1803                 /* don't need to decrypt the entire bio: */
1804                 nonce = nonce_add(nonce, crc.offset << 9);
1805                 bio_advance(src, crc.offset << 9);
1806
1807                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1808                 src->bi_iter.bi_size = dst_iter.bi_size;
1809
1810                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1811
1812                 if (rbio->bounce) {
1813                         struct bvec_iter src_iter = src->bi_iter;
1814                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1815                 }
1816         }
1817
1818         if (rbio->promote) {
1819                 /*
1820                  * Re encrypt data we decrypted, so it's consistent with
1821                  * rbio->crc:
1822                  */
1823                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1824                 promote_start(rbio->promote, rbio);
1825                 rbio->promote = NULL;
1826         }
1827 nodecode:
1828         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1829                 rbio = bch2_rbio_free(rbio);
1830                 bch2_rbio_done(rbio);
1831         }
1832         return;
1833 csum_err:
1834         /*
1835          * Checksum error: if the bio wasn't bounced, we may have been
1836          * reading into buffers owned by userspace (that userspace can
1837          * scribble over) - retry the read, bouncing it this time:
1838          */
1839         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1840                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1841                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1842                 return;
1843         }
1844
1845         bch2_dev_io_error(ca,
1846                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1847                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1848                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1849                 csum.hi, csum.lo, crc.csum_type);
1850         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1851         return;
1852 decompression_err:
1853         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1854                           rbio->pos.inode,
1855                           (u64) rbio->bvec_iter.bi_sector);
1856         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1857         return;
1858 }
1859
1860 static void bch2_read_endio(struct bio *bio)
1861 {
1862         struct bch_read_bio *rbio =
1863                 container_of(bio, struct bch_read_bio, bio);
1864         struct bch_fs *c        = rbio->c;
1865         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1866         struct workqueue_struct *wq = NULL;
1867         enum rbio_context context = RBIO_CONTEXT_NULL;
1868
1869         if (rbio->have_ioref) {
1870                 bch2_latency_acct(ca, rbio->submit_time, READ);
1871                 percpu_ref_put(&ca->io_ref);
1872         }
1873
1874         if (!rbio->split)
1875                 rbio->bio.bi_end_io = rbio->end_io;
1876
1877         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1878                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1879                 return;
1880         }
1881
1882         if (rbio->pick.ptr.cached &&
1883             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1884              ptr_stale(ca, &rbio->pick.ptr))) {
1885                 atomic_long_inc(&c->read_realloc_races);
1886
1887                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1888                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1889                 else
1890                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1891                 return;
1892         }
1893
1894         if (rbio->narrow_crcs ||
1895             crc_is_compressed(rbio->pick.crc) ||
1896             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1897                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1898         else if (rbio->pick.crc.csum_type)
1899                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1900
1901         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1902 }
1903
1904 int __bch2_read_indirect_extent(struct btree_trans *trans,
1905                                 unsigned *offset_into_extent,
1906                                 struct bkey_i *orig_k)
1907 {
1908         struct btree_iter *iter;
1909         struct bkey_s_c k;
1910         u64 reflink_offset;
1911         int ret;
1912
1913         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
1914                 *offset_into_extent;
1915
1916         iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
1917                                    POS(0, reflink_offset),
1918                                    BTREE_ITER_SLOTS);
1919         ret = PTR_ERR_OR_ZERO(iter);
1920         if (ret)
1921                 return ret;
1922
1923         k = bch2_btree_iter_peek_slot(iter);
1924         ret = bkey_err(k);
1925         if (ret)
1926                 goto err;
1927
1928         if (k.k->type != KEY_TYPE_reflink_v) {
1929                 __bcache_io_error(trans->c,
1930                                 "pointer to nonexistent indirect extent");
1931                 ret = -EIO;
1932                 goto err;
1933         }
1934
1935         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1936         bkey_reassemble(orig_k, k);
1937 err:
1938         bch2_trans_iter_put(trans, iter);
1939         return ret;
1940 }
1941
1942 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1943                        struct bvec_iter iter, struct bkey_s_c k,
1944                        unsigned offset_into_extent,
1945                        struct bch_io_failures *failed, unsigned flags)
1946 {
1947         struct extent_ptr_decoded pick;
1948         struct bch_read_bio *rbio = NULL;
1949         struct bch_dev *ca;
1950         struct promote_op *promote = NULL;
1951         bool bounce = false, read_full = false, narrow_crcs = false;
1952         struct bpos pos = bkey_start_pos(k.k);
1953         int pick_ret;
1954
1955         if (k.k->type == KEY_TYPE_inline_data) {
1956                 struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
1957                 unsigned bytes = min_t(unsigned, iter.bi_size,
1958                                        bkey_val_bytes(d.k));
1959
1960                 swap(iter.bi_size, bytes);
1961                 memcpy_to_bio(&orig->bio, iter, d.v->data);
1962                 swap(iter.bi_size, bytes);
1963                 bio_advance_iter(&orig->bio, &iter, bytes);
1964                 zero_fill_bio_iter(&orig->bio, iter);
1965                 goto out_read_done;
1966         }
1967
1968         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1969
1970         /* hole or reservation - just zero fill: */
1971         if (!pick_ret)
1972                 goto hole;
1973
1974         if (pick_ret < 0) {
1975                 __bcache_io_error(c, "no device to read from");
1976                 goto err;
1977         }
1978
1979         if (pick_ret > 0)
1980                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1981
1982         if (flags & BCH_READ_NODECODE) {
1983                 /*
1984                  * can happen if we retry, and the extent we were going to read
1985                  * has been merged in the meantime:
1986                  */
1987                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1988                         goto hole;
1989
1990                 iter.bi_size    = pick.crc.compressed_size << 9;
1991                 goto get_bio;
1992         }
1993
1994         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1995             bio_flagged(&orig->bio, BIO_CHAIN))
1996                 flags |= BCH_READ_MUST_CLONE;
1997
1998         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
1999                 bch2_can_narrow_extent_crcs(k, pick.crc);
2000
2001         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
2002                 flags |= BCH_READ_MUST_BOUNCE;
2003
2004         EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
2005
2006         if (crc_is_compressed(pick.crc) ||
2007             (pick.crc.csum_type != BCH_CSUM_NONE &&
2008              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2009               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2010                (flags & BCH_READ_USER_MAPPED)) ||
2011               (flags & BCH_READ_MUST_BOUNCE)))) {
2012                 read_full = true;
2013                 bounce = true;
2014         }
2015
2016         if (orig->opts.promote_target)
2017                 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2018                                         &rbio, &bounce, &read_full);
2019
2020         if (!read_full) {
2021                 EBUG_ON(crc_is_compressed(pick.crc));
2022                 EBUG_ON(pick.crc.csum_type &&
2023                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2024                          bvec_iter_sectors(iter) != pick.crc.live_size ||
2025                          pick.crc.offset ||
2026                          offset_into_extent));
2027
2028                 pos.offset += offset_into_extent;
2029                 pick.ptr.offset += pick.crc.offset +
2030                         offset_into_extent;
2031                 offset_into_extent              = 0;
2032                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
2033                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
2034                 pick.crc.offset                 = 0;
2035                 pick.crc.live_size              = bvec_iter_sectors(iter);
2036                 offset_into_extent              = 0;
2037         }
2038 get_bio:
2039         if (rbio) {
2040                 /*
2041                  * promote already allocated bounce rbio:
2042                  * promote needs to allocate a bio big enough for uncompressing
2043                  * data in the write path, but we're not going to use it all
2044                  * here:
2045                  */
2046                 EBUG_ON(rbio->bio.bi_iter.bi_size <
2047                        pick.crc.compressed_size << 9);
2048                 rbio->bio.bi_iter.bi_size =
2049                         pick.crc.compressed_size << 9;
2050         } else if (bounce) {
2051                 unsigned sectors = pick.crc.compressed_size;
2052
2053                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
2054                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
2055                                                   &c->bio_read_split),
2056                                  orig->opts);
2057
2058                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2059                 rbio->bounce    = true;
2060                 rbio->split     = true;
2061         } else if (flags & BCH_READ_MUST_CLONE) {
2062                 /*
2063                  * Have to clone if there were any splits, due to error
2064                  * reporting issues (if a split errored, and retrying didn't
2065                  * work, when it reports the error to its parent (us) we don't
2066                  * know if the error was from our bio, and we should retry, or
2067                  * from the whole bio, in which case we don't want to retry and
2068                  * lose the error)
2069                  */
2070                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
2071                                                 &c->bio_read_split),
2072                                  orig->opts);
2073                 rbio->bio.bi_iter = iter;
2074                 rbio->split     = true;
2075         } else {
2076                 rbio = orig;
2077                 rbio->bio.bi_iter = iter;
2078                 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2079         }
2080
2081         EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2082
2083         rbio->c                 = c;
2084         rbio->submit_time       = local_clock();
2085         if (rbio->split)
2086                 rbio->parent    = orig;
2087         else
2088                 rbio->end_io    = orig->bio.bi_end_io;
2089         rbio->bvec_iter         = iter;
2090         rbio->offset_into_extent= offset_into_extent;
2091         rbio->flags             = flags;
2092         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2093         rbio->narrow_crcs       = narrow_crcs;
2094         rbio->hole              = 0;
2095         rbio->retry             = 0;
2096         rbio->context           = 0;
2097         /* XXX: only initialize this if needed */
2098         rbio->devs_have         = bch2_bkey_devs(k);
2099         rbio->pick              = pick;
2100         rbio->pos               = pos;
2101         rbio->version           = k.k->version;
2102         rbio->promote           = promote;
2103         INIT_WORK(&rbio->work, NULL);
2104
2105         rbio->bio.bi_opf        = orig->bio.bi_opf;
2106         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2107         rbio->bio.bi_end_io     = bch2_read_endio;
2108
2109         if (rbio->bounce)
2110                 trace_read_bounce(&rbio->bio);
2111
2112         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2113
2114         rcu_read_lock();
2115         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
2116         rcu_read_unlock();
2117
2118         if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2119                 bio_inc_remaining(&orig->bio);
2120                 trace_read_split(&orig->bio);
2121         }
2122
2123         if (!rbio->pick.idx) {
2124                 if (!rbio->have_ioref) {
2125                         __bcache_io_error(c, "no device to read from");
2126                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2127                         goto out;
2128                 }
2129
2130                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
2131                              bio_sectors(&rbio->bio));
2132                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2133
2134                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2135                         submit_bio(&rbio->bio);
2136                 else
2137                         submit_bio_wait(&rbio->bio);
2138         } else {
2139                 /* Attempting reconstruct read: */
2140                 if (bch2_ec_read_extent(c, rbio)) {
2141                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2142                         goto out;
2143                 }
2144
2145                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2146                         bio_endio(&rbio->bio);
2147         }
2148 out:
2149         if (likely(!(flags & BCH_READ_IN_RETRY))) {
2150                 return 0;
2151         } else {
2152                 int ret;
2153
2154                 rbio->context = RBIO_CONTEXT_UNBOUND;
2155                 bch2_read_endio(&rbio->bio);
2156
2157                 ret = rbio->retry;
2158                 rbio = bch2_rbio_free(rbio);
2159
2160                 if (ret == READ_RETRY_AVOID) {
2161                         bch2_mark_io_failure(failed, &pick);
2162                         ret = READ_RETRY;
2163                 }
2164
2165                 return ret;
2166         }
2167
2168 err:
2169         if (flags & BCH_READ_IN_RETRY)
2170                 return READ_ERR;
2171
2172         orig->bio.bi_status = BLK_STS_IOERR;
2173         goto out_read_done;
2174
2175 hole:
2176         /*
2177          * won't normally happen in the BCH_READ_NODECODE
2178          * (bch2_move_extent()) path, but if we retry and the extent we wanted
2179          * to read no longer exists we have to signal that:
2180          */
2181         if (flags & BCH_READ_NODECODE)
2182                 orig->hole = true;
2183
2184         zero_fill_bio_iter(&orig->bio, iter);
2185 out_read_done:
2186         if (flags & BCH_READ_LAST_FRAGMENT)
2187                 bch2_rbio_done(orig);
2188         return 0;
2189 }
2190
2191 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
2192 {
2193         struct btree_trans trans;
2194         struct btree_iter *iter;
2195         struct bkey_on_stack sk;
2196         struct bkey_s_c k;
2197         unsigned flags = BCH_READ_RETRY_IF_STALE|
2198                 BCH_READ_MAY_PROMOTE|
2199                 BCH_READ_USER_MAPPED;
2200         int ret;
2201
2202         BUG_ON(rbio->_state);
2203         BUG_ON(flags & BCH_READ_NODECODE);
2204         BUG_ON(flags & BCH_READ_IN_RETRY);
2205
2206         rbio->c = c;
2207         rbio->start_time = local_clock();
2208
2209         bkey_on_stack_init(&sk);
2210         bch2_trans_init(&trans, c, 0, 0);
2211 retry:
2212         bch2_trans_begin(&trans);
2213
2214         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2215                                    POS(inode, rbio->bio.bi_iter.bi_sector),
2216                                    BTREE_ITER_SLOTS);
2217         while (1) {
2218                 unsigned bytes, sectors, offset_into_extent;
2219
2220                 bch2_btree_iter_set_pos(iter,
2221                                 POS(inode, rbio->bio.bi_iter.bi_sector));
2222
2223                 k = bch2_btree_iter_peek_slot(iter);
2224                 ret = bkey_err(k);
2225                 if (ret)
2226                         goto err;
2227
2228                 offset_into_extent = iter->pos.offset -
2229                         bkey_start_offset(k.k);
2230                 sectors = k.k->size - offset_into_extent;
2231
2232                 bkey_on_stack_reassemble(&sk, c, k);
2233                 k = bkey_i_to_s_c(sk.k);
2234
2235                 ret = bch2_read_indirect_extent(&trans,
2236                                         &offset_into_extent, sk.k);
2237                 if (ret)
2238                         goto err;
2239
2240                 /*
2241                  * With indirect extents, the amount of data to read is the min
2242                  * of the original extent and the indirect extent:
2243                  */
2244                 sectors = min(sectors, k.k->size - offset_into_extent);
2245
2246                 /*
2247                  * Unlock the iterator while the btree node's lock is still in
2248                  * cache, before doing the IO:
2249                  */
2250                 bch2_trans_unlock(&trans);
2251
2252                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
2253                 swap(rbio->bio.bi_iter.bi_size, bytes);
2254
2255                 if (rbio->bio.bi_iter.bi_size == bytes)
2256                         flags |= BCH_READ_LAST_FRAGMENT;
2257
2258                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
2259
2260                 if (flags & BCH_READ_LAST_FRAGMENT)
2261                         break;
2262
2263                 swap(rbio->bio.bi_iter.bi_size, bytes);
2264                 bio_advance(&rbio->bio, bytes);
2265         }
2266 out:
2267         bch2_trans_exit(&trans);
2268         bkey_on_stack_exit(&sk, c);
2269         return;
2270 err:
2271         if (ret == -EINTR)
2272                 goto retry;
2273
2274         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
2275         bch2_rbio_done(rbio);
2276         goto out;
2277 }
2278
2279 void bch2_fs_io_exit(struct bch_fs *c)
2280 {
2281         if (c->promote_table.tbl)
2282                 rhashtable_destroy(&c->promote_table);
2283         mempool_exit(&c->bio_bounce_pages);
2284         bioset_exit(&c->bio_write);
2285         bioset_exit(&c->bio_read_split);
2286         bioset_exit(&c->bio_read);
2287 }
2288
2289 int bch2_fs_io_init(struct bch_fs *c)
2290 {
2291         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2292                         BIOSET_NEED_BVECS) ||
2293             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2294                         BIOSET_NEED_BVECS) ||
2295             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2296                         BIOSET_NEED_BVECS) ||
2297             mempool_init_page_pool(&c->bio_bounce_pages,
2298                                    max_t(unsigned,
2299                                          c->opts.btree_node_size,
2300                                          c->sb.encoded_extent_max) /
2301                                    PAGE_SECTORS, 0) ||
2302             rhashtable_init(&c->promote_table, &bch_promote_params))
2303                 return -ENOMEM;
2304
2305         return 0;
2306 }