]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
Update bcachefs sources to d372ddcbfa bcachefs: Reorganize extents.c
[bcachefs-tools-debian] / libbcachefs / io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "alloc_foreground.h"
11 #include "bkey_on_stack.h"
12 #include "bset.h"
13 #include "btree_update.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "compress.h"
17 #include "clock.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "ec.h"
21 #include "error.h"
22 #include "extent_update.h"
23 #include "inode.h"
24 #include "io.h"
25 #include "journal.h"
26 #include "keylist.h"
27 #include "move.h"
28 #include "rebalance.h"
29 #include "super.h"
30 #include "super-io.h"
31
32 #include <linux/blkdev.h>
33 #include <linux/random.h>
34
35 #include <trace/events/bcachefs.h>
36
37 static bool bch2_target_congested(struct bch_fs *c, u16 target)
38 {
39         const struct bch_devs_mask *devs;
40         unsigned d, nr = 0, total = 0;
41         u64 now = local_clock(), last;
42         s64 congested;
43         struct bch_dev *ca;
44
45         if (!target)
46                 return false;
47
48         rcu_read_lock();
49         devs = bch2_target_to_mask(c, target);
50         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
51                 ca = rcu_dereference(c->devs[d]);
52                 if (!ca)
53                         continue;
54
55                 congested = atomic_read(&ca->congested);
56                 last = READ_ONCE(ca->congested_last);
57                 if (time_after64(now, last))
58                         congested -= (now - last) >> 12;
59
60                 total += max(congested, 0LL);
61                 nr++;
62         }
63         rcu_read_unlock();
64
65         return bch2_rand_range(nr * CONGESTED_MAX) < total;
66 }
67
68 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
69                                        u64 now, int rw)
70 {
71         u64 latency_capable =
72                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
73         /* ideally we'd be taking into account the device's variance here: */
74         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
75         s64 latency_over = io_latency - latency_threshold;
76
77         if (latency_threshold && latency_over > 0) {
78                 /*
79                  * bump up congested by approximately latency_over * 4 /
80                  * latency_threshold - we don't need much accuracy here so don't
81                  * bother with the divide:
82                  */
83                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
84                         atomic_add(latency_over >>
85                                    max_t(int, ilog2(latency_threshold) - 2, 0),
86                                    &ca->congested);
87
88                 ca->congested_last = now;
89         } else if (atomic_read(&ca->congested) > 0) {
90                 atomic_dec(&ca->congested);
91         }
92 }
93
94 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
95 {
96         atomic64_t *latency = &ca->cur_latency[rw];
97         u64 now = local_clock();
98         u64 io_latency = time_after64(now, submit_time)
99                 ? now - submit_time
100                 : 0;
101         u64 old, new, v = atomic64_read(latency);
102
103         do {
104                 old = v;
105
106                 /*
107                  * If the io latency was reasonably close to the current
108                  * latency, skip doing the update and atomic operation - most of
109                  * the time:
110                  */
111                 if (abs((int) (old - io_latency)) < (old >> 1) &&
112                     now & ~(~0 << 5))
113                         break;
114
115                 new = ewma_add(old, io_latency, 5);
116         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
117
118         bch2_congested_acct(ca, io_latency, now, rw);
119
120         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
121 }
122
123 /* Allocate, free from mempool: */
124
125 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
126 {
127         struct bvec_iter_all iter;
128         struct bio_vec *bv;
129
130         bio_for_each_segment_all(bv, bio, iter)
131                 if (bv->bv_page != ZERO_PAGE(0))
132                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
133         bio->bi_vcnt = 0;
134 }
135
136 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
137 {
138         struct page *page;
139
140         if (likely(!*using_mempool)) {
141                 page = alloc_page(GFP_NOIO);
142                 if (unlikely(!page)) {
143                         mutex_lock(&c->bio_bounce_pages_lock);
144                         *using_mempool = true;
145                         goto pool_alloc;
146
147                 }
148         } else {
149 pool_alloc:
150                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
151         }
152
153         return page;
154 }
155
156 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
157                                size_t size)
158 {
159         bool using_mempool = false;
160
161         while (size) {
162                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
163                 unsigned len = min(PAGE_SIZE, size);
164
165                 BUG_ON(!bio_add_page(bio, page, len, 0));
166                 size -= len;
167         }
168
169         if (using_mempool)
170                 mutex_unlock(&c->bio_bounce_pages_lock);
171 }
172
173 /* Extent update path: */
174
175 static int sum_sector_overwrites(struct btree_trans *trans,
176                                  struct btree_iter *extent_iter,
177                                  struct bkey_i *new,
178                                  bool may_allocate,
179                                  bool *maybe_extending,
180                                  s64 *delta)
181 {
182         struct btree_iter *iter;
183         struct bkey_s_c old;
184         int ret = 0;
185
186         *maybe_extending = true;
187         *delta = 0;
188
189         iter = bch2_trans_copy_iter(trans, extent_iter);
190         if (IS_ERR(iter))
191                 return PTR_ERR(iter);
192
193         for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
194                 if (!may_allocate &&
195                     bch2_bkey_nr_ptrs_fully_allocated(old) <
196                     bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
197                         ret = -ENOSPC;
198                         break;
199                 }
200
201                 *delta += (min(new->k.p.offset,
202                               old.k->p.offset) -
203                           max(bkey_start_offset(&new->k),
204                               bkey_start_offset(old.k))) *
205                         (bkey_extent_is_allocation(&new->k) -
206                          bkey_extent_is_allocation(old.k));
207
208                 if (bkey_cmp(old.k->p, new->k.p) >= 0) {
209                         /*
210                          * Check if there's already data above where we're
211                          * going to be writing to - this means we're definitely
212                          * not extending the file:
213                          *
214                          * Note that it's not sufficient to check if there's
215                          * data up to the sector offset we're going to be
216                          * writing to, because i_size could be up to one block
217                          * less:
218                          */
219                         if (!bkey_cmp(old.k->p, new->k.p))
220                                 old = bch2_btree_iter_next(iter);
221
222                         if (old.k && !bkey_err(old) &&
223                             old.k->p.inode == extent_iter->pos.inode &&
224                             bkey_extent_is_data(old.k))
225                                 *maybe_extending = false;
226
227                         break;
228                 }
229         }
230
231         bch2_trans_iter_put(trans, iter);
232         return ret;
233 }
234
235 int bch2_extent_update(struct btree_trans *trans,
236                        struct btree_iter *iter,
237                        struct bkey_i *k,
238                        struct disk_reservation *disk_res,
239                        u64 *journal_seq,
240                        u64 new_i_size,
241                        s64 *i_sectors_delta)
242 {
243         /* this must live until after bch2_trans_commit(): */
244         struct bkey_inode_buf inode_p;
245         bool extending = false;
246         s64 delta = 0;
247         int ret;
248
249         ret = bch2_extent_trim_atomic(k, iter);
250         if (ret)
251                 return ret;
252
253         ret = sum_sector_overwrites(trans, iter, k,
254                         disk_res && disk_res->sectors != 0,
255                         &extending, &delta);
256         if (ret)
257                 return ret;
258
259         new_i_size = extending
260                 ? min(k->k.p.offset << 9, new_i_size)
261                 : 0;
262
263         if (delta || new_i_size) {
264                 struct btree_iter *inode_iter;
265                 struct bch_inode_unpacked inode_u;
266
267                 inode_iter = bch2_inode_peek(trans, &inode_u,
268                                 k->k.p.inode, BTREE_ITER_INTENT);
269                 if (IS_ERR(inode_iter))
270                         return PTR_ERR(inode_iter);
271
272                 /*
273                  * XXX:
274                  * writeback can race a bit with truncate, because truncate
275                  * first updates the inode then truncates the pagecache. This is
276                  * ugly, but lets us preserve the invariant that the in memory
277                  * i_size is always >= the on disk i_size.
278                  *
279                 BUG_ON(new_i_size > inode_u.bi_size &&
280                        (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
281                  */
282                 BUG_ON(new_i_size > inode_u.bi_size && !extending);
283
284                 if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
285                     new_i_size > inode_u.bi_size)
286                         inode_u.bi_size = new_i_size;
287                 else
288                         new_i_size = 0;
289
290                 inode_u.bi_sectors += delta;
291
292                 if (delta || new_i_size) {
293                         bch2_inode_pack(&inode_p, &inode_u);
294                         bch2_trans_update(trans, inode_iter,
295                                           &inode_p.inode.k_i);
296                 }
297
298                 bch2_trans_iter_put(trans, inode_iter);
299         }
300
301         bch2_trans_update(trans, iter, k);
302
303         ret = bch2_trans_commit(trans, disk_res, journal_seq,
304                                 BTREE_INSERT_NOCHECK_RW|
305                                 BTREE_INSERT_NOFAIL|
306                                 BTREE_INSERT_ATOMIC|
307                                 BTREE_INSERT_USE_RESERVE);
308         if (!ret && i_sectors_delta)
309                 *i_sectors_delta += delta;
310
311         return ret;
312 }
313
314 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
315                    struct bpos end, u64 *journal_seq,
316                    s64 *i_sectors_delta)
317 {
318         struct bch_fs *c        = trans->c;
319         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
320         struct bkey_s_c k;
321         int ret = 0, ret2 = 0;
322
323         while ((k = bch2_btree_iter_peek(iter)).k &&
324                bkey_cmp(iter->pos, end) < 0) {
325                 struct disk_reservation disk_res =
326                         bch2_disk_reservation_init(c, 0);
327                 struct bkey_i delete;
328
329                 ret = bkey_err(k);
330                 if (ret)
331                         goto btree_err;
332
333                 bkey_init(&delete.k);
334                 delete.k.p = iter->pos;
335
336                 /* create the biggest key we can */
337                 bch2_key_resize(&delete.k, max_sectors);
338                 bch2_cut_back(end, &delete);
339
340                 bch2_trans_begin_updates(trans);
341
342                 ret = bch2_extent_update(trans, iter, &delete,
343                                 &disk_res, journal_seq,
344                                 0, i_sectors_delta);
345                 bch2_disk_reservation_put(c, &disk_res);
346 btree_err:
347                 if (ret == -EINTR) {
348                         ret2 = ret;
349                         ret = 0;
350                 }
351                 if (ret)
352                         break;
353         }
354
355         if (bkey_cmp(iter->pos, end) > 0) {
356                 bch2_btree_iter_set_pos(iter, end);
357                 ret = bch2_btree_iter_traverse(iter);
358         }
359
360         return ret ?: ret2;
361 }
362
363 int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
364                 u64 *journal_seq, s64 *i_sectors_delta)
365 {
366         struct btree_trans trans;
367         struct btree_iter *iter;
368         int ret = 0;
369
370         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
371         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
372                                    POS(inum, start),
373                                    BTREE_ITER_INTENT);
374
375         ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
376                              journal_seq, i_sectors_delta);
377         bch2_trans_exit(&trans);
378
379         if (ret == -EINTR)
380                 ret = 0;
381
382         return ret;
383 }
384
385 int bch2_write_index_default(struct bch_write_op *op)
386 {
387         struct bch_fs *c = op->c;
388         struct bkey_on_stack sk;
389         struct keylist *keys = &op->insert_keys;
390         struct bkey_i *k = bch2_keylist_front(keys);
391         struct btree_trans trans;
392         struct btree_iter *iter;
393         int ret;
394
395         bkey_on_stack_init(&sk);
396         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
397
398         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
399                                    bkey_start_pos(&k->k),
400                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
401
402         do {
403                 k = bch2_keylist_front(keys);
404
405                 bkey_on_stack_realloc(&sk, c, k->k.u64s);
406                 bkey_copy(sk.k, k);
407                 bch2_cut_front(iter->pos, sk.k);
408
409                 bch2_trans_begin_updates(&trans);
410
411                 ret = bch2_extent_update(&trans, iter, sk.k,
412                                          &op->res, op_journal_seq(op),
413                                          op->new_i_size, &op->i_sectors_delta);
414                 if (ret == -EINTR)
415                         continue;
416                 if (ret)
417                         break;
418
419                 if (bkey_cmp(iter->pos, k->k.p) >= 0)
420                         bch2_keylist_pop_front(keys);
421         } while (!bch2_keylist_empty(keys));
422
423         bch2_trans_exit(&trans);
424         bkey_on_stack_exit(&sk, c);
425
426         return ret;
427 }
428
429 /* Writes */
430
431 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
432                                enum bch_data_type type,
433                                const struct bkey_i *k)
434 {
435         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
436         const struct bch_extent_ptr *ptr;
437         struct bch_write_bio *n;
438         struct bch_dev *ca;
439
440         BUG_ON(c->opts.nochanges);
441
442         bkey_for_each_ptr(ptrs, ptr) {
443                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
444                        !c->devs[ptr->dev]);
445
446                 ca = bch_dev_bkey_exists(c, ptr->dev);
447
448                 if (to_entry(ptr + 1) < ptrs.end) {
449                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
450                                                    &ca->replica_set));
451
452                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
453                         n->bio.bi_private       = wbio->bio.bi_private;
454                         n->parent               = wbio;
455                         n->split                = true;
456                         n->bounce               = false;
457                         n->put_bio              = true;
458                         n->bio.bi_opf           = wbio->bio.bi_opf;
459                         bio_inc_remaining(&wbio->bio);
460                 } else {
461                         n = wbio;
462                         n->split                = false;
463                 }
464
465                 n->c                    = c;
466                 n->dev                  = ptr->dev;
467                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
468                 n->submit_time          = local_clock();
469                 n->bio.bi_iter.bi_sector = ptr->offset;
470
471                 if (!journal_flushes_device(ca))
472                         n->bio.bi_opf |= REQ_FUA;
473
474                 if (likely(n->have_ioref)) {
475                         this_cpu_add(ca->io_done->sectors[WRITE][type],
476                                      bio_sectors(&n->bio));
477
478                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
479                         submit_bio(&n->bio);
480                 } else {
481                         n->bio.bi_status        = BLK_STS_REMOVED;
482                         bio_endio(&n->bio);
483                 }
484         }
485 }
486
487 static void __bch2_write(struct closure *);
488
489 static void bch2_write_done(struct closure *cl)
490 {
491         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
492         struct bch_fs *c = op->c;
493
494         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
495                 op->error = bch2_journal_error(&c->journal);
496
497         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
498                 bch2_disk_reservation_put(c, &op->res);
499         percpu_ref_put(&c->writes);
500         bch2_keylist_free(&op->insert_keys, op->inline_keys);
501
502         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
503
504         if (op->end_io)
505                 op->end_io(op);
506         if (cl->parent)
507                 closure_return(cl);
508         else
509                 closure_debug_destroy(cl);
510 }
511
512 /**
513  * bch_write_index - after a write, update index to point to new data
514  */
515 static void __bch2_write_index(struct bch_write_op *op)
516 {
517         struct bch_fs *c = op->c;
518         struct keylist *keys = &op->insert_keys;
519         struct bch_extent_ptr *ptr;
520         struct bkey_i *src, *dst = keys->keys, *n, *k;
521         unsigned dev;
522         int ret;
523
524         for (src = keys->keys; src != keys->top; src = n) {
525                 n = bkey_next(src);
526
527                 if (bkey_extent_is_direct_data(&src->k)) {
528                         bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
529                                             test_bit(ptr->dev, op->failed.d));
530
531                         if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
532                                 ret = -EIO;
533                                 goto err;
534                         }
535                 }
536
537                 if (dst != src)
538                         memmove_u64s_down(dst, src, src->u64s);
539                 dst = bkey_next(dst);
540         }
541
542         keys->top = dst;
543
544         /*
545          * probably not the ideal place to hook this in, but I don't
546          * particularly want to plumb io_opts all the way through the btree
547          * update stack right now
548          */
549         for_each_keylist_key(keys, k)
550                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
551
552         if (!bch2_keylist_empty(keys)) {
553                 u64 sectors_start = keylist_sectors(keys);
554                 int ret = op->index_update_fn(op);
555
556                 BUG_ON(ret == -EINTR);
557                 BUG_ON(keylist_sectors(keys) && !ret);
558
559                 op->written += sectors_start - keylist_sectors(keys);
560
561                 if (ret) {
562                         __bcache_io_error(c, "btree IO error %i", ret);
563                         op->error = ret;
564                 }
565         }
566 out:
567         /* If some a bucket wasn't written, we can't erasure code it: */
568         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
569                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
570
571         bch2_open_buckets_put(c, &op->open_buckets);
572         return;
573 err:
574         keys->top = keys->keys;
575         op->error = ret;
576         goto out;
577 }
578
579 static void bch2_write_index(struct closure *cl)
580 {
581         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
582         struct bch_fs *c = op->c;
583
584         __bch2_write_index(op);
585
586         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
587                 bch2_journal_flush_seq_async(&c->journal,
588                                              *op_journal_seq(op),
589                                              cl);
590                 continue_at(cl, bch2_write_done, index_update_wq(op));
591         } else {
592                 continue_at_nobarrier(cl, bch2_write_done, NULL);
593         }
594 }
595
596 static void bch2_write_endio(struct bio *bio)
597 {
598         struct closure *cl              = bio->bi_private;
599         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
600         struct bch_write_bio *wbio      = to_wbio(bio);
601         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
602         struct bch_fs *c                = wbio->c;
603         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
604
605         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
606                 set_bit(wbio->dev, op->failed.d);
607
608         if (wbio->have_ioref) {
609                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
610                 percpu_ref_put(&ca->io_ref);
611         }
612
613         if (wbio->bounce)
614                 bch2_bio_free_pages_pool(c, bio);
615
616         if (wbio->put_bio)
617                 bio_put(bio);
618
619         if (parent)
620                 bio_endio(&parent->bio);
621         else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
622                 closure_put(cl);
623         else
624                 continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
625 }
626
627 static void init_append_extent(struct bch_write_op *op,
628                                struct write_point *wp,
629                                struct bversion version,
630                                struct bch_extent_crc_unpacked crc)
631 {
632         struct bch_fs *c = op->c;
633         struct bkey_i_extent *e;
634         struct open_bucket *ob;
635         unsigned i;
636
637         BUG_ON(crc.compressed_size > wp->sectors_free);
638         wp->sectors_free -= crc.compressed_size;
639         op->pos.offset += crc.uncompressed_size;
640
641         e = bkey_extent_init(op->insert_keys.top);
642         e->k.p          = op->pos;
643         e->k.size       = crc.uncompressed_size;
644         e->k.version    = version;
645
646         if (crc.csum_type ||
647             crc.compression_type ||
648             crc.nonce)
649                 bch2_extent_crc_append(&e->k_i, crc);
650
651         open_bucket_for_each(c, &wp->ptrs, ob, i) {
652                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
653                 union bch_extent_entry *end =
654                         bkey_val_end(bkey_i_to_s(&e->k_i));
655
656                 end->ptr = ob->ptr;
657                 end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
658                 end->ptr.cached = !ca->mi.durability ||
659                         (op->flags & BCH_WRITE_CACHED) != 0;
660                 end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
661
662                 e->k.u64s++;
663
664                 BUG_ON(crc.compressed_size > ob->sectors_free);
665                 ob->sectors_free -= crc.compressed_size;
666         }
667
668         bch2_keylist_push(&op->insert_keys);
669 }
670
671 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
672                                         struct write_point *wp,
673                                         struct bio *src,
674                                         bool *page_alloc_failed,
675                                         void *buf)
676 {
677         struct bch_write_bio *wbio;
678         struct bio *bio;
679         unsigned output_available =
680                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
681         unsigned pages = DIV_ROUND_UP(output_available +
682                                       (buf
683                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
684                                        : 0), PAGE_SIZE);
685
686         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
687         wbio                    = wbio_init(bio);
688         wbio->put_bio           = true;
689         /* copy WRITE_SYNC flag */
690         wbio->bio.bi_opf        = src->bi_opf;
691
692         if (buf) {
693                 bch2_bio_map(bio, buf, output_available);
694                 return bio;
695         }
696
697         wbio->bounce            = true;
698
699         /*
700          * We can't use mempool for more than c->sb.encoded_extent_max
701          * worth of pages, but we'd like to allocate more if we can:
702          */
703         bch2_bio_alloc_pages_pool(c, bio,
704                                   min_t(unsigned, output_available,
705                                         c->sb.encoded_extent_max << 9));
706
707         if (bio->bi_iter.bi_size < output_available)
708                 *page_alloc_failed =
709                         bch2_bio_alloc_pages(bio,
710                                              output_available -
711                                              bio->bi_iter.bi_size,
712                                              GFP_NOFS) != 0;
713
714         return bio;
715 }
716
717 static int bch2_write_rechecksum(struct bch_fs *c,
718                                  struct bch_write_op *op,
719                                  unsigned new_csum_type)
720 {
721         struct bio *bio = &op->wbio.bio;
722         struct bch_extent_crc_unpacked new_crc;
723         int ret;
724
725         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
726
727         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
728             bch2_csum_type_is_encryption(new_csum_type))
729                 new_csum_type = op->crc.csum_type;
730
731         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
732                                   NULL, &new_crc,
733                                   op->crc.offset, op->crc.live_size,
734                                   new_csum_type);
735         if (ret)
736                 return ret;
737
738         bio_advance(bio, op->crc.offset << 9);
739         bio->bi_iter.bi_size = op->crc.live_size << 9;
740         op->crc = new_crc;
741         return 0;
742 }
743
744 static int bch2_write_decrypt(struct bch_write_op *op)
745 {
746         struct bch_fs *c = op->c;
747         struct nonce nonce = extent_nonce(op->version, op->crc);
748         struct bch_csum csum;
749
750         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
751                 return 0;
752
753         /*
754          * If we need to decrypt data in the write path, we'll no longer be able
755          * to verify the existing checksum (poly1305 mac, in this case) after
756          * it's decrypted - this is the last point we'll be able to reverify the
757          * checksum:
758          */
759         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
760         if (bch2_crc_cmp(op->crc.csum, csum))
761                 return -EIO;
762
763         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
764         op->crc.csum_type = 0;
765         op->crc.csum = (struct bch_csum) { 0, 0 };
766         return 0;
767 }
768
769 static enum prep_encoded_ret {
770         PREP_ENCODED_OK,
771         PREP_ENCODED_ERR,
772         PREP_ENCODED_CHECKSUM_ERR,
773         PREP_ENCODED_DO_WRITE,
774 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
775 {
776         struct bch_fs *c = op->c;
777         struct bio *bio = &op->wbio.bio;
778
779         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
780                 return PREP_ENCODED_OK;
781
782         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
783
784         /* Can we just write the entire extent as is? */
785         if (op->crc.uncompressed_size == op->crc.live_size &&
786             op->crc.compressed_size <= wp->sectors_free &&
787             op->crc.compression_type == op->compression_type) {
788                 if (!op->crc.compression_type &&
789                     op->csum_type != op->crc.csum_type &&
790                     bch2_write_rechecksum(c, op, op->csum_type))
791                         return PREP_ENCODED_CHECKSUM_ERR;
792
793                 return PREP_ENCODED_DO_WRITE;
794         }
795
796         /*
797          * If the data is compressed and we couldn't write the entire extent as
798          * is, we have to decompress it:
799          */
800         if (op->crc.compression_type) {
801                 struct bch_csum csum;
802
803                 if (bch2_write_decrypt(op))
804                         return PREP_ENCODED_CHECKSUM_ERR;
805
806                 /* Last point we can still verify checksum: */
807                 csum = bch2_checksum_bio(c, op->crc.csum_type,
808                                          extent_nonce(op->version, op->crc),
809                                          bio);
810                 if (bch2_crc_cmp(op->crc.csum, csum))
811                         return PREP_ENCODED_CHECKSUM_ERR;
812
813                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
814                         return PREP_ENCODED_ERR;
815         }
816
817         /*
818          * No longer have compressed data after this point - data might be
819          * encrypted:
820          */
821
822         /*
823          * If the data is checksummed and we're only writing a subset,
824          * rechecksum and adjust bio to point to currently live data:
825          */
826         if ((op->crc.live_size != op->crc.uncompressed_size ||
827              op->crc.csum_type != op->csum_type) &&
828             bch2_write_rechecksum(c, op, op->csum_type))
829                 return PREP_ENCODED_CHECKSUM_ERR;
830
831         /*
832          * If we want to compress the data, it has to be decrypted:
833          */
834         if ((op->compression_type ||
835              bch2_csum_type_is_encryption(op->crc.csum_type) !=
836              bch2_csum_type_is_encryption(op->csum_type)) &&
837             bch2_write_decrypt(op))
838                 return PREP_ENCODED_CHECKSUM_ERR;
839
840         return PREP_ENCODED_OK;
841 }
842
843 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
844                              struct bio **_dst)
845 {
846         struct bch_fs *c = op->c;
847         struct bio *src = &op->wbio.bio, *dst = src;
848         struct bvec_iter saved_iter;
849         void *ec_buf;
850         struct bpos ec_pos = op->pos;
851         unsigned total_output = 0, total_input = 0;
852         bool bounce = false;
853         bool page_alloc_failed = false;
854         int ret, more = 0;
855
856         BUG_ON(!bio_sectors(src));
857
858         ec_buf = bch2_writepoint_ec_buf(c, wp);
859
860         switch (bch2_write_prep_encoded_data(op, wp)) {
861         case PREP_ENCODED_OK:
862                 break;
863         case PREP_ENCODED_ERR:
864                 ret = -EIO;
865                 goto err;
866         case PREP_ENCODED_CHECKSUM_ERR:
867                 goto csum_err;
868         case PREP_ENCODED_DO_WRITE:
869                 /* XXX look for bug here */
870                 if (ec_buf) {
871                         dst = bch2_write_bio_alloc(c, wp, src,
872                                                    &page_alloc_failed,
873                                                    ec_buf);
874                         bio_copy_data(dst, src);
875                         bounce = true;
876                 }
877                 init_append_extent(op, wp, op->version, op->crc);
878                 goto do_write;
879         }
880
881         if (ec_buf ||
882             op->compression_type ||
883             (op->csum_type &&
884              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
885             (bch2_csum_type_is_encryption(op->csum_type) &&
886              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
887                 dst = bch2_write_bio_alloc(c, wp, src,
888                                            &page_alloc_failed,
889                                            ec_buf);
890                 bounce = true;
891         }
892
893         saved_iter = dst->bi_iter;
894
895         do {
896                 struct bch_extent_crc_unpacked crc =
897                         (struct bch_extent_crc_unpacked) { 0 };
898                 struct bversion version = op->version;
899                 size_t dst_len, src_len;
900
901                 if (page_alloc_failed &&
902                     bio_sectors(dst) < wp->sectors_free &&
903                     bio_sectors(dst) < c->sb.encoded_extent_max)
904                         break;
905
906                 BUG_ON(op->compression_type &&
907                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
908                        bch2_csum_type_is_encryption(op->crc.csum_type));
909                 BUG_ON(op->compression_type && !bounce);
910
911                 crc.compression_type = op->compression_type
912                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
913                                              op->compression_type)
914                         : 0;
915                 if (!crc.compression_type) {
916                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
917                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
918
919                         if (op->csum_type)
920                                 dst_len = min_t(unsigned, dst_len,
921                                                 c->sb.encoded_extent_max << 9);
922
923                         if (bounce) {
924                                 swap(dst->bi_iter.bi_size, dst_len);
925                                 bio_copy_data(dst, src);
926                                 swap(dst->bi_iter.bi_size, dst_len);
927                         }
928
929                         src_len = dst_len;
930                 }
931
932                 BUG_ON(!src_len || !dst_len);
933
934                 if (bch2_csum_type_is_encryption(op->csum_type)) {
935                         if (bversion_zero(version)) {
936                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
937                         } else {
938                                 crc.nonce = op->nonce;
939                                 op->nonce += src_len >> 9;
940                         }
941                 }
942
943                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
944                     !crc.compression_type &&
945                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
946                     bch2_csum_type_is_encryption(op->csum_type)) {
947                         /*
948                          * Note: when we're using rechecksum(), we need to be
949                          * checksumming @src because it has all the data our
950                          * existing checksum covers - if we bounced (because we
951                          * were trying to compress), @dst will only have the
952                          * part of the data the new checksum will cover.
953                          *
954                          * But normally we want to be checksumming post bounce,
955                          * because part of the reason for bouncing is so the
956                          * data can't be modified (by userspace) while it's in
957                          * flight.
958                          */
959                         if (bch2_rechecksum_bio(c, src, version, op->crc,
960                                         &crc, &op->crc,
961                                         src_len >> 9,
962                                         bio_sectors(src) - (src_len >> 9),
963                                         op->csum_type))
964                                 goto csum_err;
965                 } else {
966                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
967                             bch2_rechecksum_bio(c, src, version, op->crc,
968                                         NULL, &op->crc,
969                                         src_len >> 9,
970                                         bio_sectors(src) - (src_len >> 9),
971                                         op->crc.csum_type))
972                                 goto csum_err;
973
974                         crc.compressed_size     = dst_len >> 9;
975                         crc.uncompressed_size   = src_len >> 9;
976                         crc.live_size           = src_len >> 9;
977
978                         swap(dst->bi_iter.bi_size, dst_len);
979                         bch2_encrypt_bio(c, op->csum_type,
980                                          extent_nonce(version, crc), dst);
981                         crc.csum = bch2_checksum_bio(c, op->csum_type,
982                                          extent_nonce(version, crc), dst);
983                         crc.csum_type = op->csum_type;
984                         swap(dst->bi_iter.bi_size, dst_len);
985                 }
986
987                 init_append_extent(op, wp, version, crc);
988
989                 if (dst != src)
990                         bio_advance(dst, dst_len);
991                 bio_advance(src, src_len);
992                 total_output    += dst_len;
993                 total_input     += src_len;
994         } while (dst->bi_iter.bi_size &&
995                  src->bi_iter.bi_size &&
996                  wp->sectors_free &&
997                  !bch2_keylist_realloc(&op->insert_keys,
998                                       op->inline_keys,
999                                       ARRAY_SIZE(op->inline_keys),
1000                                       BKEY_EXTENT_U64s_MAX));
1001
1002         more = src->bi_iter.bi_size != 0;
1003
1004         dst->bi_iter = saved_iter;
1005
1006         if (dst == src && more) {
1007                 BUG_ON(total_output != total_input);
1008
1009                 dst = bio_split(src, total_input >> 9,
1010                                 GFP_NOIO, &c->bio_write);
1011                 wbio_init(dst)->put_bio = true;
1012                 /* copy WRITE_SYNC flag */
1013                 dst->bi_opf             = src->bi_opf;
1014         }
1015
1016         dst->bi_iter.bi_size = total_output;
1017 do_write:
1018         /* might have done a realloc... */
1019         bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
1020
1021         *_dst = dst;
1022         return more;
1023 csum_err:
1024         bch_err(c, "error verifying existing checksum while "
1025                 "rewriting existing data (memory corruption?)");
1026         ret = -EIO;
1027 err:
1028         if (to_wbio(dst)->bounce)
1029                 bch2_bio_free_pages_pool(c, dst);
1030         if (to_wbio(dst)->put_bio)
1031                 bio_put(dst);
1032
1033         return ret;
1034 }
1035
1036 static void __bch2_write(struct closure *cl)
1037 {
1038         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1039         struct bch_fs *c = op->c;
1040         struct write_point *wp;
1041         struct bio *bio;
1042         bool skip_put = true;
1043         int ret;
1044 again:
1045         memset(&op->failed, 0, sizeof(op->failed));
1046
1047         do {
1048                 struct bkey_i *key_to_write;
1049                 unsigned key_to_write_offset = op->insert_keys.top_p -
1050                         op->insert_keys.keys_p;
1051
1052                 /* +1 for possible cache device: */
1053                 if (op->open_buckets.nr + op->nr_replicas + 1 >
1054                     ARRAY_SIZE(op->open_buckets.v))
1055                         goto flush_io;
1056
1057                 if (bch2_keylist_realloc(&op->insert_keys,
1058                                         op->inline_keys,
1059                                         ARRAY_SIZE(op->inline_keys),
1060                                         BKEY_EXTENT_U64s_MAX))
1061                         goto flush_io;
1062
1063                 wp = bch2_alloc_sectors_start(c,
1064                         op->target,
1065                         op->opts.erasure_code,
1066                         op->write_point,
1067                         &op->devs_have,
1068                         op->nr_replicas,
1069                         op->nr_replicas_required,
1070                         op->alloc_reserve,
1071                         op->flags,
1072                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
1073                 EBUG_ON(!wp);
1074
1075                 if (unlikely(IS_ERR(wp))) {
1076                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
1077                                 ret = PTR_ERR(wp);
1078                                 goto err;
1079                         }
1080
1081                         goto flush_io;
1082                 }
1083
1084                 bch2_open_bucket_get(c, wp, &op->open_buckets);
1085                 ret = bch2_write_extent(op, wp, &bio);
1086                 bch2_alloc_sectors_done(c, wp);
1087
1088                 if (ret < 0)
1089                         goto err;
1090
1091                 if (ret)
1092                         skip_put = false;
1093
1094                 bio->bi_end_io  = bch2_write_endio;
1095                 bio->bi_private = &op->cl;
1096                 bio->bi_opf |= REQ_OP_WRITE;
1097
1098                 if (!skip_put)
1099                         closure_get(bio->bi_private);
1100                 else
1101                         op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
1102
1103                 key_to_write = (void *) (op->insert_keys.keys_p +
1104                                          key_to_write_offset);
1105
1106                 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
1107                                           key_to_write);
1108         } while (ret);
1109
1110         if (!skip_put)
1111                 continue_at(cl, bch2_write_index, index_update_wq(op));
1112         return;
1113 err:
1114         op->error = ret;
1115
1116         continue_at(cl, bch2_write_index, index_update_wq(op));
1117         return;
1118 flush_io:
1119         closure_sync(cl);
1120
1121         if (!bch2_keylist_empty(&op->insert_keys)) {
1122                 __bch2_write_index(op);
1123
1124                 if (op->error) {
1125                         continue_at_nobarrier(cl, bch2_write_done, NULL);
1126                         return;
1127                 }
1128         }
1129
1130         goto again;
1131 }
1132
1133 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1134 {
1135         struct closure *cl = &op->cl;
1136         struct bio *bio = &op->wbio.bio;
1137         struct bvec_iter iter;
1138         struct bkey_i_inline_data *id;
1139         unsigned sectors;
1140         int ret;
1141
1142         ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1143                                    ARRAY_SIZE(op->inline_keys),
1144                                    BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1145         if (ret) {
1146                 op->error = ret;
1147                 goto err;
1148         }
1149
1150         sectors = bio_sectors(bio);
1151         op->pos.offset += sectors;
1152
1153         id = bkey_inline_data_init(op->insert_keys.top);
1154         id->k.p         = op->pos;
1155         id->k.version   = op->version;
1156         id->k.size      = sectors;
1157
1158         iter = bio->bi_iter;
1159         iter.bi_size = data_len;
1160         memcpy_from_bio(id->v.data, bio, iter);
1161
1162         while (data_len & 7)
1163                 id->v.data[data_len++] = '\0';
1164         set_bkey_val_bytes(&id->k, data_len);
1165         bch2_keylist_push(&op->insert_keys);
1166
1167         op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1168         continue_at_nobarrier(cl, bch2_write_index, NULL);
1169         return;
1170 err:
1171         bch2_write_done(&op->cl);
1172 }
1173
1174 /**
1175  * bch_write - handle a write to a cache device or flash only volume
1176  *
1177  * This is the starting point for any data to end up in a cache device; it could
1178  * be from a normal write, or a writeback write, or a write to a flash only
1179  * volume - it's also used by the moving garbage collector to compact data in
1180  * mostly empty buckets.
1181  *
1182  * It first writes the data to the cache, creating a list of keys to be inserted
1183  * (if the data won't fit in a single open bucket, there will be multiple keys);
1184  * after the data is written it calls bch_journal, and after the keys have been
1185  * added to the next journal write they're inserted into the btree.
1186  *
1187  * If op->discard is true, instead of inserting the data it invalidates the
1188  * region of the cache represented by op->bio and op->inode.
1189  */
1190 void bch2_write(struct closure *cl)
1191 {
1192         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1193         struct bio *bio = &op->wbio.bio;
1194         struct bch_fs *c = op->c;
1195         unsigned data_len;
1196
1197         BUG_ON(!op->nr_replicas);
1198         BUG_ON(!op->write_point.v);
1199         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
1200
1201         op->start_time = local_clock();
1202         bch2_keylist_init(&op->insert_keys, op->inline_keys);
1203         wbio_init(bio)->put_bio = false;
1204
1205         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
1206                 __bcache_io_error(c, "misaligned write");
1207                 op->error = -EIO;
1208                 goto err;
1209         }
1210
1211         if (c->opts.nochanges ||
1212             !percpu_ref_tryget(&c->writes)) {
1213                 __bcache_io_error(c, "read only");
1214                 op->error = -EROFS;
1215                 goto err;
1216         }
1217
1218         bch2_increment_clock(c, bio_sectors(bio), WRITE);
1219
1220         data_len = min_t(u64, bio->bi_iter.bi_size,
1221                          op->new_i_size - (op->pos.offset << 9));
1222
1223         if (data_len <= min(block_bytes(c) / 2, 1024U)) {
1224                 bch2_write_data_inline(op, data_len);
1225                 return;
1226         }
1227
1228         continue_at_nobarrier(cl, __bch2_write, NULL);
1229         return;
1230 err:
1231         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
1232                 bch2_disk_reservation_put(c, &op->res);
1233         if (op->end_io)
1234                 op->end_io(op);
1235         if (cl->parent)
1236                 closure_return(cl);
1237         else
1238                 closure_debug_destroy(cl);
1239 }
1240
1241 /* Cache promotion on read */
1242
1243 struct promote_op {
1244         struct closure          cl;
1245         struct rcu_head         rcu;
1246         u64                     start_time;
1247
1248         struct rhash_head       hash;
1249         struct bpos             pos;
1250
1251         struct migrate_write    write;
1252         struct bio_vec          bi_inline_vecs[0]; /* must be last */
1253 };
1254
1255 static const struct rhashtable_params bch_promote_params = {
1256         .head_offset    = offsetof(struct promote_op, hash),
1257         .key_offset     = offsetof(struct promote_op, pos),
1258         .key_len        = sizeof(struct bpos),
1259 };
1260
1261 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1262                                   struct bpos pos,
1263                                   struct bch_io_opts opts,
1264                                   unsigned flags)
1265 {
1266         if (!(flags & BCH_READ_MAY_PROMOTE))
1267                 return false;
1268
1269         if (!opts.promote_target)
1270                 return false;
1271
1272         if (bch2_bkey_has_target(c, k, opts.promote_target))
1273                 return false;
1274
1275         if (bch2_target_congested(c, opts.promote_target)) {
1276                 /* XXX trace this */
1277                 return false;
1278         }
1279
1280         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1281                                    bch_promote_params))
1282                 return false;
1283
1284         return true;
1285 }
1286
1287 static void promote_free(struct bch_fs *c, struct promote_op *op)
1288 {
1289         int ret;
1290
1291         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1292                                      bch_promote_params);
1293         BUG_ON(ret);
1294         percpu_ref_put(&c->writes);
1295         kfree_rcu(op, rcu);
1296 }
1297
1298 static void promote_done(struct closure *cl)
1299 {
1300         struct promote_op *op =
1301                 container_of(cl, struct promote_op, cl);
1302         struct bch_fs *c = op->write.op.c;
1303
1304         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1305                                op->start_time);
1306
1307         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1308         promote_free(c, op);
1309 }
1310
1311 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1312 {
1313         struct bch_fs *c = rbio->c;
1314         struct closure *cl = &op->cl;
1315         struct bio *bio = &op->write.op.wbio.bio;
1316
1317         trace_promote(&rbio->bio);
1318
1319         /* we now own pages: */
1320         BUG_ON(!rbio->bounce);
1321         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1322
1323         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1324                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1325         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1326
1327         bch2_migrate_read_done(&op->write, rbio);
1328
1329         closure_init(cl, NULL);
1330         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1331         closure_return_with_destructor(cl, promote_done);
1332 }
1333
1334 static struct promote_op *__promote_alloc(struct bch_fs *c,
1335                                           enum btree_id btree_id,
1336                                           struct bpos pos,
1337                                           struct extent_ptr_decoded *pick,
1338                                           struct bch_io_opts opts,
1339                                           unsigned sectors,
1340                                           struct bch_read_bio **rbio)
1341 {
1342         struct promote_op *op = NULL;
1343         struct bio *bio;
1344         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1345         int ret;
1346
1347         if (!percpu_ref_tryget(&c->writes))
1348                 return NULL;
1349
1350         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1351         if (!op)
1352                 goto err;
1353
1354         op->start_time = local_clock();
1355         op->pos = pos;
1356
1357         /*
1358          * We don't use the mempool here because extents that aren't
1359          * checksummed or compressed can be too big for the mempool:
1360          */
1361         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1362                         sizeof(struct bio_vec) * pages,
1363                         GFP_NOIO);
1364         if (!*rbio)
1365                 goto err;
1366
1367         rbio_init(&(*rbio)->bio, opts);
1368         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1369
1370         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1371                                  GFP_NOIO))
1372                 goto err;
1373
1374         (*rbio)->bounce         = true;
1375         (*rbio)->split          = true;
1376         (*rbio)->kmalloc        = true;
1377
1378         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1379                                           bch_promote_params))
1380                 goto err;
1381
1382         bio = &op->write.op.wbio.bio;
1383         bio_init(bio, bio->bi_inline_vecs, pages);
1384
1385         ret = bch2_migrate_write_init(c, &op->write,
1386                         writepoint_hashed((unsigned long) current),
1387                         opts,
1388                         DATA_PROMOTE,
1389                         (struct data_opts) {
1390                                 .target = opts.promote_target
1391                         },
1392                         btree_id,
1393                         bkey_s_c_null);
1394         BUG_ON(ret);
1395
1396         return op;
1397 err:
1398         if (*rbio)
1399                 bio_free_pages(&(*rbio)->bio);
1400         kfree(*rbio);
1401         *rbio = NULL;
1402         kfree(op);
1403         percpu_ref_put(&c->writes);
1404         return NULL;
1405 }
1406
1407 noinline
1408 static struct promote_op *promote_alloc(struct bch_fs *c,
1409                                                struct bvec_iter iter,
1410                                                struct bkey_s_c k,
1411                                                struct extent_ptr_decoded *pick,
1412                                                struct bch_io_opts opts,
1413                                                unsigned flags,
1414                                                struct bch_read_bio **rbio,
1415                                                bool *bounce,
1416                                                bool *read_full)
1417 {
1418         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1419         /* data might have to be decompressed in the write path: */
1420         unsigned sectors = promote_full
1421                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1422                 : bvec_iter_sectors(iter);
1423         struct bpos pos = promote_full
1424                 ? bkey_start_pos(k.k)
1425                 : POS(k.k->p.inode, iter.bi_sector);
1426         struct promote_op *promote;
1427
1428         if (!should_promote(c, k, pos, opts, flags))
1429                 return NULL;
1430
1431         promote = __promote_alloc(c,
1432                                   k.k->type == KEY_TYPE_reflink_v
1433                                   ? BTREE_ID_REFLINK
1434                                   : BTREE_ID_EXTENTS,
1435                                   pos, pick, opts, sectors, rbio);
1436         if (!promote)
1437                 return NULL;
1438
1439         *bounce         = true;
1440         *read_full      = promote_full;
1441         return promote;
1442 }
1443
1444 /* Read */
1445
1446 #define READ_RETRY_AVOID        1
1447 #define READ_RETRY              2
1448 #define READ_ERR                3
1449
1450 enum rbio_context {
1451         RBIO_CONTEXT_NULL,
1452         RBIO_CONTEXT_HIGHPRI,
1453         RBIO_CONTEXT_UNBOUND,
1454 };
1455
1456 static inline struct bch_read_bio *
1457 bch2_rbio_parent(struct bch_read_bio *rbio)
1458 {
1459         return rbio->split ? rbio->parent : rbio;
1460 }
1461
1462 __always_inline
1463 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1464                            enum rbio_context context,
1465                            struct workqueue_struct *wq)
1466 {
1467         if (context <= rbio->context) {
1468                 fn(&rbio->work);
1469         } else {
1470                 rbio->work.func         = fn;
1471                 rbio->context           = context;
1472                 queue_work(wq, &rbio->work);
1473         }
1474 }
1475
1476 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1477 {
1478         BUG_ON(rbio->bounce && !rbio->split);
1479
1480         if (rbio->promote)
1481                 promote_free(rbio->c, rbio->promote);
1482         rbio->promote = NULL;
1483
1484         if (rbio->bounce)
1485                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1486
1487         if (rbio->split) {
1488                 struct bch_read_bio *parent = rbio->parent;
1489
1490                 if (rbio->kmalloc)
1491                         kfree(rbio);
1492                 else
1493                         bio_put(&rbio->bio);
1494
1495                 rbio = parent;
1496         }
1497
1498         return rbio;
1499 }
1500
1501 /*
1502  * Only called on a top level bch_read_bio to complete an entire read request,
1503  * not a split:
1504  */
1505 static void bch2_rbio_done(struct bch_read_bio *rbio)
1506 {
1507         if (rbio->start_time)
1508                 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1509                                        rbio->start_time);
1510         bio_endio(&rbio->bio);
1511 }
1512
1513 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1514                                      struct bvec_iter bvec_iter, u64 inode,
1515                                      struct bch_io_failures *failed,
1516                                      unsigned flags)
1517 {
1518         struct btree_trans trans;
1519         struct btree_iter *iter;
1520         struct bkey_on_stack sk;
1521         struct bkey_s_c k;
1522         int ret;
1523
1524         flags &= ~BCH_READ_LAST_FRAGMENT;
1525         flags |= BCH_READ_MUST_CLONE;
1526
1527         bkey_on_stack_init(&sk);
1528         bch2_trans_init(&trans, c, 0, 0);
1529
1530         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1531                                    rbio->pos, BTREE_ITER_SLOTS);
1532 retry:
1533         rbio->bio.bi_status = 0;
1534
1535         k = bch2_btree_iter_peek_slot(iter);
1536         if (bkey_err(k))
1537                 goto err;
1538
1539         bkey_on_stack_realloc(&sk, c, k.k->u64s);
1540         bkey_reassemble(sk.k, k);
1541         k = bkey_i_to_s_c(sk.k);
1542         bch2_trans_unlock(&trans);
1543
1544         if (!bch2_bkey_matches_ptr(c, k,
1545                                    rbio->pick.ptr,
1546                                    rbio->pos.offset -
1547                                    rbio->pick.crc.offset)) {
1548                 /* extent we wanted to read no longer exists: */
1549                 rbio->hole = true;
1550                 goto out;
1551         }
1552
1553         ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
1554         if (ret == READ_RETRY)
1555                 goto retry;
1556         if (ret)
1557                 goto err;
1558 out:
1559         bch2_rbio_done(rbio);
1560         bch2_trans_exit(&trans);
1561         bkey_on_stack_exit(&sk, c);
1562         return;
1563 err:
1564         rbio->bio.bi_status = BLK_STS_IOERR;
1565         goto out;
1566 }
1567
1568 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1569                             struct bvec_iter bvec_iter, u64 inode,
1570                             struct bch_io_failures *failed, unsigned flags)
1571 {
1572         struct btree_trans trans;
1573         struct btree_iter *iter;
1574         struct bkey_on_stack sk;
1575         struct bkey_s_c k;
1576         int ret;
1577
1578         flags &= ~BCH_READ_LAST_FRAGMENT;
1579         flags |= BCH_READ_MUST_CLONE;
1580
1581         bkey_on_stack_init(&sk);
1582         bch2_trans_init(&trans, c, 0, 0);
1583 retry:
1584         bch2_trans_begin(&trans);
1585
1586         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1587                            POS(inode, bvec_iter.bi_sector),
1588                            BTREE_ITER_SLOTS, k, ret) {
1589                 unsigned bytes, sectors, offset_into_extent;
1590
1591                 bkey_on_stack_realloc(&sk, c, k.k->u64s);
1592                 bkey_reassemble(sk.k, k);
1593                 k = bkey_i_to_s_c(sk.k);
1594
1595                 offset_into_extent = iter->pos.offset -
1596                         bkey_start_offset(k.k);
1597                 sectors = k.k->size - offset_into_extent;
1598
1599                 ret = bch2_read_indirect_extent(&trans,
1600                                         &offset_into_extent, sk.k);
1601                 if (ret)
1602                         break;
1603
1604                 sectors = min(sectors, k.k->size - offset_into_extent);
1605
1606                 bch2_trans_unlock(&trans);
1607
1608                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1609                 swap(bvec_iter.bi_size, bytes);
1610
1611                 ret = __bch2_read_extent(c, rbio, bvec_iter, k,
1612                                 offset_into_extent, failed, flags);
1613                 switch (ret) {
1614                 case READ_RETRY:
1615                         goto retry;
1616                 case READ_ERR:
1617                         goto err;
1618                 };
1619
1620                 if (bytes == bvec_iter.bi_size)
1621                         goto out;
1622
1623                 swap(bvec_iter.bi_size, bytes);
1624                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1625         }
1626
1627         if (ret == -EINTR)
1628                 goto retry;
1629         /*
1630          * If we get here, it better have been because there was an error
1631          * reading a btree node
1632          */
1633         BUG_ON(!ret);
1634         __bcache_io_error(c, "btree IO error: %i", ret);
1635 err:
1636         rbio->bio.bi_status = BLK_STS_IOERR;
1637 out:
1638         bch2_trans_exit(&trans);
1639         bkey_on_stack_exit(&sk, c);
1640         bch2_rbio_done(rbio);
1641 }
1642
1643 static void bch2_rbio_retry(struct work_struct *work)
1644 {
1645         struct bch_read_bio *rbio =
1646                 container_of(work, struct bch_read_bio, work);
1647         struct bch_fs *c        = rbio->c;
1648         struct bvec_iter iter   = rbio->bvec_iter;
1649         unsigned flags          = rbio->flags;
1650         u64 inode               = rbio->pos.inode;
1651         struct bch_io_failures failed = { .nr = 0 };
1652
1653         trace_read_retry(&rbio->bio);
1654
1655         if (rbio->retry == READ_RETRY_AVOID)
1656                 bch2_mark_io_failure(&failed, &rbio->pick);
1657
1658         rbio->bio.bi_status = 0;
1659
1660         rbio = bch2_rbio_free(rbio);
1661
1662         flags |= BCH_READ_IN_RETRY;
1663         flags &= ~BCH_READ_MAY_PROMOTE;
1664
1665         if (flags & BCH_READ_NODECODE)
1666                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1667         else
1668                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1669 }
1670
1671 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1672                             blk_status_t error)
1673 {
1674         rbio->retry = retry;
1675
1676         if (rbio->flags & BCH_READ_IN_RETRY)
1677                 return;
1678
1679         if (retry == READ_ERR) {
1680                 rbio = bch2_rbio_free(rbio);
1681
1682                 rbio->bio.bi_status = error;
1683                 bch2_rbio_done(rbio);
1684         } else {
1685                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1686                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1687         }
1688 }
1689
1690 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1691 {
1692         struct bch_fs *c = rbio->c;
1693         struct btree_trans trans;
1694         struct btree_iter *iter;
1695         struct bkey_s_c k;
1696         struct bkey_on_stack new;
1697         struct bch_extent_crc_unpacked new_crc;
1698         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1699         int ret;
1700
1701         if (rbio->pick.crc.compression_type)
1702                 return;
1703
1704         bkey_on_stack_init(&new);
1705         bch2_trans_init(&trans, c, 0, 0);
1706 retry:
1707         bch2_trans_begin(&trans);
1708
1709         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1710                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1711         k = bch2_btree_iter_peek_slot(iter);
1712         if (IS_ERR_OR_NULL(k.k))
1713                 goto out;
1714
1715         bkey_on_stack_realloc(&new, c, k.k->u64s);
1716         bkey_reassemble(new.k, k);
1717         k = bkey_i_to_s_c(new.k);
1718
1719         if (bversion_cmp(k.k->version, rbio->version) ||
1720             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1721                 goto out;
1722
1723         /* Extent was merged? */
1724         if (bkey_start_offset(k.k) < data_offset ||
1725             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1726                 goto out;
1727
1728         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1729                         rbio->pick.crc, NULL, &new_crc,
1730                         bkey_start_offset(k.k) - data_offset, k.k->size,
1731                         rbio->pick.crc.csum_type)) {
1732                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1733                 goto out;
1734         }
1735
1736         if (!bch2_bkey_narrow_crcs(new.k, new_crc))
1737                 goto out;
1738
1739         bch2_trans_update(&trans, iter, new.k);
1740         ret = bch2_trans_commit(&trans, NULL, NULL,
1741                                 BTREE_INSERT_ATOMIC|
1742                                 BTREE_INSERT_NOFAIL|
1743                                 BTREE_INSERT_NOWAIT);
1744         if (ret == -EINTR)
1745                 goto retry;
1746 out:
1747         bch2_trans_exit(&trans);
1748         bkey_on_stack_exit(&new, c);
1749 }
1750
1751 /* Inner part that may run in process context */
1752 static void __bch2_read_endio(struct work_struct *work)
1753 {
1754         struct bch_read_bio *rbio =
1755                 container_of(work, struct bch_read_bio, work);
1756         struct bch_fs *c        = rbio->c;
1757         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1758         struct bio *src         = &rbio->bio;
1759         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1760         struct bvec_iter dst_iter = rbio->bvec_iter;
1761         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1762         struct nonce nonce = extent_nonce(rbio->version, crc);
1763         struct bch_csum csum;
1764
1765         /* Reset iterator for checksumming and copying bounced data: */
1766         if (rbio->bounce) {
1767                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1768                 src->bi_iter.bi_idx             = 0;
1769                 src->bi_iter.bi_bvec_done       = 0;
1770         } else {
1771                 src->bi_iter                    = rbio->bvec_iter;
1772         }
1773
1774         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1775         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1776                 goto csum_err;
1777
1778         if (unlikely(rbio->narrow_crcs))
1779                 bch2_rbio_narrow_crcs(rbio);
1780
1781         if (rbio->flags & BCH_READ_NODECODE)
1782                 goto nodecode;
1783
1784         /* Adjust crc to point to subset of data we want: */
1785         crc.offset     += rbio->offset_into_extent;
1786         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1787
1788         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1789                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1790                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1791                         goto decompression_err;
1792         } else {
1793                 /* don't need to decrypt the entire bio: */
1794                 nonce = nonce_add(nonce, crc.offset << 9);
1795                 bio_advance(src, crc.offset << 9);
1796
1797                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1798                 src->bi_iter.bi_size = dst_iter.bi_size;
1799
1800                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1801
1802                 if (rbio->bounce) {
1803                         struct bvec_iter src_iter = src->bi_iter;
1804                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1805                 }
1806         }
1807
1808         if (rbio->promote) {
1809                 /*
1810                  * Re encrypt data we decrypted, so it's consistent with
1811                  * rbio->crc:
1812                  */
1813                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1814                 promote_start(rbio->promote, rbio);
1815                 rbio->promote = NULL;
1816         }
1817 nodecode:
1818         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1819                 rbio = bch2_rbio_free(rbio);
1820                 bch2_rbio_done(rbio);
1821         }
1822         return;
1823 csum_err:
1824         /*
1825          * Checksum error: if the bio wasn't bounced, we may have been
1826          * reading into buffers owned by userspace (that userspace can
1827          * scribble over) - retry the read, bouncing it this time:
1828          */
1829         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1830                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1831                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1832                 return;
1833         }
1834
1835         bch2_dev_io_error(ca,
1836                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1837                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1838                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1839                 csum.hi, csum.lo, crc.csum_type);
1840         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1841         return;
1842 decompression_err:
1843         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1844                           rbio->pos.inode,
1845                           (u64) rbio->bvec_iter.bi_sector);
1846         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1847         return;
1848 }
1849
1850 static void bch2_read_endio(struct bio *bio)
1851 {
1852         struct bch_read_bio *rbio =
1853                 container_of(bio, struct bch_read_bio, bio);
1854         struct bch_fs *c        = rbio->c;
1855         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1856         struct workqueue_struct *wq = NULL;
1857         enum rbio_context context = RBIO_CONTEXT_NULL;
1858
1859         if (rbio->have_ioref) {
1860                 bch2_latency_acct(ca, rbio->submit_time, READ);
1861                 percpu_ref_put(&ca->io_ref);
1862         }
1863
1864         if (!rbio->split)
1865                 rbio->bio.bi_end_io = rbio->end_io;
1866
1867         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1868                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1869                 return;
1870         }
1871
1872         if (rbio->pick.ptr.cached &&
1873             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1874              ptr_stale(ca, &rbio->pick.ptr))) {
1875                 atomic_long_inc(&c->read_realloc_races);
1876
1877                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1878                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1879                 else
1880                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1881                 return;
1882         }
1883
1884         if (rbio->narrow_crcs ||
1885             rbio->pick.crc.compression_type ||
1886             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1887                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1888         else if (rbio->pick.crc.csum_type)
1889                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1890
1891         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1892 }
1893
1894 int __bch2_read_indirect_extent(struct btree_trans *trans,
1895                                 unsigned *offset_into_extent,
1896                                 struct bkey_i *orig_k)
1897 {
1898         struct btree_iter *iter;
1899         struct bkey_s_c k;
1900         u64 reflink_offset;
1901         int ret;
1902
1903         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
1904                 *offset_into_extent;
1905
1906         iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
1907                                    POS(0, reflink_offset),
1908                                    BTREE_ITER_SLOTS);
1909         ret = PTR_ERR_OR_ZERO(iter);
1910         if (ret)
1911                 return ret;
1912
1913         k = bch2_btree_iter_peek_slot(iter);
1914         ret = bkey_err(k);
1915         if (ret)
1916                 goto err;
1917
1918         if (k.k->type != KEY_TYPE_reflink_v) {
1919                 __bcache_io_error(trans->c,
1920                                 "pointer to nonexistent indirect extent");
1921                 ret = -EIO;
1922                 goto err;
1923         }
1924
1925         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1926         bkey_reassemble(orig_k, k);
1927 err:
1928         bch2_trans_iter_put(trans, iter);
1929         return ret;
1930 }
1931
1932 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1933                        struct bvec_iter iter, struct bkey_s_c k,
1934                        unsigned offset_into_extent,
1935                        struct bch_io_failures *failed, unsigned flags)
1936 {
1937         struct extent_ptr_decoded pick;
1938         struct bch_read_bio *rbio = NULL;
1939         struct bch_dev *ca;
1940         struct promote_op *promote = NULL;
1941         bool bounce = false, read_full = false, narrow_crcs = false;
1942         struct bpos pos = bkey_start_pos(k.k);
1943         int pick_ret;
1944
1945         if (k.k->type == KEY_TYPE_inline_data) {
1946                 struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
1947                 unsigned bytes = min_t(unsigned, iter.bi_size,
1948                                        bkey_val_bytes(d.k));
1949
1950                 swap(iter.bi_size, bytes);
1951                 memcpy_to_bio(&orig->bio, iter, d.v->data);
1952                 swap(iter.bi_size, bytes);
1953                 bio_advance_iter(&orig->bio, &iter, bytes);
1954                 zero_fill_bio_iter(&orig->bio, iter);
1955                 goto out_read_done;
1956         }
1957
1958         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1959
1960         /* hole or reservation - just zero fill: */
1961         if (!pick_ret)
1962                 goto hole;
1963
1964         if (pick_ret < 0) {
1965                 __bcache_io_error(c, "no device to read from");
1966                 goto err;
1967         }
1968
1969         if (pick_ret > 0)
1970                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1971
1972         if (flags & BCH_READ_NODECODE) {
1973                 /*
1974                  * can happen if we retry, and the extent we were going to read
1975                  * has been merged in the meantime:
1976                  */
1977                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1978                         goto hole;
1979
1980                 iter.bi_size    = pick.crc.compressed_size << 9;
1981                 goto noclone;
1982         }
1983
1984         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1985             bio_flagged(&orig->bio, BIO_CHAIN))
1986                 flags |= BCH_READ_MUST_CLONE;
1987
1988         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
1989                 bch2_can_narrow_extent_crcs(k, pick.crc);
1990
1991         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1992                 flags |= BCH_READ_MUST_BOUNCE;
1993
1994         EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1995
1996         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1997             (pick.crc.csum_type != BCH_CSUM_NONE &&
1998              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1999               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2000                (flags & BCH_READ_USER_MAPPED)) ||
2001               (flags & BCH_READ_MUST_BOUNCE)))) {
2002                 read_full = true;
2003                 bounce = true;
2004         }
2005
2006         if (orig->opts.promote_target)
2007                 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2008                                         &rbio, &bounce, &read_full);
2009
2010         if (!read_full) {
2011                 EBUG_ON(pick.crc.compression_type);
2012                 EBUG_ON(pick.crc.csum_type &&
2013                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2014                          bvec_iter_sectors(iter) != pick.crc.live_size ||
2015                          pick.crc.offset ||
2016                          offset_into_extent));
2017
2018                 pos.offset += offset_into_extent;
2019                 pick.ptr.offset += pick.crc.offset +
2020                         offset_into_extent;
2021                 offset_into_extent              = 0;
2022                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
2023                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
2024                 pick.crc.offset                 = 0;
2025                 pick.crc.live_size              = bvec_iter_sectors(iter);
2026                 offset_into_extent              = 0;
2027         }
2028
2029         if (rbio) {
2030                 /*
2031                  * promote already allocated bounce rbio:
2032                  * promote needs to allocate a bio big enough for uncompressing
2033                  * data in the write path, but we're not going to use it all
2034                  * here:
2035                  */
2036                 EBUG_ON(rbio->bio.bi_iter.bi_size <
2037                        pick.crc.compressed_size << 9);
2038                 rbio->bio.bi_iter.bi_size =
2039                         pick.crc.compressed_size << 9;
2040         } else if (bounce) {
2041                 unsigned sectors = pick.crc.compressed_size;
2042
2043                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
2044                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
2045                                                   &c->bio_read_split),
2046                                  orig->opts);
2047
2048                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2049                 rbio->bounce    = true;
2050                 rbio->split     = true;
2051         } else if (flags & BCH_READ_MUST_CLONE) {
2052                 /*
2053                  * Have to clone if there were any splits, due to error
2054                  * reporting issues (if a split errored, and retrying didn't
2055                  * work, when it reports the error to its parent (us) we don't
2056                  * know if the error was from our bio, and we should retry, or
2057                  * from the whole bio, in which case we don't want to retry and
2058                  * lose the error)
2059                  */
2060                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
2061                                                 &c->bio_read_split),
2062                                  orig->opts);
2063                 rbio->bio.bi_iter = iter;
2064                 rbio->split     = true;
2065         } else {
2066 noclone:
2067                 rbio = orig;
2068                 rbio->bio.bi_iter = iter;
2069                 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2070         }
2071
2072         EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2073
2074         rbio->c                 = c;
2075         rbio->submit_time       = local_clock();
2076         if (rbio->split)
2077                 rbio->parent    = orig;
2078         else
2079                 rbio->end_io    = orig->bio.bi_end_io;
2080         rbio->bvec_iter         = iter;
2081         rbio->offset_into_extent= offset_into_extent;
2082         rbio->flags             = flags;
2083         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2084         rbio->narrow_crcs       = narrow_crcs;
2085         rbio->hole              = 0;
2086         rbio->retry             = 0;
2087         rbio->context           = 0;
2088         /* XXX: only initialize this if needed */
2089         rbio->devs_have         = bch2_bkey_devs(k);
2090         rbio->pick              = pick;
2091         rbio->pos               = pos;
2092         rbio->version           = k.k->version;
2093         rbio->promote           = promote;
2094         INIT_WORK(&rbio->work, NULL);
2095
2096         rbio->bio.bi_opf        = orig->bio.bi_opf;
2097         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2098         rbio->bio.bi_end_io     = bch2_read_endio;
2099
2100         if (rbio->bounce)
2101                 trace_read_bounce(&rbio->bio);
2102
2103         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2104
2105         rcu_read_lock();
2106         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
2107         rcu_read_unlock();
2108
2109         if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2110                 bio_inc_remaining(&orig->bio);
2111                 trace_read_split(&orig->bio);
2112         }
2113
2114         if (!rbio->pick.idx) {
2115                 if (!rbio->have_ioref) {
2116                         __bcache_io_error(c, "no device to read from");
2117                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2118                         goto out;
2119                 }
2120
2121                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
2122                              bio_sectors(&rbio->bio));
2123                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2124
2125                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2126                         submit_bio(&rbio->bio);
2127                 else
2128                         submit_bio_wait(&rbio->bio);
2129         } else {
2130                 /* Attempting reconstruct read: */
2131                 if (bch2_ec_read_extent(c, rbio)) {
2132                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2133                         goto out;
2134                 }
2135
2136                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2137                         bio_endio(&rbio->bio);
2138         }
2139 out:
2140         if (likely(!(flags & BCH_READ_IN_RETRY))) {
2141                 return 0;
2142         } else {
2143                 int ret;
2144
2145                 rbio->context = RBIO_CONTEXT_UNBOUND;
2146                 bch2_read_endio(&rbio->bio);
2147
2148                 ret = rbio->retry;
2149                 rbio = bch2_rbio_free(rbio);
2150
2151                 if (ret == READ_RETRY_AVOID) {
2152                         bch2_mark_io_failure(failed, &pick);
2153                         ret = READ_RETRY;
2154                 }
2155
2156                 return ret;
2157         }
2158
2159 err:
2160         if (flags & BCH_READ_IN_RETRY)
2161                 return READ_ERR;
2162
2163         orig->bio.bi_status = BLK_STS_IOERR;
2164         goto out_read_done;
2165
2166 hole:
2167         /*
2168          * won't normally happen in the BCH_READ_NODECODE
2169          * (bch2_move_extent()) path, but if we retry and the extent we wanted
2170          * to read no longer exists we have to signal that:
2171          */
2172         if (flags & BCH_READ_NODECODE)
2173                 orig->hole = true;
2174
2175         zero_fill_bio_iter(&orig->bio, iter);
2176 out_read_done:
2177         if (flags & BCH_READ_LAST_FRAGMENT)
2178                 bch2_rbio_done(orig);
2179         return 0;
2180 }
2181
2182 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
2183 {
2184         struct btree_trans trans;
2185         struct btree_iter *iter;
2186         struct bkey_on_stack sk;
2187         struct bkey_s_c k;
2188         unsigned flags = BCH_READ_RETRY_IF_STALE|
2189                 BCH_READ_MAY_PROMOTE|
2190                 BCH_READ_USER_MAPPED;
2191         int ret;
2192
2193         BUG_ON(rbio->_state);
2194         BUG_ON(flags & BCH_READ_NODECODE);
2195         BUG_ON(flags & BCH_READ_IN_RETRY);
2196
2197         rbio->c = c;
2198         rbio->start_time = local_clock();
2199
2200         bkey_on_stack_init(&sk);
2201         bch2_trans_init(&trans, c, 0, 0);
2202 retry:
2203         bch2_trans_begin(&trans);
2204
2205         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2206                                    POS(inode, rbio->bio.bi_iter.bi_sector),
2207                                    BTREE_ITER_SLOTS);
2208         while (1) {
2209                 unsigned bytes, sectors, offset_into_extent;
2210
2211                 bch2_btree_iter_set_pos(iter,
2212                                 POS(inode, rbio->bio.bi_iter.bi_sector));
2213
2214                 k = bch2_btree_iter_peek_slot(iter);
2215                 ret = bkey_err(k);
2216                 if (ret)
2217                         goto err;
2218
2219                 offset_into_extent = iter->pos.offset -
2220                         bkey_start_offset(k.k);
2221                 sectors = k.k->size - offset_into_extent;
2222
2223                 bkey_on_stack_realloc(&sk, c, k.k->u64s);
2224                 bkey_reassemble(sk.k, k);
2225                 k = bkey_i_to_s_c(sk.k);
2226
2227                 ret = bch2_read_indirect_extent(&trans,
2228                                         &offset_into_extent, sk.k);
2229                 if (ret)
2230                         goto err;
2231
2232                 /*
2233                  * With indirect extents, the amount of data to read is the min
2234                  * of the original extent and the indirect extent:
2235                  */
2236                 sectors = min(sectors, k.k->size - offset_into_extent);
2237
2238                 /*
2239                  * Unlock the iterator while the btree node's lock is still in
2240                  * cache, before doing the IO:
2241                  */
2242                 bch2_trans_unlock(&trans);
2243
2244                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
2245                 swap(rbio->bio.bi_iter.bi_size, bytes);
2246
2247                 if (rbio->bio.bi_iter.bi_size == bytes)
2248                         flags |= BCH_READ_LAST_FRAGMENT;
2249
2250                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
2251
2252                 if (flags & BCH_READ_LAST_FRAGMENT)
2253                         break;
2254
2255                 swap(rbio->bio.bi_iter.bi_size, bytes);
2256                 bio_advance(&rbio->bio, bytes);
2257         }
2258 out:
2259         bch2_trans_exit(&trans);
2260         bkey_on_stack_exit(&sk, c);
2261         return;
2262 err:
2263         if (ret == -EINTR)
2264                 goto retry;
2265
2266         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
2267         bch2_rbio_done(rbio);
2268         goto out;
2269 }
2270
2271 void bch2_fs_io_exit(struct bch_fs *c)
2272 {
2273         if (c->promote_table.tbl)
2274                 rhashtable_destroy(&c->promote_table);
2275         mempool_exit(&c->bio_bounce_pages);
2276         bioset_exit(&c->bio_write);
2277         bioset_exit(&c->bio_read_split);
2278         bioset_exit(&c->bio_read);
2279 }
2280
2281 int bch2_fs_io_init(struct bch_fs *c)
2282 {
2283         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2284                         BIOSET_NEED_BVECS) ||
2285             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2286                         BIOSET_NEED_BVECS) ||
2287             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2288                         BIOSET_NEED_BVECS) ||
2289             mempool_init_page_pool(&c->bio_bounce_pages,
2290                                    max_t(unsigned,
2291                                          c->opts.btree_node_size,
2292                                          c->sb.encoded_extent_max) /
2293                                    PAGE_SECTORS, 0) ||
2294             rhashtable_init(&c->promote_table, &bch_promote_params))
2295                 return -ENOMEM;
2296
2297         return 0;
2298 }