]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
Update bcachefs sources to 18686af684 bcachefs: Inode backpointers
[bcachefs-tools-debian] / libbcachefs / io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "bkey_buf.h"
13 #include "bset.h"
14 #include "btree_update.h"
15 #include "buckets.h"
16 #include "checksum.h"
17 #include "compress.h"
18 #include "clock.h"
19 #include "debug.h"
20 #include "disk_groups.h"
21 #include "ec.h"
22 #include "error.h"
23 #include "extent_update.h"
24 #include "inode.h"
25 #include "io.h"
26 #include "journal.h"
27 #include "keylist.h"
28 #include "move.h"
29 #include "rebalance.h"
30 #include "super.h"
31 #include "super-io.h"
32
33 #include <linux/blkdev.h>
34 #include <linux/random.h>
35 #include <linux/sched/mm.h>
36
37 #include <trace/events/bcachefs.h>
38
39 const char *bch2_blk_status_to_str(blk_status_t status)
40 {
41         if (status == BLK_STS_REMOVED)
42                 return "device removed";
43         return blk_status_to_str(status);
44 }
45
46 static bool bch2_target_congested(struct bch_fs *c, u16 target)
47 {
48         const struct bch_devs_mask *devs;
49         unsigned d, nr = 0, total = 0;
50         u64 now = local_clock(), last;
51         s64 congested;
52         struct bch_dev *ca;
53
54         if (!target)
55                 return false;
56
57         rcu_read_lock();
58         devs = bch2_target_to_mask(c, target) ?:
59                 &c->rw_devs[BCH_DATA_user];
60
61         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
62                 ca = rcu_dereference(c->devs[d]);
63                 if (!ca)
64                         continue;
65
66                 congested = atomic_read(&ca->congested);
67                 last = READ_ONCE(ca->congested_last);
68                 if (time_after64(now, last))
69                         congested -= (now - last) >> 12;
70
71                 total += max(congested, 0LL);
72                 nr++;
73         }
74         rcu_read_unlock();
75
76         return bch2_rand_range(nr * CONGESTED_MAX) < total;
77 }
78
79 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
80                                        u64 now, int rw)
81 {
82         u64 latency_capable =
83                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
84         /* ideally we'd be taking into account the device's variance here: */
85         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
86         s64 latency_over = io_latency - latency_threshold;
87
88         if (latency_threshold && latency_over > 0) {
89                 /*
90                  * bump up congested by approximately latency_over * 4 /
91                  * latency_threshold - we don't need much accuracy here so don't
92                  * bother with the divide:
93                  */
94                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
95                         atomic_add(latency_over >>
96                                    max_t(int, ilog2(latency_threshold) - 2, 0),
97                                    &ca->congested);
98
99                 ca->congested_last = now;
100         } else if (atomic_read(&ca->congested) > 0) {
101                 atomic_dec(&ca->congested);
102         }
103 }
104
105 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
106 {
107         atomic64_t *latency = &ca->cur_latency[rw];
108         u64 now = local_clock();
109         u64 io_latency = time_after64(now, submit_time)
110                 ? now - submit_time
111                 : 0;
112         u64 old, new, v = atomic64_read(latency);
113
114         do {
115                 old = v;
116
117                 /*
118                  * If the io latency was reasonably close to the current
119                  * latency, skip doing the update and atomic operation - most of
120                  * the time:
121                  */
122                 if (abs((int) (old - io_latency)) < (old >> 1) &&
123                     now & ~(~0 << 5))
124                         break;
125
126                 new = ewma_add(old, io_latency, 5);
127         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
128
129         bch2_congested_acct(ca, io_latency, now, rw);
130
131         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
132 }
133
134 /* Allocate, free from mempool: */
135
136 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
137 {
138         struct bvec_iter_all iter;
139         struct bio_vec *bv;
140
141         bio_for_each_segment_all(bv, bio, iter)
142                 if (bv->bv_page != ZERO_PAGE(0))
143                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
144         bio->bi_vcnt = 0;
145 }
146
147 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
148 {
149         struct page *page;
150
151         if (likely(!*using_mempool)) {
152                 page = alloc_page(GFP_NOIO);
153                 if (unlikely(!page)) {
154                         mutex_lock(&c->bio_bounce_pages_lock);
155                         *using_mempool = true;
156                         goto pool_alloc;
157
158                 }
159         } else {
160 pool_alloc:
161                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
162         }
163
164         return page;
165 }
166
167 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
168                                size_t size)
169 {
170         bool using_mempool = false;
171
172         while (size) {
173                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
174                 unsigned len = min_t(size_t, PAGE_SIZE, size);
175
176                 BUG_ON(!bio_add_page(bio, page, len, 0));
177                 size -= len;
178         }
179
180         if (using_mempool)
181                 mutex_unlock(&c->bio_bounce_pages_lock);
182 }
183
184 /* Extent update path: */
185
186 int bch2_sum_sector_overwrites(struct btree_trans *trans,
187                                struct btree_iter *extent_iter,
188                                struct bkey_i *new,
189                                bool *maybe_extending,
190                                bool *should_check_enospc,
191                                s64 *i_sectors_delta,
192                                s64 *disk_sectors_delta)
193 {
194         struct bch_fs *c = trans->c;
195         struct btree_iter *iter;
196         struct bkey_s_c old;
197         unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
198         bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
199         int ret = 0;
200
201         *maybe_extending        = true;
202         *should_check_enospc    = false;
203         *i_sectors_delta        = 0;
204         *disk_sectors_delta     = 0;
205
206         iter = bch2_trans_copy_iter(trans, extent_iter);
207
208         for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
209                 s64 sectors = min(new->k.p.offset, old.k->p.offset) -
210                         max(bkey_start_offset(&new->k),
211                             bkey_start_offset(old.k));
212
213                 *i_sectors_delta += sectors *
214                         (bkey_extent_is_allocation(&new->k) -
215                          bkey_extent_is_allocation(old.k));
216
217                 *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
218                 *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
219                         ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
220                         : 0;
221
222                 if (!*should_check_enospc &&
223                     (new_replicas > bch2_bkey_replicas(c, old) ||
224                      (!new_compressed && bch2_bkey_sectors_compressed(old))))
225                         *should_check_enospc = true;
226
227                 if (bkey_cmp(old.k->p, new->k.p) >= 0) {
228                         /*
229                          * Check if there's already data above where we're
230                          * going to be writing to - this means we're definitely
231                          * not extending the file:
232                          *
233                          * Note that it's not sufficient to check if there's
234                          * data up to the sector offset we're going to be
235                          * writing to, because i_size could be up to one block
236                          * less:
237                          */
238                         if (!bkey_cmp(old.k->p, new->k.p))
239                                 old = bch2_btree_iter_next(iter);
240
241                         if (old.k && !bkey_err(old) &&
242                             old.k->p.inode == extent_iter->pos.inode &&
243                             bkey_extent_is_data(old.k))
244                                 *maybe_extending = false;
245
246                         break;
247                 }
248         }
249
250         bch2_trans_iter_put(trans, iter);
251         return ret;
252 }
253
254 int bch2_extent_update(struct btree_trans *trans,
255                        struct btree_iter *iter,
256                        struct bkey_i *k,
257                        struct disk_reservation *disk_res,
258                        u64 *journal_seq,
259                        u64 new_i_size,
260                        s64 *i_sectors_delta_total)
261 {
262         /* this must live until after bch2_trans_commit(): */
263         struct bkey_inode_buf inode_p;
264         bool extending = false, should_check_enospc;
265         s64 i_sectors_delta = 0, disk_sectors_delta = 0;
266         int ret;
267
268         ret = bch2_extent_trim_atomic(k, iter);
269         if (ret)
270                 return ret;
271
272         ret = bch2_sum_sector_overwrites(trans, iter, k,
273                         &extending,
274                         &should_check_enospc,
275                         &i_sectors_delta,
276                         &disk_sectors_delta);
277         if (ret)
278                 return ret;
279
280         if (disk_res &&
281             disk_sectors_delta > (s64) disk_res->sectors) {
282                 ret = bch2_disk_reservation_add(trans->c, disk_res,
283                                         disk_sectors_delta - disk_res->sectors,
284                                         !should_check_enospc
285                                         ? BCH_DISK_RESERVATION_NOFAIL : 0);
286                 if (ret)
287                         return ret;
288         }
289
290         new_i_size = extending
291                 ? min(k->k.p.offset << 9, new_i_size)
292                 : 0;
293
294         if (i_sectors_delta || new_i_size) {
295                 struct btree_iter *inode_iter;
296                 struct bch_inode_unpacked inode_u;
297
298                 inode_iter = bch2_inode_peek(trans, &inode_u,
299                                 k->k.p.inode, BTREE_ITER_INTENT);
300                 if (IS_ERR(inode_iter))
301                         return PTR_ERR(inode_iter);
302
303                 /*
304                  * XXX:
305                  * writeback can race a bit with truncate, because truncate
306                  * first updates the inode then truncates the pagecache. This is
307                  * ugly, but lets us preserve the invariant that the in memory
308                  * i_size is always >= the on disk i_size.
309                  *
310                 BUG_ON(new_i_size > inode_u.bi_size &&
311                        (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
312                  */
313                 BUG_ON(new_i_size > inode_u.bi_size && !extending);
314
315                 if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
316                     new_i_size > inode_u.bi_size)
317                         inode_u.bi_size = new_i_size;
318                 else
319                         new_i_size = 0;
320
321                 inode_u.bi_sectors += i_sectors_delta;
322
323                 if (i_sectors_delta || new_i_size) {
324                         bch2_inode_pack(trans->c, &inode_p, &inode_u);
325
326                         inode_p.inode.k.p.snapshot = iter->snapshot;
327
328                         bch2_trans_update(trans, inode_iter,
329                                           &inode_p.inode.k_i, 0);
330                 }
331
332                 bch2_trans_iter_put(trans, inode_iter);
333         }
334
335         bch2_trans_update(trans, iter, k, 0);
336
337         ret = bch2_trans_commit(trans, disk_res, journal_seq,
338                                 BTREE_INSERT_NOCHECK_RW|
339                                 BTREE_INSERT_NOFAIL);
340         if (ret)
341                 return ret;
342
343         if (i_sectors_delta_total)
344                 *i_sectors_delta_total += i_sectors_delta;
345         return 0;
346 }
347
348 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
349                    struct bpos end, u64 *journal_seq,
350                    s64 *i_sectors_delta)
351 {
352         struct bch_fs *c        = trans->c;
353         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
354         struct bkey_s_c k;
355         int ret = 0, ret2 = 0;
356
357         while ((k = bch2_btree_iter_peek(iter)).k &&
358                bkey_cmp(iter->pos, end) < 0) {
359                 struct disk_reservation disk_res =
360                         bch2_disk_reservation_init(c, 0);
361                 struct bkey_i delete;
362
363                 bch2_trans_begin(trans);
364
365                 ret = bkey_err(k);
366                 if (ret)
367                         goto btree_err;
368
369                 bkey_init(&delete.k);
370                 delete.k.p = iter->pos;
371
372                 /* create the biggest key we can */
373                 bch2_key_resize(&delete.k, max_sectors);
374                 bch2_cut_back(end, &delete);
375
376                 ret = bch2_extent_update(trans, iter, &delete,
377                                 &disk_res, journal_seq,
378                                 0, i_sectors_delta);
379                 bch2_disk_reservation_put(c, &disk_res);
380 btree_err:
381                 if (ret == -EINTR) {
382                         ret2 = ret;
383                         ret = 0;
384                 }
385                 if (ret)
386                         break;
387         }
388
389         if (bkey_cmp(iter->pos, end) > 0) {
390                 bch2_btree_iter_set_pos(iter, end);
391                 ret = bch2_btree_iter_traverse(iter);
392         }
393
394         return ret ?: ret2;
395 }
396
397 int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
398                 u64 *journal_seq, s64 *i_sectors_delta)
399 {
400         struct btree_trans trans;
401         struct btree_iter *iter;
402         int ret = 0;
403
404         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
405         iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
406                                    POS(inum, start),
407                                    BTREE_ITER_INTENT);
408
409         ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
410                              journal_seq, i_sectors_delta);
411
412         bch2_trans_iter_put(&trans, iter);
413         bch2_trans_exit(&trans);
414
415         if (ret == -EINTR)
416                 ret = 0;
417
418         return ret;
419 }
420
421 int bch2_write_index_default(struct bch_write_op *op)
422 {
423         struct bch_fs *c = op->c;
424         struct bkey_buf sk;
425         struct keylist *keys = &op->insert_keys;
426         struct bkey_i *k = bch2_keylist_front(keys);
427         struct btree_trans trans;
428         struct btree_iter *iter;
429         int ret;
430
431         bch2_bkey_buf_init(&sk);
432         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
433
434         iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
435                                    bkey_start_pos(&k->k),
436                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
437
438         do {
439                 bch2_trans_begin(&trans);
440
441                 k = bch2_keylist_front(keys);
442
443                 k->k.p.snapshot = iter->snapshot;
444
445                 bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
446                 bkey_copy(sk.k, k);
447                 bch2_cut_front(iter->pos, sk.k);
448
449                 ret = bch2_extent_update(&trans, iter, sk.k,
450                                          &op->res, op_journal_seq(op),
451                                          op->new_i_size, &op->i_sectors_delta);
452                 if (ret == -EINTR)
453                         continue;
454                 if (ret)
455                         break;
456
457                 if (bkey_cmp(iter->pos, k->k.p) >= 0)
458                         bch2_keylist_pop_front(keys);
459         } while (!bch2_keylist_empty(keys));
460
461         bch2_trans_iter_put(&trans, iter);
462         bch2_trans_exit(&trans);
463         bch2_bkey_buf_exit(&sk, c);
464
465         return ret;
466 }
467
468 /* Writes */
469
470 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
471                                enum bch_data_type type,
472                                const struct bkey_i *k)
473 {
474         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
475         const struct bch_extent_ptr *ptr;
476         struct bch_write_bio *n;
477         struct bch_dev *ca;
478
479         BUG_ON(c->opts.nochanges);
480
481         bkey_for_each_ptr(ptrs, ptr) {
482                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
483                        !c->devs[ptr->dev]);
484
485                 ca = bch_dev_bkey_exists(c, ptr->dev);
486
487                 if (to_entry(ptr + 1) < ptrs.end) {
488                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
489                                                    &ca->replica_set));
490
491                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
492                         n->bio.bi_private       = wbio->bio.bi_private;
493                         n->parent               = wbio;
494                         n->split                = true;
495                         n->bounce               = false;
496                         n->put_bio              = true;
497                         n->bio.bi_opf           = wbio->bio.bi_opf;
498                         bio_inc_remaining(&wbio->bio);
499                 } else {
500                         n = wbio;
501                         n->split                = false;
502                 }
503
504                 n->c                    = c;
505                 n->dev                  = ptr->dev;
506                 n->have_ioref           = bch2_dev_get_ioref(ca,
507                                         type == BCH_DATA_btree ? READ : WRITE);
508                 n->submit_time          = local_clock();
509                 n->bio.bi_iter.bi_sector = ptr->offset;
510
511                 if (likely(n->have_ioref)) {
512                         this_cpu_add(ca->io_done->sectors[WRITE][type],
513                                      bio_sectors(&n->bio));
514
515                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
516                         submit_bio(&n->bio);
517                 } else {
518                         n->bio.bi_status        = BLK_STS_REMOVED;
519                         bio_endio(&n->bio);
520                 }
521         }
522 }
523
524 static void __bch2_write(struct closure *);
525
526 static void bch2_write_done(struct closure *cl)
527 {
528         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
529         struct bch_fs *c = op->c;
530
531         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
532                 op->error = bch2_journal_error(&c->journal);
533
534         bch2_disk_reservation_put(c, &op->res);
535         percpu_ref_put(&c->writes);
536         bch2_keylist_free(&op->insert_keys, op->inline_keys);
537
538         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
539
540         if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
541                 up(&c->io_in_flight);
542
543         if (op->end_io) {
544                 EBUG_ON(cl->parent);
545                 closure_debug_destroy(cl);
546                 op->end_io(op);
547         } else {
548                 closure_return(cl);
549         }
550 }
551
552 /**
553  * bch_write_index - after a write, update index to point to new data
554  */
555 static void __bch2_write_index(struct bch_write_op *op)
556 {
557         struct bch_fs *c = op->c;
558         struct keylist *keys = &op->insert_keys;
559         struct bch_extent_ptr *ptr;
560         struct bkey_i *src, *dst = keys->keys, *n, *k;
561         unsigned dev;
562         int ret;
563
564         for (src = keys->keys; src != keys->top; src = n) {
565                 n = bkey_next(src);
566
567                 if (bkey_extent_is_direct_data(&src->k)) {
568                         bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
569                                             test_bit(ptr->dev, op->failed.d));
570
571                         if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
572                                 ret = -EIO;
573                                 goto err;
574                         }
575                 }
576
577                 if (dst != src)
578                         memmove_u64s_down(dst, src, src->u64s);
579                 dst = bkey_next(dst);
580         }
581
582         keys->top = dst;
583
584         /*
585          * probably not the ideal place to hook this in, but I don't
586          * particularly want to plumb io_opts all the way through the btree
587          * update stack right now
588          */
589         for_each_keylist_key(keys, k) {
590                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
591
592                 if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
593                         bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
594
595         }
596
597         if (!bch2_keylist_empty(keys)) {
598                 u64 sectors_start = keylist_sectors(keys);
599                 int ret = op->index_update_fn(op);
600
601                 BUG_ON(ret == -EINTR);
602                 BUG_ON(keylist_sectors(keys) && !ret);
603
604                 op->written += sectors_start - keylist_sectors(keys);
605
606                 if (ret) {
607                         bch_err_inum_ratelimited(c, op->pos.inode,
608                                 "write error %i from btree update", ret);
609                         op->error = ret;
610                 }
611         }
612 out:
613         /* If some a bucket wasn't written, we can't erasure code it: */
614         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
615                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
616
617         bch2_open_buckets_put(c, &op->open_buckets);
618         return;
619 err:
620         keys->top = keys->keys;
621         op->error = ret;
622         goto out;
623 }
624
625 static void bch2_write_index(struct closure *cl)
626 {
627         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
628         struct bch_fs *c = op->c;
629
630         __bch2_write_index(op);
631
632         if (!(op->flags & BCH_WRITE_DONE)) {
633                 continue_at(cl, __bch2_write, index_update_wq(op));
634         } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
635                 bch2_journal_flush_seq_async(&c->journal,
636                                              *op_journal_seq(op),
637                                              cl);
638                 continue_at(cl, bch2_write_done, index_update_wq(op));
639         } else {
640                 continue_at_nobarrier(cl, bch2_write_done, NULL);
641         }
642 }
643
644 static void bch2_write_endio(struct bio *bio)
645 {
646         struct closure *cl              = bio->bi_private;
647         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
648         struct bch_write_bio *wbio      = to_wbio(bio);
649         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
650         struct bch_fs *c                = wbio->c;
651         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
652
653         if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
654                                     op->pos.inode,
655                                     op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
656                                     "data write error: %s",
657                                bch2_blk_status_to_str(bio->bi_status)))
658                 set_bit(wbio->dev, op->failed.d);
659
660         if (wbio->have_ioref) {
661                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
662                 percpu_ref_put(&ca->io_ref);
663         }
664
665         if (wbio->bounce)
666                 bch2_bio_free_pages_pool(c, bio);
667
668         if (wbio->put_bio)
669                 bio_put(bio);
670
671         if (parent)
672                 bio_endio(&parent->bio);
673         else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
674                 closure_put(cl);
675         else
676                 continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
677 }
678
679 static void init_append_extent(struct bch_write_op *op,
680                                struct write_point *wp,
681                                struct bversion version,
682                                struct bch_extent_crc_unpacked crc)
683 {
684         struct bch_fs *c = op->c;
685         struct bkey_i_extent *e;
686         struct open_bucket *ob;
687         unsigned i;
688
689         BUG_ON(crc.compressed_size > wp->sectors_free);
690         wp->sectors_free -= crc.compressed_size;
691         op->pos.offset += crc.uncompressed_size;
692
693         e = bkey_extent_init(op->insert_keys.top);
694         e->k.p          = op->pos;
695         e->k.size       = crc.uncompressed_size;
696         e->k.version    = version;
697
698         if (crc.csum_type ||
699             crc.compression_type ||
700             crc.nonce)
701                 bch2_extent_crc_append(&e->k_i, crc);
702
703         open_bucket_for_each(c, &wp->ptrs, ob, i) {
704                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
705                 union bch_extent_entry *end =
706                         bkey_val_end(bkey_i_to_s(&e->k_i));
707
708                 end->ptr = ob->ptr;
709                 end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
710                 end->ptr.cached = !ca->mi.durability ||
711                         (op->flags & BCH_WRITE_CACHED) != 0;
712                 end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
713
714                 e->k.u64s++;
715
716                 BUG_ON(crc.compressed_size > ob->sectors_free);
717                 ob->sectors_free -= crc.compressed_size;
718         }
719
720         bch2_keylist_push(&op->insert_keys);
721 }
722
723 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
724                                         struct write_point *wp,
725                                         struct bio *src,
726                                         bool *page_alloc_failed,
727                                         void *buf)
728 {
729         struct bch_write_bio *wbio;
730         struct bio *bio;
731         unsigned output_available =
732                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
733         unsigned pages = DIV_ROUND_UP(output_available +
734                                       (buf
735                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
736                                        : 0), PAGE_SIZE);
737
738         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
739         wbio                    = wbio_init(bio);
740         wbio->put_bio           = true;
741         /* copy WRITE_SYNC flag */
742         wbio->bio.bi_opf        = src->bi_opf;
743
744         if (buf) {
745                 bch2_bio_map(bio, buf, output_available);
746                 return bio;
747         }
748
749         wbio->bounce            = true;
750
751         /*
752          * We can't use mempool for more than c->sb.encoded_extent_max
753          * worth of pages, but we'd like to allocate more if we can:
754          */
755         bch2_bio_alloc_pages_pool(c, bio,
756                                   min_t(unsigned, output_available,
757                                         c->sb.encoded_extent_max << 9));
758
759         if (bio->bi_iter.bi_size < output_available)
760                 *page_alloc_failed =
761                         bch2_bio_alloc_pages(bio,
762                                              output_available -
763                                              bio->bi_iter.bi_size,
764                                              GFP_NOFS) != 0;
765
766         return bio;
767 }
768
769 static int bch2_write_rechecksum(struct bch_fs *c,
770                                  struct bch_write_op *op,
771                                  unsigned new_csum_type)
772 {
773         struct bio *bio = &op->wbio.bio;
774         struct bch_extent_crc_unpacked new_crc;
775         int ret;
776
777         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
778
779         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
780             bch2_csum_type_is_encryption(new_csum_type))
781                 new_csum_type = op->crc.csum_type;
782
783         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
784                                   NULL, &new_crc,
785                                   op->crc.offset, op->crc.live_size,
786                                   new_csum_type);
787         if (ret)
788                 return ret;
789
790         bio_advance(bio, op->crc.offset << 9);
791         bio->bi_iter.bi_size = op->crc.live_size << 9;
792         op->crc = new_crc;
793         return 0;
794 }
795
796 static int bch2_write_decrypt(struct bch_write_op *op)
797 {
798         struct bch_fs *c = op->c;
799         struct nonce nonce = extent_nonce(op->version, op->crc);
800         struct bch_csum csum;
801
802         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
803                 return 0;
804
805         /*
806          * If we need to decrypt data in the write path, we'll no longer be able
807          * to verify the existing checksum (poly1305 mac, in this case) after
808          * it's decrypted - this is the last point we'll be able to reverify the
809          * checksum:
810          */
811         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
812         if (bch2_crc_cmp(op->crc.csum, csum))
813                 return -EIO;
814
815         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
816         op->crc.csum_type = 0;
817         op->crc.csum = (struct bch_csum) { 0, 0 };
818         return 0;
819 }
820
821 static enum prep_encoded_ret {
822         PREP_ENCODED_OK,
823         PREP_ENCODED_ERR,
824         PREP_ENCODED_CHECKSUM_ERR,
825         PREP_ENCODED_DO_WRITE,
826 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
827 {
828         struct bch_fs *c = op->c;
829         struct bio *bio = &op->wbio.bio;
830
831         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
832                 return PREP_ENCODED_OK;
833
834         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
835
836         /* Can we just write the entire extent as is? */
837         if (op->crc.uncompressed_size == op->crc.live_size &&
838             op->crc.compressed_size <= wp->sectors_free &&
839             (op->crc.compression_type == op->compression_type ||
840              op->incompressible)) {
841                 if (!crc_is_compressed(op->crc) &&
842                     op->csum_type != op->crc.csum_type &&
843                     bch2_write_rechecksum(c, op, op->csum_type))
844                         return PREP_ENCODED_CHECKSUM_ERR;
845
846                 return PREP_ENCODED_DO_WRITE;
847         }
848
849         /*
850          * If the data is compressed and we couldn't write the entire extent as
851          * is, we have to decompress it:
852          */
853         if (crc_is_compressed(op->crc)) {
854                 struct bch_csum csum;
855
856                 if (bch2_write_decrypt(op))
857                         return PREP_ENCODED_CHECKSUM_ERR;
858
859                 /* Last point we can still verify checksum: */
860                 csum = bch2_checksum_bio(c, op->crc.csum_type,
861                                          extent_nonce(op->version, op->crc),
862                                          bio);
863                 if (bch2_crc_cmp(op->crc.csum, csum))
864                         return PREP_ENCODED_CHECKSUM_ERR;
865
866                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
867                         return PREP_ENCODED_ERR;
868         }
869
870         /*
871          * No longer have compressed data after this point - data might be
872          * encrypted:
873          */
874
875         /*
876          * If the data is checksummed and we're only writing a subset,
877          * rechecksum and adjust bio to point to currently live data:
878          */
879         if ((op->crc.live_size != op->crc.uncompressed_size ||
880              op->crc.csum_type != op->csum_type) &&
881             bch2_write_rechecksum(c, op, op->csum_type))
882                 return PREP_ENCODED_CHECKSUM_ERR;
883
884         /*
885          * If we want to compress the data, it has to be decrypted:
886          */
887         if ((op->compression_type ||
888              bch2_csum_type_is_encryption(op->crc.csum_type) !=
889              bch2_csum_type_is_encryption(op->csum_type)) &&
890             bch2_write_decrypt(op))
891                 return PREP_ENCODED_CHECKSUM_ERR;
892
893         return PREP_ENCODED_OK;
894 }
895
896 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
897                              struct bio **_dst)
898 {
899         struct bch_fs *c = op->c;
900         struct bio *src = &op->wbio.bio, *dst = src;
901         struct bvec_iter saved_iter;
902         void *ec_buf;
903         struct bpos ec_pos = op->pos;
904         unsigned total_output = 0, total_input = 0;
905         bool bounce = false;
906         bool page_alloc_failed = false;
907         int ret, more = 0;
908
909         BUG_ON(!bio_sectors(src));
910
911         ec_buf = bch2_writepoint_ec_buf(c, wp);
912
913         switch (bch2_write_prep_encoded_data(op, wp)) {
914         case PREP_ENCODED_OK:
915                 break;
916         case PREP_ENCODED_ERR:
917                 ret = -EIO;
918                 goto err;
919         case PREP_ENCODED_CHECKSUM_ERR:
920                 BUG();
921                 goto csum_err;
922         case PREP_ENCODED_DO_WRITE:
923                 /* XXX look for bug here */
924                 if (ec_buf) {
925                         dst = bch2_write_bio_alloc(c, wp, src,
926                                                    &page_alloc_failed,
927                                                    ec_buf);
928                         bio_copy_data(dst, src);
929                         bounce = true;
930                 }
931                 init_append_extent(op, wp, op->version, op->crc);
932                 goto do_write;
933         }
934
935         if (ec_buf ||
936             op->compression_type ||
937             (op->csum_type &&
938              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
939             (bch2_csum_type_is_encryption(op->csum_type) &&
940              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
941                 dst = bch2_write_bio_alloc(c, wp, src,
942                                            &page_alloc_failed,
943                                            ec_buf);
944                 bounce = true;
945         }
946
947         saved_iter = dst->bi_iter;
948
949         do {
950                 struct bch_extent_crc_unpacked crc =
951                         (struct bch_extent_crc_unpacked) { 0 };
952                 struct bversion version = op->version;
953                 size_t dst_len, src_len;
954
955                 if (page_alloc_failed &&
956                     bio_sectors(dst) < wp->sectors_free &&
957                     bio_sectors(dst) < c->sb.encoded_extent_max)
958                         break;
959
960                 BUG_ON(op->compression_type &&
961                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
962                        bch2_csum_type_is_encryption(op->crc.csum_type));
963                 BUG_ON(op->compression_type && !bounce);
964
965                 crc.compression_type = op->incompressible
966                         ? BCH_COMPRESSION_TYPE_incompressible
967                         : op->compression_type
968                         ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
969                                             op->compression_type)
970                         : 0;
971                 if (!crc_is_compressed(crc)) {
972                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
973                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
974
975                         if (op->csum_type)
976                                 dst_len = min_t(unsigned, dst_len,
977                                                 c->sb.encoded_extent_max << 9);
978
979                         if (bounce) {
980                                 swap(dst->bi_iter.bi_size, dst_len);
981                                 bio_copy_data(dst, src);
982                                 swap(dst->bi_iter.bi_size, dst_len);
983                         }
984
985                         src_len = dst_len;
986                 }
987
988                 BUG_ON(!src_len || !dst_len);
989
990                 if (bch2_csum_type_is_encryption(op->csum_type)) {
991                         if (bversion_zero(version)) {
992                                 version.lo = atomic64_inc_return(&c->key_version);
993                         } else {
994                                 crc.nonce = op->nonce;
995                                 op->nonce += src_len >> 9;
996                         }
997                 }
998
999                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
1000                     !crc_is_compressed(crc) &&
1001                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
1002                     bch2_csum_type_is_encryption(op->csum_type)) {
1003                         /*
1004                          * Note: when we're using rechecksum(), we need to be
1005                          * checksumming @src because it has all the data our
1006                          * existing checksum covers - if we bounced (because we
1007                          * were trying to compress), @dst will only have the
1008                          * part of the data the new checksum will cover.
1009                          *
1010                          * But normally we want to be checksumming post bounce,
1011                          * because part of the reason for bouncing is so the
1012                          * data can't be modified (by userspace) while it's in
1013                          * flight.
1014                          */
1015                         if (bch2_rechecksum_bio(c, src, version, op->crc,
1016                                         &crc, &op->crc,
1017                                         src_len >> 9,
1018                                         bio_sectors(src) - (src_len >> 9),
1019                                         op->csum_type))
1020                                 goto csum_err;
1021                 } else {
1022                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
1023                             bch2_rechecksum_bio(c, src, version, op->crc,
1024                                         NULL, &op->crc,
1025                                         src_len >> 9,
1026                                         bio_sectors(src) - (src_len >> 9),
1027                                         op->crc.csum_type))
1028                                 goto csum_err;
1029
1030                         crc.compressed_size     = dst_len >> 9;
1031                         crc.uncompressed_size   = src_len >> 9;
1032                         crc.live_size           = src_len >> 9;
1033
1034                         swap(dst->bi_iter.bi_size, dst_len);
1035                         bch2_encrypt_bio(c, op->csum_type,
1036                                          extent_nonce(version, crc), dst);
1037                         crc.csum = bch2_checksum_bio(c, op->csum_type,
1038                                          extent_nonce(version, crc), dst);
1039                         crc.csum_type = op->csum_type;
1040                         swap(dst->bi_iter.bi_size, dst_len);
1041                 }
1042
1043                 init_append_extent(op, wp, version, crc);
1044
1045                 if (dst != src)
1046                         bio_advance(dst, dst_len);
1047                 bio_advance(src, src_len);
1048                 total_output    += dst_len;
1049                 total_input     += src_len;
1050         } while (dst->bi_iter.bi_size &&
1051                  src->bi_iter.bi_size &&
1052                  wp->sectors_free &&
1053                  !bch2_keylist_realloc(&op->insert_keys,
1054                                       op->inline_keys,
1055                                       ARRAY_SIZE(op->inline_keys),
1056                                       BKEY_EXTENT_U64s_MAX));
1057
1058         more = src->bi_iter.bi_size != 0;
1059
1060         dst->bi_iter = saved_iter;
1061
1062         if (dst == src && more) {
1063                 BUG_ON(total_output != total_input);
1064
1065                 dst = bio_split(src, total_input >> 9,
1066                                 GFP_NOIO, &c->bio_write);
1067                 wbio_init(dst)->put_bio = true;
1068                 /* copy WRITE_SYNC flag */
1069                 dst->bi_opf             = src->bi_opf;
1070         }
1071
1072         dst->bi_iter.bi_size = total_output;
1073 do_write:
1074         /* might have done a realloc... */
1075         bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
1076
1077         *_dst = dst;
1078         return more;
1079 csum_err:
1080         bch_err(c, "error verifying existing checksum while "
1081                 "rewriting existing data (memory corruption?)");
1082         ret = -EIO;
1083 err:
1084         if (to_wbio(dst)->bounce)
1085                 bch2_bio_free_pages_pool(c, dst);
1086         if (to_wbio(dst)->put_bio)
1087                 bio_put(dst);
1088
1089         return ret;
1090 }
1091
1092 static void __bch2_write(struct closure *cl)
1093 {
1094         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1095         struct bch_fs *c = op->c;
1096         struct write_point *wp;
1097         struct bio *bio;
1098         bool skip_put = true;
1099         unsigned nofs_flags;
1100         int ret;
1101
1102         nofs_flags = memalloc_nofs_save();
1103 again:
1104         memset(&op->failed, 0, sizeof(op->failed));
1105
1106         do {
1107                 struct bkey_i *key_to_write;
1108                 unsigned key_to_write_offset = op->insert_keys.top_p -
1109                         op->insert_keys.keys_p;
1110
1111                 /* +1 for possible cache device: */
1112                 if (op->open_buckets.nr + op->nr_replicas + 1 >
1113                     ARRAY_SIZE(op->open_buckets.v))
1114                         goto flush_io;
1115
1116                 if (bch2_keylist_realloc(&op->insert_keys,
1117                                         op->inline_keys,
1118                                         ARRAY_SIZE(op->inline_keys),
1119                                         BKEY_EXTENT_U64s_MAX))
1120                         goto flush_io;
1121
1122                 if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
1123                     percpu_ref_is_dying(&c->writes)) {
1124                         ret = -EROFS;
1125                         goto err;
1126                 }
1127
1128                 /*
1129                  * The copygc thread is now global, which means it's no longer
1130                  * freeing up space on specific disks, which means that
1131                  * allocations for specific disks may hang arbitrarily long:
1132                  */
1133                 wp = bch2_alloc_sectors_start(c,
1134                         op->target,
1135                         op->opts.erasure_code,
1136                         op->write_point,
1137                         &op->devs_have,
1138                         op->nr_replicas,
1139                         op->nr_replicas_required,
1140                         op->alloc_reserve,
1141                         op->flags,
1142                         (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
1143                                       BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
1144                 EBUG_ON(!wp);
1145
1146                 if (unlikely(IS_ERR(wp))) {
1147                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
1148                                 ret = PTR_ERR(wp);
1149                                 goto err;
1150                         }
1151
1152                         goto flush_io;
1153                 }
1154
1155                 /*
1156                  * It's possible for the allocator to fail, put us on the
1157                  * freelist waitlist, and then succeed in one of various retry
1158                  * paths: if that happens, we need to disable the skip_put
1159                  * optimization because otherwise there won't necessarily be a
1160                  * barrier before we free the bch_write_op:
1161                  */
1162                 if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
1163                         skip_put = false;
1164
1165                 bch2_open_bucket_get(c, wp, &op->open_buckets);
1166                 ret = bch2_write_extent(op, wp, &bio);
1167                 bch2_alloc_sectors_done(c, wp);
1168
1169                 if (ret < 0)
1170                         goto err;
1171
1172                 if (ret) {
1173                         skip_put = false;
1174                 } else {
1175                         /*
1176                          * for the skip_put optimization this has to be set
1177                          * before we submit the bio:
1178                          */
1179                         op->flags |= BCH_WRITE_DONE;
1180                 }
1181
1182                 bio->bi_end_io  = bch2_write_endio;
1183                 bio->bi_private = &op->cl;
1184                 bio->bi_opf |= REQ_OP_WRITE;
1185
1186                 if (!skip_put)
1187                         closure_get(bio->bi_private);
1188                 else
1189                         op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
1190
1191                 key_to_write = (void *) (op->insert_keys.keys_p +
1192                                          key_to_write_offset);
1193
1194                 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
1195                                           key_to_write);
1196         } while (ret);
1197
1198         if (!skip_put)
1199                 continue_at(cl, bch2_write_index, index_update_wq(op));
1200 out:
1201         memalloc_nofs_restore(nofs_flags);
1202         return;
1203 err:
1204         op->error = ret;
1205         op->flags |= BCH_WRITE_DONE;
1206
1207         continue_at(cl, bch2_write_index, index_update_wq(op));
1208         goto out;
1209 flush_io:
1210         /*
1211          * If the write can't all be submitted at once, we generally want to
1212          * block synchronously as that signals backpressure to the caller.
1213          *
1214          * However, if we're running out of a workqueue, we can't block here
1215          * because we'll be blocking other work items from completing:
1216          */
1217         if (current->flags & PF_WQ_WORKER) {
1218                 continue_at(cl, bch2_write_index, index_update_wq(op));
1219                 goto out;
1220         }
1221
1222         closure_sync(cl);
1223
1224         if (!bch2_keylist_empty(&op->insert_keys)) {
1225                 __bch2_write_index(op);
1226
1227                 if (op->error) {
1228                         op->flags |= BCH_WRITE_DONE;
1229                         continue_at_nobarrier(cl, bch2_write_done, NULL);
1230                         goto out;
1231                 }
1232         }
1233
1234         goto again;
1235 }
1236
1237 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1238 {
1239         struct closure *cl = &op->cl;
1240         struct bio *bio = &op->wbio.bio;
1241         struct bvec_iter iter;
1242         struct bkey_i_inline_data *id;
1243         unsigned sectors;
1244         int ret;
1245
1246         bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1247
1248         ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1249                                    ARRAY_SIZE(op->inline_keys),
1250                                    BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1251         if (ret) {
1252                 op->error = ret;
1253                 goto err;
1254         }
1255
1256         sectors = bio_sectors(bio);
1257         op->pos.offset += sectors;
1258
1259         id = bkey_inline_data_init(op->insert_keys.top);
1260         id->k.p         = op->pos;
1261         id->k.version   = op->version;
1262         id->k.size      = sectors;
1263
1264         iter = bio->bi_iter;
1265         iter.bi_size = data_len;
1266         memcpy_from_bio(id->v.data, bio, iter);
1267
1268         while (data_len & 7)
1269                 id->v.data[data_len++] = '\0';
1270         set_bkey_val_bytes(&id->k, data_len);
1271         bch2_keylist_push(&op->insert_keys);
1272
1273         op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1274         op->flags |= BCH_WRITE_DONE;
1275
1276         continue_at_nobarrier(cl, bch2_write_index, NULL);
1277         return;
1278 err:
1279         bch2_write_done(&op->cl);
1280 }
1281
1282 /**
1283  * bch_write - handle a write to a cache device or flash only volume
1284  *
1285  * This is the starting point for any data to end up in a cache device; it could
1286  * be from a normal write, or a writeback write, or a write to a flash only
1287  * volume - it's also used by the moving garbage collector to compact data in
1288  * mostly empty buckets.
1289  *
1290  * It first writes the data to the cache, creating a list of keys to be inserted
1291  * (if the data won't fit in a single open bucket, there will be multiple keys);
1292  * after the data is written it calls bch_journal, and after the keys have been
1293  * added to the next journal write they're inserted into the btree.
1294  *
1295  * If op->discard is true, instead of inserting the data it invalidates the
1296  * region of the cache represented by op->bio and op->inode.
1297  */
1298 void bch2_write(struct closure *cl)
1299 {
1300         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1301         struct bio *bio = &op->wbio.bio;
1302         struct bch_fs *c = op->c;
1303         unsigned data_len;
1304
1305         BUG_ON(!op->nr_replicas);
1306         BUG_ON(!op->write_point.v);
1307         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
1308
1309         op->start_time = local_clock();
1310         bch2_keylist_init(&op->insert_keys, op->inline_keys);
1311         wbio_init(bio)->put_bio = false;
1312
1313         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
1314                 bch_err_inum_ratelimited(c, op->pos.inode,
1315                                          "misaligned write");
1316                 op->error = -EIO;
1317                 goto err;
1318         }
1319
1320         if (c->opts.nochanges ||
1321             !percpu_ref_tryget(&c->writes)) {
1322                 op->error = -EROFS;
1323                 goto err;
1324         }
1325
1326         /*
1327          * Can't ratelimit copygc - we'd deadlock:
1328          */
1329         if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
1330                 down(&c->io_in_flight);
1331
1332         bch2_increment_clock(c, bio_sectors(bio), WRITE);
1333
1334         data_len = min_t(u64, bio->bi_iter.bi_size,
1335                          op->new_i_size - (op->pos.offset << 9));
1336
1337         if (c->opts.inline_data &&
1338             data_len <= min(block_bytes(c) / 2, 1024U)) {
1339                 bch2_write_data_inline(op, data_len);
1340                 return;
1341         }
1342
1343         continue_at_nobarrier(cl, __bch2_write, NULL);
1344         return;
1345 err:
1346         bch2_disk_reservation_put(c, &op->res);
1347
1348         if (op->end_io) {
1349                 EBUG_ON(cl->parent);
1350                 closure_debug_destroy(cl);
1351                 op->end_io(op);
1352         } else {
1353                 closure_return(cl);
1354         }
1355 }
1356
1357 /* Cache promotion on read */
1358
1359 struct promote_op {
1360         struct closure          cl;
1361         struct rcu_head         rcu;
1362         u64                     start_time;
1363
1364         struct rhash_head       hash;
1365         struct bpos             pos;
1366
1367         struct migrate_write    write;
1368         struct bio_vec          bi_inline_vecs[0]; /* must be last */
1369 };
1370
1371 static const struct rhashtable_params bch_promote_params = {
1372         .head_offset    = offsetof(struct promote_op, hash),
1373         .key_offset     = offsetof(struct promote_op, pos),
1374         .key_len        = sizeof(struct bpos),
1375 };
1376
1377 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1378                                   struct bpos pos,
1379                                   struct bch_io_opts opts,
1380                                   unsigned flags)
1381 {
1382         if (!(flags & BCH_READ_MAY_PROMOTE))
1383                 return false;
1384
1385         if (!opts.promote_target)
1386                 return false;
1387
1388         if (bch2_bkey_has_target(c, k, opts.promote_target))
1389                 return false;
1390
1391         if (bch2_target_congested(c, opts.promote_target)) {
1392                 /* XXX trace this */
1393                 return false;
1394         }
1395
1396         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1397                                    bch_promote_params))
1398                 return false;
1399
1400         return true;
1401 }
1402
1403 static void promote_free(struct bch_fs *c, struct promote_op *op)
1404 {
1405         int ret;
1406
1407         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1408                                      bch_promote_params);
1409         BUG_ON(ret);
1410         percpu_ref_put(&c->writes);
1411         kfree_rcu(op, rcu);
1412 }
1413
1414 static void promote_done(struct closure *cl)
1415 {
1416         struct promote_op *op =
1417                 container_of(cl, struct promote_op, cl);
1418         struct bch_fs *c = op->write.op.c;
1419
1420         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1421                                op->start_time);
1422
1423         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1424         promote_free(c, op);
1425 }
1426
1427 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1428 {
1429         struct bch_fs *c = rbio->c;
1430         struct closure *cl = &op->cl;
1431         struct bio *bio = &op->write.op.wbio.bio;
1432
1433         trace_promote(&rbio->bio);
1434
1435         /* we now own pages: */
1436         BUG_ON(!rbio->bounce);
1437         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1438
1439         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1440                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1441         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1442
1443         bch2_migrate_read_done(&op->write, rbio);
1444
1445         closure_init(cl, NULL);
1446         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1447         closure_return_with_destructor(cl, promote_done);
1448 }
1449
1450 static struct promote_op *__promote_alloc(struct bch_fs *c,
1451                                           enum btree_id btree_id,
1452                                           struct bkey_s_c k,
1453                                           struct bpos pos,
1454                                           struct extent_ptr_decoded *pick,
1455                                           struct bch_io_opts opts,
1456                                           unsigned sectors,
1457                                           struct bch_read_bio **rbio)
1458 {
1459         struct promote_op *op = NULL;
1460         struct bio *bio;
1461         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1462         int ret;
1463
1464         if (!percpu_ref_tryget(&c->writes))
1465                 return NULL;
1466
1467         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1468         if (!op)
1469                 goto err;
1470
1471         op->start_time = local_clock();
1472         op->pos = pos;
1473
1474         /*
1475          * We don't use the mempool here because extents that aren't
1476          * checksummed or compressed can be too big for the mempool:
1477          */
1478         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1479                         sizeof(struct bio_vec) * pages,
1480                         GFP_NOIO);
1481         if (!*rbio)
1482                 goto err;
1483
1484         rbio_init(&(*rbio)->bio, opts);
1485         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1486
1487         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1488                                  GFP_NOIO))
1489                 goto err;
1490
1491         (*rbio)->bounce         = true;
1492         (*rbio)->split          = true;
1493         (*rbio)->kmalloc        = true;
1494
1495         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1496                                           bch_promote_params))
1497                 goto err;
1498
1499         bio = &op->write.op.wbio.bio;
1500         bio_init(bio, bio->bi_inline_vecs, pages);
1501
1502         ret = bch2_migrate_write_init(c, &op->write,
1503                         writepoint_hashed((unsigned long) current),
1504                         opts,
1505                         DATA_PROMOTE,
1506                         (struct data_opts) {
1507                                 .target         = opts.promote_target,
1508                                 .nr_replicas    = 1,
1509                         },
1510                         btree_id, k);
1511         BUG_ON(ret);
1512
1513         return op;
1514 err:
1515         if (*rbio)
1516                 bio_free_pages(&(*rbio)->bio);
1517         kfree(*rbio);
1518         *rbio = NULL;
1519         kfree(op);
1520         percpu_ref_put(&c->writes);
1521         return NULL;
1522 }
1523
1524 noinline
1525 static struct promote_op *promote_alloc(struct bch_fs *c,
1526                                                struct bvec_iter iter,
1527                                                struct bkey_s_c k,
1528                                                struct extent_ptr_decoded *pick,
1529                                                struct bch_io_opts opts,
1530                                                unsigned flags,
1531                                                struct bch_read_bio **rbio,
1532                                                bool *bounce,
1533                                                bool *read_full)
1534 {
1535         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1536         /* data might have to be decompressed in the write path: */
1537         unsigned sectors = promote_full
1538                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1539                 : bvec_iter_sectors(iter);
1540         struct bpos pos = promote_full
1541                 ? bkey_start_pos(k.k)
1542                 : POS(k.k->p.inode, iter.bi_sector);
1543         struct promote_op *promote;
1544
1545         if (!should_promote(c, k, pos, opts, flags))
1546                 return NULL;
1547
1548         promote = __promote_alloc(c,
1549                                   k.k->type == KEY_TYPE_reflink_v
1550                                   ? BTREE_ID_reflink
1551                                   : BTREE_ID_extents,
1552                                   k, pos, pick, opts, sectors, rbio);
1553         if (!promote)
1554                 return NULL;
1555
1556         *bounce         = true;
1557         *read_full      = promote_full;
1558         return promote;
1559 }
1560
1561 /* Read */
1562
1563 #define READ_RETRY_AVOID        1
1564 #define READ_RETRY              2
1565 #define READ_ERR                3
1566
1567 enum rbio_context {
1568         RBIO_CONTEXT_NULL,
1569         RBIO_CONTEXT_HIGHPRI,
1570         RBIO_CONTEXT_UNBOUND,
1571 };
1572
1573 static inline struct bch_read_bio *
1574 bch2_rbio_parent(struct bch_read_bio *rbio)
1575 {
1576         return rbio->split ? rbio->parent : rbio;
1577 }
1578
1579 __always_inline
1580 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1581                            enum rbio_context context,
1582                            struct workqueue_struct *wq)
1583 {
1584         if (context <= rbio->context) {
1585                 fn(&rbio->work);
1586         } else {
1587                 rbio->work.func         = fn;
1588                 rbio->context           = context;
1589                 queue_work(wq, &rbio->work);
1590         }
1591 }
1592
1593 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1594 {
1595         BUG_ON(rbio->bounce && !rbio->split);
1596
1597         if (rbio->promote)
1598                 promote_free(rbio->c, rbio->promote);
1599         rbio->promote = NULL;
1600
1601         if (rbio->bounce)
1602                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1603
1604         if (rbio->split) {
1605                 struct bch_read_bio *parent = rbio->parent;
1606
1607                 if (rbio->kmalloc)
1608                         kfree(rbio);
1609                 else
1610                         bio_put(&rbio->bio);
1611
1612                 rbio = parent;
1613         }
1614
1615         return rbio;
1616 }
1617
1618 /*
1619  * Only called on a top level bch_read_bio to complete an entire read request,
1620  * not a split:
1621  */
1622 static void bch2_rbio_done(struct bch_read_bio *rbio)
1623 {
1624         if (rbio->start_time)
1625                 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1626                                        rbio->start_time);
1627         bio_endio(&rbio->bio);
1628 }
1629
1630 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1631                                      struct bvec_iter bvec_iter, u64 inode,
1632                                      struct bch_io_failures *failed,
1633                                      unsigned flags)
1634 {
1635         struct btree_trans trans;
1636         struct btree_iter *iter;
1637         struct bkey_buf sk;
1638         struct bkey_s_c k;
1639         int ret;
1640
1641         flags &= ~BCH_READ_LAST_FRAGMENT;
1642         flags |= BCH_READ_MUST_CLONE;
1643
1644         bch2_bkey_buf_init(&sk);
1645         bch2_trans_init(&trans, c, 0, 0);
1646
1647         iter = bch2_trans_get_iter(&trans, rbio->data_btree,
1648                                    rbio->read_pos, BTREE_ITER_SLOTS);
1649 retry:
1650         rbio->bio.bi_status = 0;
1651
1652         k = bch2_btree_iter_peek_slot(iter);
1653         if (bkey_err(k))
1654                 goto err;
1655
1656         bch2_bkey_buf_reassemble(&sk, c, k);
1657         k = bkey_i_to_s_c(sk.k);
1658         bch2_trans_unlock(&trans);
1659
1660         if (!bch2_bkey_matches_ptr(c, k,
1661                                    rbio->pick.ptr,
1662                                    rbio->data_pos.offset -
1663                                    rbio->pick.crc.offset)) {
1664                 /* extent we wanted to read no longer exists: */
1665                 rbio->hole = true;
1666                 goto out;
1667         }
1668
1669         ret = __bch2_read_extent(&trans, rbio, bvec_iter,
1670                                  rbio->read_pos,
1671                                  rbio->data_btree,
1672                                  k, 0, failed, flags);
1673         if (ret == READ_RETRY)
1674                 goto retry;
1675         if (ret)
1676                 goto err;
1677 out:
1678         bch2_rbio_done(rbio);
1679         bch2_trans_iter_put(&trans, iter);
1680         bch2_trans_exit(&trans);
1681         bch2_bkey_buf_exit(&sk, c);
1682         return;
1683 err:
1684         rbio->bio.bi_status = BLK_STS_IOERR;
1685         goto out;
1686 }
1687
1688 static void bch2_rbio_retry(struct work_struct *work)
1689 {
1690         struct bch_read_bio *rbio =
1691                 container_of(work, struct bch_read_bio, work);
1692         struct bch_fs *c        = rbio->c;
1693         struct bvec_iter iter   = rbio->bvec_iter;
1694         unsigned flags          = rbio->flags;
1695         u64 inode               = rbio->read_pos.inode;
1696         struct bch_io_failures failed = { .nr = 0 };
1697
1698         trace_read_retry(&rbio->bio);
1699
1700         if (rbio->retry == READ_RETRY_AVOID)
1701                 bch2_mark_io_failure(&failed, &rbio->pick);
1702
1703         rbio->bio.bi_status = 0;
1704
1705         rbio = bch2_rbio_free(rbio);
1706
1707         flags |= BCH_READ_IN_RETRY;
1708         flags &= ~BCH_READ_MAY_PROMOTE;
1709
1710         if (flags & BCH_READ_NODECODE) {
1711                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1712         } else {
1713                 flags &= ~BCH_READ_LAST_FRAGMENT;
1714                 flags |= BCH_READ_MUST_CLONE;
1715
1716                 __bch2_read(c, rbio, iter, inode, &failed, flags);
1717         }
1718 }
1719
1720 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1721                             blk_status_t error)
1722 {
1723         rbio->retry = retry;
1724
1725         if (rbio->flags & BCH_READ_IN_RETRY)
1726                 return;
1727
1728         if (retry == READ_ERR) {
1729                 rbio = bch2_rbio_free(rbio);
1730
1731                 rbio->bio.bi_status = error;
1732                 bch2_rbio_done(rbio);
1733         } else {
1734                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1735                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1736         }
1737 }
1738
1739 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
1740                                    struct bch_read_bio *rbio)
1741 {
1742         struct bch_fs *c = rbio->c;
1743         u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
1744         struct bch_extent_crc_unpacked new_crc;
1745         struct btree_iter *iter = NULL;
1746         struct bkey_i *new;
1747         struct bkey_s_c k;
1748         int ret = 0;
1749
1750         if (crc_is_compressed(rbio->pick.crc))
1751                 return 0;
1752
1753         iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
1754                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1755         k = bch2_btree_iter_peek_slot(iter);
1756         if ((ret = bkey_err(k)))
1757                 goto out;
1758
1759         if (bversion_cmp(k.k->version, rbio->version) ||
1760             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1761                 goto out;
1762
1763         /* Extent was merged? */
1764         if (bkey_start_offset(k.k) < data_offset ||
1765             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1766                 goto out;
1767
1768         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1769                         rbio->pick.crc, NULL, &new_crc,
1770                         bkey_start_offset(k.k) - data_offset, k.k->size,
1771                         rbio->pick.crc.csum_type)) {
1772                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1773                 ret = 0;
1774                 goto out;
1775         }
1776
1777         /*
1778          * going to be temporarily appending another checksum entry:
1779          */
1780         new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
1781                                  sizeof(struct bch_extent_crc128));
1782         if ((ret = PTR_ERR_OR_ZERO(new)))
1783                 goto out;
1784
1785         bkey_reassemble(new, k);
1786
1787         if (!bch2_bkey_narrow_crcs(new, new_crc))
1788                 goto out;
1789
1790         bch2_trans_update(trans, iter, new, 0);
1791 out:
1792         bch2_trans_iter_put(trans, iter);
1793         return ret;
1794 }
1795
1796 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1797 {
1798         bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
1799                       __bch2_rbio_narrow_crcs(&trans, rbio));
1800 }
1801
1802 /* Inner part that may run in process context */
1803 static void __bch2_read_endio(struct work_struct *work)
1804 {
1805         struct bch_read_bio *rbio =
1806                 container_of(work, struct bch_read_bio, work);
1807         struct bch_fs *c        = rbio->c;
1808         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1809         struct bio *src         = &rbio->bio;
1810         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1811         struct bvec_iter dst_iter = rbio->bvec_iter;
1812         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1813         struct nonce nonce = extent_nonce(rbio->version, crc);
1814         struct bch_csum csum;
1815
1816         /* Reset iterator for checksumming and copying bounced data: */
1817         if (rbio->bounce) {
1818                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1819                 src->bi_iter.bi_idx             = 0;
1820                 src->bi_iter.bi_bvec_done       = 0;
1821         } else {
1822                 src->bi_iter                    = rbio->bvec_iter;
1823         }
1824
1825         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1826         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1827                 goto csum_err;
1828
1829         if (unlikely(rbio->narrow_crcs))
1830                 bch2_rbio_narrow_crcs(rbio);
1831
1832         if (rbio->flags & BCH_READ_NODECODE)
1833                 goto nodecode;
1834
1835         /* Adjust crc to point to subset of data we want: */
1836         crc.offset     += rbio->offset_into_extent;
1837         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1838
1839         if (crc_is_compressed(crc)) {
1840                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1841                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1842                         goto decompression_err;
1843         } else {
1844                 /* don't need to decrypt the entire bio: */
1845                 nonce = nonce_add(nonce, crc.offset << 9);
1846                 bio_advance(src, crc.offset << 9);
1847
1848                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1849                 src->bi_iter.bi_size = dst_iter.bi_size;
1850
1851                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1852
1853                 if (rbio->bounce) {
1854                         struct bvec_iter src_iter = src->bi_iter;
1855                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1856                 }
1857         }
1858
1859         if (rbio->promote) {
1860                 /*
1861                  * Re encrypt data we decrypted, so it's consistent with
1862                  * rbio->crc:
1863                  */
1864                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1865                 promote_start(rbio->promote, rbio);
1866                 rbio->promote = NULL;
1867         }
1868 nodecode:
1869         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1870                 rbio = bch2_rbio_free(rbio);
1871                 bch2_rbio_done(rbio);
1872         }
1873         return;
1874 csum_err:
1875         /*
1876          * Checksum error: if the bio wasn't bounced, we may have been
1877          * reading into buffers owned by userspace (that userspace can
1878          * scribble over) - retry the read, bouncing it this time:
1879          */
1880         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1881                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1882                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1883                 return;
1884         }
1885
1886         bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
1887                 "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1888                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1889                 csum.hi, csum.lo, crc.csum_type);
1890         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1891         return;
1892 decompression_err:
1893         bch_err_inum_ratelimited(c, rbio->read_pos.inode,
1894                                  "decompression error");
1895         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1896         return;
1897 }
1898
1899 static void bch2_read_endio(struct bio *bio)
1900 {
1901         struct bch_read_bio *rbio =
1902                 container_of(bio, struct bch_read_bio, bio);
1903         struct bch_fs *c        = rbio->c;
1904         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1905         struct workqueue_struct *wq = NULL;
1906         enum rbio_context context = RBIO_CONTEXT_NULL;
1907
1908         if (rbio->have_ioref) {
1909                 bch2_latency_acct(ca, rbio->submit_time, READ);
1910                 percpu_ref_put(&ca->io_ref);
1911         }
1912
1913         if (!rbio->split)
1914                 rbio->bio.bi_end_io = rbio->end_io;
1915
1916         if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
1917                                     rbio->read_pos.inode,
1918                                     rbio->read_pos.offset,
1919                                     "data read error: %s",
1920                                bch2_blk_status_to_str(bio->bi_status))) {
1921                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1922                 return;
1923         }
1924
1925         if (rbio->pick.ptr.cached &&
1926             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1927              ptr_stale(ca, &rbio->pick.ptr))) {
1928                 atomic_long_inc(&c->read_realloc_races);
1929
1930                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1931                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1932                 else
1933                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1934                 return;
1935         }
1936
1937         if (rbio->narrow_crcs ||
1938             crc_is_compressed(rbio->pick.crc) ||
1939             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1940                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1941         else if (rbio->pick.crc.csum_type)
1942                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1943
1944         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1945 }
1946
1947 int __bch2_read_indirect_extent(struct btree_trans *trans,
1948                                 unsigned *offset_into_extent,
1949                                 struct bkey_buf *orig_k)
1950 {
1951         struct btree_iter *iter;
1952         struct bkey_s_c k;
1953         u64 reflink_offset;
1954         int ret;
1955
1956         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
1957                 *offset_into_extent;
1958
1959         iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
1960                                    POS(0, reflink_offset),
1961                                    BTREE_ITER_SLOTS);
1962         k = bch2_btree_iter_peek_slot(iter);
1963         ret = bkey_err(k);
1964         if (ret)
1965                 goto err;
1966
1967         if (k.k->type != KEY_TYPE_reflink_v &&
1968             k.k->type != KEY_TYPE_indirect_inline_data) {
1969                 bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
1970                                 "pointer to nonexistent indirect extent");
1971                 ret = -EIO;
1972                 goto err;
1973         }
1974
1975         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1976         bch2_bkey_buf_reassemble(orig_k, trans->c, k);
1977 err:
1978         bch2_trans_iter_put(trans, iter);
1979         return ret;
1980 }
1981
1982 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
1983                        struct bvec_iter iter, struct bpos read_pos,
1984                        enum btree_id data_btree, struct bkey_s_c k,
1985                        unsigned offset_into_extent,
1986                        struct bch_io_failures *failed, unsigned flags)
1987 {
1988         struct bch_fs *c = trans->c;
1989         struct extent_ptr_decoded pick;
1990         struct bch_read_bio *rbio = NULL;
1991         struct bch_dev *ca;
1992         struct promote_op *promote = NULL;
1993         bool bounce = false, read_full = false, narrow_crcs = false;
1994         struct bpos data_pos = bkey_start_pos(k.k);
1995         int pick_ret;
1996
1997         if (bkey_extent_is_inline_data(k.k)) {
1998                 unsigned bytes = min_t(unsigned, iter.bi_size,
1999                                        bkey_inline_data_bytes(k.k));
2000
2001                 swap(iter.bi_size, bytes);
2002                 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
2003                 swap(iter.bi_size, bytes);
2004                 bio_advance_iter(&orig->bio, &iter, bytes);
2005                 zero_fill_bio_iter(&orig->bio, iter);
2006                 goto out_read_done;
2007         }
2008
2009         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
2010
2011         /* hole or reservation - just zero fill: */
2012         if (!pick_ret)
2013                 goto hole;
2014
2015         if (pick_ret < 0) {
2016                 bch_err_inum_ratelimited(c, k.k->p.inode,
2017                                          "no device to read from");
2018                 goto err;
2019         }
2020
2021         if (pick_ret > 0)
2022                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
2023
2024         if (flags & BCH_READ_NODECODE) {
2025                 /*
2026                  * can happen if we retry, and the extent we were going to read
2027                  * has been merged in the meantime:
2028                  */
2029                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
2030                         goto hole;
2031
2032                 iter.bi_size    = pick.crc.compressed_size << 9;
2033                 goto get_bio;
2034         }
2035
2036         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
2037             bio_flagged(&orig->bio, BIO_CHAIN))
2038                 flags |= BCH_READ_MUST_CLONE;
2039
2040         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
2041                 bch2_can_narrow_extent_crcs(k, pick.crc);
2042
2043         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
2044                 flags |= BCH_READ_MUST_BOUNCE;
2045
2046         EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
2047
2048         if (crc_is_compressed(pick.crc) ||
2049             (pick.crc.csum_type != BCH_CSUM_NONE &&
2050              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2051               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2052                (flags & BCH_READ_USER_MAPPED)) ||
2053               (flags & BCH_READ_MUST_BOUNCE)))) {
2054                 read_full = true;
2055                 bounce = true;
2056         }
2057
2058         if (orig->opts.promote_target)
2059                 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2060                                         &rbio, &bounce, &read_full);
2061
2062         if (!read_full) {
2063                 EBUG_ON(crc_is_compressed(pick.crc));
2064                 EBUG_ON(pick.crc.csum_type &&
2065                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2066                          bvec_iter_sectors(iter) != pick.crc.live_size ||
2067                          pick.crc.offset ||
2068                          offset_into_extent));
2069
2070                 data_pos.offset += offset_into_extent;
2071                 pick.ptr.offset += pick.crc.offset +
2072                         offset_into_extent;
2073                 offset_into_extent              = 0;
2074                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
2075                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
2076                 pick.crc.offset                 = 0;
2077                 pick.crc.live_size              = bvec_iter_sectors(iter);
2078                 offset_into_extent              = 0;
2079         }
2080 get_bio:
2081         if (rbio) {
2082                 /*
2083                  * promote already allocated bounce rbio:
2084                  * promote needs to allocate a bio big enough for uncompressing
2085                  * data in the write path, but we're not going to use it all
2086                  * here:
2087                  */
2088                 EBUG_ON(rbio->bio.bi_iter.bi_size <
2089                        pick.crc.compressed_size << 9);
2090                 rbio->bio.bi_iter.bi_size =
2091                         pick.crc.compressed_size << 9;
2092         } else if (bounce) {
2093                 unsigned sectors = pick.crc.compressed_size;
2094
2095                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
2096                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
2097                                                   &c->bio_read_split),
2098                                  orig->opts);
2099
2100                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2101                 rbio->bounce    = true;
2102                 rbio->split     = true;
2103         } else if (flags & BCH_READ_MUST_CLONE) {
2104                 /*
2105                  * Have to clone if there were any splits, due to error
2106                  * reporting issues (if a split errored, and retrying didn't
2107                  * work, when it reports the error to its parent (us) we don't
2108                  * know if the error was from our bio, and we should retry, or
2109                  * from the whole bio, in which case we don't want to retry and
2110                  * lose the error)
2111                  */
2112                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
2113                                                 &c->bio_read_split),
2114                                  orig->opts);
2115                 rbio->bio.bi_iter = iter;
2116                 rbio->split     = true;
2117         } else {
2118                 rbio = orig;
2119                 rbio->bio.bi_iter = iter;
2120                 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2121         }
2122
2123         EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2124
2125         rbio->c                 = c;
2126         rbio->submit_time       = local_clock();
2127         if (rbio->split)
2128                 rbio->parent    = orig;
2129         else
2130                 rbio->end_io    = orig->bio.bi_end_io;
2131         rbio->bvec_iter         = iter;
2132         rbio->offset_into_extent= offset_into_extent;
2133         rbio->flags             = flags;
2134         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2135         rbio->narrow_crcs       = narrow_crcs;
2136         rbio->hole              = 0;
2137         rbio->retry             = 0;
2138         rbio->context           = 0;
2139         /* XXX: only initialize this if needed */
2140         rbio->devs_have         = bch2_bkey_devs(k);
2141         rbio->pick              = pick;
2142         rbio->read_pos          = read_pos;
2143         rbio->data_btree        = data_btree;
2144         rbio->data_pos          = data_pos;
2145         rbio->version           = k.k->version;
2146         rbio->promote           = promote;
2147         INIT_WORK(&rbio->work, NULL);
2148
2149         rbio->bio.bi_opf        = orig->bio.bi_opf;
2150         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2151         rbio->bio.bi_end_io     = bch2_read_endio;
2152
2153         if (rbio->bounce)
2154                 trace_read_bounce(&rbio->bio);
2155
2156         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2157
2158         /*
2159          * If it's being moved internally, we don't want to flag it as a cache
2160          * hit:
2161          */
2162         if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
2163                 bch2_bucket_io_time_reset(trans, pick.ptr.dev,
2164                         PTR_BUCKET_NR(ca, &pick.ptr), READ);
2165
2166         if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2167                 bio_inc_remaining(&orig->bio);
2168                 trace_read_split(&orig->bio);
2169         }
2170
2171         if (!rbio->pick.idx) {
2172                 if (!rbio->have_ioref) {
2173                         bch_err_inum_ratelimited(c, k.k->p.inode,
2174                                                  "no device to read from");
2175                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2176                         goto out;
2177                 }
2178
2179                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
2180                              bio_sectors(&rbio->bio));
2181                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2182
2183                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2184                         submit_bio(&rbio->bio);
2185                 else
2186                         submit_bio_wait(&rbio->bio);
2187         } else {
2188                 /* Attempting reconstruct read: */
2189                 if (bch2_ec_read_extent(c, rbio)) {
2190                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2191                         goto out;
2192                 }
2193
2194                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2195                         bio_endio(&rbio->bio);
2196         }
2197 out:
2198         if (likely(!(flags & BCH_READ_IN_RETRY))) {
2199                 return 0;
2200         } else {
2201                 int ret;
2202
2203                 rbio->context = RBIO_CONTEXT_UNBOUND;
2204                 bch2_read_endio(&rbio->bio);
2205
2206                 ret = rbio->retry;
2207                 rbio = bch2_rbio_free(rbio);
2208
2209                 if (ret == READ_RETRY_AVOID) {
2210                         bch2_mark_io_failure(failed, &pick);
2211                         ret = READ_RETRY;
2212                 }
2213
2214                 if (!ret)
2215                         goto out_read_done;
2216
2217                 return ret;
2218         }
2219
2220 err:
2221         if (flags & BCH_READ_IN_RETRY)
2222                 return READ_ERR;
2223
2224         orig->bio.bi_status = BLK_STS_IOERR;
2225         goto out_read_done;
2226
2227 hole:
2228         /*
2229          * won't normally happen in the BCH_READ_NODECODE
2230          * (bch2_move_extent()) path, but if we retry and the extent we wanted
2231          * to read no longer exists we have to signal that:
2232          */
2233         if (flags & BCH_READ_NODECODE)
2234                 orig->hole = true;
2235
2236         zero_fill_bio_iter(&orig->bio, iter);
2237 out_read_done:
2238         if (flags & BCH_READ_LAST_FRAGMENT)
2239                 bch2_rbio_done(orig);
2240         return 0;
2241 }
2242
2243 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
2244                  struct bvec_iter bvec_iter, u64 inode,
2245                  struct bch_io_failures *failed, unsigned flags)
2246 {
2247         struct btree_trans trans;
2248         struct btree_iter *iter;
2249         struct bkey_buf sk;
2250         struct bkey_s_c k;
2251         int ret;
2252
2253         BUG_ON(flags & BCH_READ_NODECODE);
2254
2255         bch2_bkey_buf_init(&sk);
2256         bch2_trans_init(&trans, c, 0, 0);
2257 retry:
2258         bch2_trans_begin(&trans);
2259
2260         iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
2261                                    POS(inode, bvec_iter.bi_sector),
2262                                    BTREE_ITER_SLOTS);
2263         while (1) {
2264                 unsigned bytes, sectors, offset_into_extent;
2265                 enum btree_id data_btree = BTREE_ID_extents;
2266
2267                 bch2_btree_iter_set_pos(iter,
2268                                 POS(inode, bvec_iter.bi_sector));
2269
2270                 k = bch2_btree_iter_peek_slot(iter);
2271                 ret = bkey_err(k);
2272                 if (ret)
2273                         break;
2274
2275                 offset_into_extent = iter->pos.offset -
2276                         bkey_start_offset(k.k);
2277                 sectors = k.k->size - offset_into_extent;
2278
2279                 bch2_bkey_buf_reassemble(&sk, c, k);
2280
2281                 ret = bch2_read_indirect_extent(&trans, &data_btree,
2282                                         &offset_into_extent, &sk);
2283                 if (ret)
2284                         break;
2285
2286                 k = bkey_i_to_s_c(sk.k);
2287
2288                 /*
2289                  * With indirect extents, the amount of data to read is the min
2290                  * of the original extent and the indirect extent:
2291                  */
2292                 sectors = min(sectors, k.k->size - offset_into_extent);
2293
2294                 /*
2295                  * Unlock the iterator while the btree node's lock is still in
2296                  * cache, before doing the IO:
2297                  */
2298                 bch2_trans_unlock(&trans);
2299
2300                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
2301                 swap(bvec_iter.bi_size, bytes);
2302
2303                 if (bvec_iter.bi_size == bytes)
2304                         flags |= BCH_READ_LAST_FRAGMENT;
2305
2306                 ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
2307                                          data_btree, k,
2308                                          offset_into_extent, failed, flags);
2309                 if (ret)
2310                         break;
2311
2312                 if (flags & BCH_READ_LAST_FRAGMENT)
2313                         break;
2314
2315                 swap(bvec_iter.bi_size, bytes);
2316                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
2317         }
2318         bch2_trans_iter_put(&trans, iter);
2319
2320         if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
2321                 goto retry;
2322
2323         if (ret) {
2324                 bch_err_inum_ratelimited(c, inode,
2325                                          "read error %i from btree lookup", ret);
2326                 rbio->bio.bi_status = BLK_STS_IOERR;
2327                 bch2_rbio_done(rbio);
2328         }
2329         bch2_trans_exit(&trans);
2330         bch2_bkey_buf_exit(&sk, c);
2331 }
2332
2333 void bch2_fs_io_exit(struct bch_fs *c)
2334 {
2335         if (c->promote_table.tbl)
2336                 rhashtable_destroy(&c->promote_table);
2337         mempool_exit(&c->bio_bounce_pages);
2338         bioset_exit(&c->bio_write);
2339         bioset_exit(&c->bio_read_split);
2340         bioset_exit(&c->bio_read);
2341 }
2342
2343 int bch2_fs_io_init(struct bch_fs *c)
2344 {
2345         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2346                         BIOSET_NEED_BVECS) ||
2347             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2348                         BIOSET_NEED_BVECS) ||
2349             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2350                         BIOSET_NEED_BVECS) ||
2351             mempool_init_page_pool(&c->bio_bounce_pages,
2352                                    max_t(unsigned,
2353                                          c->opts.btree_node_size,
2354                                          c->sb.encoded_extent_max) /
2355                                    PAGE_SECTORS, 0) ||
2356             rhashtable_init(&c->promote_table, &bch_promote_params))
2357                 return -ENOMEM;
2358
2359         return 0;
2360 }