]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
Update bcachefs sources to 7227ff07f14b Merge pull request #10 from modelrockettier...
[bcachefs-tools-debian] / libbcachefs / io.c
1 /*
2  * Some low level IO code, and hacks for various block layer limitations
3  *
4  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "alloc_foreground.h"
10 #include "bset.h"
11 #include "btree_update.h"
12 #include "buckets.h"
13 #include "checksum.h"
14 #include "compress.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "disk_groups.h"
18 #include "error.h"
19 #include "extents.h"
20 #include "io.h"
21 #include "journal.h"
22 #include "keylist.h"
23 #include "move.h"
24 #include "rebalance.h"
25 #include "replicas.h"
26 #include "super.h"
27 #include "super-io.h"
28
29 #include <linux/blkdev.h>
30 #include <linux/random.h>
31
32 #include <trace/events/bcachefs.h>
33
34 static bool bch2_target_congested(struct bch_fs *c, u16 target)
35 {
36         const struct bch_devs_mask *devs;
37         unsigned d, nr = 0, total = 0;
38         u64 now = local_clock(), last;
39         s64 congested;
40         struct bch_dev *ca;
41
42         if (!target)
43                 return false;
44
45         rcu_read_lock();
46         devs = bch2_target_to_mask(c, target);
47         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
48                 ca = rcu_dereference(c->devs[d]);
49                 if (!ca)
50                         continue;
51
52                 congested = atomic_read(&ca->congested);
53                 last = READ_ONCE(ca->congested_last);
54                 if (time_after64(now, last))
55                         congested -= (now - last) >> 12;
56
57                 total += max(congested, 0LL);
58                 nr++;
59         }
60         rcu_read_unlock();
61
62         return bch2_rand_range(nr * CONGESTED_MAX) < total;
63 }
64
65 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
66                                        u64 now, int rw)
67 {
68         u64 latency_capable =
69                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
70         /* ideally we'd be taking into account the device's variance here: */
71         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
72         s64 latency_over = io_latency - latency_threshold;
73
74         if (latency_threshold && latency_over > 0) {
75                 /*
76                  * bump up congested by approximately latency_over * 4 /
77                  * latency_threshold - we don't need much accuracy here so don't
78                  * bother with the divide:
79                  */
80                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
81                         atomic_add(latency_over >>
82                                    max_t(int, ilog2(latency_threshold) - 2, 0),
83                                    &ca->congested);
84
85                 ca->congested_last = now;
86         } else if (atomic_read(&ca->congested) > 0) {
87                 atomic_dec(&ca->congested);
88         }
89 }
90
91 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
92 {
93         atomic64_t *latency = &ca->cur_latency[rw];
94         u64 now = local_clock();
95         u64 io_latency = time_after64(now, submit_time)
96                 ? now - submit_time
97                 : 0;
98         u64 old, new, v = atomic64_read(latency);
99
100         do {
101                 old = v;
102
103                 /*
104                  * If the io latency was reasonably close to the current
105                  * latency, skip doing the update and atomic operation - most of
106                  * the time:
107                  */
108                 if (abs((int) (old - io_latency)) < (old >> 1) &&
109                     now & ~(~0 << 5))
110                         break;
111
112                 new = ewma_add(old, io_latency, 5);
113         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
114
115         bch2_congested_acct(ca, io_latency, now, rw);
116
117         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
118 }
119
120 /* Allocate, free from mempool: */
121
122 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
123 {
124         struct bio_vec *bv;
125         unsigned i;
126
127         bio_for_each_segment_all(bv, bio, i)
128                 if (bv->bv_page != ZERO_PAGE(0))
129                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
130         bio->bi_vcnt = 0;
131 }
132
133 static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
134                                     bool *using_mempool)
135 {
136         struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
137
138         if (likely(!*using_mempool)) {
139                 bv->bv_page = alloc_page(GFP_NOIO);
140                 if (unlikely(!bv->bv_page)) {
141                         mutex_lock(&c->bio_bounce_pages_lock);
142                         *using_mempool = true;
143                         goto pool_alloc;
144
145                 }
146         } else {
147 pool_alloc:
148                 bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
149         }
150
151         bv->bv_len = PAGE_SIZE;
152         bv->bv_offset = 0;
153 }
154
155 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
156                                size_t bytes)
157 {
158         bool using_mempool = false;
159
160         BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
161
162         bio->bi_iter.bi_size = bytes;
163
164         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
165                 bch2_bio_alloc_page_pool(c, bio, &using_mempool);
166
167         if (using_mempool)
168                 mutex_unlock(&c->bio_bounce_pages_lock);
169 }
170
171 void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
172                                     size_t bytes)
173 {
174         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
175                 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
176
177                 BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
178
179                 bv->bv_page = alloc_page(GFP_NOIO);
180                 if (!bv->bv_page) {
181                         /*
182                          * We already allocated from mempool, we can't allocate from it again
183                          * without freeing the pages we already allocated or else we could
184                          * deadlock:
185                          */
186                         bch2_bio_free_pages_pool(c, bio);
187                         bch2_bio_alloc_pages_pool(c, bio, bytes);
188                         return;
189                 }
190
191                 bv->bv_len = PAGE_SIZE;
192                 bv->bv_offset = 0;
193                 bio->bi_vcnt++;
194         }
195
196         bio->bi_iter.bi_size = bytes;
197 }
198
199 /* Writes */
200
201 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
202                                enum bch_data_type type,
203                                const struct bkey_i *k)
204 {
205         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
206         const struct bch_extent_ptr *ptr;
207         struct bch_write_bio *n;
208         struct bch_dev *ca;
209
210         BUG_ON(c->opts.nochanges);
211
212         extent_for_each_ptr(e, ptr) {
213                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
214                        !c->devs[ptr->dev]);
215
216                 ca = bch_dev_bkey_exists(c, ptr->dev);
217
218                 if (ptr + 1 < &extent_entry_last(e)->ptr) {
219                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
220                                                    &ca->replica_set));
221
222                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
223                         n->bio.bi_private       = wbio->bio.bi_private;
224                         n->parent               = wbio;
225                         n->split                = true;
226                         n->bounce               = false;
227                         n->put_bio              = true;
228                         n->bio.bi_opf           = wbio->bio.bi_opf;
229                         bio_inc_remaining(&wbio->bio);
230                 } else {
231                         n = wbio;
232                         n->split                = false;
233                 }
234
235                 n->c                    = c;
236                 n->dev                  = ptr->dev;
237                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
238                 n->submit_time          = local_clock();
239                 n->bio.bi_iter.bi_sector = ptr->offset;
240
241                 if (!journal_flushes_device(ca))
242                         n->bio.bi_opf |= REQ_FUA;
243
244                 if (likely(n->have_ioref)) {
245                         this_cpu_add(ca->io_done->sectors[WRITE][type],
246                                      bio_sectors(&n->bio));
247
248                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
249                         submit_bio(&n->bio);
250                 } else {
251                         n->bio.bi_status        = BLK_STS_REMOVED;
252                         bio_endio(&n->bio);
253                 }
254         }
255 }
256
257 static void __bch2_write(struct closure *);
258
259 static void bch2_write_done(struct closure *cl)
260 {
261         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
262         struct bch_fs *c = op->c;
263
264         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
265                 op->error = bch2_journal_error(&c->journal);
266
267         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
268                 bch2_disk_reservation_put(c, &op->res);
269         percpu_ref_put(&c->writes);
270         bch2_keylist_free(&op->insert_keys, op->inline_keys);
271
272         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
273
274         closure_return(cl);
275 }
276
277 int bch2_write_index_default(struct bch_write_op *op)
278 {
279         struct keylist *keys = &op->insert_keys;
280         struct btree_iter iter;
281         int ret;
282
283         bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
284                              bkey_start_pos(&bch2_keylist_front(keys)->k),
285                              BTREE_ITER_INTENT);
286
287         ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
288                                         op_journal_seq(op),
289                                         BTREE_INSERT_NOFAIL|
290                                         BTREE_INSERT_USE_RESERVE);
291         bch2_btree_iter_unlock(&iter);
292
293         return ret;
294 }
295
296 /**
297  * bch_write_index - after a write, update index to point to new data
298  */
299 static void __bch2_write_index(struct bch_write_op *op)
300 {
301         struct bch_fs *c = op->c;
302         struct keylist *keys = &op->insert_keys;
303         struct bkey_s_extent e;
304         struct bch_extent_ptr *ptr;
305         struct bkey_i *src, *dst = keys->keys, *n, *k;
306         int ret;
307
308         for (src = keys->keys; src != keys->top; src = n) {
309                 n = bkey_next(src);
310                 bkey_copy(dst, src);
311
312                 e = bkey_i_to_s_extent(dst);
313
314                 bch2_extent_drop_ptrs(e, ptr,
315                         test_bit(ptr->dev, op->failed.d));
316
317                 if (!bch2_extent_nr_ptrs(e.c)) {
318                         ret = -EIO;
319                         goto err;
320                 }
321
322                 if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
323                         ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
324                                                       e.s_c);
325                         if (ret)
326                                 goto err;
327                 }
328
329                 dst = bkey_next(dst);
330         }
331
332         keys->top = dst;
333
334         /*
335          * probably not the ideal place to hook this in, but I don't
336          * particularly want to plumb io_opts all the way through the btree
337          * update stack right now
338          */
339         for_each_keylist_key(keys, k)
340                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
341
342         if (!bch2_keylist_empty(keys)) {
343                 u64 sectors_start = keylist_sectors(keys);
344                 int ret = op->index_update_fn(op);
345
346                 BUG_ON(keylist_sectors(keys) && !ret);
347
348                 op->written += sectors_start - keylist_sectors(keys);
349
350                 if (ret) {
351                         __bcache_io_error(c, "btree IO error %i", ret);
352                         op->error = ret;
353                 }
354         }
355 out:
356         bch2_open_buckets_put(c, &op->open_buckets);
357         return;
358 err:
359         keys->top = keys->keys;
360         op->error = ret;
361         goto out;
362 }
363
364 static void bch2_write_index(struct closure *cl)
365 {
366         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
367         struct bch_fs *c = op->c;
368
369         __bch2_write_index(op);
370
371         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
372                 bch2_journal_flush_seq_async(&c->journal,
373                                              *op_journal_seq(op),
374                                              cl);
375                 continue_at(cl, bch2_write_done, index_update_wq(op));
376         } else {
377                 continue_at_nobarrier(cl, bch2_write_done, NULL);
378         }
379 }
380
381 static void bch2_write_endio(struct bio *bio)
382 {
383         struct closure *cl              = bio->bi_private;
384         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
385         struct bch_write_bio *wbio      = to_wbio(bio);
386         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
387         struct bch_fs *c                = wbio->c;
388         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
389
390         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
391                 set_bit(wbio->dev, op->failed.d);
392
393         if (wbio->have_ioref) {
394                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
395                 percpu_ref_put(&ca->io_ref);
396         }
397
398         if (wbio->bounce)
399                 bch2_bio_free_pages_pool(c, bio);
400
401         if (wbio->put_bio)
402                 bio_put(bio);
403
404         if (parent)
405                 bio_endio(&parent->bio);
406         else
407                 closure_put(cl);
408 }
409
410 static void init_append_extent(struct bch_write_op *op,
411                                struct write_point *wp,
412                                struct bversion version,
413                                struct bch_extent_crc_unpacked crc)
414 {
415         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
416
417         op->pos.offset += crc.uncompressed_size;
418         e->k.p = op->pos;
419         e->k.size = crc.uncompressed_size;
420         e->k.version = version;
421         bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
422
423         bch2_extent_crc_append(e, crc);
424         bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
425
426         bch2_keylist_push(&op->insert_keys);
427 }
428
429 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
430                                         struct write_point *wp,
431                                         struct bio *src,
432                                         bool *page_alloc_failed)
433 {
434         struct bch_write_bio *wbio;
435         struct bio *bio;
436         unsigned output_available =
437                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
438         unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
439
440         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
441         wbio                    = wbio_init(bio);
442         wbio->bounce            = true;
443         wbio->put_bio           = true;
444         /* copy WRITE_SYNC flag */
445         wbio->bio.bi_opf        = src->bi_opf;
446
447         /*
448          * We can't use mempool for more than c->sb.encoded_extent_max
449          * worth of pages, but we'd like to allocate more if we can:
450          */
451         while (bio->bi_iter.bi_size < output_available) {
452                 unsigned len = min_t(unsigned, PAGE_SIZE,
453                                      output_available - bio->bi_iter.bi_size);
454                 struct page *p;
455
456                 p = alloc_page(GFP_NOIO);
457                 if (!p) {
458                         unsigned pool_max =
459                                 min_t(unsigned, output_available,
460                                       c->sb.encoded_extent_max << 9);
461
462                         if (bio_sectors(bio) < pool_max)
463                                 bch2_bio_alloc_pages_pool(c, bio, pool_max);
464                         break;
465                 }
466
467                 bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
468                         .bv_page        = p,
469                         .bv_len         = len,
470                         .bv_offset      = 0,
471                 };
472                 bio->bi_iter.bi_size += len;
473         }
474
475         *page_alloc_failed = bio->bi_vcnt < pages;
476         return bio;
477 }
478
479 static int bch2_write_rechecksum(struct bch_fs *c,
480                                  struct bch_write_op *op,
481                                  unsigned new_csum_type)
482 {
483         struct bio *bio = &op->wbio.bio;
484         struct bch_extent_crc_unpacked new_crc;
485         int ret;
486
487         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
488
489         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
490             bch2_csum_type_is_encryption(new_csum_type))
491                 new_csum_type = op->crc.csum_type;
492
493         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
494                                   NULL, &new_crc,
495                                   op->crc.offset, op->crc.live_size,
496                                   new_csum_type);
497         if (ret)
498                 return ret;
499
500         bio_advance(bio, op->crc.offset << 9);
501         bio->bi_iter.bi_size = op->crc.live_size << 9;
502         op->crc = new_crc;
503         return 0;
504 }
505
506 static int bch2_write_decrypt(struct bch_write_op *op)
507 {
508         struct bch_fs *c = op->c;
509         struct nonce nonce = extent_nonce(op->version, op->crc);
510         struct bch_csum csum;
511
512         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
513                 return 0;
514
515         /*
516          * If we need to decrypt data in the write path, we'll no longer be able
517          * to verify the existing checksum (poly1305 mac, in this case) after
518          * it's decrypted - this is the last point we'll be able to reverify the
519          * checksum:
520          */
521         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
522         if (bch2_crc_cmp(op->crc.csum, csum))
523                 return -EIO;
524
525         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
526         op->crc.csum_type = 0;
527         op->crc.csum = (struct bch_csum) { 0, 0 };
528         return 0;
529 }
530
531 static enum prep_encoded_ret {
532         PREP_ENCODED_OK,
533         PREP_ENCODED_ERR,
534         PREP_ENCODED_CHECKSUM_ERR,
535         PREP_ENCODED_DO_WRITE,
536 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
537 {
538         struct bch_fs *c = op->c;
539         struct bio *bio = &op->wbio.bio;
540
541         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
542                 return PREP_ENCODED_OK;
543
544         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
545
546         /* Can we just write the entire extent as is? */
547         if (op->crc.uncompressed_size == op->crc.live_size &&
548             op->crc.compressed_size <= wp->sectors_free &&
549             op->crc.compression_type == op->compression_type) {
550                 if (!op->crc.compression_type &&
551                     op->csum_type != op->crc.csum_type &&
552                     bch2_write_rechecksum(c, op, op->csum_type))
553                         return PREP_ENCODED_CHECKSUM_ERR;
554
555                 return PREP_ENCODED_DO_WRITE;
556         }
557
558         /*
559          * If the data is compressed and we couldn't write the entire extent as
560          * is, we have to decompress it:
561          */
562         if (op->crc.compression_type) {
563                 struct bch_csum csum;
564
565                 if (bch2_write_decrypt(op))
566                         return PREP_ENCODED_CHECKSUM_ERR;
567
568                 /* Last point we can still verify checksum: */
569                 csum = bch2_checksum_bio(c, op->crc.csum_type,
570                                          extent_nonce(op->version, op->crc),
571                                          bio);
572                 if (bch2_crc_cmp(op->crc.csum, csum))
573                         return PREP_ENCODED_CHECKSUM_ERR;
574
575                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
576                         return PREP_ENCODED_ERR;
577         }
578
579         /*
580          * No longer have compressed data after this point - data might be
581          * encrypted:
582          */
583
584         /*
585          * If the data is checksummed and we're only writing a subset,
586          * rechecksum and adjust bio to point to currently live data:
587          */
588         if ((op->crc.live_size != op->crc.uncompressed_size ||
589              op->crc.csum_type != op->csum_type) &&
590             bch2_write_rechecksum(c, op, op->csum_type))
591                 return PREP_ENCODED_CHECKSUM_ERR;
592
593         /*
594          * If we want to compress the data, it has to be decrypted:
595          */
596         if ((op->compression_type ||
597              bch2_csum_type_is_encryption(op->crc.csum_type) !=
598              bch2_csum_type_is_encryption(op->csum_type)) &&
599             bch2_write_decrypt(op))
600                 return PREP_ENCODED_CHECKSUM_ERR;
601
602         return PREP_ENCODED_OK;
603 }
604
605 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
606 {
607         struct bch_fs *c = op->c;
608         struct bio *src = &op->wbio.bio, *dst = src;
609         struct bvec_iter saved_iter;
610         struct bkey_i *key_to_write;
611         unsigned key_to_write_offset = op->insert_keys.top_p -
612                 op->insert_keys.keys_p;
613         unsigned total_output = 0;
614         bool bounce = false, page_alloc_failed = false;
615         int ret, more = 0;
616
617         BUG_ON(!bio_sectors(src));
618
619         switch (bch2_write_prep_encoded_data(op, wp)) {
620         case PREP_ENCODED_OK:
621                 break;
622         case PREP_ENCODED_ERR:
623                 ret = -EIO;
624                 goto err;
625         case PREP_ENCODED_CHECKSUM_ERR:
626                 goto csum_err;
627         case PREP_ENCODED_DO_WRITE:
628                 init_append_extent(op, wp, op->version, op->crc);
629                 goto do_write;
630         }
631
632         if (op->compression_type ||
633             (op->csum_type &&
634              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
635             (bch2_csum_type_is_encryption(op->csum_type) &&
636              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
637                 dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
638                 bounce = true;
639         }
640
641         saved_iter = dst->bi_iter;
642
643         do {
644                 struct bch_extent_crc_unpacked crc =
645                         (struct bch_extent_crc_unpacked) { 0 };
646                 struct bversion version = op->version;
647                 size_t dst_len, src_len;
648
649                 if (page_alloc_failed &&
650                     bio_sectors(dst) < wp->sectors_free &&
651                     bio_sectors(dst) < c->sb.encoded_extent_max)
652                         break;
653
654                 BUG_ON(op->compression_type &&
655                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
656                        bch2_csum_type_is_encryption(op->crc.csum_type));
657                 BUG_ON(op->compression_type && !bounce);
658
659                 crc.compression_type = op->compression_type
660                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
661                                              op->compression_type)
662                         : 0;
663                 if (!crc.compression_type) {
664                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
665                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
666
667                         if (op->csum_type)
668                                 dst_len = min_t(unsigned, dst_len,
669                                                 c->sb.encoded_extent_max << 9);
670
671                         if (bounce) {
672                                 swap(dst->bi_iter.bi_size, dst_len);
673                                 bio_copy_data(dst, src);
674                                 swap(dst->bi_iter.bi_size, dst_len);
675                         }
676
677                         src_len = dst_len;
678                 }
679
680                 BUG_ON(!src_len || !dst_len);
681
682                 if (bch2_csum_type_is_encryption(op->csum_type)) {
683                         if (bversion_zero(version)) {
684                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
685                         } else {
686                                 crc.nonce = op->nonce;
687                                 op->nonce += src_len >> 9;
688                         }
689                 }
690
691                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
692                     !crc.compression_type &&
693                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
694                     bch2_csum_type_is_encryption(op->csum_type)) {
695                         /*
696                          * Note: when we're using rechecksum(), we need to be
697                          * checksumming @src because it has all the data our
698                          * existing checksum covers - if we bounced (because we
699                          * were trying to compress), @dst will only have the
700                          * part of the data the new checksum will cover.
701                          *
702                          * But normally we want to be checksumming post bounce,
703                          * because part of the reason for bouncing is so the
704                          * data can't be modified (by userspace) while it's in
705                          * flight.
706                          */
707                         if (bch2_rechecksum_bio(c, src, version, op->crc,
708                                         &crc, &op->crc,
709                                         src_len >> 9,
710                                         bio_sectors(src) - (src_len >> 9),
711                                         op->csum_type))
712                                 goto csum_err;
713                 } else {
714                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
715                             bch2_rechecksum_bio(c, src, version, op->crc,
716                                         NULL, &op->crc,
717                                         src_len >> 9,
718                                         bio_sectors(src) - (src_len >> 9),
719                                         op->crc.csum_type))
720                                 goto csum_err;
721
722                         crc.compressed_size     = dst_len >> 9;
723                         crc.uncompressed_size   = src_len >> 9;
724                         crc.live_size           = src_len >> 9;
725
726                         swap(dst->bi_iter.bi_size, dst_len);
727                         bch2_encrypt_bio(c, op->csum_type,
728                                          extent_nonce(version, crc), dst);
729                         crc.csum = bch2_checksum_bio(c, op->csum_type,
730                                          extent_nonce(version, crc), dst);
731                         crc.csum_type = op->csum_type;
732                         swap(dst->bi_iter.bi_size, dst_len);
733                 }
734
735                 init_append_extent(op, wp, version, crc);
736
737                 if (dst != src)
738                         bio_advance(dst, dst_len);
739                 bio_advance(src, src_len);
740                 total_output += dst_len;
741         } while (dst->bi_iter.bi_size &&
742                  src->bi_iter.bi_size &&
743                  wp->sectors_free &&
744                  !bch2_keylist_realloc(&op->insert_keys,
745                                       op->inline_keys,
746                                       ARRAY_SIZE(op->inline_keys),
747                                       BKEY_EXTENT_U64s_MAX));
748
749         more = src->bi_iter.bi_size != 0;
750
751         dst->bi_iter = saved_iter;
752
753         if (!bounce && more) {
754                 dst = bio_split(src, total_output >> 9,
755                                 GFP_NOIO, &c->bio_write);
756                 wbio_init(dst)->put_bio = true;
757         }
758
759         dst->bi_iter.bi_size = total_output;
760
761         /* Free unneeded pages after compressing: */
762         if (bounce)
763                 while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
764                         mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
765                                      &c->bio_bounce_pages);
766 do_write:
767         /* might have done a realloc... */
768
769         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
770
771         dst->bi_end_io  = bch2_write_endio;
772         dst->bi_private = &op->cl;
773         bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
774
775         closure_get(dst->bi_private);
776
777         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
778                                   key_to_write);
779         return more;
780 csum_err:
781         bch_err(c, "error verifying existing checksum while "
782                 "rewriting existing data (memory corruption?)");
783         ret = -EIO;
784 err:
785         if (bounce) {
786                 bch2_bio_free_pages_pool(c, dst);
787                 bio_put(dst);
788         }
789
790         return ret;
791 }
792
793 static void __bch2_write(struct closure *cl)
794 {
795         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
796         struct bch_fs *c = op->c;
797         struct write_point *wp;
798         int ret;
799 again:
800         do {
801                 /* +1 for possible cache device: */
802                 if (op->open_buckets.nr + op->nr_replicas + 1 >
803                     ARRAY_SIZE(op->open_buckets.v))
804                         goto flush_io;
805
806                 if (bch2_keylist_realloc(&op->insert_keys,
807                                         op->inline_keys,
808                                         ARRAY_SIZE(op->inline_keys),
809                                         BKEY_EXTENT_U64s_MAX))
810                         goto flush_io;
811
812                 wp = bch2_alloc_sectors_start(c,
813                         op->target,
814                         op->write_point,
815                         &op->devs_have,
816                         op->nr_replicas,
817                         op->nr_replicas_required,
818                         op->alloc_reserve,
819                         op->flags,
820                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
821                 EBUG_ON(!wp);
822
823                 if (unlikely(IS_ERR(wp))) {
824                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
825                                 ret = PTR_ERR(wp);
826                                 goto err;
827                         }
828
829                         goto flush_io;
830                 }
831
832                 ret = bch2_write_extent(op, wp);
833
834                 bch2_open_bucket_get(c, wp, &op->open_buckets);
835                 bch2_alloc_sectors_done(c, wp);
836
837                 if (ret < 0)
838                         goto err;
839         } while (ret);
840
841         continue_at(cl, bch2_write_index, index_update_wq(op));
842         return;
843 err:
844         op->error = ret;
845
846         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
847                     ? bch2_write_index
848                     : bch2_write_done, index_update_wq(op));
849         return;
850 flush_io:
851         closure_sync(cl);
852
853         if (!bch2_keylist_empty(&op->insert_keys)) {
854                 __bch2_write_index(op);
855
856                 if (op->error) {
857                         continue_at_nobarrier(cl, bch2_write_done, NULL);
858                         return;
859                 }
860         }
861
862         goto again;
863 }
864
865 /**
866  * bch_write - handle a write to a cache device or flash only volume
867  *
868  * This is the starting point for any data to end up in a cache device; it could
869  * be from a normal write, or a writeback write, or a write to a flash only
870  * volume - it's also used by the moving garbage collector to compact data in
871  * mostly empty buckets.
872  *
873  * It first writes the data to the cache, creating a list of keys to be inserted
874  * (if the data won't fit in a single open bucket, there will be multiple keys);
875  * after the data is written it calls bch_journal, and after the keys have been
876  * added to the next journal write they're inserted into the btree.
877  *
878  * If op->discard is true, instead of inserting the data it invalidates the
879  * region of the cache represented by op->bio and op->inode.
880  */
881 void bch2_write(struct closure *cl)
882 {
883         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
884         struct bch_fs *c = op->c;
885
886         BUG_ON(!op->nr_replicas);
887         BUG_ON(!op->write_point.v);
888         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
889         BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
890
891         op->start_time = local_clock();
892
893         memset(&op->failed, 0, sizeof(op->failed));
894
895         bch2_keylist_init(&op->insert_keys, op->inline_keys);
896         wbio_init(&op->wbio.bio)->put_bio = false;
897
898         if (c->opts.nochanges ||
899             !percpu_ref_tryget(&c->writes)) {
900                 __bcache_io_error(c, "read only");
901                 op->error = -EROFS;
902                 if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
903                         bch2_disk_reservation_put(c, &op->res);
904                 closure_return(cl);
905                 return;
906         }
907
908         bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
909
910         continue_at_nobarrier(cl, __bch2_write, NULL);
911 }
912
913 /* Cache promotion on read */
914
915 struct promote_op {
916         struct closure          cl;
917         u64                     start_time;
918
919         struct rhash_head       hash;
920         struct bpos             pos;
921
922         struct migrate_write    write;
923         struct bio_vec          bi_inline_vecs[0]; /* must be last */
924 };
925
926 static const struct rhashtable_params bch_promote_params = {
927         .head_offset    = offsetof(struct promote_op, hash),
928         .key_offset     = offsetof(struct promote_op, pos),
929         .key_len        = sizeof(struct bpos),
930 };
931
932 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
933                                   struct bpos pos,
934                                   struct bch_io_opts opts,
935                                   unsigned flags)
936 {
937         if (!opts.promote_target)
938                 return false;
939
940         if (!(flags & BCH_READ_MAY_PROMOTE))
941                 return false;
942
943         if (percpu_ref_is_dying(&c->writes))
944                 return false;
945
946         if (!bkey_extent_is_data(k.k))
947                 return false;
948
949         if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
950                 return false;
951
952         if (bch2_target_congested(c, opts.promote_target))
953                 return false;
954
955         if (rhashtable_lookup_fast(&c->promote_table, &pos,
956                                    bch_promote_params))
957                 return false;
958
959         return true;
960 }
961
962 static void promote_free(struct bch_fs *c, struct promote_op *op)
963 {
964         int ret;
965
966         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
967                                      bch_promote_params);
968         BUG_ON(ret);
969         percpu_ref_put(&c->writes);
970         kfree(op);
971 }
972
973 static void promote_done(struct closure *cl)
974 {
975         struct promote_op *op =
976                 container_of(cl, struct promote_op, cl);
977         struct bch_fs *c = op->write.op.c;
978
979         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
980                                op->start_time);
981
982         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
983         promote_free(c, op);
984 }
985
986 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
987 {
988         struct bch_fs *c = rbio->c;
989         struct closure *cl = &op->cl;
990         struct bio *bio = &op->write.op.wbio.bio;
991
992         trace_promote(&rbio->bio);
993
994         /* we now own pages: */
995         BUG_ON(!rbio->bounce);
996         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
997
998         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
999                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1000         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1001
1002         bch2_migrate_read_done(&op->write, rbio);
1003
1004         closure_init(cl, NULL);
1005         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1006         closure_return_with_destructor(cl, promote_done);
1007 }
1008
1009 noinline
1010 static struct promote_op *__promote_alloc(struct bch_fs *c,
1011                                           struct bpos pos,
1012                                           struct extent_ptr_decoded *pick,
1013                                           struct bch_io_opts opts,
1014                                           unsigned rbio_sectors,
1015                                           struct bch_read_bio **rbio)
1016 {
1017         struct promote_op *op = NULL;
1018         struct bio *bio;
1019         unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
1020         /* data might have to be decompressed in the write path: */
1021         unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
1022                                            PAGE_SECTORS);
1023         int ret;
1024
1025         if (!percpu_ref_tryget(&c->writes))
1026                 return NULL;
1027
1028         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
1029                      GFP_NOIO);
1030         if (!op)
1031                 goto err;
1032
1033         op->start_time = local_clock();
1034         op->pos = pos;
1035
1036         /*
1037          * promotes require bouncing, but if the extent isn't
1038          * checksummed/compressed it might be too big for the mempool:
1039          */
1040         if (rbio_sectors > c->sb.encoded_extent_max) {
1041                 *rbio = kzalloc(sizeof(struct bch_read_bio) +
1042                                 sizeof(struct bio_vec) * rbio_pages,
1043                                 GFP_NOIO);
1044                 if (!*rbio)
1045                         goto err;
1046
1047                 rbio_init(&(*rbio)->bio, opts);
1048                 bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
1049                          rbio_pages);
1050
1051                 (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
1052                 bch2_bio_map(&(*rbio)->bio, NULL);
1053
1054                 if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
1055                         goto err;
1056
1057                 (*rbio)->bounce         = true;
1058                 (*rbio)->split          = true;
1059                 (*rbio)->kmalloc        = true;
1060         }
1061
1062         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1063                                           bch_promote_params))
1064                 goto err;
1065
1066         bio = &op->write.op.wbio.bio;
1067         bio_init(bio, bio->bi_inline_vecs, wbio_pages);
1068
1069         ret = bch2_migrate_write_init(c, &op->write,
1070                         writepoint_hashed((unsigned long) current),
1071                         opts,
1072                         DATA_PROMOTE,
1073                         (struct data_opts) {
1074                                 .target = opts.promote_target
1075                         },
1076                         bkey_s_c_null);
1077         BUG_ON(ret);
1078
1079         return op;
1080 err:
1081         if (*rbio)
1082                 bio_free_pages(&(*rbio)->bio);
1083         kfree(*rbio);
1084         *rbio = NULL;
1085         kfree(op);
1086         percpu_ref_put(&c->writes);
1087         return NULL;
1088 }
1089
1090 static inline struct promote_op *promote_alloc(struct bch_fs *c,
1091                                                struct bvec_iter iter,
1092                                                struct bkey_s_c k,
1093                                                struct extent_ptr_decoded *pick,
1094                                                struct bch_io_opts opts,
1095                                                unsigned flags,
1096                                                struct bch_read_bio **rbio,
1097                                                bool *bounce,
1098                                                bool *read_full)
1099 {
1100         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1101         unsigned sectors = promote_full
1102                 ? pick->crc.compressed_size
1103                 : bvec_iter_sectors(iter);
1104         struct bpos pos = promote_full
1105                 ? bkey_start_pos(k.k)
1106                 : POS(k.k->p.inode, iter.bi_sector);
1107         struct promote_op *promote;
1108
1109         if (!should_promote(c, k, pos, opts, flags))
1110                 return NULL;
1111
1112         promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
1113         if (!promote)
1114                 return NULL;
1115
1116         *bounce         = true;
1117         *read_full      = promote_full;
1118         return promote;
1119 }
1120
1121 /* Read */
1122
1123 #define READ_RETRY_AVOID        1
1124 #define READ_RETRY              2
1125 #define READ_ERR                3
1126
1127 enum rbio_context {
1128         RBIO_CONTEXT_NULL,
1129         RBIO_CONTEXT_HIGHPRI,
1130         RBIO_CONTEXT_UNBOUND,
1131 };
1132
1133 static inline struct bch_read_bio *
1134 bch2_rbio_parent(struct bch_read_bio *rbio)
1135 {
1136         return rbio->split ? rbio->parent : rbio;
1137 }
1138
1139 __always_inline
1140 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1141                            enum rbio_context context,
1142                            struct workqueue_struct *wq)
1143 {
1144         if (context <= rbio->context) {
1145                 fn(&rbio->work);
1146         } else {
1147                 rbio->work.func         = fn;
1148                 rbio->context           = context;
1149                 queue_work(wq, &rbio->work);
1150         }
1151 }
1152
1153 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1154 {
1155         BUG_ON(rbio->bounce && !rbio->split);
1156
1157         if (rbio->promote)
1158                 promote_free(rbio->c, rbio->promote);
1159         rbio->promote = NULL;
1160
1161         if (rbio->bounce)
1162                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1163
1164         if (rbio->split) {
1165                 struct bch_read_bio *parent = rbio->parent;
1166
1167                 if (rbio->kmalloc)
1168                         kfree(rbio);
1169                 else
1170                         bio_put(&rbio->bio);
1171
1172                 rbio = parent;
1173         }
1174
1175         return rbio;
1176 }
1177
1178 static void bch2_rbio_done(struct bch_read_bio *rbio)
1179 {
1180         bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1181                                rbio->start_time);
1182         bio_endio(&rbio->bio);
1183 }
1184
1185 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1186                                      struct bvec_iter bvec_iter, u64 inode,
1187                                      struct bch_io_failures *failed,
1188                                      unsigned flags)
1189 {
1190         struct btree_iter iter;
1191         BKEY_PADDED(k) tmp;
1192         struct bkey_s_c k;
1193         int ret;
1194
1195         flags &= ~BCH_READ_LAST_FRAGMENT;
1196
1197         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
1198                              rbio->pos, BTREE_ITER_SLOTS);
1199 retry:
1200         rbio->bio.bi_status = 0;
1201
1202         k = bch2_btree_iter_peek_slot(&iter);
1203         if (btree_iter_err(k)) {
1204                 bch2_btree_iter_unlock(&iter);
1205                 goto err;
1206         }
1207
1208         bkey_reassemble(&tmp.k, k);
1209         k = bkey_i_to_s_c(&tmp.k);
1210         bch2_btree_iter_unlock(&iter);
1211
1212         if (!bkey_extent_is_data(k.k) ||
1213             !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
1214                                      rbio->pick.ptr,
1215                                      rbio->pos.offset -
1216                                      rbio->pick.crc.offset)) {
1217                 /* extent we wanted to read no longer exists: */
1218                 rbio->hole = true;
1219                 goto out;
1220         }
1221
1222         ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1223         if (ret == READ_RETRY)
1224                 goto retry;
1225         if (ret)
1226                 goto err;
1227         goto out;
1228 err:
1229         rbio->bio.bi_status = BLK_STS_IOERR;
1230 out:
1231         bch2_rbio_done(rbio);
1232 }
1233
1234 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1235                             struct bvec_iter bvec_iter, u64 inode,
1236                             struct bch_io_failures *failed, unsigned flags)
1237 {
1238         struct btree_iter iter;
1239         struct bkey_s_c k;
1240         int ret;
1241
1242         flags &= ~BCH_READ_LAST_FRAGMENT;
1243         flags |= BCH_READ_MUST_CLONE;
1244 retry:
1245         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1246                            POS(inode, bvec_iter.bi_sector),
1247                            BTREE_ITER_SLOTS, k) {
1248                 BKEY_PADDED(k) tmp;
1249                 unsigned bytes;
1250
1251                 bkey_reassemble(&tmp.k, k);
1252                 k = bkey_i_to_s_c(&tmp.k);
1253                 bch2_btree_iter_unlock(&iter);
1254
1255                 bytes = min_t(unsigned, bvec_iter.bi_size,
1256                               (k.k->p.offset - bvec_iter.bi_sector) << 9);
1257                 swap(bvec_iter.bi_size, bytes);
1258
1259                 ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1260                 switch (ret) {
1261                 case READ_RETRY:
1262                         goto retry;
1263                 case READ_ERR:
1264                         goto err;
1265                 };
1266
1267                 if (bytes == bvec_iter.bi_size)
1268                         goto out;
1269
1270                 swap(bvec_iter.bi_size, bytes);
1271                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1272         }
1273
1274         /*
1275          * If we get here, it better have been because there was an error
1276          * reading a btree node
1277          */
1278         ret = bch2_btree_iter_unlock(&iter);
1279         BUG_ON(!ret);
1280         __bcache_io_error(c, "btree IO error %i", ret);
1281 err:
1282         rbio->bio.bi_status = BLK_STS_IOERR;
1283 out:
1284         bch2_rbio_done(rbio);
1285 }
1286
1287 static void bch2_rbio_retry(struct work_struct *work)
1288 {
1289         struct bch_read_bio *rbio =
1290                 container_of(work, struct bch_read_bio, work);
1291         struct bch_fs *c        = rbio->c;
1292         struct bvec_iter iter   = rbio->bvec_iter;
1293         unsigned flags          = rbio->flags;
1294         u64 inode               = rbio->pos.inode;
1295         struct bch_io_failures failed = { .nr = 0 };
1296
1297         trace_read_retry(&rbio->bio);
1298
1299         if (rbio->retry == READ_RETRY_AVOID)
1300                 bch2_mark_io_failure(&failed, &rbio->pick);
1301
1302         rbio->bio.bi_status = 0;
1303
1304         rbio = bch2_rbio_free(rbio);
1305
1306         flags |= BCH_READ_IN_RETRY;
1307         flags &= ~BCH_READ_MAY_PROMOTE;
1308
1309         if (flags & BCH_READ_NODECODE)
1310                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1311         else
1312                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1313 }
1314
1315 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1316                             blk_status_t error)
1317 {
1318         rbio->retry = retry;
1319
1320         if (rbio->flags & BCH_READ_IN_RETRY)
1321                 return;
1322
1323         if (retry == READ_ERR) {
1324                 rbio = bch2_rbio_free(rbio);
1325
1326                 rbio->bio.bi_status = error;
1327                 bch2_rbio_done(rbio);
1328         } else {
1329                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1330                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1331         }
1332 }
1333
1334 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1335 {
1336         struct bch_fs *c = rbio->c;
1337         struct btree_iter iter;
1338         struct bkey_s_c k;
1339         struct bkey_i_extent *e;
1340         BKEY_PADDED(k) new;
1341         struct bch_extent_crc_unpacked new_crc;
1342         unsigned offset;
1343         int ret;
1344
1345         if (rbio->pick.crc.compression_type)
1346                 return;
1347
1348         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
1349                              BTREE_ITER_INTENT);
1350 retry:
1351         k = bch2_btree_iter_peek(&iter);
1352         if (IS_ERR_OR_NULL(k.k))
1353                 goto out;
1354
1355         if (!bkey_extent_is_data(k.k))
1356                 goto out;
1357
1358         bkey_reassemble(&new.k, k);
1359         e = bkey_i_to_extent(&new.k);
1360
1361         if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
1362                                      rbio->pick.ptr,
1363                                      rbio->pos.offset -
1364                                      rbio->pick.crc.offset) ||
1365             bversion_cmp(e->k.version, rbio->version))
1366                 goto out;
1367
1368         /* Extent was merged? */
1369         if (bkey_start_offset(&e->k) < rbio->pos.offset ||
1370             e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
1371                 goto out;
1372
1373         /* The extent might have been partially overwritten since we read it: */
1374         offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
1375
1376         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1377                                 rbio->pick.crc, NULL, &new_crc,
1378                                 offset, e->k.size,
1379                                 rbio->pick.crc.csum_type)) {
1380                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1381                 goto out;
1382         }
1383
1384         if (!bch2_extent_narrow_crcs(e, new_crc))
1385                 goto out;
1386
1387         ret = bch2_btree_insert_at(c, NULL, NULL,
1388                                    BTREE_INSERT_ATOMIC|
1389                                    BTREE_INSERT_NOFAIL|
1390                                    BTREE_INSERT_NOWAIT,
1391                                    BTREE_INSERT_ENTRY(&iter, &e->k_i));
1392         if (ret == -EINTR)
1393                 goto retry;
1394 out:
1395         bch2_btree_iter_unlock(&iter);
1396 }
1397
1398 static bool should_narrow_crcs(struct bkey_s_c k,
1399                                struct extent_ptr_decoded *pick,
1400                                unsigned flags)
1401 {
1402         return !(flags & BCH_READ_IN_RETRY) &&
1403                 bkey_extent_is_data(k.k) &&
1404                 bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
1405 }
1406
1407 /* Inner part that may run in process context */
1408 static void __bch2_read_endio(struct work_struct *work)
1409 {
1410         struct bch_read_bio *rbio =
1411                 container_of(work, struct bch_read_bio, work);
1412         struct bch_fs *c        = rbio->c;
1413         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1414         struct bio *src         = &rbio->bio;
1415         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1416         struct bvec_iter dst_iter = rbio->bvec_iter;
1417         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1418         struct nonce nonce = extent_nonce(rbio->version, crc);
1419         struct bch_csum csum;
1420
1421         /* Reset iterator for checksumming and copying bounced data: */
1422         if (rbio->bounce) {
1423                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1424                 src->bi_iter.bi_idx             = 0;
1425                 src->bi_iter.bi_bvec_done       = 0;
1426         } else {
1427                 src->bi_iter                    = rbio->bvec_iter;
1428         }
1429
1430         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1431         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1432                 goto csum_err;
1433
1434         if (unlikely(rbio->narrow_crcs))
1435                 bch2_rbio_narrow_crcs(rbio);
1436
1437         if (rbio->flags & BCH_READ_NODECODE)
1438                 goto nodecode;
1439
1440         /* Adjust crc to point to subset of data we want: */
1441         crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
1442         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1443
1444         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1445                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1446                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1447                         goto decompression_err;
1448         } else {
1449                 /* don't need to decrypt the entire bio: */
1450                 nonce = nonce_add(nonce, crc.offset << 9);
1451                 bio_advance(src, crc.offset << 9);
1452
1453                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1454                 src->bi_iter.bi_size = dst_iter.bi_size;
1455
1456                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1457
1458                 if (rbio->bounce) {
1459                         struct bvec_iter src_iter = src->bi_iter;
1460                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1461                 }
1462         }
1463
1464         if (rbio->promote) {
1465                 /*
1466                  * Re encrypt data we decrypted, so it's consistent with
1467                  * rbio->crc:
1468                  */
1469                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1470                 promote_start(rbio->promote, rbio);
1471                 rbio->promote = NULL;
1472         }
1473 nodecode:
1474         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1475                 rbio = bch2_rbio_free(rbio);
1476                 bch2_rbio_done(rbio);
1477         }
1478         return;
1479 csum_err:
1480         /*
1481          * Checksum error: if the bio wasn't bounced, we may have been
1482          * reading into buffers owned by userspace (that userspace can
1483          * scribble over) - retry the read, bouncing it this time:
1484          */
1485         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1486                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1487                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1488                 return;
1489         }
1490
1491         bch2_dev_io_error(ca,
1492                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1493                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1494                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1495                 csum.hi, csum.lo, crc.csum_type);
1496         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1497         return;
1498 decompression_err:
1499         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1500                           rbio->pos.inode,
1501                           (u64) rbio->bvec_iter.bi_sector);
1502         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1503         return;
1504 }
1505
1506 static void bch2_read_endio(struct bio *bio)
1507 {
1508         struct bch_read_bio *rbio =
1509                 container_of(bio, struct bch_read_bio, bio);
1510         struct bch_fs *c        = rbio->c;
1511         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1512         struct workqueue_struct *wq = NULL;
1513         enum rbio_context context = RBIO_CONTEXT_NULL;
1514
1515         if (rbio->have_ioref) {
1516                 bch2_latency_acct(ca, rbio->submit_time, READ);
1517                 percpu_ref_put(&ca->io_ref);
1518         }
1519
1520         if (!rbio->split)
1521                 rbio->bio.bi_end_io = rbio->end_io;
1522
1523         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1524                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1525                 return;
1526         }
1527
1528         if (rbio->pick.ptr.cached &&
1529             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1530              ptr_stale(ca, &rbio->pick.ptr))) {
1531                 atomic_long_inc(&c->read_realloc_races);
1532
1533                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1534                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1535                 else
1536                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1537                 return;
1538         }
1539
1540         if (rbio->narrow_crcs ||
1541             rbio->pick.crc.compression_type ||
1542             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1543                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1544         else if (rbio->pick.crc.csum_type)
1545                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1546
1547         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1548 }
1549
1550 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1551                        struct bvec_iter iter, struct bkey_s_c k,
1552                        struct bch_io_failures *failed, unsigned flags)
1553 {
1554         struct extent_ptr_decoded pick;
1555         struct bch_read_bio *rbio = NULL;
1556         struct bch_dev *ca;
1557         struct promote_op *promote = NULL;
1558         bool bounce = false, read_full = false, narrow_crcs = false;
1559         struct bpos pos = bkey_start_pos(k.k);
1560         int pick_ret;
1561
1562         pick_ret = bch2_extent_pick_ptr(c, k, failed, &pick);
1563
1564         /* hole or reservation - just zero fill: */
1565         if (!pick_ret)
1566                 goto hole;
1567
1568         if (pick_ret < 0)
1569                 goto no_device;
1570
1571         if (pick_ret > 0)
1572                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1573
1574         if (flags & BCH_READ_NODECODE) {
1575                 /*
1576                  * can happen if we retry, and the extent we were going to read
1577                  * has been merged in the meantime:
1578                  */
1579                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1580                         goto hole;
1581
1582                 iter.bi_sector  = pos.offset;
1583                 iter.bi_size    = pick.crc.compressed_size << 9;
1584                 goto noclone;
1585         }
1586
1587         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1588             bio_flagged(&orig->bio, BIO_CHAIN))
1589                 flags |= BCH_READ_MUST_CLONE;
1590
1591         narrow_crcs = should_narrow_crcs(k, &pick, flags);
1592
1593         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1594                 flags |= BCH_READ_MUST_BOUNCE;
1595
1596         EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
1597                 k.k->p.offset < bvec_iter_end_sector(iter));
1598
1599         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1600             (pick.crc.csum_type != BCH_CSUM_NONE &&
1601              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1602               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1603                (flags & BCH_READ_USER_MAPPED)) ||
1604               (flags & BCH_READ_MUST_BOUNCE)))) {
1605                 read_full = true;
1606                 bounce = true;
1607         }
1608
1609         promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
1610                                 &rbio, &bounce, &read_full);
1611
1612         if (!read_full) {
1613                 EBUG_ON(pick.crc.compression_type);
1614                 EBUG_ON(pick.crc.csum_type &&
1615                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1616                          bvec_iter_sectors(iter) != pick.crc.live_size ||
1617                          pick.crc.offset ||
1618                          iter.bi_sector != pos.offset));
1619
1620                 pick.ptr.offset += pick.crc.offset +
1621                         (iter.bi_sector - pos.offset);
1622                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
1623                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
1624                 pick.crc.offset                 = 0;
1625                 pick.crc.live_size              = bvec_iter_sectors(iter);
1626                 pos.offset                      = iter.bi_sector;
1627         }
1628
1629         if (rbio) {
1630                 /* promote already allocated bounce rbio */
1631         } else if (bounce) {
1632                 unsigned sectors = pick.crc.compressed_size;
1633
1634                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
1635                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
1636                                                   &c->bio_read_split),
1637                                  orig->opts);
1638
1639                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1640                 rbio->bounce    = true;
1641                 rbio->split     = true;
1642         } else if (flags & BCH_READ_MUST_CLONE) {
1643                 /*
1644                  * Have to clone if there were any splits, due to error
1645                  * reporting issues (if a split errored, and retrying didn't
1646                  * work, when it reports the error to its parent (us) we don't
1647                  * know if the error was from our bio, and we should retry, or
1648                  * from the whole bio, in which case we don't want to retry and
1649                  * lose the error)
1650                  */
1651                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
1652                                                 &c->bio_read_split),
1653                                  orig->opts);
1654                 rbio->bio.bi_iter = iter;
1655                 rbio->split     = true;
1656         } else {
1657 noclone:
1658                 rbio = orig;
1659                 rbio->bio.bi_iter = iter;
1660                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1661         }
1662
1663         BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1664
1665         rbio->c                 = c;
1666         rbio->submit_time       = local_clock();
1667         if (rbio->split)
1668                 rbio->parent    = orig;
1669         else
1670                 rbio->end_io    = orig->bio.bi_end_io;
1671         rbio->bvec_iter         = iter;
1672         rbio->flags             = flags;
1673         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
1674         rbio->narrow_crcs       = narrow_crcs;
1675         rbio->hole              = 0;
1676         rbio->retry             = 0;
1677         rbio->context           = 0;
1678         rbio->devs_have         = bch2_bkey_devs(k);
1679         rbio->pick              = pick;
1680         rbio->pos               = pos;
1681         rbio->version           = k.k->version;
1682         rbio->promote           = promote;
1683         INIT_WORK(&rbio->work, NULL);
1684
1685         rbio->bio.bi_opf        = orig->bio.bi_opf;
1686         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1687         rbio->bio.bi_end_io     = bch2_read_endio;
1688
1689         if (rbio->bounce)
1690                 trace_read_bounce(&rbio->bio);
1691
1692         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1693
1694         if (!rbio->have_ioref)
1695                 goto no_device_postclone;
1696
1697         percpu_down_read_preempt_disable(&c->usage_lock);
1698         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
1699         percpu_up_read_preempt_enable(&c->usage_lock);
1700
1701         this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
1702                      bio_sectors(&rbio->bio));
1703
1704         bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1705
1706         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1707                 if (!(flags & BCH_READ_LAST_FRAGMENT)) {
1708                         bio_inc_remaining(&orig->bio);
1709                         trace_read_split(&orig->bio);
1710                 }
1711
1712                 submit_bio(&rbio->bio);
1713                 return 0;
1714         } else {
1715                 int ret;
1716
1717                 submit_bio_wait(&rbio->bio);
1718
1719                 rbio->context = RBIO_CONTEXT_UNBOUND;
1720                 bch2_read_endio(&rbio->bio);
1721
1722                 ret = rbio->retry;
1723                 rbio = bch2_rbio_free(rbio);
1724
1725                 if (ret == READ_RETRY_AVOID) {
1726                         bch2_mark_io_failure(failed, &pick);
1727                         ret = READ_RETRY;
1728                 }
1729
1730                 return ret;
1731         }
1732
1733 no_device_postclone:
1734         if (!rbio->split)
1735                 rbio->bio.bi_end_io = rbio->end_io;
1736         bch2_rbio_free(rbio);
1737 no_device:
1738         __bcache_io_error(c, "no device to read from");
1739
1740         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1741                 orig->bio.bi_status = BLK_STS_IOERR;
1742
1743                 if (flags & BCH_READ_LAST_FRAGMENT)
1744                         bch2_rbio_done(orig);
1745                 return 0;
1746         } else {
1747                 return READ_ERR;
1748         }
1749
1750 hole:
1751         /*
1752          * won't normally happen in the BCH_READ_NODECODE
1753          * (bch2_move_extent()) path, but if we retry and the extent we wanted
1754          * to read no longer exists we have to signal that:
1755          */
1756         if (flags & BCH_READ_NODECODE)
1757                 orig->hole = true;
1758
1759         zero_fill_bio_iter(&orig->bio, iter);
1760
1761         if (flags & BCH_READ_LAST_FRAGMENT)
1762                 bch2_rbio_done(orig);
1763         return 0;
1764 }
1765
1766 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
1767 {
1768         struct btree_iter iter;
1769         struct bkey_s_c k;
1770         unsigned flags = BCH_READ_RETRY_IF_STALE|
1771                 BCH_READ_MAY_PROMOTE|
1772                 BCH_READ_USER_MAPPED;
1773         int ret;
1774
1775         BUG_ON(rbio->_state);
1776         BUG_ON(flags & BCH_READ_NODECODE);
1777         BUG_ON(flags & BCH_READ_IN_RETRY);
1778
1779         rbio->c = c;
1780         rbio->start_time = local_clock();
1781
1782         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1783                            POS(inode, rbio->bio.bi_iter.bi_sector),
1784                            BTREE_ITER_SLOTS, k) {
1785                 BKEY_PADDED(k) tmp;
1786                 unsigned bytes;
1787
1788                 /*
1789                  * Unlock the iterator while the btree node's lock is still in
1790                  * cache, before doing the IO:
1791                  */
1792                 bkey_reassemble(&tmp.k, k);
1793                 k = bkey_i_to_s_c(&tmp.k);
1794                 bch2_btree_iter_unlock(&iter);
1795
1796                 bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
1797                               (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
1798                 swap(rbio->bio.bi_iter.bi_size, bytes);
1799
1800                 if (rbio->bio.bi_iter.bi_size == bytes)
1801                         flags |= BCH_READ_LAST_FRAGMENT;
1802
1803                 bch2_read_extent(c, rbio, k, flags);
1804
1805                 if (flags & BCH_READ_LAST_FRAGMENT)
1806                         return;
1807
1808                 swap(rbio->bio.bi_iter.bi_size, bytes);
1809                 bio_advance(&rbio->bio, bytes);
1810         }
1811
1812         /*
1813          * If we get here, it better have been because there was an error
1814          * reading a btree node
1815          */
1816         ret = bch2_btree_iter_unlock(&iter);
1817         BUG_ON(!ret);
1818         bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
1819         bch2_rbio_done(rbio);
1820 }
1821
1822 void bch2_fs_io_exit(struct bch_fs *c)
1823 {
1824         if (c->promote_table.tbl)
1825                 rhashtable_destroy(&c->promote_table);
1826         mempool_exit(&c->bio_bounce_pages);
1827         bioset_exit(&c->bio_write);
1828         bioset_exit(&c->bio_read_split);
1829         bioset_exit(&c->bio_read);
1830 }
1831
1832 int bch2_fs_io_init(struct bch_fs *c)
1833 {
1834         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1835                         BIOSET_NEED_BVECS) ||
1836             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1837                         BIOSET_NEED_BVECS) ||
1838             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
1839                         BIOSET_NEED_BVECS) ||
1840             mempool_init_page_pool(&c->bio_bounce_pages,
1841                                    max_t(unsigned,
1842                                          c->opts.btree_node_size,
1843                                          c->sb.encoded_extent_max) /
1844                                    PAGE_SECTORS, 0) ||
1845             rhashtable_init(&c->promote_table, &bch_promote_params))
1846                 return -ENOMEM;
1847
1848         return 0;
1849 }