]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c
7cddbccd1938ef4daeaaa2bda3d53e54ccf28faf
[bcachefs-tools-debian] / libbcachefs / io.c
1 /*
2  * Some low level IO code, and hacks for various block layer limitations
3  *
4  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5  * Copyright 2012 Google, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "alloc.h"
10 #include "bset.h"
11 #include "btree_update.h"
12 #include "buckets.h"
13 #include "checksum.h"
14 #include "compress.h"
15 #include "clock.h"
16 #include "debug.h"
17 #include "error.h"
18 #include "extents.h"
19 #include "io.h"
20 #include "journal.h"
21 #include "keylist.h"
22 #include "move.h"
23 #include "super.h"
24 #include "super-io.h"
25
26 #include <linux/blkdev.h>
27 #include <linux/random.h>
28
29 #include <trace/events/bcachefs.h>
30
31 /* Allocate, free from mempool: */
32
33 void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
34 {
35         u64 now = local_clock();
36         unsigned io_latency = (now >> 10) - submit_time_us;
37         atomic_t *latency = &ca->latency[rw];
38         unsigned old, new, v = atomic_read(latency);
39
40         do {
41                 old = v;
42
43                 /*
44                  * If the io latency was reasonably close to the current
45                  * latency, skip doing the update and atomic operation - most of
46                  * the time:
47                  */
48                 if (abs((int) (old - io_latency)) < (old >> 1) &&
49                     now & ~(~0 << 5))
50                         break;
51
52                 new = ewma_add((u64) old, io_latency, 6);
53         } while ((v = atomic_cmpxchg(latency, old, new)) != old);
54 }
55
56 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
57 {
58         struct bio_vec *bv;
59         unsigned i;
60
61         bio_for_each_segment_all(bv, bio, i)
62                 if (bv->bv_page != ZERO_PAGE(0))
63                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
64         bio->bi_vcnt = 0;
65 }
66
67 static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
68                                     bool *using_mempool)
69 {
70         struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
71
72         if (likely(!*using_mempool)) {
73                 bv->bv_page = alloc_page(GFP_NOIO);
74                 if (unlikely(!bv->bv_page)) {
75                         mutex_lock(&c->bio_bounce_pages_lock);
76                         *using_mempool = true;
77                         goto pool_alloc;
78
79                 }
80         } else {
81 pool_alloc:
82                 bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
83         }
84
85         bv->bv_len = PAGE_SIZE;
86         bv->bv_offset = 0;
87 }
88
89 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
90                                size_t bytes)
91 {
92         bool using_mempool = false;
93
94         BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
95
96         bio->bi_iter.bi_size = bytes;
97
98         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
99                 bch2_bio_alloc_page_pool(c, bio, &using_mempool);
100
101         if (using_mempool)
102                 mutex_unlock(&c->bio_bounce_pages_lock);
103 }
104
105 void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
106                                     size_t bytes)
107 {
108         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
109                 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
110
111                 BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
112
113                 bv->bv_page = alloc_page(GFP_NOIO);
114                 if (!bv->bv_page) {
115                         /*
116                          * We already allocated from mempool, we can't allocate from it again
117                          * without freeing the pages we already allocated or else we could
118                          * deadlock:
119                          */
120                         bch2_bio_free_pages_pool(c, bio);
121                         bch2_bio_alloc_pages_pool(c, bio, bytes);
122                         return;
123                 }
124
125                 bv->bv_len = PAGE_SIZE;
126                 bv->bv_offset = 0;
127                 bio->bi_vcnt++;
128         }
129
130         bio->bi_iter.bi_size = bytes;
131 }
132
133 /* Writes */
134
135 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
136                                enum bch_data_type type,
137                                const struct bkey_i *k)
138 {
139         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
140         const struct bch_extent_ptr *ptr;
141         struct bch_write_bio *n;
142         struct bch_dev *ca;
143
144         BUG_ON(c->opts.nochanges);
145
146         extent_for_each_ptr(e, ptr) {
147                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
148                        !c->devs[ptr->dev]);
149
150                 ca = bch_dev_bkey_exists(c, ptr->dev);
151
152                 if (ptr + 1 < &extent_entry_last(e)->ptr) {
153                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
154                                                    &ca->replica_set));
155
156                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
157                         n->bio.bi_private       = wbio->bio.bi_private;
158                         n->parent               = wbio;
159                         n->split                = true;
160                         n->bounce               = false;
161                         n->put_bio              = true;
162                         n->bio.bi_opf           = wbio->bio.bi_opf;
163                         bio_inc_remaining(&wbio->bio);
164                 } else {
165                         n = wbio;
166                         n->split                = false;
167                 }
168
169                 n->c                    = c;
170                 n->ca                   = ca;
171                 n->submit_time_us       = local_clock_us();
172                 n->bio.bi_iter.bi_sector = ptr->offset;
173
174                 if (!journal_flushes_device(ca))
175                         n->bio.bi_opf |= REQ_FUA;
176
177                 if (likely(percpu_ref_tryget(&ca->io_ref))) {
178                         this_cpu_add(ca->io_done->sectors[WRITE][type],
179                                      bio_sectors(&n->bio));
180
181                         n->have_io_ref          = true;
182                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
183                         submit_bio(&n->bio);
184                 } else {
185                         n->have_io_ref          = false;
186                         n->bio.bi_status        = BLK_STS_REMOVED;
187                         bio_endio(&n->bio);
188                 }
189         }
190 }
191
192 static void __bch2_write(struct closure *);
193
194 static void bch2_write_done(struct closure *cl)
195 {
196         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
197
198         BUG_ON(!(op->flags & BCH_WRITE_DONE));
199
200         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
201                 op->error = bch2_journal_error(&op->c->journal);
202
203         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
204                 bch2_disk_reservation_put(op->c, &op->res);
205         percpu_ref_put(&op->c->writes);
206         bch2_keylist_free(&op->insert_keys, op->inline_keys);
207         op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
208
209         closure_return(cl);
210 }
211
212 static u64 keylist_sectors(struct keylist *keys)
213 {
214         struct bkey_i *k;
215         u64 ret = 0;
216
217         for_each_keylist_key(keys, k)
218                 ret += k->k.size;
219
220         return ret;
221 }
222
223 int bch2_write_index_default(struct bch_write_op *op)
224 {
225         struct keylist *keys = &op->insert_keys;
226         struct btree_iter iter;
227         int ret;
228
229         bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
230                              bkey_start_pos(&bch2_keylist_front(keys)->k),
231                              BTREE_ITER_INTENT);
232
233         ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
234                                        NULL, op_journal_seq(op),
235                                        BTREE_INSERT_NOFAIL);
236         bch2_btree_iter_unlock(&iter);
237
238         return ret;
239 }
240
241 /**
242  * bch_write_index - after a write, update index to point to new data
243  */
244 static void bch2_write_index(struct closure *cl)
245 {
246         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
247         struct bch_fs *c = op->c;
248         struct keylist *keys = &op->insert_keys;
249         struct bkey_s_extent e;
250         struct bch_extent_ptr *ptr;
251         struct bkey_i *src, *dst = keys->keys, *n;
252         int ret;
253
254         op->flags |= BCH_WRITE_LOOPED;
255
256         for (src = keys->keys; src != keys->top; src = n) {
257                 n = bkey_next(src);
258                 bkey_copy(dst, src);
259
260                 e = bkey_i_to_s_extent(dst);
261                 extent_for_each_ptr_backwards(e, ptr)
262                         if (test_bit(ptr->dev, op->failed.d))
263                                 bch2_extent_drop_ptr(e, ptr);
264
265                 if (!bch2_extent_nr_ptrs(e.c)) {
266                         ret = -EIO;
267                         goto err;
268                 }
269
270                 if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
271                         ret = bch2_check_mark_super(c, BCH_DATA_USER,
272                                                     bch2_extent_devs(e.c));
273                         if (ret)
274                                 goto err;
275                 }
276
277                 dst = bkey_next(dst);
278         }
279
280         keys->top = dst;
281
282         if (!bch2_keylist_empty(keys)) {
283                 u64 sectors_start = keylist_sectors(keys);
284                 int ret = op->index_update_fn(op);
285
286                 BUG_ON(keylist_sectors(keys) && !ret);
287
288                 op->written += sectors_start - keylist_sectors(keys);
289
290                 if (ret) {
291                         __bcache_io_error(c, "btree IO error %i", ret);
292                         op->error = ret;
293                 }
294         }
295 out:
296         bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
297
298         if (!(op->flags & BCH_WRITE_DONE))
299                 continue_at(cl, __bch2_write, op->io_wq);
300
301         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
302                 bch2_journal_flush_seq_async(&c->journal,
303                                              *op_journal_seq(op),
304                                              cl);
305                 continue_at(cl, bch2_write_done, index_update_wq(op));
306         } else {
307                 continue_at_nobarrier(cl, bch2_write_done, NULL);
308         }
309         return;
310 err:
311         keys->top = keys->keys;
312         op->error = ret;
313         op->flags |= BCH_WRITE_DONE;
314         goto out;
315 }
316
317 static void bch2_write_endio(struct bio *bio)
318 {
319         struct closure *cl              = bio->bi_private;
320         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
321         struct bch_write_bio *wbio      = to_wbio(bio);
322         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
323         struct bch_fs *c                = wbio->c;
324         struct bch_dev *ca              = wbio->ca;
325
326         bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
327
328         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
329                 set_bit(ca->dev_idx, op->failed.d);
330
331         if (wbio->have_io_ref)
332                 percpu_ref_put(&ca->io_ref);
333
334         if (wbio->bounce)
335                 bch2_bio_free_pages_pool(c, bio);
336
337         if (wbio->put_bio)
338                 bio_put(bio);
339
340         if (parent)
341                 bio_endio(&parent->bio);
342         else
343                 closure_put(cl);
344 }
345
346 static void init_append_extent(struct bch_write_op *op,
347                                struct write_point *wp,
348                                struct bversion version,
349                                struct bch_extent_crc_unpacked crc)
350 {
351         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
352
353         op->pos.offset += crc.uncompressed_size;
354         e->k.p = op->pos;
355         e->k.size = crc.uncompressed_size;
356         e->k.version = version;
357         bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
358
359         bch2_extent_crc_append(e, crc);
360         bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
361
362         bch2_keylist_push(&op->insert_keys);
363 }
364
365 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
366                                         struct write_point *wp,
367                                         struct bio *src,
368                                         bool *page_alloc_failed)
369 {
370         struct bch_write_bio *wbio;
371         struct bio *bio;
372         unsigned output_available =
373                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
374         unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
375
376         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
377         wbio                    = wbio_init(bio);
378         wbio->bounce            = true;
379         wbio->put_bio           = true;
380         /* copy WRITE_SYNC flag */
381         wbio->bio.bi_opf        = src->bi_opf;
382
383         /*
384          * We can't use mempool for more than c->sb.encoded_extent_max
385          * worth of pages, but we'd like to allocate more if we can:
386          */
387         while (bio->bi_iter.bi_size < output_available) {
388                 unsigned len = min_t(unsigned, PAGE_SIZE,
389                                      output_available - bio->bi_iter.bi_size);
390                 struct page *p;
391
392                 p = alloc_page(GFP_NOIO);
393                 if (!p) {
394                         unsigned pool_max =
395                                 min_t(unsigned, output_available,
396                                       c->sb.encoded_extent_max << 9);
397
398                         if (bio_sectors(bio) < pool_max)
399                                 bch2_bio_alloc_pages_pool(c, bio, pool_max);
400                         break;
401                 }
402
403                 bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
404                         .bv_page        = p,
405                         .bv_len         = len,
406                         .bv_offset      = 0,
407                 };
408                 bio->bi_iter.bi_size += len;
409         }
410
411         *page_alloc_failed = bio->bi_vcnt < pages;
412         return bio;
413 }
414
415 static int bch2_write_rechecksum(struct bch_fs *c,
416                                  struct bch_write_op *op,
417                                  unsigned new_csum_type)
418 {
419         struct bio *bio = &op->wbio.bio;
420         struct bch_extent_crc_unpacked new_crc;
421         int ret;
422
423         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
424
425         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
426             bch2_csum_type_is_encryption(new_csum_type))
427                 new_csum_type = op->crc.csum_type;
428
429         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
430                                   NULL, &new_crc,
431                                   op->crc.offset, op->crc.live_size,
432                                   new_csum_type);
433         if (ret)
434                 return ret;
435
436         bio_advance(bio, op->crc.offset << 9);
437         bio->bi_iter.bi_size = op->crc.live_size << 9;
438         op->crc = new_crc;
439         return 0;
440 }
441
442 static int bch2_write_decrypt(struct bch_write_op *op)
443 {
444         struct bch_fs *c = op->c;
445         struct nonce nonce = extent_nonce(op->version, op->crc);
446         struct bch_csum csum;
447
448         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
449                 return 0;
450
451         /*
452          * If we need to decrypt data in the write path, we'll no longer be able
453          * to verify the existing checksum (poly1305 mac, in this case) after
454          * it's decrypted - this is the last point we'll be able to reverify the
455          * checksum:
456          */
457         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
458         if (bch2_crc_cmp(op->crc.csum, csum))
459                 return -EIO;
460
461         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
462         op->crc.csum_type = 0;
463         op->crc.csum = (struct bch_csum) { 0, 0 };
464         return 0;
465 }
466
467 static enum prep_encoded_ret {
468         PREP_ENCODED_OK,
469         PREP_ENCODED_ERR,
470         PREP_ENCODED_CHECKSUM_ERR,
471         PREP_ENCODED_DO_WRITE,
472 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
473 {
474         struct bch_fs *c = op->c;
475         struct bio *bio = &op->wbio.bio;
476
477         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
478                 return PREP_ENCODED_OK;
479
480         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
481
482         /* Can we just write the entire extent as is? */
483         if (op->crc.uncompressed_size == op->crc.live_size &&
484             op->crc.compressed_size <= wp->sectors_free &&
485             op->crc.compression_type == op->compression_type) {
486                 if (!op->crc.compression_type &&
487                     op->csum_type != op->crc.csum_type &&
488                     bch2_write_rechecksum(c, op, op->csum_type))
489                         return PREP_ENCODED_CHECKSUM_ERR;
490
491                 return PREP_ENCODED_DO_WRITE;
492         }
493
494         /*
495          * If the data is compressed and we couldn't write the entire extent as
496          * is, we have to decompress it:
497          */
498         if (op->crc.compression_type) {
499                 struct bch_csum csum;
500
501                 if (bch2_write_decrypt(op))
502                         return PREP_ENCODED_CHECKSUM_ERR;
503
504                 /* Last point we can still verify checksum: */
505                 csum = bch2_checksum_bio(c, op->crc.csum_type,
506                                          extent_nonce(op->version, op->crc),
507                                          bio);
508                 if (bch2_crc_cmp(op->crc.csum, csum))
509                         return PREP_ENCODED_CHECKSUM_ERR;
510
511                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
512                         return PREP_ENCODED_ERR;
513         }
514
515         /*
516          * No longer have compressed data after this point - data might be
517          * encrypted:
518          */
519
520         /*
521          * If the data is checksummed and we're only writing a subset,
522          * rechecksum and adjust bio to point to currently live data:
523          */
524         if ((op->crc.live_size != op->crc.uncompressed_size ||
525              op->crc.csum_type != op->csum_type) &&
526             bch2_write_rechecksum(c, op, op->csum_type))
527                 return PREP_ENCODED_CHECKSUM_ERR;
528
529         /*
530          * If we want to compress the data, it has to be decrypted:
531          */
532         if ((op->compression_type ||
533              bch2_csum_type_is_encryption(op->crc.csum_type) !=
534              bch2_csum_type_is_encryption(op->csum_type)) &&
535             bch2_write_decrypt(op))
536                 return PREP_ENCODED_CHECKSUM_ERR;
537
538         return PREP_ENCODED_OK;
539 }
540
541 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
542 {
543         struct bch_fs *c = op->c;
544         struct bio *src = &op->wbio.bio, *dst = src;
545         struct bvec_iter saved_iter;
546         struct bkey_i *key_to_write;
547         unsigned key_to_write_offset = op->insert_keys.top_p -
548                 op->insert_keys.keys_p;
549         unsigned total_output = 0;
550         bool bounce = false, page_alloc_failed = false;
551         int ret, more = 0;
552
553         BUG_ON(!bio_sectors(src));
554
555         switch (bch2_write_prep_encoded_data(op, wp)) {
556         case PREP_ENCODED_OK:
557                 break;
558         case PREP_ENCODED_ERR:
559                 ret = -EIO;
560                 goto err;
561         case PREP_ENCODED_CHECKSUM_ERR:
562                 goto csum_err;
563         case PREP_ENCODED_DO_WRITE:
564                 init_append_extent(op, wp, op->version, op->crc);
565                 goto do_write;
566         }
567
568         if (op->compression_type ||
569             (op->csum_type &&
570              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
571             (bch2_csum_type_is_encryption(op->csum_type) &&
572              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
573                 dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
574                 bounce = true;
575         }
576
577         saved_iter = dst->bi_iter;
578
579         do {
580                 struct bch_extent_crc_unpacked crc =
581                         (struct bch_extent_crc_unpacked) { 0 };
582                 struct bversion version = op->version;
583                 size_t dst_len, src_len;
584
585                 if (page_alloc_failed &&
586                     bio_sectors(dst) < wp->sectors_free &&
587                     bio_sectors(dst) < c->sb.encoded_extent_max)
588                         break;
589
590                 BUG_ON(op->compression_type &&
591                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
592                        bch2_csum_type_is_encryption(op->crc.csum_type));
593                 BUG_ON(op->compression_type && !bounce);
594
595                 crc.compression_type = op->compression_type
596                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
597                                              op->compression_type)
598                         : 0;
599                 if (!crc.compression_type) {
600                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
601                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
602
603                         if (op->csum_type)
604                                 dst_len = min_t(unsigned, dst_len,
605                                                 c->sb.encoded_extent_max << 9);
606
607                         if (bounce) {
608                                 swap(dst->bi_iter.bi_size, dst_len);
609                                 bio_copy_data(dst, src);
610                                 swap(dst->bi_iter.bi_size, dst_len);
611                         }
612
613                         src_len = dst_len;
614                 }
615
616                 BUG_ON(!src_len || !dst_len);
617
618                 if (bch2_csum_type_is_encryption(op->csum_type)) {
619                         if (bversion_zero(version)) {
620                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
621                         } else {
622                                 crc.nonce = op->nonce;
623                                 op->nonce += src_len >> 9;
624                         }
625                 }
626
627                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
628                     !crc.compression_type &&
629                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
630                     bch2_csum_type_is_encryption(op->csum_type)) {
631                         /*
632                          * Note: when we're using rechecksum(), we need to be
633                          * checksumming @src because it has all the data our
634                          * existing checksum covers - if we bounced (because we
635                          * were trying to compress), @dst will only have the
636                          * part of the data the new checksum will cover.
637                          *
638                          * But normally we want to be checksumming post bounce,
639                          * because part of the reason for bouncing is so the
640                          * data can't be modified (by userspace) while it's in
641                          * flight.
642                          */
643                         if (bch2_rechecksum_bio(c, src, version, op->crc,
644                                         &crc, &op->crc,
645                                         src_len >> 9,
646                                         bio_sectors(src) - (src_len >> 9),
647                                         op->csum_type))
648                                 goto csum_err;
649                 } else {
650                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
651                             bch2_rechecksum_bio(c, src, version, op->crc,
652                                         NULL, &op->crc,
653                                         src_len >> 9,
654                                         bio_sectors(src) - (src_len >> 9),
655                                         op->crc.csum_type))
656                                 goto csum_err;
657
658                         crc.compressed_size     = dst_len >> 9;
659                         crc.uncompressed_size   = src_len >> 9;
660                         crc.live_size           = src_len >> 9;
661
662                         swap(dst->bi_iter.bi_size, dst_len);
663                         bch2_encrypt_bio(c, op->csum_type,
664                                          extent_nonce(version, crc), dst);
665                         crc.csum = bch2_checksum_bio(c, op->csum_type,
666                                          extent_nonce(version, crc), dst);
667                         crc.csum_type = op->csum_type;
668                         swap(dst->bi_iter.bi_size, dst_len);
669                 }
670
671                 init_append_extent(op, wp, version, crc);
672
673                 if (dst != src)
674                         bio_advance(dst, dst_len);
675                 bio_advance(src, src_len);
676                 total_output += dst_len;
677         } while (dst->bi_iter.bi_size &&
678                  src->bi_iter.bi_size &&
679                  wp->sectors_free &&
680                  !bch2_keylist_realloc(&op->insert_keys,
681                                       op->inline_keys,
682                                       ARRAY_SIZE(op->inline_keys),
683                                       BKEY_EXTENT_U64s_MAX));
684
685         more = src->bi_iter.bi_size != 0;
686
687         dst->bi_iter = saved_iter;
688
689         if (!bounce && more) {
690                 dst = bio_split(src, total_output >> 9,
691                                 GFP_NOIO, &c->bio_write);
692                 wbio_init(dst)->put_bio = true;
693         }
694
695         dst->bi_iter.bi_size = total_output;
696
697         /* Free unneeded pages after compressing: */
698         if (bounce)
699                 while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
700                         mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
701                                      &c->bio_bounce_pages);
702 do_write:
703         /* might have done a realloc... */
704
705         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
706
707         dst->bi_end_io  = bch2_write_endio;
708         dst->bi_private = &op->cl;
709         bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
710
711         closure_get(dst->bi_private);
712
713         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
714                                   key_to_write);
715         return more;
716 csum_err:
717         bch_err(c, "error verifying existing checksum while "
718                 "rewriting existing data (memory corruption?)");
719         ret = -EIO;
720 err:
721         if (bounce) {
722                 bch2_bio_free_pages_pool(c, dst);
723                 bio_put(dst);
724         }
725
726         return ret;
727 }
728
729 static void __bch2_write(struct closure *cl)
730 {
731         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
732         struct bch_fs *c = op->c;
733         struct write_point *wp;
734         int ret;
735
736         do {
737                 if (op->open_buckets_nr + op->nr_replicas >
738                     ARRAY_SIZE(op->open_buckets))
739                         continue_at(cl, bch2_write_index, index_update_wq(op));
740
741                 /* for the device pointers and 1 for the chksum */
742                 if (bch2_keylist_realloc(&op->insert_keys,
743                                         op->inline_keys,
744                                         ARRAY_SIZE(op->inline_keys),
745                                         BKEY_EXTENT_U64s_MAX))
746                         continue_at(cl, bch2_write_index, index_update_wq(op));
747
748                 wp = bch2_alloc_sectors_start(c,
749                         op->devs,
750                         op->write_point,
751                         &op->devs_have,
752                         op->nr_replicas,
753                         op->nr_replicas_required,
754                         op->alloc_reserve,
755                         op->flags,
756                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
757                 EBUG_ON(!wp);
758
759                 if (unlikely(IS_ERR(wp))) {
760                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
761                                 ret = PTR_ERR(wp);
762                                 goto err;
763                         }
764
765                         /*
766                          * If we already have some keys, must insert them first
767                          * before allocating another open bucket. We only hit
768                          * this case if open_bucket_nr > 1.
769                          */
770                         if (!bch2_keylist_empty(&op->insert_keys))
771                                 continue_at(cl, bch2_write_index,
772                                             index_update_wq(op));
773
774                         /*
775                          * If we've looped, we're running out of a workqueue -
776                          * not the bch2_write() caller's context - and we don't
777                          * want to block the workqueue:
778                          */
779                         if (op->flags & BCH_WRITE_LOOPED)
780                                 continue_at(cl, __bch2_write, op->io_wq);
781
782                         /*
783                          * Otherwise, we do want to block the caller on alloc
784                          * failure instead of letting it queue up more and more
785                          * writes:
786                          * XXX: this technically needs a try_to_freeze() -
787                          * except that that's not safe because caller may have
788                          * issued other IO... hmm..
789                          */
790                         closure_sync(cl);
791                         continue;
792                 }
793
794                 ret = bch2_write_extent(op, wp);
795
796                 BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use >
797                        ARRAY_SIZE(op->open_buckets));
798                 bch2_open_bucket_get(c, wp,
799                                      &op->open_buckets_nr,
800                                      op->open_buckets);
801                 bch2_alloc_sectors_done(c, wp);
802
803                 if (ret < 0)
804                         goto err;
805         } while (ret);
806
807         op->flags |= BCH_WRITE_DONE;
808         continue_at(cl, bch2_write_index, index_update_wq(op));
809 err:
810         /*
811          * Right now we can only error here if we went RO - the
812          * allocation failed, but we already checked for -ENOSPC when we
813          * got our reservation.
814          *
815          * XXX capacity might have changed, but we don't check for that
816          * yet:
817          */
818         op->error = ret;
819         op->flags |= BCH_WRITE_DONE;
820
821         /*
822          * No reason not to insert keys for whatever data was successfully
823          * written (especially for a cmpxchg operation that's moving data
824          * around)
825          */
826         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
827                     ? bch2_write_index
828                     : bch2_write_done, index_update_wq(op));
829 }
830
831 /**
832  * bch_write - handle a write to a cache device or flash only volume
833  *
834  * This is the starting point for any data to end up in a cache device; it could
835  * be from a normal write, or a writeback write, or a write to a flash only
836  * volume - it's also used by the moving garbage collector to compact data in
837  * mostly empty buckets.
838  *
839  * It first writes the data to the cache, creating a list of keys to be inserted
840  * (if the data won't fit in a single open bucket, there will be multiple keys);
841  * after the data is written it calls bch_journal, and after the keys have been
842  * added to the next journal write they're inserted into the btree.
843  *
844  * If op->discard is true, instead of inserting the data it invalidates the
845  * region of the cache represented by op->bio and op->inode.
846  */
847 void bch2_write(struct closure *cl)
848 {
849         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
850         struct bch_fs *c = op->c;
851
852         BUG_ON(!op->nr_replicas);
853         BUG_ON(!op->write_point.v);
854         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
855         BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
856
857         memset(&op->failed, 0, sizeof(op->failed));
858
859         bch2_keylist_init(&op->insert_keys, op->inline_keys);
860         wbio_init(&op->wbio.bio)->put_bio = false;
861
862         if (c->opts.nochanges ||
863             !percpu_ref_tryget(&c->writes)) {
864                 __bcache_io_error(c, "read only");
865                 op->error = -EROFS;
866                 if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
867                         bch2_disk_reservation_put(c, &op->res);
868                 closure_return(cl);
869         }
870
871         bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
872
873         continue_at_nobarrier(cl, __bch2_write, NULL);
874 }
875
876 /* Cache promotion on read */
877
878 struct promote_op {
879         struct closure          cl;
880         struct migrate_write    write;
881         struct bio_vec          bi_inline_vecs[0]; /* must be last */
882 };
883
884 static void promote_done(struct closure *cl)
885 {
886         struct promote_op *op =
887                 container_of(cl, struct promote_op, cl);
888         struct bch_fs *c = op->write.op.c;
889
890         percpu_ref_put(&c->writes);
891         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
892         kfree(op);
893 }
894
895 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
896 {
897         struct bch_fs *c = rbio->c;
898         struct closure *cl = &op->cl;
899         struct bio *bio = &op->write.op.wbio.bio;
900
901         BUG_ON(!rbio->split || !rbio->bounce);
902
903         if (!percpu_ref_tryget(&c->writes))
904                 return;
905
906         trace_promote(&rbio->bio);
907
908         /* we now own pages: */
909         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
910         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
911         rbio->promote = NULL;
912
913         bch2_write_op_init(&op->write.op, c);
914         op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
915         op->write.op.compression_type =
916                 bch2_compression_opt_to_type(rbio->opts.compression);
917
918         op->write.move_dev      = -1;
919         op->write.op.devs       = c->fastest_devs;
920         op->write.op.write_point = writepoint_hashed((unsigned long) current);
921         op->write.op.flags      |= BCH_WRITE_ALLOC_NOWAIT;
922         op->write.op.flags      |= BCH_WRITE_CACHED;
923
924         bch2_migrate_write_init(&op->write, rbio);
925
926         closure_init(cl, NULL);
927         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
928         closure_return_with_destructor(cl, promote_done);
929 }
930
931 /*
932  * XXX: multiple promotes can race with each other, wastefully. Keep a list of
933  * outstanding promotes?
934  */
935 static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
936 {
937         struct promote_op *op;
938         struct bio *bio;
939         /* data might have to be decompressed in the write path: */
940         unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
941                                       PAGE_SECTORS);
942
943         BUG_ON(!rbio->bounce);
944         BUG_ON(pages < rbio->bio.bi_vcnt);
945
946         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
947                      GFP_NOIO);
948         if (!op)
949                 return NULL;
950
951         bio = &op->write.op.wbio.bio;
952         bio_init(bio, bio->bi_inline_vecs, pages);
953
954         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
955                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
956
957         return op;
958 }
959
960 /* only promote if we're not reading from the fastest tier: */
961 static bool should_promote(struct bch_fs *c,
962                            struct extent_pick_ptr *pick, unsigned flags)
963 {
964         if (!(flags & BCH_READ_MAY_PROMOTE))
965                 return false;
966
967         if (percpu_ref_is_dying(&c->writes))
968                 return false;
969
970         return c->fastest_tier &&
971                 c->fastest_tier < c->tiers + pick->ca->mi.tier;
972 }
973
974 /* Read */
975
976 static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
977                                      struct bvec_iter, u64,
978                                      struct bch_devs_mask *, unsigned);
979
980 #define READ_RETRY_AVOID        1
981 #define READ_RETRY              2
982 #define READ_ERR                3
983
984 enum rbio_context {
985         RBIO_CONTEXT_NULL,
986         RBIO_CONTEXT_HIGHPRI,
987         RBIO_CONTEXT_UNBOUND,
988 };
989
990 static inline struct bch_read_bio *
991 bch2_rbio_parent(struct bch_read_bio *rbio)
992 {
993         return rbio->split ? rbio->parent : rbio;
994 }
995
996 __always_inline
997 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
998                            enum rbio_context context,
999                            struct workqueue_struct *wq)
1000 {
1001         if (context <= rbio->context) {
1002                 fn(&rbio->work);
1003         } else {
1004                 rbio->work.func         = fn;
1005                 rbio->context           = context;
1006                 queue_work(wq, &rbio->work);
1007         }
1008 }
1009
1010 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1011 {
1012         struct bch_read_bio *parent = rbio->parent;
1013
1014         BUG_ON(!rbio->split);
1015
1016         if (rbio->promote)
1017                 kfree(rbio->promote);
1018         if (rbio->bounce)
1019                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1020         bio_put(&rbio->bio);
1021
1022         return parent;
1023 }
1024
1025 static void bch2_rbio_done(struct bch_read_bio *rbio)
1026 {
1027         if (rbio->promote)
1028                 kfree(rbio->promote);
1029         rbio->promote = NULL;
1030
1031         if (rbio->split)
1032                 rbio = bch2_rbio_free(rbio);
1033         bio_endio(&rbio->bio);
1034 }
1035
1036 static void bch2_rbio_retry(struct work_struct *work)
1037 {
1038         struct bch_read_bio *rbio =
1039                 container_of(work, struct bch_read_bio, work);
1040         struct bch_fs *c                = rbio->c;
1041         struct bvec_iter iter           = rbio->bvec_iter;
1042         unsigned flags                  = rbio->flags;
1043         u64 inode                       = rbio->pos.inode;
1044         struct bch_devs_mask avoid;
1045
1046         trace_read_retry(&rbio->bio);
1047
1048         memset(&avoid, 0, sizeof(avoid));
1049
1050         if (rbio->retry == READ_RETRY_AVOID)
1051                 __set_bit(rbio->pick.ca->dev_idx, avoid.d);
1052
1053         if (rbio->promote)
1054                 kfree(rbio->promote);
1055         rbio->promote = NULL;
1056
1057         if (rbio->split)
1058                 rbio = bch2_rbio_free(rbio);
1059         else
1060                 rbio->bio.bi_status = 0;
1061
1062         if (!(flags & BCH_READ_NODECODE))
1063                 flags |= BCH_READ_MUST_CLONE;
1064         flags |= BCH_READ_IN_RETRY;
1065         flags &= ~BCH_READ_MAY_PROMOTE;
1066
1067         if (flags & BCH_READ_NODECODE)
1068                 bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
1069         else
1070                 __bch2_read(c, rbio, iter, inode, &avoid, flags);
1071 }
1072
1073 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1074                             blk_status_t error)
1075 {
1076         rbio->retry = retry;
1077
1078         if (rbio->flags & BCH_READ_IN_RETRY)
1079                 return;
1080
1081         if (retry == READ_ERR) {
1082                 bch2_rbio_parent(rbio)->bio.bi_status = error;
1083                 bch2_rbio_done(rbio);
1084         } else {
1085                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1086                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1087         }
1088 }
1089
1090 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1091 {
1092         struct bch_fs *c = rbio->c;
1093         struct btree_iter iter;
1094         struct bkey_s_c k;
1095         struct bkey_i_extent *e;
1096         BKEY_PADDED(k) new;
1097         struct bch_extent_crc_unpacked new_crc;
1098         unsigned offset;
1099         int ret;
1100
1101         if (rbio->pick.crc.compression_type)
1102                 return;
1103
1104         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
1105                              BTREE_ITER_INTENT);
1106 retry:
1107         k = bch2_btree_iter_peek(&iter);
1108         if (IS_ERR_OR_NULL(k.k))
1109                 goto out;
1110
1111         if (!bkey_extent_is_data(k.k))
1112                 goto out;
1113
1114         bkey_reassemble(&new.k, k);
1115         e = bkey_i_to_extent(&new.k);
1116
1117         if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
1118                                      rbio->pick.ptr,
1119                                      rbio->pos.offset -
1120                                      rbio->pick.crc.offset) ||
1121             bversion_cmp(e->k.version, rbio->version))
1122                 goto out;
1123
1124         /* Extent was merged? */
1125         if (bkey_start_offset(&e->k) < rbio->pos.offset ||
1126             e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
1127                 goto out;
1128
1129         /* The extent might have been partially overwritten since we read it: */
1130         offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
1131
1132         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1133                                 rbio->pick.crc, NULL, &new_crc,
1134                                 offset, e->k.size,
1135                                 rbio->pick.crc.csum_type)) {
1136                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1137                 goto out;
1138         }
1139
1140         if (!bch2_extent_narrow_crcs(e, new_crc))
1141                 goto out;
1142
1143         ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
1144                                    BTREE_INSERT_ATOMIC|
1145                                    BTREE_INSERT_NOFAIL|
1146                                    BTREE_INSERT_NOWAIT,
1147                                    BTREE_INSERT_ENTRY(&iter, &e->k_i));
1148         if (ret == -EINTR)
1149                 goto retry;
1150 out:
1151         bch2_btree_iter_unlock(&iter);
1152 }
1153
1154 static bool should_narrow_crcs(struct bkey_s_c_extent e,
1155                                struct extent_pick_ptr *pick,
1156                                unsigned flags)
1157 {
1158         return !(flags & BCH_READ_IN_RETRY) &&
1159                 bch2_can_narrow_extent_crcs(e, pick->crc);
1160 }
1161
1162 /* Inner part that may run in process context */
1163 static void __bch2_read_endio(struct work_struct *work)
1164 {
1165         struct bch_read_bio *rbio =
1166                 container_of(work, struct bch_read_bio, work);
1167         struct bch_fs *c = rbio->c;
1168         struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
1169         struct bvec_iter dst_iter = rbio->bvec_iter;
1170         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1171         struct nonce nonce = extent_nonce(rbio->version, crc);
1172         struct bch_csum csum;
1173
1174         /* Reset iterator for checksumming and copying bounced data: */
1175         if (rbio->bounce) {
1176                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1177                 src->bi_iter.bi_idx             = 0;
1178                 src->bi_iter.bi_bvec_done       = 0;
1179         } else {
1180                 src->bi_iter                    = rbio->bvec_iter;
1181         }
1182
1183         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1184         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1185                 goto csum_err;
1186
1187         if (unlikely(rbio->narrow_crcs))
1188                 bch2_rbio_narrow_crcs(rbio);
1189
1190         if (rbio->flags & BCH_READ_NODECODE)
1191                 goto nodecode;
1192
1193         /* Adjust crc to point to subset of data we want: */
1194         crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
1195         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1196
1197         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1198                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1199                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1200                         goto decompression_err;
1201         } else {
1202                 /* don't need to decrypt the entire bio: */
1203                 nonce = nonce_add(nonce, crc.offset << 9);
1204                 bio_advance(src, crc.offset << 9);
1205
1206                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1207                 src->bi_iter.bi_size = dst_iter.bi_size;
1208
1209                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1210
1211                 if (rbio->bounce) {
1212                         struct bvec_iter src_iter = src->bi_iter;
1213                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1214                 }
1215         }
1216
1217         if (rbio->promote) {
1218                 /*
1219                  * Re encrypt data we decrypted, so it's consistent with
1220                  * rbio->crc:
1221                  */
1222                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1223                 promote_start(rbio->promote, rbio);
1224         }
1225 nodecode:
1226         if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
1227                 bch2_rbio_done(rbio);
1228         return;
1229 csum_err:
1230         /*
1231          * Checksum error: if the bio wasn't bounced, we may have been
1232          * reading into buffers owned by userspace (that userspace can
1233          * scribble over) - retry the read, bouncing it this time:
1234          */
1235         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1236                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1237                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1238                 return;
1239         }
1240
1241         bch2_dev_io_error(rbio->pick.ca,
1242                 "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
1243                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1244                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1245                 csum.hi, csum.lo, crc.csum_type);
1246         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1247         return;
1248 decompression_err:
1249         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1250                           rbio->pos.inode,
1251                           (u64) rbio->bvec_iter.bi_sector);
1252         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1253         return;
1254 }
1255
1256 static void bch2_read_endio(struct bio *bio)
1257 {
1258         struct bch_read_bio *rbio =
1259                 container_of(bio, struct bch_read_bio, bio);
1260         struct bch_fs *c = rbio->c;
1261         struct workqueue_struct *wq = NULL;
1262         enum rbio_context context = RBIO_CONTEXT_NULL;
1263
1264         bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
1265
1266         percpu_ref_put(&rbio->pick.ca->io_ref);
1267
1268         if (!rbio->split)
1269                 rbio->bio.bi_end_io = rbio->end_io;
1270
1271         if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
1272                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1273                 return;
1274         }
1275
1276         if (rbio->pick.ptr.cached &&
1277             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1278              ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
1279                 atomic_long_inc(&c->read_realloc_races);
1280
1281                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1282                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1283                 else
1284                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1285                 return;
1286         }
1287
1288         if (rbio->narrow_crcs ||
1289             rbio->pick.crc.compression_type ||
1290             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1291                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1292         else if (rbio->pick.crc.csum_type)
1293                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1294
1295         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1296 }
1297
1298 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1299                        struct bvec_iter iter, struct bkey_s_c_extent e,
1300                        struct extent_pick_ptr *pick, unsigned flags)
1301 {
1302         struct bch_read_bio *rbio;
1303         bool split = false, bounce = false, read_full = false;
1304         bool promote = false, narrow_crcs = false;
1305         struct bpos pos = bkey_start_pos(e.k);
1306         int ret = 0;
1307
1308         lg_local_lock(&c->usage_lock);
1309         bucket_io_clock_reset(c, pick->ca,
1310                         PTR_BUCKET_NR(pick->ca, &pick->ptr), READ);
1311         lg_local_unlock(&c->usage_lock);
1312
1313         narrow_crcs = should_narrow_crcs(e, pick, flags);
1314
1315         if (flags & BCH_READ_NODECODE) {
1316                 BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
1317                 iter.bi_size = pick->crc.compressed_size << 9;
1318                 goto noclone;
1319         }
1320
1321         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1322                 flags |= BCH_READ_MUST_BOUNCE;
1323
1324         EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
1325                 e.k->p.offset < bvec_iter_end_sector(iter));
1326
1327         if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
1328             (pick->crc.csum_type != BCH_CSUM_NONE &&
1329              (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
1330               (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
1331                (flags & BCH_READ_USER_MAPPED)) ||
1332               (flags & BCH_READ_MUST_BOUNCE)))) {
1333                 read_full = true;
1334                 bounce = true;
1335         }
1336
1337         promote = should_promote(c, pick, flags);
1338         /* could also set read_full */
1339         if (promote)
1340                 bounce = true;
1341
1342         if (!read_full) {
1343                 EBUG_ON(pick->crc.compression_type);
1344                 EBUG_ON(pick->crc.csum_type &&
1345                         (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
1346                          bvec_iter_sectors(iter) != pick->crc.live_size ||
1347                          pick->crc.offset ||
1348                          iter.bi_sector != pos.offset));
1349
1350                 pick->ptr.offset += pick->crc.offset +
1351                         (iter.bi_sector - pos.offset);
1352                 pick->crc.compressed_size       = bvec_iter_sectors(iter);
1353                 pick->crc.uncompressed_size     = bvec_iter_sectors(iter);
1354                 pick->crc.offset                = 0;
1355                 pick->crc.live_size             = bvec_iter_sectors(iter);
1356                 pos.offset                      = iter.bi_sector;
1357         }
1358
1359         if (bounce) {
1360                 unsigned sectors = pick->crc.compressed_size;
1361
1362                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
1363                                         DIV_ROUND_UP(sectors, PAGE_SECTORS),
1364                                         &c->bio_read_split),
1365                                  orig->opts);
1366
1367                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1368                 split = true;
1369         } else if (flags & BCH_READ_MUST_CLONE) {
1370                 /*
1371                  * Have to clone if there were any splits, due to error
1372                  * reporting issues (if a split errored, and retrying didn't
1373                  * work, when it reports the error to its parent (us) we don't
1374                  * know if the error was from our bio, and we should retry, or
1375                  * from the whole bio, in which case we don't want to retry and
1376                  * lose the error)
1377                  */
1378                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
1379                                                 &c->bio_read_split),
1380                                  orig->opts);
1381                 rbio->bio.bi_iter = iter;
1382                 split = true;
1383         } else {
1384 noclone:
1385                 rbio = orig;
1386                 rbio->bio.bi_iter = iter;
1387                 split = false;
1388                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1389         }
1390
1391         BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
1392
1393         rbio->c                 = c;
1394         if (split)
1395                 rbio->parent    = orig;
1396         else
1397                 rbio->end_io    = orig->bio.bi_end_io;
1398         rbio->bvec_iter         = iter;
1399         rbio->submit_time_us    = local_clock_us();
1400         rbio->flags             = flags;
1401         rbio->bounce            = bounce;
1402         rbio->split             = split;
1403         rbio->narrow_crcs       = narrow_crcs;
1404         rbio->retry             = 0;
1405         rbio->context           = 0;
1406         rbio->devs_have         = bch2_extent_devs(e);
1407         rbio->pick              = *pick;
1408         rbio->pos               = pos;
1409         rbio->version           = e.k->version;
1410         rbio->promote           = promote ? promote_alloc(rbio) : NULL;
1411         INIT_WORK(&rbio->work, NULL);
1412
1413         bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
1414         rbio->bio.bi_opf        = orig->bio.bi_opf;
1415         rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
1416         rbio->bio.bi_end_io     = bch2_read_endio;
1417
1418         if (bounce)
1419                 trace_read_bounce(&rbio->bio);
1420
1421         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1422         this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
1423                      bio_sectors(&rbio->bio));
1424
1425         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1426                 submit_bio(&rbio->bio);
1427         } else {
1428                 submit_bio_wait(&rbio->bio);
1429
1430                 rbio->context = RBIO_CONTEXT_UNBOUND;
1431                 bch2_read_endio(&rbio->bio);
1432
1433                 ret = rbio->retry;
1434                 if (rbio->split)
1435                         rbio = bch2_rbio_free(rbio);
1436                 if (!ret)
1437                         bch2_rbio_done(rbio);
1438         }
1439
1440         return ret;
1441 }
1442
1443 static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1444                                      struct bvec_iter bvec_iter, u64 inode,
1445                                      struct bch_devs_mask *avoid, unsigned flags)
1446 {
1447         struct extent_pick_ptr pick;
1448         struct btree_iter iter;
1449         BKEY_PADDED(k) tmp;
1450         struct bkey_s_c k;
1451         int ret;
1452
1453         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
1454                              POS(inode, bvec_iter.bi_sector),
1455                              BTREE_ITER_SLOTS);
1456 retry:
1457         k = bch2_btree_iter_peek_slot(&iter);
1458         if (btree_iter_err(k)) {
1459                 bch2_btree_iter_unlock(&iter);
1460                 goto err;
1461         }
1462
1463         bkey_reassemble(&tmp.k, k);
1464         k = bkey_i_to_s_c(&tmp.k);
1465         bch2_btree_iter_unlock(&iter);
1466
1467         if (!bkey_extent_is_data(k.k) ||
1468             !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
1469                                      rbio->pick.ptr,
1470                                      rbio->pos.offset -
1471                                      rbio->pick.crc.offset) ||
1472             bkey_start_offset(k.k) != bvec_iter.bi_sector)
1473                 goto err;
1474
1475         bch2_extent_pick_ptr(c, k, avoid, &pick);
1476         if (IS_ERR(pick.ca)) {
1477                 bcache_io_error(c, &rbio->bio, "no device to read from");
1478                 bio_endio(&rbio->bio);
1479                 return;
1480         }
1481
1482         if (!pick.ca)
1483                 goto err;
1484
1485         if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
1486                 percpu_ref_put(&pick.ca->io_ref);
1487                 goto err;
1488
1489         }
1490
1491         ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
1492                                  &pick, flags);
1493         switch (ret) {
1494         case READ_RETRY_AVOID:
1495                 __set_bit(pick.ca->dev_idx, avoid->d);
1496         case READ_RETRY:
1497                 goto retry;
1498         case READ_ERR:
1499                 bio_endio(&rbio->bio);
1500                 return;
1501         };
1502
1503         return;
1504 err:
1505         /*
1506          * extent we wanted to read no longer exists, or
1507          * was merged or partially overwritten (and thus
1508          * possibly bigger than the memory that was
1509          * originally allocated)
1510          */
1511         rbio->bio.bi_status = BLK_STS_AGAIN;
1512         bio_endio(&rbio->bio);
1513         return;
1514 }
1515
1516 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
1517                  struct bvec_iter bvec_iter, u64 inode,
1518                  struct bch_devs_mask *avoid, unsigned flags)
1519 {
1520         struct btree_iter iter;
1521         struct bkey_s_c k;
1522         int ret;
1523
1524         EBUG_ON(flags & BCH_READ_NODECODE);
1525 retry:
1526         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1527                            POS(inode, bvec_iter.bi_sector),
1528                            BTREE_ITER_SLOTS, k) {
1529                 BKEY_PADDED(k) tmp;
1530                 struct extent_pick_ptr pick;
1531                 struct bvec_iter fragment;
1532
1533                 /*
1534                  * Unlock the iterator while the btree node's lock is still in
1535                  * cache, before doing the IO:
1536                  */
1537                 bkey_reassemble(&tmp.k, k);
1538                 k = bkey_i_to_s_c(&tmp.k);
1539                 bch2_btree_iter_unlock(&iter);
1540
1541                 bch2_extent_pick_ptr(c, k, avoid, &pick);
1542                 if (IS_ERR(pick.ca)) {
1543                         bcache_io_error(c, &rbio->bio, "no device to read from");
1544                         bio_endio(&rbio->bio);
1545                         return;
1546                 }
1547
1548                 fragment = bvec_iter;
1549                 fragment.bi_size = (min_t(u64, k.k->p.offset,
1550                                           bvec_iter_end_sector(bvec_iter)) -
1551                                     bvec_iter.bi_sector) << 9;
1552
1553                 if (pick.ca) {
1554                         if (fragment.bi_size != bvec_iter.bi_size) {
1555                                 bio_inc_remaining(&rbio->bio);
1556                                 flags |= BCH_READ_MUST_CLONE;
1557                                 trace_read_split(&rbio->bio);
1558                         }
1559
1560                         ret = __bch2_read_extent(c, rbio, fragment,
1561                                                  bkey_s_c_to_extent(k),
1562                                                  &pick, flags);
1563                         switch (ret) {
1564                         case READ_RETRY_AVOID:
1565                                 __set_bit(pick.ca->dev_idx, avoid->d);
1566                         case READ_RETRY:
1567                                 goto retry;
1568                         case READ_ERR:
1569                                 rbio->bio.bi_status = BLK_STS_IOERR;
1570                                 bio_endio(&rbio->bio);
1571                                 return;
1572                         };
1573                 } else {
1574                         zero_fill_bio_iter(&rbio->bio, fragment);
1575
1576                         if (fragment.bi_size == bvec_iter.bi_size)
1577                                 bio_endio(&rbio->bio);
1578                 }
1579
1580                 if (fragment.bi_size == bvec_iter.bi_size)
1581                         return;
1582
1583                 bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
1584         }
1585
1586         /*
1587          * If we get here, it better have been because there was an error
1588          * reading a btree node
1589          */
1590         ret = bch2_btree_iter_unlock(&iter);
1591         BUG_ON(!ret);
1592         bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
1593         bio_endio(&rbio->bio);
1594 }