git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c

   1 /*
   2  * Some low level IO code, and hacks for various block layer limitations
   3  *
   4  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   5  * Copyright 2012 Google, Inc.
   6  */
   7
   8 #include "bcachefs.h"
   9 #include "alloc_foreground.h"
  10 #include "bset.h"
  11 #include "btree_update.h"
  12 #include "buckets.h"
  13 #include "checksum.h"
  14 #include "compress.h"
  15 #include "clock.h"
  16 #include "debug.h"
  17 #include "disk_groups.h"
  18 #include "ec.h"
  19 #include "error.h"
  20 #include "extents.h"
  21 #include "io.h"
  22 #include "journal.h"
  23 #include "keylist.h"
  24 #include "move.h"
  25 #include "rebalance.h"
  26 #include "super.h"
  27 #include "super-io.h"
  28
  29 #include <linux/blkdev.h>
  30 #include <linux/random.h>
  31
  32 #include <trace/events/bcachefs.h>
  33
  34 static bool bch2_target_congested(struct bch_fs *c, u16 target)
  35 {
  36         const struct bch_devs_mask *devs;
  37         unsigned d, nr = 0, total = 0;
  38         u64 now = local_clock(), last;
  39         s64 congested;
  40         struct bch_dev *ca;
  41
  42         if (!target)
  43                 return false;
  44
  45         rcu_read_lock();
  46         devs = bch2_target_to_mask(c, target);
  47         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
  48                 ca = rcu_dereference(c->devs[d]);
  49                 if (!ca)
  50                         continue;
  51
  52                 congested = atomic_read(&ca->congested);
  53                 last = READ_ONCE(ca->congested_last);
  54                 if (time_after64(now, last))
  55                         congested -= (now - last) >> 12;
  56
  57                 total += max(congested, 0LL);
  58                 nr++;
  59         }
  60         rcu_read_unlock();
  61
  62         return bch2_rand_range(nr * CONGESTED_MAX) < total;
  63 }
  64
  65 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
  66                                        u64 now, int rw)
  67 {
  68         u64 latency_capable =
  69                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
  70         /* ideally we'd be taking into account the device's variance here: */
  71         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
  72         s64 latency_over = io_latency - latency_threshold;
  73
  74         if (latency_threshold && latency_over > 0) {
  75                 /*
  76                  * bump up congested by approximately latency_over * 4 /
  77                  * latency_threshold - we don't need much accuracy here so don't
  78                  * bother with the divide:
  79                  */
  80                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
  81                         atomic_add(latency_over >>
  82                                    max_t(int, ilog2(latency_threshold) - 2, 0),
  83                                    &ca->congested);
  84
  85                 ca->congested_last = now;
  86         } else if (atomic_read(&ca->congested) > 0) {
  87                 atomic_dec(&ca->congested);
  88         }
  89 }
  90
  91 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
  92 {
  93         atomic64_t *latency = &ca->cur_latency[rw];
  94         u64 now = local_clock();
  95         u64 io_latency = time_after64(now, submit_time)
  96                 ? now - submit_time
  97                 : 0;
  98         u64 old, new, v = atomic64_read(latency);
  99
 100         do {
 101                 old = v;
 102
 103                 /*
 104                  * If the io latency was reasonably close to the current
 105                  * latency, skip doing the update and atomic operation - most of
 106                  * the time:
 107                  */
 108                 if (abs((int) (old - io_latency)) < (old >> 1) &&
 109                     now & ~(~0 << 5))
 110                         break;
 111
 112                 new = ewma_add(old, io_latency, 5);
 113         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
 114
 115         bch2_congested_acct(ca, io_latency, now, rw);
 116
 117         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 118 }
 119
 120 /* Allocate, free from mempool: */
 121
 122 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 123 {
 124         struct bio_vec *bv;
 125         unsigned i;
 126
 127         bio_for_each_segment_all(bv, bio, i)
 128                 if (bv->bv_page != ZERO_PAGE(0))
 129                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
 130         bio->bi_vcnt = 0;
 131 }
 132
 133 static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
 134                                     bool *using_mempool)
 135 {
 136         struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
 137
 138         if (likely(!*using_mempool)) {
 139                 bv->bv_page = alloc_page(GFP_NOIO);
 140                 if (unlikely(!bv->bv_page)) {
 141                         mutex_lock(&c->bio_bounce_pages_lock);
 142                         *using_mempool = true;
 143                         goto pool_alloc;
 144
 145                 }
 146         } else {
 147 pool_alloc:
 148                 bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
 149         }
 150
 151         bv->bv_len = PAGE_SIZE;
 152         bv->bv_offset = 0;
 153 }
 154
 155 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 156                                size_t bytes)
 157 {
 158         bool using_mempool = false;
 159
 160         BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
 161
 162         bio->bi_iter.bi_size = bytes;
 163
 164         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
 165                 bch2_bio_alloc_page_pool(c, bio, &using_mempool);
 166
 167         if (using_mempool)
 168                 mutex_unlock(&c->bio_bounce_pages_lock);
 169 }
 170
 171 void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
 172                                     size_t bytes)
 173 {
 174         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
 175                 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
 176
 177                 BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
 178
 179                 bv->bv_page = alloc_page(GFP_NOIO);
 180                 if (!bv->bv_page) {
 181                         /*
 182                          * We already allocated from mempool, we can't allocate from it again
 183                          * without freeing the pages we already allocated or else we could
 184                          * deadlock:
 185                          */
 186                         bch2_bio_free_pages_pool(c, bio);
 187                         bch2_bio_alloc_pages_pool(c, bio, bytes);
 188                         return;
 189                 }
 190
 191                 bv->bv_len = PAGE_SIZE;
 192                 bv->bv_offset = 0;
 193                 bio->bi_vcnt++;
 194         }
 195
 196         bio->bi_iter.bi_size = bytes;
 197 }
 198
 199 /* Writes */
 200
 201 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 202                                enum bch_data_type type,
 203                                const struct bkey_i *k)
 204 {
 205         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
 206         const struct bch_extent_ptr *ptr;
 207         struct bch_write_bio *n;
 208         struct bch_dev *ca;
 209
 210         BUG_ON(c->opts.nochanges);
 211
 212         bkey_for_each_ptr(ptrs, ptr) {
 213                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 214                        !c->devs[ptr->dev]);
 215
 216                 ca = bch_dev_bkey_exists(c, ptr->dev);
 217
 218                 if (to_entry(ptr + 1) < ptrs.end) {
 219                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
 220                                                    &ca->replica_set));
 221
 222                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
 223                         n->bio.bi_private       = wbio->bio.bi_private;
 224                         n->parent               = wbio;
 225                         n->split                = true;
 226                         n->bounce               = false;
 227                         n->put_bio              = true;
 228                         n->bio.bi_opf           = wbio->bio.bi_opf;
 229                         bio_inc_remaining(&wbio->bio);
 230                 } else {
 231                         n = wbio;
 232                         n->split                = false;
 233                 }
 234
 235                 n->c                    = c;
 236                 n->dev                  = ptr->dev;
 237                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
 238                 n->submit_time          = local_clock();
 239                 n->bio.bi_iter.bi_sector = ptr->offset;
 240
 241                 if (!journal_flushes_device(ca))
 242                         n->bio.bi_opf |= REQ_FUA;
 243
 244                 if (likely(n->have_ioref)) {
 245                         this_cpu_add(ca->io_done->sectors[WRITE][type],
 246                                      bio_sectors(&n->bio));
 247
 248                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
 249                         submit_bio(&n->bio);
 250                 } else {
 251                         n->bio.bi_status        = BLK_STS_REMOVED;
 252                         bio_endio(&n->bio);
 253                 }
 254         }
 255 }
 256
 257 static void __bch2_write(struct closure *);
 258
 259 static void bch2_write_done(struct closure *cl)
 260 {
 261         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 262         struct bch_fs *c = op->c;
 263
 264         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 265                 op->error = bch2_journal_error(&c->journal);
 266
 267         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 268                 bch2_disk_reservation_put(c, &op->res);
 269         percpu_ref_put(&c->writes);
 270         bch2_keylist_free(&op->insert_keys, op->inline_keys);
 271
 272         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 273
 274         closure_return(cl);
 275 }
 276
 277 int bch2_write_index_default(struct bch_write_op *op)
 278 {
 279         struct bch_fs *c = op->c;
 280         struct btree_trans trans;
 281         struct btree_iter *iter;
 282         struct keylist *keys = &op->insert_keys;
 283         int ret;
 284
 285         BUG_ON(bch2_keylist_empty(keys));
 286         bch2_verify_keylist_sorted(keys);
 287
 288         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 289
 290         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 291                                    bkey_start_pos(&bch2_keylist_front(keys)->k),
 292                                    BTREE_ITER_INTENT);
 293
 294         do {
 295                 BKEY_PADDED(k) split;
 296
 297                 bkey_copy(&split.k, bch2_keylist_front(keys));
 298
 299                 bch2_extent_trim_atomic(&split.k, iter);
 300
 301                 bch2_trans_update(&trans,
 302                                   BTREE_INSERT_ENTRY(iter, &split.k));
 303
 304                 ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
 305                                         BTREE_INSERT_NOFAIL|
 306                                         BTREE_INSERT_USE_RESERVE);
 307                 if (ret)
 308                         break;
 309
 310                 if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
 311                         bch2_cut_front(iter->pos, bch2_keylist_front(keys));
 312                 else
 313                         bch2_keylist_pop_front(keys);
 314         } while (!bch2_keylist_empty(keys));
 315
 316         bch2_trans_exit(&trans);
 317
 318         return ret;
 319 }
 320
 321 /**
 322  * bch_write_index - after a write, update index to point to new data
 323  */
 324 static void __bch2_write_index(struct bch_write_op *op)
 325 {
 326         struct bch_fs *c = op->c;
 327         struct keylist *keys = &op->insert_keys;
 328         struct bch_extent_ptr *ptr;
 329         struct bkey_i *src, *dst = keys->keys, *n, *k;
 330         unsigned dev;
 331         int ret;
 332
 333         for (src = keys->keys; src != keys->top; src = n) {
 334                 n = bkey_next(src);
 335                 bkey_copy(dst, src);
 336
 337                 bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
 338                         test_bit(ptr->dev, op->failed.d));
 339
 340                 if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
 341                         ret = -EIO;
 342                         goto err;
 343                 }
 344
 345                 dst = bkey_next(dst);
 346         }
 347
 348         keys->top = dst;
 349
 350         /*
 351          * probably not the ideal place to hook this in, but I don't
 352          * particularly want to plumb io_opts all the way through the btree
 353          * update stack right now
 354          */
 355         for_each_keylist_key(keys, k)
 356                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 357
 358         if (!bch2_keylist_empty(keys)) {
 359                 u64 sectors_start = keylist_sectors(keys);
 360                 int ret = op->index_update_fn(op);
 361
 362                 BUG_ON(keylist_sectors(keys) && !ret);
 363
 364                 op->written += sectors_start - keylist_sectors(keys);
 365
 366                 if (ret) {
 367                         __bcache_io_error(c, "btree IO error %i", ret);
 368                         op->error = ret;
 369                 }
 370         }
 371 out:
 372         /* If some a bucket wasn't written, we can't erasure code it: */
 373         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
 374                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
 375
 376         bch2_open_buckets_put(c, &op->open_buckets);
 377         return;
 378 err:
 379         keys->top = keys->keys;
 380         op->error = ret;
 381         goto out;
 382 }
 383
 384 static void bch2_write_index(struct closure *cl)
 385 {
 386         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 387         struct bch_fs *c = op->c;
 388
 389         __bch2_write_index(op);
 390
 391         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 392                 bch2_journal_flush_seq_async(&c->journal,
 393                                              *op_journal_seq(op),
 394                                              cl);
 395                 continue_at(cl, bch2_write_done, index_update_wq(op));
 396         } else {
 397                 continue_at_nobarrier(cl, bch2_write_done, NULL);
 398         }
 399 }
 400
 401 static void bch2_write_endio(struct bio *bio)
 402 {
 403         struct closure *cl              = bio->bi_private;
 404         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
 405         struct bch_write_bio *wbio      = to_wbio(bio);
 406         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
 407         struct bch_fs *c                = wbio->c;
 408         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
 409
 410         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
 411                 set_bit(wbio->dev, op->failed.d);
 412
 413         if (wbio->have_ioref) {
 414                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
 415                 percpu_ref_put(&ca->io_ref);
 416         }
 417
 418         if (wbio->bounce)
 419                 bch2_bio_free_pages_pool(c, bio);
 420
 421         if (wbio->put_bio)
 422                 bio_put(bio);
 423
 424         if (parent)
 425                 bio_endio(&parent->bio);
 426         else
 427                 closure_put(cl);
 428 }
 429
 430 static void init_append_extent(struct bch_write_op *op,
 431                                struct write_point *wp,
 432                                struct bversion version,
 433                                struct bch_extent_crc_unpacked crc)
 434 {
 435         struct bch_fs *c = op->c;
 436         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
 437         struct extent_ptr_decoded p = { .crc = crc };
 438         struct open_bucket *ob;
 439         unsigned i;
 440
 441         op->pos.offset += crc.uncompressed_size;
 442         e->k.p          = op->pos;
 443         e->k.size       = crc.uncompressed_size;
 444         e->k.version    = version;
 445
 446         BUG_ON(crc.compressed_size > wp->sectors_free);
 447         wp->sectors_free -= crc.compressed_size;
 448
 449         open_bucket_for_each(c, &wp->ptrs, ob, i) {
 450                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 451
 452                 p.ptr = ob->ptr;
 453                 p.ptr.cached = !ca->mi.durability ||
 454                         (op->flags & BCH_WRITE_CACHED) != 0;
 455                 p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
 456                 bch2_extent_ptr_decoded_append(e, &p);
 457
 458                 BUG_ON(crc.compressed_size > ob->sectors_free);
 459                 ob->sectors_free -= crc.compressed_size;
 460         }
 461
 462         bch2_keylist_push(&op->insert_keys);
 463 }
 464
 465 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 466                                         struct write_point *wp,
 467                                         struct bio *src,
 468                                         bool *page_alloc_failed,
 469                                         void *buf)
 470 {
 471         struct bch_write_bio *wbio;
 472         struct bio *bio;
 473         unsigned output_available =
 474                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
 475         unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
 476
 477         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
 478         wbio                    = wbio_init(bio);
 479         wbio->put_bio           = true;
 480         /* copy WRITE_SYNC flag */
 481         wbio->bio.bi_opf        = src->bi_opf;
 482
 483         if (buf) {
 484                 bio->bi_iter.bi_size = output_available;
 485                 bch2_bio_map(bio, buf);
 486                 return bio;
 487         }
 488
 489         wbio->bounce            = true;
 490
 491         /*
 492          * We can't use mempool for more than c->sb.encoded_extent_max
 493          * worth of pages, but we'd like to allocate more if we can:
 494          */
 495         while (bio->bi_iter.bi_size < output_available) {
 496                 unsigned len = min_t(unsigned, PAGE_SIZE,
 497                                      output_available - bio->bi_iter.bi_size);
 498                 struct page *p;
 499
 500                 p = alloc_page(GFP_NOIO);
 501                 if (!p) {
 502                         unsigned pool_max =
 503                                 min_t(unsigned, output_available,
 504                                       c->sb.encoded_extent_max << 9);
 505
 506                         if (bio_sectors(bio) < pool_max)
 507                                 bch2_bio_alloc_pages_pool(c, bio, pool_max);
 508                         break;
 509                 }
 510
 511                 bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
 512                         .bv_page        = p,
 513                         .bv_len         = len,
 514                         .bv_offset      = 0,
 515                 };
 516                 bio->bi_iter.bi_size += len;
 517         }
 518
 519         *page_alloc_failed = bio->bi_vcnt < pages;
 520         return bio;
 521 }
 522
 523 static int bch2_write_rechecksum(struct bch_fs *c,
 524                                  struct bch_write_op *op,
 525                                  unsigned new_csum_type)
 526 {
 527         struct bio *bio = &op->wbio.bio;
 528         struct bch_extent_crc_unpacked new_crc;
 529         int ret;
 530
 531         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
 532
 533         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
 534             bch2_csum_type_is_encryption(new_csum_type))
 535                 new_csum_type = op->crc.csum_type;
 536
 537         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
 538                                   NULL, &new_crc,
 539                                   op->crc.offset, op->crc.live_size,
 540                                   new_csum_type);
 541         if (ret)
 542                 return ret;
 543
 544         bio_advance(bio, op->crc.offset << 9);
 545         bio->bi_iter.bi_size = op->crc.live_size << 9;
 546         op->crc = new_crc;
 547         return 0;
 548 }
 549
 550 static int bch2_write_decrypt(struct bch_write_op *op)
 551 {
 552         struct bch_fs *c = op->c;
 553         struct nonce nonce = extent_nonce(op->version, op->crc);
 554         struct bch_csum csum;
 555
 556         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
 557                 return 0;
 558
 559         /*
 560          * If we need to decrypt data in the write path, we'll no longer be able
 561          * to verify the existing checksum (poly1305 mac, in this case) after
 562          * it's decrypted - this is the last point we'll be able to reverify the
 563          * checksum:
 564          */
 565         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 566         if (bch2_crc_cmp(op->crc.csum, csum))
 567                 return -EIO;
 568
 569         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 570         op->crc.csum_type = 0;
 571         op->crc.csum = (struct bch_csum) { 0, 0 };
 572         return 0;
 573 }
 574
 575 static enum prep_encoded_ret {
 576         PREP_ENCODED_OK,
 577         PREP_ENCODED_ERR,
 578         PREP_ENCODED_CHECKSUM_ERR,
 579         PREP_ENCODED_DO_WRITE,
 580 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
 581 {
 582         struct bch_fs *c = op->c;
 583         struct bio *bio = &op->wbio.bio;
 584
 585         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
 586                 return PREP_ENCODED_OK;
 587
 588         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
 589
 590         /* Can we just write the entire extent as is? */
 591         if (op->crc.uncompressed_size == op->crc.live_size &&
 592             op->crc.compressed_size <= wp->sectors_free &&
 593             op->crc.compression_type == op->compression_type) {
 594                 if (!op->crc.compression_type &&
 595                     op->csum_type != op->crc.csum_type &&
 596                     bch2_write_rechecksum(c, op, op->csum_type))
 597                         return PREP_ENCODED_CHECKSUM_ERR;
 598
 599                 return PREP_ENCODED_DO_WRITE;
 600         }
 601
 602         /*
 603          * If the data is compressed and we couldn't write the entire extent as
 604          * is, we have to decompress it:
 605          */
 606         if (op->crc.compression_type) {
 607                 struct bch_csum csum;
 608
 609                 if (bch2_write_decrypt(op))
 610                         return PREP_ENCODED_CHECKSUM_ERR;
 611
 612                 /* Last point we can still verify checksum: */
 613                 csum = bch2_checksum_bio(c, op->crc.csum_type,
 614                                          extent_nonce(op->version, op->crc),
 615                                          bio);
 616                 if (bch2_crc_cmp(op->crc.csum, csum))
 617                         return PREP_ENCODED_CHECKSUM_ERR;
 618
 619                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
 620                         return PREP_ENCODED_ERR;
 621         }
 622
 623         /*
 624          * No longer have compressed data after this point - data might be
 625          * encrypted:
 626          */
 627
 628         /*
 629          * If the data is checksummed and we're only writing a subset,
 630          * rechecksum and adjust bio to point to currently live data:
 631          */
 632         if ((op->crc.live_size != op->crc.uncompressed_size ||
 633              op->crc.csum_type != op->csum_type) &&
 634             bch2_write_rechecksum(c, op, op->csum_type))
 635                 return PREP_ENCODED_CHECKSUM_ERR;
 636
 637         /*
 638          * If we want to compress the data, it has to be decrypted:
 639          */
 640         if ((op->compression_type ||
 641              bch2_csum_type_is_encryption(op->crc.csum_type) !=
 642              bch2_csum_type_is_encryption(op->csum_type)) &&
 643             bch2_write_decrypt(op))
 644                 return PREP_ENCODED_CHECKSUM_ERR;
 645
 646         return PREP_ENCODED_OK;
 647 }
 648
 649 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 650 {
 651         struct bch_fs *c = op->c;
 652         struct bio *src = &op->wbio.bio, *dst = src;
 653         struct bvec_iter saved_iter;
 654         struct bkey_i *key_to_write;
 655         void *ec_buf;
 656         unsigned key_to_write_offset = op->insert_keys.top_p -
 657                 op->insert_keys.keys_p;
 658         unsigned total_output = 0, total_input = 0;
 659         bool bounce = false;
 660         bool page_alloc_failed = false;
 661         int ret, more = 0;
 662
 663         BUG_ON(!bio_sectors(src));
 664
 665         ec_buf = bch2_writepoint_ec_buf(c, wp);
 666
 667         switch (bch2_write_prep_encoded_data(op, wp)) {
 668         case PREP_ENCODED_OK:
 669                 break;
 670         case PREP_ENCODED_ERR:
 671                 ret = -EIO;
 672                 goto err;
 673         case PREP_ENCODED_CHECKSUM_ERR:
 674                 goto csum_err;
 675         case PREP_ENCODED_DO_WRITE:
 676                 if (ec_buf) {
 677                         dst = bch2_write_bio_alloc(c, wp, src,
 678                                                    &page_alloc_failed,
 679                                                    ec_buf);
 680                         bio_copy_data(dst, src);
 681                         bounce = true;
 682                 }
 683                 init_append_extent(op, wp, op->version, op->crc);
 684                 goto do_write;
 685         }
 686
 687         if (ec_buf ||
 688             op->compression_type ||
 689             (op->csum_type &&
 690              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 691             (bch2_csum_type_is_encryption(op->csum_type) &&
 692              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
 693                 dst = bch2_write_bio_alloc(c, wp, src,
 694                                            &page_alloc_failed,
 695                                            ec_buf);
 696                 bounce = true;
 697         }
 698
 699         saved_iter = dst->bi_iter;
 700
 701         do {
 702                 struct bch_extent_crc_unpacked crc =
 703                         (struct bch_extent_crc_unpacked) { 0 };
 704                 struct bversion version = op->version;
 705                 size_t dst_len, src_len;
 706
 707                 if (page_alloc_failed &&
 708                     bio_sectors(dst) < wp->sectors_free &&
 709                     bio_sectors(dst) < c->sb.encoded_extent_max)
 710                         break;
 711
 712                 BUG_ON(op->compression_type &&
 713                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
 714                        bch2_csum_type_is_encryption(op->crc.csum_type));
 715                 BUG_ON(op->compression_type && !bounce);
 716
 717                 crc.compression_type = op->compression_type
 718                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
 719                                              op->compression_type)
 720                         : 0;
 721                 if (!crc.compression_type) {
 722                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 723                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
 724
 725                         if (op->csum_type)
 726                                 dst_len = min_t(unsigned, dst_len,
 727                                                 c->sb.encoded_extent_max << 9);
 728
 729                         if (bounce) {
 730                                 swap(dst->bi_iter.bi_size, dst_len);
 731                                 bio_copy_data(dst, src);
 732                                 swap(dst->bi_iter.bi_size, dst_len);
 733                         }
 734
 735                         src_len = dst_len;
 736                 }
 737
 738                 BUG_ON(!src_len || !dst_len);
 739
 740                 if (bch2_csum_type_is_encryption(op->csum_type)) {
 741                         if (bversion_zero(version)) {
 742                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
 743                         } else {
 744                                 crc.nonce = op->nonce;
 745                                 op->nonce += src_len >> 9;
 746                         }
 747                 }
 748
 749                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 750                     !crc.compression_type &&
 751                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
 752                     bch2_csum_type_is_encryption(op->csum_type)) {
 753                         /*
 754                          * Note: when we're using rechecksum(), we need to be
 755                          * checksumming @src because it has all the data our
 756                          * existing checksum covers - if we bounced (because we
 757                          * were trying to compress), @dst will only have the
 758                          * part of the data the new checksum will cover.
 759                          *
 760                          * But normally we want to be checksumming post bounce,
 761                          * because part of the reason for bouncing is so the
 762                          * data can't be modified (by userspace) while it's in
 763                          * flight.
 764                          */
 765                         if (bch2_rechecksum_bio(c, src, version, op->crc,
 766                                         &crc, &op->crc,
 767                                         src_len >> 9,
 768                                         bio_sectors(src) - (src_len >> 9),
 769                                         op->csum_type))
 770                                 goto csum_err;
 771                 } else {
 772                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 773                             bch2_rechecksum_bio(c, src, version, op->crc,
 774                                         NULL, &op->crc,
 775                                         src_len >> 9,
 776                                         bio_sectors(src) - (src_len >> 9),
 777                                         op->crc.csum_type))
 778                                 goto csum_err;
 779
 780                         crc.compressed_size     = dst_len >> 9;
 781                         crc.uncompressed_size   = src_len >> 9;
 782                         crc.live_size           = src_len >> 9;
 783
 784                         swap(dst->bi_iter.bi_size, dst_len);
 785                         bch2_encrypt_bio(c, op->csum_type,
 786                                          extent_nonce(version, crc), dst);
 787                         crc.csum = bch2_checksum_bio(c, op->csum_type,
 788                                          extent_nonce(version, crc), dst);
 789                         crc.csum_type = op->csum_type;
 790                         swap(dst->bi_iter.bi_size, dst_len);
 791                 }
 792
 793                 init_append_extent(op, wp, version, crc);
 794
 795                 if (dst != src)
 796                         bio_advance(dst, dst_len);
 797                 bio_advance(src, src_len);
 798                 total_output    += dst_len;
 799                 total_input     += src_len;
 800         } while (dst->bi_iter.bi_size &&
 801                  src->bi_iter.bi_size &&
 802                  wp->sectors_free &&
 803                  !bch2_keylist_realloc(&op->insert_keys,
 804                                       op->inline_keys,
 805                                       ARRAY_SIZE(op->inline_keys),
 806                                       BKEY_EXTENT_U64s_MAX));
 807
 808         more = src->bi_iter.bi_size != 0;
 809
 810         dst->bi_iter = saved_iter;
 811
 812         if (dst == src && more) {
 813                 BUG_ON(total_output != total_input);
 814
 815                 dst = bio_split(src, total_input >> 9,
 816                                 GFP_NOIO, &c->bio_write);
 817                 wbio_init(dst)->put_bio = true;
 818                 /* copy WRITE_SYNC flag */
 819                 dst->bi_opf             = src->bi_opf;
 820         }
 821
 822         dst->bi_iter.bi_size = total_output;
 823
 824         /* Free unneeded pages after compressing: */
 825         if (to_wbio(dst)->bounce)
 826                 while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
 827                         mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
 828                                      &c->bio_bounce_pages);
 829 do_write:
 830         /* might have done a realloc... */
 831
 832         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 833
 834         bch2_ec_add_backpointer(c, wp,
 835                                 bkey_start_pos(&key_to_write->k),
 836                                 total_input >> 9);
 837
 838         dst->bi_end_io  = bch2_write_endio;
 839         dst->bi_private = &op->cl;
 840         bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
 841
 842         closure_get(dst->bi_private);
 843
 844         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
 845                                   key_to_write);
 846         return more;
 847 csum_err:
 848         bch_err(c, "error verifying existing checksum while "
 849                 "rewriting existing data (memory corruption?)");
 850         ret = -EIO;
 851 err:
 852         if (to_wbio(dst)->bounce)
 853                 bch2_bio_free_pages_pool(c, dst);
 854         if (to_wbio(dst)->put_bio)
 855                 bio_put(dst);
 856
 857         return ret;
 858 }
 859
 860 static void __bch2_write(struct closure *cl)
 861 {
 862         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 863         struct bch_fs *c = op->c;
 864         struct write_point *wp;
 865         int ret;
 866 again:
 867         memset(&op->failed, 0, sizeof(op->failed));
 868
 869         do {
 870                 /* +1 for possible cache device: */
 871                 if (op->open_buckets.nr + op->nr_replicas + 1 >
 872                     ARRAY_SIZE(op->open_buckets.v))
 873                         goto flush_io;
 874
 875                 if (bch2_keylist_realloc(&op->insert_keys,
 876                                         op->inline_keys,
 877                                         ARRAY_SIZE(op->inline_keys),
 878                                         BKEY_EXTENT_U64s_MAX))
 879                         goto flush_io;
 880
 881                 wp = bch2_alloc_sectors_start(c,
 882                         op->target,
 883                         op->opts.erasure_code,
 884                         op->write_point,
 885                         &op->devs_have,
 886                         op->nr_replicas,
 887                         op->nr_replicas_required,
 888                         op->alloc_reserve,
 889                         op->flags,
 890                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
 891                 EBUG_ON(!wp);
 892
 893                 if (unlikely(IS_ERR(wp))) {
 894                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
 895                                 ret = PTR_ERR(wp);
 896                                 goto err;
 897                         }
 898
 899                         goto flush_io;
 900                 }
 901
 902                 ret = bch2_write_extent(op, wp);
 903
 904                 bch2_open_bucket_get(c, wp, &op->open_buckets);
 905                 bch2_alloc_sectors_done(c, wp);
 906
 907                 if (ret < 0)
 908                         goto err;
 909         } while (ret);
 910
 911         continue_at(cl, bch2_write_index, index_update_wq(op));
 912         return;
 913 err:
 914         op->error = ret;
 915
 916         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
 917                     ? bch2_write_index
 918                     : bch2_write_done, index_update_wq(op));
 919         return;
 920 flush_io:
 921         closure_sync(cl);
 922
 923         if (!bch2_keylist_empty(&op->insert_keys)) {
 924                 __bch2_write_index(op);
 925
 926                 if (op->error) {
 927                         continue_at_nobarrier(cl, bch2_write_done, NULL);
 928                         return;
 929                 }
 930         }
 931
 932         goto again;
 933 }
 934
 935 /**
 936  * bch_write - handle a write to a cache device or flash only volume
 937  *
 938  * This is the starting point for any data to end up in a cache device; it could
 939  * be from a normal write, or a writeback write, or a write to a flash only
 940  * volume - it's also used by the moving garbage collector to compact data in
 941  * mostly empty buckets.
 942  *
 943  * It first writes the data to the cache, creating a list of keys to be inserted
 944  * (if the data won't fit in a single open bucket, there will be multiple keys);
 945  * after the data is written it calls bch_journal, and after the keys have been
 946  * added to the next journal write they're inserted into the btree.
 947  *
 948  * If op->discard is true, instead of inserting the data it invalidates the
 949  * region of the cache represented by op->bio and op->inode.
 950  */
 951 void bch2_write(struct closure *cl)
 952 {
 953         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 954         struct bch_fs *c = op->c;
 955
 956         BUG_ON(!op->nr_replicas);
 957         BUG_ON(!op->write_point.v);
 958         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 959         BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
 960
 961         op->start_time = local_clock();
 962
 963         bch2_keylist_init(&op->insert_keys, op->inline_keys);
 964         wbio_init(&op->wbio.bio)->put_bio = false;
 965
 966         if (c->opts.nochanges ||
 967             !percpu_ref_tryget(&c->writes)) {
 968                 __bcache_io_error(c, "read only");
 969                 op->error = -EROFS;
 970                 if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 971                         bch2_disk_reservation_put(c, &op->res);
 972                 closure_return(cl);
 973                 return;
 974         }
 975
 976         bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
 977
 978         continue_at_nobarrier(cl, __bch2_write, NULL);
 979 }
 980
 981 /* Cache promotion on read */
 982
 983 struct promote_op {
 984         struct closure          cl;
 985         struct rcu_head         rcu;
 986         u64                     start_time;
 987
 988         struct rhash_head       hash;
 989         struct bpos             pos;
 990
 991         struct migrate_write    write;
 992         struct bio_vec          bi_inline_vecs[0]; /* must be last */
 993 };
 994
 995 static const struct rhashtable_params bch_promote_params = {
 996         .head_offset    = offsetof(struct promote_op, hash),
 997         .key_offset     = offsetof(struct promote_op, pos),
 998         .key_len        = sizeof(struct bpos),
 999 };
1000
1001 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1002                                   struct bpos pos,
1003                                   struct bch_io_opts opts,
1004                                   unsigned flags)
1005 {
1006         if (!opts.promote_target)
1007                 return false;
1008
1009         if (!(flags & BCH_READ_MAY_PROMOTE))
1010                 return false;
1011
1012         if (percpu_ref_is_dying(&c->writes))
1013                 return false;
1014
1015         if (!bkey_extent_is_data(k.k))
1016                 return false;
1017
1018         if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
1019                 return false;
1020
1021         if (bch2_target_congested(c, opts.promote_target))
1022                 return false;
1023
1024         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1025                                    bch_promote_params))
1026                 return false;
1027
1028         return true;
1029 }
1030
1031 static void promote_free(struct bch_fs *c, struct promote_op *op)
1032 {
1033         int ret;
1034
1035         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1036                                      bch_promote_params);
1037         BUG_ON(ret);
1038         percpu_ref_put(&c->writes);
1039         kfree_rcu(op, rcu);
1040 }
1041
1042 static void promote_done(struct closure *cl)
1043 {
1044         struct promote_op *op =
1045                 container_of(cl, struct promote_op, cl);
1046         struct bch_fs *c = op->write.op.c;
1047
1048         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1049                                op->start_time);
1050
1051         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1052         promote_free(c, op);
1053 }
1054
1055 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1056 {
1057         struct bch_fs *c = rbio->c;
1058         struct closure *cl = &op->cl;
1059         struct bio *bio = &op->write.op.wbio.bio;
1060
1061         trace_promote(&rbio->bio);
1062
1063         /* we now own pages: */
1064         BUG_ON(!rbio->bounce);
1065         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1066
1067         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1068                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1069         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1070
1071         bch2_migrate_read_done(&op->write, rbio);
1072
1073         closure_init(cl, NULL);
1074         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1075         closure_return_with_destructor(cl, promote_done);
1076 }
1077
1078 noinline
1079 static struct promote_op *__promote_alloc(struct bch_fs *c,
1080                                           struct bpos pos,
1081                                           struct extent_ptr_decoded *pick,
1082                                           struct bch_io_opts opts,
1083                                           unsigned rbio_sectors,
1084                                           struct bch_read_bio **rbio)
1085 {
1086         struct promote_op *op = NULL;
1087         struct bio *bio;
1088         unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
1089         /* data might have to be decompressed in the write path: */
1090         unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
1091                                            PAGE_SECTORS);
1092         int ret;
1093
1094         if (!percpu_ref_tryget(&c->writes))
1095                 return NULL;
1096
1097         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
1098                      GFP_NOIO);
1099         if (!op)
1100                 goto err;
1101
1102         op->start_time = local_clock();
1103         op->pos = pos;
1104
1105         /*
1106          * promotes require bouncing, but if the extent isn't
1107          * checksummed/compressed it might be too big for the mempool:
1108          */
1109         if (rbio_sectors > c->sb.encoded_extent_max) {
1110                 *rbio = kzalloc(sizeof(struct bch_read_bio) +
1111                                 sizeof(struct bio_vec) * rbio_pages,
1112                                 GFP_NOIO);
1113                 if (!*rbio)
1114                         goto err;
1115
1116                 rbio_init(&(*rbio)->bio, opts);
1117                 bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
1118                          rbio_pages);
1119
1120                 (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
1121                 bch2_bio_map(&(*rbio)->bio, NULL);
1122
1123                 if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
1124                         goto err;
1125
1126                 (*rbio)->bounce         = true;
1127                 (*rbio)->split          = true;
1128                 (*rbio)->kmalloc        = true;
1129         }
1130
1131         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1132                                           bch_promote_params))
1133                 goto err;
1134
1135         bio = &op->write.op.wbio.bio;
1136         bio_init(bio, bio->bi_inline_vecs, wbio_pages);
1137
1138         ret = bch2_migrate_write_init(c, &op->write,
1139                         writepoint_hashed((unsigned long) current),
1140                         opts,
1141                         DATA_PROMOTE,
1142                         (struct data_opts) {
1143                                 .target = opts.promote_target
1144                         },
1145                         bkey_s_c_null);
1146         BUG_ON(ret);
1147
1148         return op;
1149 err:
1150         if (*rbio)
1151                 bio_free_pages(&(*rbio)->bio);
1152         kfree(*rbio);
1153         *rbio = NULL;
1154         kfree(op);
1155         percpu_ref_put(&c->writes);
1156         return NULL;
1157 }
1158
1159 static inline struct promote_op *promote_alloc(struct bch_fs *c,
1160                                                struct bvec_iter iter,
1161                                                struct bkey_s_c k,
1162                                                struct extent_ptr_decoded *pick,
1163                                                struct bch_io_opts opts,
1164                                                unsigned flags,
1165                                                struct bch_read_bio **rbio,
1166                                                bool *bounce,
1167                                                bool *read_full)
1168 {
1169         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1170         unsigned sectors = promote_full
1171                 ? pick->crc.compressed_size
1172                 : bvec_iter_sectors(iter);
1173         struct bpos pos = promote_full
1174                 ? bkey_start_pos(k.k)
1175                 : POS(k.k->p.inode, iter.bi_sector);
1176         struct promote_op *promote;
1177
1178         if (!should_promote(c, k, pos, opts, flags))
1179                 return NULL;
1180
1181         promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
1182         if (!promote)
1183                 return NULL;
1184
1185         *bounce         = true;
1186         *read_full      = promote_full;
1187         return promote;
1188 }
1189
1190 /* Read */
1191
1192 #define READ_RETRY_AVOID        1
1193 #define READ_RETRY              2
1194 #define READ_ERR                3
1195
1196 enum rbio_context {
1197         RBIO_CONTEXT_NULL,
1198         RBIO_CONTEXT_HIGHPRI,
1199         RBIO_CONTEXT_UNBOUND,
1200 };
1201
1202 static inline struct bch_read_bio *
1203 bch2_rbio_parent(struct bch_read_bio *rbio)
1204 {
1205         return rbio->split ? rbio->parent : rbio;
1206 }
1207
1208 __always_inline
1209 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1210                            enum rbio_context context,
1211                            struct workqueue_struct *wq)
1212 {
1213         if (context <= rbio->context) {
1214                 fn(&rbio->work);
1215         } else {
1216                 rbio->work.func         = fn;
1217                 rbio->context           = context;
1218                 queue_work(wq, &rbio->work);
1219         }
1220 }
1221
1222 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1223 {
1224         BUG_ON(rbio->bounce && !rbio->split);
1225
1226         if (rbio->promote)
1227                 promote_free(rbio->c, rbio->promote);
1228         rbio->promote = NULL;
1229
1230         if (rbio->bounce)
1231                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1232
1233         if (rbio->split) {
1234                 struct bch_read_bio *parent = rbio->parent;
1235
1236                 if (rbio->kmalloc)
1237                         kfree(rbio);
1238                 else
1239                         bio_put(&rbio->bio);
1240
1241                 rbio = parent;
1242         }
1243
1244         return rbio;
1245 }
1246
1247 static void bch2_rbio_done(struct bch_read_bio *rbio)
1248 {
1249         bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1250                                rbio->start_time);
1251         bio_endio(&rbio->bio);
1252 }
1253
1254 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1255                                      struct bvec_iter bvec_iter, u64 inode,
1256                                      struct bch_io_failures *failed,
1257                                      unsigned flags)
1258 {
1259         struct btree_trans trans;
1260         struct btree_iter *iter;
1261         BKEY_PADDED(k) tmp;
1262         struct bkey_s_c k;
1263         int ret;
1264
1265         flags &= ~BCH_READ_LAST_FRAGMENT;
1266
1267         bch2_trans_init(&trans, c, 0, 0);
1268
1269         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1270                                    rbio->pos, BTREE_ITER_SLOTS);
1271 retry:
1272         rbio->bio.bi_status = 0;
1273
1274         k = bch2_btree_iter_peek_slot(iter);
1275         if (bkey_err(k))
1276                 goto err;
1277
1278         bkey_reassemble(&tmp.k, k);
1279         k = bkey_i_to_s_c(&tmp.k);
1280         bch2_trans_unlock(&trans);
1281
1282         if (!bkey_extent_is_data(k.k) ||
1283             !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
1284                                      rbio->pick.ptr,
1285                                      rbio->pos.offset -
1286                                      rbio->pick.crc.offset)) {
1287                 /* extent we wanted to read no longer exists: */
1288                 rbio->hole = true;
1289                 goto out;
1290         }
1291
1292         ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1293         if (ret == READ_RETRY)
1294                 goto retry;
1295         if (ret)
1296                 goto err;
1297 out:
1298         bch2_rbio_done(rbio);
1299         bch2_trans_exit(&trans);
1300         return;
1301 err:
1302         rbio->bio.bi_status = BLK_STS_IOERR;
1303         goto out;
1304 }
1305
1306 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1307                             struct bvec_iter bvec_iter, u64 inode,
1308                             struct bch_io_failures *failed, unsigned flags)
1309 {
1310         struct btree_trans trans;
1311         struct btree_iter *iter;
1312         struct bkey_s_c k;
1313         int ret;
1314
1315         bch2_trans_init(&trans, c, 0, 0);
1316
1317         flags &= ~BCH_READ_LAST_FRAGMENT;
1318         flags |= BCH_READ_MUST_CLONE;
1319 retry:
1320         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1321                            POS(inode, bvec_iter.bi_sector),
1322                            BTREE_ITER_SLOTS, k, ret) {
1323                 BKEY_PADDED(k) tmp;
1324                 unsigned bytes;
1325
1326                 bkey_reassemble(&tmp.k, k);
1327                 k = bkey_i_to_s_c(&tmp.k);
1328                 bch2_trans_unlock(&trans);
1329
1330                 bytes = min_t(unsigned, bvec_iter.bi_size,
1331                               (k.k->p.offset - bvec_iter.bi_sector) << 9);
1332                 swap(bvec_iter.bi_size, bytes);
1333
1334                 ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1335                 switch (ret) {
1336                 case READ_RETRY:
1337                         goto retry;
1338                 case READ_ERR:
1339                         goto err;
1340                 };
1341
1342                 if (bytes == bvec_iter.bi_size)
1343                         goto out;
1344
1345                 swap(bvec_iter.bi_size, bytes);
1346                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1347         }
1348
1349         /*
1350          * If we get here, it better have been because there was an error
1351          * reading a btree node
1352          */
1353         BUG_ON(!ret);
1354         __bcache_io_error(c, "btree IO error: %i", ret);
1355 err:
1356         rbio->bio.bi_status = BLK_STS_IOERR;
1357 out:
1358         bch2_trans_exit(&trans);
1359         bch2_rbio_done(rbio);
1360 }
1361
1362 static void bch2_rbio_retry(struct work_struct *work)
1363 {
1364         struct bch_read_bio *rbio =
1365                 container_of(work, struct bch_read_bio, work);
1366         struct bch_fs *c        = rbio->c;
1367         struct bvec_iter iter   = rbio->bvec_iter;
1368         unsigned flags          = rbio->flags;
1369         u64 inode               = rbio->pos.inode;
1370         struct bch_io_failures failed = { .nr = 0 };
1371
1372         trace_read_retry(&rbio->bio);
1373
1374         if (rbio->retry == READ_RETRY_AVOID)
1375                 bch2_mark_io_failure(&failed, &rbio->pick);
1376
1377         rbio->bio.bi_status = 0;
1378
1379         rbio = bch2_rbio_free(rbio);
1380
1381         flags |= BCH_READ_IN_RETRY;
1382         flags &= ~BCH_READ_MAY_PROMOTE;
1383
1384         if (flags & BCH_READ_NODECODE)
1385                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1386         else
1387                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1388 }
1389
1390 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1391                             blk_status_t error)
1392 {
1393         rbio->retry = retry;
1394
1395         if (rbio->flags & BCH_READ_IN_RETRY)
1396                 return;
1397
1398         if (retry == READ_ERR) {
1399                 rbio = bch2_rbio_free(rbio);
1400
1401                 rbio->bio.bi_status = error;
1402                 bch2_rbio_done(rbio);
1403         } else {
1404                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1405                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1406         }
1407 }
1408
1409 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1410 {
1411         struct bch_fs *c = rbio->c;
1412         struct btree_trans trans;
1413         struct btree_iter *iter;
1414         struct bkey_s_c k;
1415         struct bkey_i_extent *e;
1416         BKEY_PADDED(k) new;
1417         struct bch_extent_crc_unpacked new_crc;
1418         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1419         int ret;
1420
1421         if (rbio->pick.crc.compression_type)
1422                 return;
1423
1424         bch2_trans_init(&trans, c, 0, 0);
1425 retry:
1426         bch2_trans_begin(&trans);
1427
1428         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1429                                    BTREE_ITER_INTENT);
1430         k = bch2_btree_iter_peek(iter);
1431         if (IS_ERR_OR_NULL(k.k))
1432                 goto out;
1433
1434         if (!bkey_extent_is_data(k.k))
1435                 goto out;
1436
1437         bkey_reassemble(&new.k, k);
1438         e = bkey_i_to_extent(&new.k);
1439
1440         if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
1441                                      rbio->pick.ptr, data_offset) ||
1442             bversion_cmp(e->k.version, rbio->version))
1443                 goto out;
1444
1445         /* Extent was merged? */
1446         if (bkey_start_offset(&e->k) < data_offset ||
1447             e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1448                 goto out;
1449
1450         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1451                         rbio->pick.crc, NULL, &new_crc,
1452                         bkey_start_offset(&e->k) - data_offset, e->k.size,
1453                         rbio->pick.crc.csum_type)) {
1454                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1455                 goto out;
1456         }
1457
1458         if (!bch2_extent_narrow_crcs(e, new_crc))
1459                 goto out;
1460
1461         bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
1462         ret = bch2_trans_commit(&trans, NULL, NULL,
1463                                 BTREE_INSERT_ATOMIC|
1464                                 BTREE_INSERT_NOFAIL|
1465                                 BTREE_INSERT_NOWAIT);
1466         if (ret == -EINTR)
1467                 goto retry;
1468 out:
1469         bch2_trans_exit(&trans);
1470 }
1471
1472 static bool should_narrow_crcs(struct bkey_s_c k,
1473                                struct extent_ptr_decoded *pick,
1474                                unsigned flags)
1475 {
1476         return !(flags & BCH_READ_IN_RETRY) &&
1477                 bkey_extent_is_data(k.k) &&
1478                 bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
1479 }
1480
1481 /* Inner part that may run in process context */
1482 static void __bch2_read_endio(struct work_struct *work)
1483 {
1484         struct bch_read_bio *rbio =
1485                 container_of(work, struct bch_read_bio, work);
1486         struct bch_fs *c        = rbio->c;
1487         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1488         struct bio *src         = &rbio->bio;
1489         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1490         struct bvec_iter dst_iter = rbio->bvec_iter;
1491         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1492         struct nonce nonce = extent_nonce(rbio->version, crc);
1493         struct bch_csum csum;
1494
1495         /* Reset iterator for checksumming and copying bounced data: */
1496         if (rbio->bounce) {
1497                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1498                 src->bi_iter.bi_idx             = 0;
1499                 src->bi_iter.bi_bvec_done       = 0;
1500         } else {
1501                 src->bi_iter                    = rbio->bvec_iter;
1502         }
1503
1504         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1505         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1506                 goto csum_err;
1507
1508         if (unlikely(rbio->narrow_crcs))
1509                 bch2_rbio_narrow_crcs(rbio);
1510
1511         if (rbio->flags & BCH_READ_NODECODE)
1512                 goto nodecode;
1513
1514         /* Adjust crc to point to subset of data we want: */
1515         crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
1516         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1517
1518         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1519                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1520                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1521                         goto decompression_err;
1522         } else {
1523                 /* don't need to decrypt the entire bio: */
1524                 nonce = nonce_add(nonce, crc.offset << 9);
1525                 bio_advance(src, crc.offset << 9);
1526
1527                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1528                 src->bi_iter.bi_size = dst_iter.bi_size;
1529
1530                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1531
1532                 if (rbio->bounce) {
1533                         struct bvec_iter src_iter = src->bi_iter;
1534                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1535                 }
1536         }
1537
1538         if (rbio->promote) {
1539                 /*
1540                  * Re encrypt data we decrypted, so it's consistent with
1541                  * rbio->crc:
1542                  */
1543                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1544                 promote_start(rbio->promote, rbio);
1545                 rbio->promote = NULL;
1546         }
1547 nodecode:
1548         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1549                 rbio = bch2_rbio_free(rbio);
1550                 bch2_rbio_done(rbio);
1551         }
1552         return;
1553 csum_err:
1554         /*
1555          * Checksum error: if the bio wasn't bounced, we may have been
1556          * reading into buffers owned by userspace (that userspace can
1557          * scribble over) - retry the read, bouncing it this time:
1558          */
1559         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1560                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1561                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1562                 return;
1563         }
1564
1565         bch2_dev_io_error(ca,
1566                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1567                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1568                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1569                 csum.hi, csum.lo, crc.csum_type);
1570         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1571         return;
1572 decompression_err:
1573         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1574                           rbio->pos.inode,
1575                           (u64) rbio->bvec_iter.bi_sector);
1576         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1577         return;
1578 }
1579
1580 static void bch2_read_endio(struct bio *bio)
1581 {
1582         struct bch_read_bio *rbio =
1583                 container_of(bio, struct bch_read_bio, bio);
1584         struct bch_fs *c        = rbio->c;
1585         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1586         struct workqueue_struct *wq = NULL;
1587         enum rbio_context context = RBIO_CONTEXT_NULL;
1588
1589         if (rbio->have_ioref) {
1590                 bch2_latency_acct(ca, rbio->submit_time, READ);
1591                 percpu_ref_put(&ca->io_ref);
1592         }
1593
1594         if (!rbio->split)
1595                 rbio->bio.bi_end_io = rbio->end_io;
1596
1597         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1598                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1599                 return;
1600         }
1601
1602         if (rbio->pick.ptr.cached &&
1603             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1604              ptr_stale(ca, &rbio->pick.ptr))) {
1605                 atomic_long_inc(&c->read_realloc_races);
1606
1607                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1608                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1609                 else
1610                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1611                 return;
1612         }
1613
1614         if (rbio->narrow_crcs ||
1615             rbio->pick.crc.compression_type ||
1616             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1617                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1618         else if (rbio->pick.crc.csum_type)
1619                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1620
1621         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1622 }
1623
1624 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1625                        struct bvec_iter iter, struct bkey_s_c k,
1626                        struct bch_io_failures *failed, unsigned flags)
1627 {
1628         struct extent_ptr_decoded pick;
1629         struct bch_read_bio *rbio = NULL;
1630         struct bch_dev *ca;
1631         struct promote_op *promote = NULL;
1632         bool bounce = false, read_full = false, narrow_crcs = false;
1633         struct bpos pos = bkey_start_pos(k.k);
1634         int pick_ret;
1635
1636         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1637
1638         /* hole or reservation - just zero fill: */
1639         if (!pick_ret)
1640                 goto hole;
1641
1642         if (pick_ret < 0) {
1643                 __bcache_io_error(c, "no device to read from");
1644                 goto err;
1645         }
1646
1647         if (pick_ret > 0)
1648                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1649
1650         if (flags & BCH_READ_NODECODE) {
1651                 /*
1652                  * can happen if we retry, and the extent we were going to read
1653                  * has been merged in the meantime:
1654                  */
1655                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1656                         goto hole;
1657
1658                 iter.bi_sector  = pos.offset;
1659                 iter.bi_size    = pick.crc.compressed_size << 9;
1660                 goto noclone;
1661         }
1662
1663         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1664             bio_flagged(&orig->bio, BIO_CHAIN))
1665                 flags |= BCH_READ_MUST_CLONE;
1666
1667         narrow_crcs = should_narrow_crcs(k, &pick, flags);
1668
1669         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1670                 flags |= BCH_READ_MUST_BOUNCE;
1671
1672         EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
1673                 k.k->p.offset < bvec_iter_end_sector(iter));
1674
1675         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1676             (pick.crc.csum_type != BCH_CSUM_NONE &&
1677              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1678               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1679                (flags & BCH_READ_USER_MAPPED)) ||
1680               (flags & BCH_READ_MUST_BOUNCE)))) {
1681                 read_full = true;
1682                 bounce = true;
1683         }
1684
1685         promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
1686                                 &rbio, &bounce, &read_full);
1687
1688         if (!read_full) {
1689                 EBUG_ON(pick.crc.compression_type);
1690                 EBUG_ON(pick.crc.csum_type &&
1691                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1692                          bvec_iter_sectors(iter) != pick.crc.live_size ||
1693                          pick.crc.offset ||
1694                          iter.bi_sector != pos.offset));
1695
1696                 pick.ptr.offset += pick.crc.offset +
1697                         (iter.bi_sector - pos.offset);
1698                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
1699                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
1700                 pick.crc.offset                 = 0;
1701                 pick.crc.live_size              = bvec_iter_sectors(iter);
1702                 pos.offset                      = iter.bi_sector;
1703         }
1704
1705         if (rbio) {
1706                 /* promote already allocated bounce rbio */
1707         } else if (bounce) {
1708                 unsigned sectors = pick.crc.compressed_size;
1709
1710                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
1711                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
1712                                                   &c->bio_read_split),
1713                                  orig->opts);
1714
1715                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1716                 rbio->bounce    = true;
1717                 rbio->split     = true;
1718         } else if (flags & BCH_READ_MUST_CLONE) {
1719                 /*
1720                  * Have to clone if there were any splits, due to error
1721                  * reporting issues (if a split errored, and retrying didn't
1722                  * work, when it reports the error to its parent (us) we don't
1723                  * know if the error was from our bio, and we should retry, or
1724                  * from the whole bio, in which case we don't want to retry and
1725                  * lose the error)
1726                  */
1727                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
1728                                                 &c->bio_read_split),
1729                                  orig->opts);
1730                 rbio->bio.bi_iter = iter;
1731                 rbio->split     = true;
1732         } else {
1733 noclone:
1734                 rbio = orig;
1735                 rbio->bio.bi_iter = iter;
1736                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1737         }
1738
1739         BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1740
1741         rbio->c                 = c;
1742         rbio->submit_time       = local_clock();
1743         if (rbio->split)
1744                 rbio->parent    = orig;
1745         else
1746                 rbio->end_io    = orig->bio.bi_end_io;
1747         rbio->bvec_iter         = iter;
1748         rbio->flags             = flags;
1749         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
1750         rbio->narrow_crcs       = narrow_crcs;
1751         rbio->hole              = 0;
1752         rbio->retry             = 0;
1753         rbio->context           = 0;
1754         rbio->devs_have         = bch2_bkey_devs(k);
1755         rbio->pick              = pick;
1756         rbio->pos               = pos;
1757         rbio->version           = k.k->version;
1758         rbio->promote           = promote;
1759         INIT_WORK(&rbio->work, NULL);
1760
1761         rbio->bio.bi_opf        = orig->bio.bi_opf;
1762         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1763         rbio->bio.bi_end_io     = bch2_read_endio;
1764
1765         if (rbio->bounce)
1766                 trace_read_bounce(&rbio->bio);
1767
1768         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1769
1770         percpu_down_read_preempt_disable(&c->mark_lock);
1771         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
1772         percpu_up_read_preempt_enable(&c->mark_lock);
1773
1774         if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
1775                 bio_inc_remaining(&orig->bio);
1776                 trace_read_split(&orig->bio);
1777         }
1778
1779         if (!rbio->pick.idx) {
1780                 if (!rbio->have_ioref) {
1781                         __bcache_io_error(c, "no device to read from");
1782                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1783                         goto out;
1784                 }
1785
1786                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
1787                              bio_sectors(&rbio->bio));
1788                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1789
1790                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1791                         submit_bio(&rbio->bio);
1792                 else
1793                         submit_bio_wait(&rbio->bio);
1794         } else {
1795                 /* Attempting reconstruct read: */
1796                 if (bch2_ec_read_extent(c, rbio)) {
1797                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1798                         goto out;
1799                 }
1800
1801                 if (likely(!(flags & BCH_READ_IN_RETRY)))
1802                         bio_endio(&rbio->bio);
1803         }
1804 out:
1805         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1806                 return 0;
1807         } else {
1808                 int ret;
1809
1810                 rbio->context = RBIO_CONTEXT_UNBOUND;
1811                 bch2_read_endio(&rbio->bio);
1812
1813                 ret = rbio->retry;
1814                 rbio = bch2_rbio_free(rbio);
1815
1816                 if (ret == READ_RETRY_AVOID) {
1817                         bch2_mark_io_failure(failed, &pick);
1818                         ret = READ_RETRY;
1819                 }
1820
1821                 return ret;
1822         }
1823
1824 err:
1825         if (flags & BCH_READ_IN_RETRY)
1826                 return READ_ERR;
1827
1828         orig->bio.bi_status = BLK_STS_IOERR;
1829         goto out_read_done;
1830
1831 hole:
1832         /*
1833          * won't normally happen in the BCH_READ_NODECODE
1834          * (bch2_move_extent()) path, but if we retry and the extent we wanted
1835          * to read no longer exists we have to signal that:
1836          */
1837         if (flags & BCH_READ_NODECODE)
1838                 orig->hole = true;
1839
1840         zero_fill_bio_iter(&orig->bio, iter);
1841 out_read_done:
1842         if (flags & BCH_READ_LAST_FRAGMENT)
1843                 bch2_rbio_done(orig);
1844         return 0;
1845 }
1846
1847 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
1848 {
1849         struct btree_trans trans;
1850         struct btree_iter *iter;
1851         struct bkey_s_c k;
1852         unsigned flags = BCH_READ_RETRY_IF_STALE|
1853                 BCH_READ_MAY_PROMOTE|
1854                 BCH_READ_USER_MAPPED;
1855         int ret;
1856
1857         bch2_trans_init(&trans, c, 0, 0);
1858
1859         BUG_ON(rbio->_state);
1860         BUG_ON(flags & BCH_READ_NODECODE);
1861         BUG_ON(flags & BCH_READ_IN_RETRY);
1862
1863         rbio->c = c;
1864         rbio->start_time = local_clock();
1865
1866         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1867                            POS(inode, rbio->bio.bi_iter.bi_sector),
1868                            BTREE_ITER_SLOTS, k, ret) {
1869                 BKEY_PADDED(k) tmp;
1870                 unsigned bytes;
1871
1872                 /*
1873                  * Unlock the iterator while the btree node's lock is still in
1874                  * cache, before doing the IO:
1875                  */
1876                 bkey_reassemble(&tmp.k, k);
1877                 k = bkey_i_to_s_c(&tmp.k);
1878                 bch2_trans_unlock(&trans);
1879
1880                 bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
1881                               (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
1882                 swap(rbio->bio.bi_iter.bi_size, bytes);
1883
1884                 if (rbio->bio.bi_iter.bi_size == bytes)
1885                         flags |= BCH_READ_LAST_FRAGMENT;
1886
1887                 bch2_read_extent(c, rbio, k, flags);
1888
1889                 if (flags & BCH_READ_LAST_FRAGMENT)
1890                         return;
1891
1892                 swap(rbio->bio.bi_iter.bi_size, bytes);
1893                 bio_advance(&rbio->bio, bytes);
1894         }
1895
1896         /*
1897          * If we get here, it better have been because there was an error
1898          * reading a btree node
1899          */
1900         BUG_ON(!ret);
1901         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
1902
1903         bch2_trans_exit(&trans);
1904         bch2_rbio_done(rbio);
1905 }
1906
1907 void bch2_fs_io_exit(struct bch_fs *c)
1908 {
1909         if (c->promote_table.tbl)
1910                 rhashtable_destroy(&c->promote_table);
1911         mempool_exit(&c->bio_bounce_pages);
1912         bioset_exit(&c->bio_write);
1913         bioset_exit(&c->bio_read_split);
1914         bioset_exit(&c->bio_read);
1915 }
1916
1917 int bch2_fs_io_init(struct bch_fs *c)
1918 {
1919         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1920                         BIOSET_NEED_BVECS) ||
1921             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1922                         BIOSET_NEED_BVECS) ||
1923             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
1924                         BIOSET_NEED_BVECS) ||
1925             mempool_init_page_pool(&c->bio_bounce_pages,
1926                                    max_t(unsigned,
1927                                          c->opts.btree_node_size,
1928                                          c->sb.encoded_extent_max) /
1929                                    PAGE_SECTORS, 0) ||
1930             rhashtable_init(&c->promote_table, &bch_promote_params))
1931                 return -ENOMEM;
1932
1933         return 0;
1934 }