git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Some low level IO code, and hacks for various block layer limitations
   4  *
   5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6  * Copyright 2012 Google, Inc.
   7  */
   8
   9 #include "bcachefs.h"
  10 #include "alloc_foreground.h"
  11 #include "bkey_on_stack.h"
  12 #include "bset.h"
  13 #include "btree_update.h"
  14 #include "buckets.h"
  15 #include "checksum.h"
  16 #include "compress.h"
  17 #include "clock.h"
  18 #include "debug.h"
  19 #include "disk_groups.h"
  20 #include "ec.h"
  21 #include "error.h"
  22 #include "extent_update.h"
  23 #include "inode.h"
  24 #include "io.h"
  25 #include "journal.h"
  26 #include "keylist.h"
  27 #include "move.h"
  28 #include "rebalance.h"
  29 #include "super.h"
  30 #include "super-io.h"
  31
  32 #include <linux/blkdev.h>
  33 #include <linux/random.h>
  34
  35 #include <trace/events/bcachefs.h>
  36
  37 static bool bch2_target_congested(struct bch_fs *c, u16 target)
  38 {
  39         const struct bch_devs_mask *devs;
  40         unsigned d, nr = 0, total = 0;
  41         u64 now = local_clock(), last;
  42         s64 congested;
  43         struct bch_dev *ca;
  44
  45         if (!target)
  46                 return false;
  47
  48         rcu_read_lock();
  49         devs = bch2_target_to_mask(c, target);
  50         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
  51                 ca = rcu_dereference(c->devs[d]);
  52                 if (!ca)
  53                         continue;
  54
  55                 congested = atomic_read(&ca->congested);
  56                 last = READ_ONCE(ca->congested_last);
  57                 if (time_after64(now, last))
  58                         congested -= (now - last) >> 12;
  59
  60                 total += max(congested, 0LL);
  61                 nr++;
  62         }
  63         rcu_read_unlock();
  64
  65         return bch2_rand_range(nr * CONGESTED_MAX) < total;
  66 }
  67
  68 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
  69                                        u64 now, int rw)
  70 {
  71         u64 latency_capable =
  72                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
  73         /* ideally we'd be taking into account the device's variance here: */
  74         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
  75         s64 latency_over = io_latency - latency_threshold;
  76
  77         if (latency_threshold && latency_over > 0) {
  78                 /*
  79                  * bump up congested by approximately latency_over * 4 /
  80                  * latency_threshold - we don't need much accuracy here so don't
  81                  * bother with the divide:
  82                  */
  83                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
  84                         atomic_add(latency_over >>
  85                                    max_t(int, ilog2(latency_threshold) - 2, 0),
  86                                    &ca->congested);
  87
  88                 ca->congested_last = now;
  89         } else if (atomic_read(&ca->congested) > 0) {
  90                 atomic_dec(&ca->congested);
  91         }
  92 }
  93
  94 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
  95 {
  96         atomic64_t *latency = &ca->cur_latency[rw];
  97         u64 now = local_clock();
  98         u64 io_latency = time_after64(now, submit_time)
  99                 ? now - submit_time
 100                 : 0;
 101         u64 old, new, v = atomic64_read(latency);
 102
 103         do {
 104                 old = v;
 105
 106                 /*
 107                  * If the io latency was reasonably close to the current
 108                  * latency, skip doing the update and atomic operation - most of
 109                  * the time:
 110                  */
 111                 if (abs((int) (old - io_latency)) < (old >> 1) &&
 112                     now & ~(~0 << 5))
 113                         break;
 114
 115                 new = ewma_add(old, io_latency, 5);
 116         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
 117
 118         bch2_congested_acct(ca, io_latency, now, rw);
 119
 120         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 121 }
 122
 123 /* Allocate, free from mempool: */
 124
 125 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 126 {
 127         struct bvec_iter_all iter;
 128         struct bio_vec *bv;
 129
 130         bio_for_each_segment_all(bv, bio, iter)
 131                 if (bv->bv_page != ZERO_PAGE(0))
 132                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
 133         bio->bi_vcnt = 0;
 134 }
 135
 136 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
 137 {
 138         struct page *page;
 139
 140         if (likely(!*using_mempool)) {
 141                 page = alloc_page(GFP_NOIO);
 142                 if (unlikely(!page)) {
 143                         mutex_lock(&c->bio_bounce_pages_lock);
 144                         *using_mempool = true;
 145                         goto pool_alloc;
 146
 147                 }
 148         } else {
 149 pool_alloc:
 150                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
 151         }
 152
 153         return page;
 154 }
 155
 156 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 157                                size_t size)
 158 {
 159         bool using_mempool = false;
 160
 161         while (size) {
 162                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
 163                 unsigned len = min(PAGE_SIZE, size);
 164
 165                 BUG_ON(!bio_add_page(bio, page, len, 0));
 166                 size -= len;
 167         }
 168
 169         if (using_mempool)
 170                 mutex_unlock(&c->bio_bounce_pages_lock);
 171 }
 172
 173 /* Extent update path: */
 174
 175 static int sum_sector_overwrites(struct btree_trans *trans,
 176                                  struct btree_iter *extent_iter,
 177                                  struct bkey_i *new,
 178                                  bool may_allocate,
 179                                  bool *maybe_extending,
 180                                  s64 *delta)
 181 {
 182         struct btree_iter *iter;
 183         struct bkey_s_c old;
 184         int ret = 0;
 185
 186         *maybe_extending = true;
 187         *delta = 0;
 188
 189         iter = bch2_trans_copy_iter(trans, extent_iter);
 190         if (IS_ERR(iter))
 191                 return PTR_ERR(iter);
 192
 193         for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
 194                 if (!may_allocate &&
 195                     bch2_bkey_nr_ptrs_fully_allocated(old) <
 196                     bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
 197                         ret = -ENOSPC;
 198                         break;
 199                 }
 200
 201                 *delta += (min(new->k.p.offset,
 202                               old.k->p.offset) -
 203                           max(bkey_start_offset(&new->k),
 204                               bkey_start_offset(old.k))) *
 205                         (bkey_extent_is_allocation(&new->k) -
 206                          bkey_extent_is_allocation(old.k));
 207
 208                 if (bkey_cmp(old.k->p, new->k.p) >= 0) {
 209                         /*
 210                          * Check if there's already data above where we're
 211                          * going to be writing to - this means we're definitely
 212                          * not extending the file:
 213                          *
 214                          * Note that it's not sufficient to check if there's
 215                          * data up to the sector offset we're going to be
 216                          * writing to, because i_size could be up to one block
 217                          * less:
 218                          */
 219                         if (!bkey_cmp(old.k->p, new->k.p))
 220                                 old = bch2_btree_iter_next(iter);
 221
 222                         if (old.k && !bkey_err(old) &&
 223                             old.k->p.inode == extent_iter->pos.inode &&
 224                             bkey_extent_is_data(old.k))
 225                                 *maybe_extending = false;
 226
 227                         break;
 228                 }
 229         }
 230
 231         bch2_trans_iter_put(trans, iter);
 232         return ret;
 233 }
 234
 235 int bch2_extent_update(struct btree_trans *trans,
 236                        struct btree_iter *iter,
 237                        struct bkey_i *k,
 238                        struct disk_reservation *disk_res,
 239                        u64 *journal_seq,
 240                        u64 new_i_size,
 241                        s64 *i_sectors_delta)
 242 {
 243         /* this must live until after bch2_trans_commit(): */
 244         struct bkey_inode_buf inode_p;
 245         bool extending = false;
 246         s64 delta = 0;
 247         int ret;
 248
 249         ret = bch2_extent_trim_atomic(k, iter);
 250         if (ret)
 251                 return ret;
 252
 253         ret = sum_sector_overwrites(trans, iter, k,
 254                         disk_res && disk_res->sectors != 0,
 255                         &extending, &delta);
 256         if (ret)
 257                 return ret;
 258
 259         new_i_size = extending
 260                 ? min(k->k.p.offset << 9, new_i_size)
 261                 : 0;
 262
 263         if (delta || new_i_size) {
 264                 struct btree_iter *inode_iter;
 265                 struct bch_inode_unpacked inode_u;
 266
 267                 inode_iter = bch2_inode_peek(trans, &inode_u,
 268                                 k->k.p.inode, BTREE_ITER_INTENT);
 269                 if (IS_ERR(inode_iter))
 270                         return PTR_ERR(inode_iter);
 271
 272                 /*
 273                  * XXX:
 274                  * writeback can race a bit with truncate, because truncate
 275                  * first updates the inode then truncates the pagecache. This is
 276                  * ugly, but lets us preserve the invariant that the in memory
 277                  * i_size is always >= the on disk i_size.
 278                  *
 279                 BUG_ON(new_i_size > inode_u.bi_size &&
 280                        (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
 281                  */
 282                 BUG_ON(new_i_size > inode_u.bi_size && !extending);
 283
 284                 if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 285                     new_i_size > inode_u.bi_size)
 286                         inode_u.bi_size = new_i_size;
 287                 else
 288                         new_i_size = 0;
 289
 290                 inode_u.bi_sectors += delta;
 291
 292                 if (delta || new_i_size) {
 293                         bch2_inode_pack(&inode_p, &inode_u);
 294                         bch2_trans_update(trans, inode_iter,
 295                                           &inode_p.inode.k_i);
 296                 }
 297
 298                 bch2_trans_iter_put(trans, inode_iter);
 299         }
 300
 301         bch2_trans_update(trans, iter, k);
 302
 303         ret = bch2_trans_commit(trans, disk_res, journal_seq,
 304                                 BTREE_INSERT_NOCHECK_RW|
 305                                 BTREE_INSERT_NOFAIL|
 306                                 BTREE_INSERT_ATOMIC|
 307                                 BTREE_INSERT_USE_RESERVE);
 308         if (!ret && i_sectors_delta)
 309                 *i_sectors_delta += delta;
 310
 311         return ret;
 312 }
 313
 314 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 315                    struct bpos end, u64 *journal_seq,
 316                    s64 *i_sectors_delta)
 317 {
 318         struct bch_fs *c        = trans->c;
 319         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
 320         struct bkey_s_c k;
 321         int ret = 0, ret2 = 0;
 322
 323         while ((k = bch2_btree_iter_peek(iter)).k &&
 324                bkey_cmp(iter->pos, end) < 0) {
 325                 struct disk_reservation disk_res =
 326                         bch2_disk_reservation_init(c, 0);
 327                 struct bkey_i delete;
 328
 329                 ret = bkey_err(k);
 330                 if (ret)
 331                         goto btree_err;
 332
 333                 bkey_init(&delete.k);
 334                 delete.k.p = iter->pos;
 335
 336                 /* create the biggest key we can */
 337                 bch2_key_resize(&delete.k, max_sectors);
 338                 bch2_cut_back(end, &delete);
 339
 340                 bch2_trans_begin_updates(trans);
 341
 342                 ret = bch2_extent_update(trans, iter, &delete,
 343                                 &disk_res, journal_seq,
 344                                 0, i_sectors_delta);
 345                 bch2_disk_reservation_put(c, &disk_res);
 346 btree_err:
 347                 if (ret == -EINTR) {
 348                         ret2 = ret;
 349                         ret = 0;
 350                 }
 351                 if (ret)
 352                         break;
 353         }
 354
 355         if (bkey_cmp(iter->pos, end) > 0) {
 356                 bch2_btree_iter_set_pos(iter, end);
 357                 ret = bch2_btree_iter_traverse(iter);
 358         }
 359
 360         return ret ?: ret2;
 361 }
 362
 363 int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 364                 u64 *journal_seq, s64 *i_sectors_delta)
 365 {
 366         struct btree_trans trans;
 367         struct btree_iter *iter;
 368         int ret = 0;
 369
 370         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 371         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 372                                    POS(inum, start),
 373                                    BTREE_ITER_INTENT);
 374
 375         ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
 376                              journal_seq, i_sectors_delta);
 377         bch2_trans_exit(&trans);
 378
 379         if (ret == -EINTR)
 380                 ret = 0;
 381
 382         return ret;
 383 }
 384
 385 int bch2_write_index_default(struct bch_write_op *op)
 386 {
 387         struct bch_fs *c = op->c;
 388         struct bkey_on_stack sk;
 389         struct keylist *keys = &op->insert_keys;
 390         struct bkey_i *k = bch2_keylist_front(keys);
 391         struct btree_trans trans;
 392         struct btree_iter *iter;
 393         int ret;
 394
 395         bkey_on_stack_init(&sk);
 396         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 397
 398         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 399                                    bkey_start_pos(&k->k),
 400                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 401
 402         do {
 403                 k = bch2_keylist_front(keys);
 404
 405                 bkey_on_stack_realloc(&sk, c, k->k.u64s);
 406                 bkey_copy(sk.k, k);
 407                 bch2_cut_front(iter->pos, sk.k);
 408
 409                 bch2_trans_begin_updates(&trans);
 410
 411                 ret = bch2_extent_update(&trans, iter, sk.k,
 412                                          &op->res, op_journal_seq(op),
 413                                          op->new_i_size, &op->i_sectors_delta);
 414                 if (ret == -EINTR)
 415                         continue;
 416                 if (ret)
 417                         break;
 418
 419                 if (bkey_cmp(iter->pos, k->k.p) >= 0)
 420                         bch2_keylist_pop_front(keys);
 421         } while (!bch2_keylist_empty(keys));
 422
 423         bch2_trans_exit(&trans);
 424         bkey_on_stack_exit(&sk, c);
 425
 426         return ret;
 427 }
 428
 429 /* Writes */
 430
 431 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 432                                enum bch_data_type type,
 433                                const struct bkey_i *k)
 434 {
 435         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
 436         const struct bch_extent_ptr *ptr;
 437         struct bch_write_bio *n;
 438         struct bch_dev *ca;
 439
 440         BUG_ON(c->opts.nochanges);
 441
 442         bkey_for_each_ptr(ptrs, ptr) {
 443                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 444                        !c->devs[ptr->dev]);
 445
 446                 ca = bch_dev_bkey_exists(c, ptr->dev);
 447
 448                 if (to_entry(ptr + 1) < ptrs.end) {
 449                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
 450                                                    &ca->replica_set));
 451
 452                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
 453                         n->bio.bi_private       = wbio->bio.bi_private;
 454                         n->parent               = wbio;
 455                         n->split                = true;
 456                         n->bounce               = false;
 457                         n->put_bio              = true;
 458                         n->bio.bi_opf           = wbio->bio.bi_opf;
 459                         bio_inc_remaining(&wbio->bio);
 460                 } else {
 461                         n = wbio;
 462                         n->split                = false;
 463                 }
 464
 465                 n->c                    = c;
 466                 n->dev                  = ptr->dev;
 467                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
 468                 n->submit_time          = local_clock();
 469                 n->bio.bi_iter.bi_sector = ptr->offset;
 470
 471                 if (!journal_flushes_device(ca))
 472                         n->bio.bi_opf |= REQ_FUA;
 473
 474                 if (likely(n->have_ioref)) {
 475                         this_cpu_add(ca->io_done->sectors[WRITE][type],
 476                                      bio_sectors(&n->bio));
 477
 478                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
 479                         submit_bio(&n->bio);
 480                 } else {
 481                         n->bio.bi_status        = BLK_STS_REMOVED;
 482                         bio_endio(&n->bio);
 483                 }
 484         }
 485 }
 486
 487 static void __bch2_write(struct closure *);
 488
 489 static void bch2_write_done(struct closure *cl)
 490 {
 491         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 492         struct bch_fs *c = op->c;
 493
 494         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 495                 op->error = bch2_journal_error(&c->journal);
 496
 497         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 498                 bch2_disk_reservation_put(c, &op->res);
 499         percpu_ref_put(&c->writes);
 500         bch2_keylist_free(&op->insert_keys, op->inline_keys);
 501
 502         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 503
 504         if (op->end_io)
 505                 op->end_io(op);
 506         if (cl->parent)
 507                 closure_return(cl);
 508         else
 509                 closure_debug_destroy(cl);
 510 }
 511
 512 /**
 513  * bch_write_index - after a write, update index to point to new data
 514  */
 515 static void __bch2_write_index(struct bch_write_op *op)
 516 {
 517         struct bch_fs *c = op->c;
 518         struct keylist *keys = &op->insert_keys;
 519         struct bch_extent_ptr *ptr;
 520         struct bkey_i *src, *dst = keys->keys, *n, *k;
 521         unsigned dev;
 522         int ret;
 523
 524         for (src = keys->keys; src != keys->top; src = n) {
 525                 n = bkey_next(src);
 526
 527                 if (bkey_extent_is_direct_data(&src->k)) {
 528                         bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
 529                                             test_bit(ptr->dev, op->failed.d));
 530
 531                         if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
 532                                 ret = -EIO;
 533                                 goto err;
 534                         }
 535                 }
 536
 537                 if (dst != src)
 538                         memmove_u64s_down(dst, src, src->u64s);
 539                 dst = bkey_next(dst);
 540         }
 541
 542         keys->top = dst;
 543
 544         /*
 545          * probably not the ideal place to hook this in, but I don't
 546          * particularly want to plumb io_opts all the way through the btree
 547          * update stack right now
 548          */
 549         for_each_keylist_key(keys, k)
 550                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 551
 552         if (!bch2_keylist_empty(keys)) {
 553                 u64 sectors_start = keylist_sectors(keys);
 554                 int ret = op->index_update_fn(op);
 555
 556                 BUG_ON(ret == -EINTR);
 557                 BUG_ON(keylist_sectors(keys) && !ret);
 558
 559                 op->written += sectors_start - keylist_sectors(keys);
 560
 561                 if (ret) {
 562                         __bcache_io_error(c, "btree IO error %i", ret);
 563                         op->error = ret;
 564                 }
 565         }
 566 out:
 567         /* If some a bucket wasn't written, we can't erasure code it: */
 568         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
 569                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
 570
 571         bch2_open_buckets_put(c, &op->open_buckets);
 572         return;
 573 err:
 574         keys->top = keys->keys;
 575         op->error = ret;
 576         goto out;
 577 }
 578
 579 static void bch2_write_index(struct closure *cl)
 580 {
 581         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 582         struct bch_fs *c = op->c;
 583
 584         __bch2_write_index(op);
 585
 586         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 587                 bch2_journal_flush_seq_async(&c->journal,
 588                                              *op_journal_seq(op),
 589                                              cl);
 590                 continue_at(cl, bch2_write_done, index_update_wq(op));
 591         } else {
 592                 continue_at_nobarrier(cl, bch2_write_done, NULL);
 593         }
 594 }
 595
 596 static void bch2_write_endio(struct bio *bio)
 597 {
 598         struct closure *cl              = bio->bi_private;
 599         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
 600         struct bch_write_bio *wbio      = to_wbio(bio);
 601         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
 602         struct bch_fs *c                = wbio->c;
 603         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
 604
 605         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
 606                 set_bit(wbio->dev, op->failed.d);
 607
 608         if (wbio->have_ioref) {
 609                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
 610                 percpu_ref_put(&ca->io_ref);
 611         }
 612
 613         if (wbio->bounce)
 614                 bch2_bio_free_pages_pool(c, bio);
 615
 616         if (wbio->put_bio)
 617                 bio_put(bio);
 618
 619         if (parent)
 620                 bio_endio(&parent->bio);
 621         else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
 622                 closure_put(cl);
 623         else
 624                 continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
 625 }
 626
 627 static void init_append_extent(struct bch_write_op *op,
 628                                struct write_point *wp,
 629                                struct bversion version,
 630                                struct bch_extent_crc_unpacked crc)
 631 {
 632         struct bch_fs *c = op->c;
 633         struct bkey_i_extent *e;
 634         struct open_bucket *ob;
 635         unsigned i;
 636
 637         BUG_ON(crc.compressed_size > wp->sectors_free);
 638         wp->sectors_free -= crc.compressed_size;
 639         op->pos.offset += crc.uncompressed_size;
 640
 641         e = bkey_extent_init(op->insert_keys.top);
 642         e->k.p          = op->pos;
 643         e->k.size       = crc.uncompressed_size;
 644         e->k.version    = version;
 645
 646         if (crc.csum_type ||
 647             crc.compression_type ||
 648             crc.nonce)
 649                 bch2_extent_crc_append(&e->k_i, crc);
 650
 651         open_bucket_for_each(c, &wp->ptrs, ob, i) {
 652                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 653                 union bch_extent_entry *end =
 654                         bkey_val_end(bkey_i_to_s(&e->k_i));
 655
 656                 end->ptr = ob->ptr;
 657                 end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 658                 end->ptr.cached = !ca->mi.durability ||
 659                         (op->flags & BCH_WRITE_CACHED) != 0;
 660                 end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
 661
 662                 e->k.u64s++;
 663
 664                 BUG_ON(crc.compressed_size > ob->sectors_free);
 665                 ob->sectors_free -= crc.compressed_size;
 666         }
 667
 668         bch2_keylist_push(&op->insert_keys);
 669 }
 670
 671 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 672                                         struct write_point *wp,
 673                                         struct bio *src,
 674                                         bool *page_alloc_failed,
 675                                         void *buf)
 676 {
 677         struct bch_write_bio *wbio;
 678         struct bio *bio;
 679         unsigned output_available =
 680                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
 681         unsigned pages = DIV_ROUND_UP(output_available +
 682                                       (buf
 683                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
 684                                        : 0), PAGE_SIZE);
 685
 686         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
 687         wbio                    = wbio_init(bio);
 688         wbio->put_bio           = true;
 689         /* copy WRITE_SYNC flag */
 690         wbio->bio.bi_opf        = src->bi_opf;
 691
 692         if (buf) {
 693                 bch2_bio_map(bio, buf, output_available);
 694                 return bio;
 695         }
 696
 697         wbio->bounce            = true;
 698
 699         /*
 700          * We can't use mempool for more than c->sb.encoded_extent_max
 701          * worth of pages, but we'd like to allocate more if we can:
 702          */
 703         bch2_bio_alloc_pages_pool(c, bio,
 704                                   min_t(unsigned, output_available,
 705                                         c->sb.encoded_extent_max << 9));
 706
 707         if (bio->bi_iter.bi_size < output_available)
 708                 *page_alloc_failed =
 709                         bch2_bio_alloc_pages(bio,
 710                                              output_available -
 711                                              bio->bi_iter.bi_size,
 712                                              GFP_NOFS) != 0;
 713
 714         return bio;
 715 }
 716
 717 static int bch2_write_rechecksum(struct bch_fs *c,
 718                                  struct bch_write_op *op,
 719                                  unsigned new_csum_type)
 720 {
 721         struct bio *bio = &op->wbio.bio;
 722         struct bch_extent_crc_unpacked new_crc;
 723         int ret;
 724
 725         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
 726
 727         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
 728             bch2_csum_type_is_encryption(new_csum_type))
 729                 new_csum_type = op->crc.csum_type;
 730
 731         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
 732                                   NULL, &new_crc,
 733                                   op->crc.offset, op->crc.live_size,
 734                                   new_csum_type);
 735         if (ret)
 736                 return ret;
 737
 738         bio_advance(bio, op->crc.offset << 9);
 739         bio->bi_iter.bi_size = op->crc.live_size << 9;
 740         op->crc = new_crc;
 741         return 0;
 742 }
 743
 744 static int bch2_write_decrypt(struct bch_write_op *op)
 745 {
 746         struct bch_fs *c = op->c;
 747         struct nonce nonce = extent_nonce(op->version, op->crc);
 748         struct bch_csum csum;
 749
 750         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
 751                 return 0;
 752
 753         /*
 754          * If we need to decrypt data in the write path, we'll no longer be able
 755          * to verify the existing checksum (poly1305 mac, in this case) after
 756          * it's decrypted - this is the last point we'll be able to reverify the
 757          * checksum:
 758          */
 759         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 760         if (bch2_crc_cmp(op->crc.csum, csum))
 761                 return -EIO;
 762
 763         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 764         op->crc.csum_type = 0;
 765         op->crc.csum = (struct bch_csum) { 0, 0 };
 766         return 0;
 767 }
 768
 769 static enum prep_encoded_ret {
 770         PREP_ENCODED_OK,
 771         PREP_ENCODED_ERR,
 772         PREP_ENCODED_CHECKSUM_ERR,
 773         PREP_ENCODED_DO_WRITE,
 774 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
 775 {
 776         struct bch_fs *c = op->c;
 777         struct bio *bio = &op->wbio.bio;
 778
 779         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
 780                 return PREP_ENCODED_OK;
 781
 782         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
 783
 784         /* Can we just write the entire extent as is? */
 785         if (op->crc.uncompressed_size == op->crc.live_size &&
 786             op->crc.compressed_size <= wp->sectors_free &&
 787             op->crc.compression_type == op->compression_type) {
 788                 if (!op->crc.compression_type &&
 789                     op->csum_type != op->crc.csum_type &&
 790                     bch2_write_rechecksum(c, op, op->csum_type))
 791                         return PREP_ENCODED_CHECKSUM_ERR;
 792
 793                 return PREP_ENCODED_DO_WRITE;
 794         }
 795
 796         /*
 797          * If the data is compressed and we couldn't write the entire extent as
 798          * is, we have to decompress it:
 799          */
 800         if (op->crc.compression_type) {
 801                 struct bch_csum csum;
 802
 803                 if (bch2_write_decrypt(op))
 804                         return PREP_ENCODED_CHECKSUM_ERR;
 805
 806                 /* Last point we can still verify checksum: */
 807                 csum = bch2_checksum_bio(c, op->crc.csum_type,
 808                                          extent_nonce(op->version, op->crc),
 809                                          bio);
 810                 if (bch2_crc_cmp(op->crc.csum, csum))
 811                         return PREP_ENCODED_CHECKSUM_ERR;
 812
 813                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
 814                         return PREP_ENCODED_ERR;
 815         }
 816
 817         /*
 818          * No longer have compressed data after this point - data might be
 819          * encrypted:
 820          */
 821
 822         /*
 823          * If the data is checksummed and we're only writing a subset,
 824          * rechecksum and adjust bio to point to currently live data:
 825          */
 826         if ((op->crc.live_size != op->crc.uncompressed_size ||
 827              op->crc.csum_type != op->csum_type) &&
 828             bch2_write_rechecksum(c, op, op->csum_type))
 829                 return PREP_ENCODED_CHECKSUM_ERR;
 830
 831         /*
 832          * If we want to compress the data, it has to be decrypted:
 833          */
 834         if ((op->compression_type ||
 835              bch2_csum_type_is_encryption(op->crc.csum_type) !=
 836              bch2_csum_type_is_encryption(op->csum_type)) &&
 837             bch2_write_decrypt(op))
 838                 return PREP_ENCODED_CHECKSUM_ERR;
 839
 840         return PREP_ENCODED_OK;
 841 }
 842
 843 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 844                              struct bio **_dst)
 845 {
 846         struct bch_fs *c = op->c;
 847         struct bio *src = &op->wbio.bio, *dst = src;
 848         struct bvec_iter saved_iter;
 849         void *ec_buf;
 850         struct bpos ec_pos = op->pos;
 851         unsigned total_output = 0, total_input = 0;
 852         bool bounce = false;
 853         bool page_alloc_failed = false;
 854         int ret, more = 0;
 855
 856         BUG_ON(!bio_sectors(src));
 857
 858         ec_buf = bch2_writepoint_ec_buf(c, wp);
 859
 860         switch (bch2_write_prep_encoded_data(op, wp)) {
 861         case PREP_ENCODED_OK:
 862                 break;
 863         case PREP_ENCODED_ERR:
 864                 ret = -EIO;
 865                 goto err;
 866         case PREP_ENCODED_CHECKSUM_ERR:
 867                 goto csum_err;
 868         case PREP_ENCODED_DO_WRITE:
 869                 /* XXX look for bug here */
 870                 if (ec_buf) {
 871                         dst = bch2_write_bio_alloc(c, wp, src,
 872                                                    &page_alloc_failed,
 873                                                    ec_buf);
 874                         bio_copy_data(dst, src);
 875                         bounce = true;
 876                 }
 877                 init_append_extent(op, wp, op->version, op->crc);
 878                 goto do_write;
 879         }
 880
 881         if (ec_buf ||
 882             op->compression_type ||
 883             (op->csum_type &&
 884              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 885             (bch2_csum_type_is_encryption(op->csum_type) &&
 886              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
 887                 dst = bch2_write_bio_alloc(c, wp, src,
 888                                            &page_alloc_failed,
 889                                            ec_buf);
 890                 bounce = true;
 891         }
 892
 893         saved_iter = dst->bi_iter;
 894
 895         do {
 896                 struct bch_extent_crc_unpacked crc =
 897                         (struct bch_extent_crc_unpacked) { 0 };
 898                 struct bversion version = op->version;
 899                 size_t dst_len, src_len;
 900
 901                 if (page_alloc_failed &&
 902                     bio_sectors(dst) < wp->sectors_free &&
 903                     bio_sectors(dst) < c->sb.encoded_extent_max)
 904                         break;
 905
 906                 BUG_ON(op->compression_type &&
 907                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
 908                        bch2_csum_type_is_encryption(op->crc.csum_type));
 909                 BUG_ON(op->compression_type && !bounce);
 910
 911                 crc.compression_type = op->compression_type
 912                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
 913                                              op->compression_type)
 914                         : 0;
 915                 if (!crc.compression_type) {
 916                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 917                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
 918
 919                         if (op->csum_type)
 920                                 dst_len = min_t(unsigned, dst_len,
 921                                                 c->sb.encoded_extent_max << 9);
 922
 923                         if (bounce) {
 924                                 swap(dst->bi_iter.bi_size, dst_len);
 925                                 bio_copy_data(dst, src);
 926                                 swap(dst->bi_iter.bi_size, dst_len);
 927                         }
 928
 929                         src_len = dst_len;
 930                 }
 931
 932                 BUG_ON(!src_len || !dst_len);
 933
 934                 if (bch2_csum_type_is_encryption(op->csum_type)) {
 935                         if (bversion_zero(version)) {
 936                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
 937                         } else {
 938                                 crc.nonce = op->nonce;
 939                                 op->nonce += src_len >> 9;
 940                         }
 941                 }
 942
 943                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 944                     !crc.compression_type &&
 945                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
 946                     bch2_csum_type_is_encryption(op->csum_type)) {
 947                         /*
 948                          * Note: when we're using rechecksum(), we need to be
 949                          * checksumming @src because it has all the data our
 950                          * existing checksum covers - if we bounced (because we
 951                          * were trying to compress), @dst will only have the
 952                          * part of the data the new checksum will cover.
 953                          *
 954                          * But normally we want to be checksumming post bounce,
 955                          * because part of the reason for bouncing is so the
 956                          * data can't be modified (by userspace) while it's in
 957                          * flight.
 958                          */
 959                         if (bch2_rechecksum_bio(c, src, version, op->crc,
 960                                         &crc, &op->crc,
 961                                         src_len >> 9,
 962                                         bio_sectors(src) - (src_len >> 9),
 963                                         op->csum_type))
 964                                 goto csum_err;
 965                 } else {
 966                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 967                             bch2_rechecksum_bio(c, src, version, op->crc,
 968                                         NULL, &op->crc,
 969                                         src_len >> 9,
 970                                         bio_sectors(src) - (src_len >> 9),
 971                                         op->crc.csum_type))
 972                                 goto csum_err;
 973
 974                         crc.compressed_size     = dst_len >> 9;
 975                         crc.uncompressed_size   = src_len >> 9;
 976                         crc.live_size           = src_len >> 9;
 977
 978                         swap(dst->bi_iter.bi_size, dst_len);
 979                         bch2_encrypt_bio(c, op->csum_type,
 980                                          extent_nonce(version, crc), dst);
 981                         crc.csum = bch2_checksum_bio(c, op->csum_type,
 982                                          extent_nonce(version, crc), dst);
 983                         crc.csum_type = op->csum_type;
 984                         swap(dst->bi_iter.bi_size, dst_len);
 985                 }
 986
 987                 init_append_extent(op, wp, version, crc);
 988
 989                 if (dst != src)
 990                         bio_advance(dst, dst_len);
 991                 bio_advance(src, src_len);
 992                 total_output    += dst_len;
 993                 total_input     += src_len;
 994         } while (dst->bi_iter.bi_size &&
 995                  src->bi_iter.bi_size &&
 996                  wp->sectors_free &&
 997                  !bch2_keylist_realloc(&op->insert_keys,
 998                                       op->inline_keys,
 999                                       ARRAY_SIZE(op->inline_keys),
1000                                       BKEY_EXTENT_U64s_MAX));
1001
1002         more = src->bi_iter.bi_size != 0;
1003
1004         dst->bi_iter = saved_iter;
1005
1006         if (dst == src && more) {
1007                 BUG_ON(total_output != total_input);
1008
1009                 dst = bio_split(src, total_input >> 9,
1010                                 GFP_NOIO, &c->bio_write);
1011                 wbio_init(dst)->put_bio = true;
1012                 /* copy WRITE_SYNC flag */
1013                 dst->bi_opf             = src->bi_opf;
1014         }
1015
1016         dst->bi_iter.bi_size = total_output;
1017 do_write:
1018         /* might have done a realloc... */
1019         bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
1020
1021         *_dst = dst;
1022         return more;
1023 csum_err:
1024         bch_err(c, "error verifying existing checksum while "
1025                 "rewriting existing data (memory corruption?)");
1026         ret = -EIO;
1027 err:
1028         if (to_wbio(dst)->bounce)
1029                 bch2_bio_free_pages_pool(c, dst);
1030         if (to_wbio(dst)->put_bio)
1031                 bio_put(dst);
1032
1033         return ret;
1034 }
1035
1036 static void __bch2_write(struct closure *cl)
1037 {
1038         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1039         struct bch_fs *c = op->c;
1040         struct write_point *wp;
1041         struct bio *bio;
1042         bool skip_put = true;
1043         int ret;
1044 again:
1045         memset(&op->failed, 0, sizeof(op->failed));
1046
1047         do {
1048                 struct bkey_i *key_to_write;
1049                 unsigned key_to_write_offset = op->insert_keys.top_p -
1050                         op->insert_keys.keys_p;
1051
1052                 /* +1 for possible cache device: */
1053                 if (op->open_buckets.nr + op->nr_replicas + 1 >
1054                     ARRAY_SIZE(op->open_buckets.v))
1055                         goto flush_io;
1056
1057                 if (bch2_keylist_realloc(&op->insert_keys,
1058                                         op->inline_keys,
1059                                         ARRAY_SIZE(op->inline_keys),
1060                                         BKEY_EXTENT_U64s_MAX))
1061                         goto flush_io;
1062
1063                 wp = bch2_alloc_sectors_start(c,
1064                         op->target,
1065                         op->opts.erasure_code,
1066                         op->write_point,
1067                         &op->devs_have,
1068                         op->nr_replicas,
1069                         op->nr_replicas_required,
1070                         op->alloc_reserve,
1071                         op->flags,
1072                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
1073                 EBUG_ON(!wp);
1074
1075                 if (unlikely(IS_ERR(wp))) {
1076                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
1077                                 ret = PTR_ERR(wp);
1078                                 goto err;
1079                         }
1080
1081                         goto flush_io;
1082                 }
1083
1084                 bch2_open_bucket_get(c, wp, &op->open_buckets);
1085                 ret = bch2_write_extent(op, wp, &bio);
1086                 bch2_alloc_sectors_done(c, wp);
1087
1088                 if (ret < 0)
1089                         goto err;
1090
1091                 if (ret)
1092                         skip_put = false;
1093
1094                 bio->bi_end_io  = bch2_write_endio;
1095                 bio->bi_private = &op->cl;
1096                 bio->bi_opf |= REQ_OP_WRITE;
1097
1098                 if (!skip_put)
1099                         closure_get(bio->bi_private);
1100                 else
1101                         op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
1102
1103                 key_to_write = (void *) (op->insert_keys.keys_p +
1104                                          key_to_write_offset);
1105
1106                 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
1107                                           key_to_write);
1108         } while (ret);
1109
1110         if (!skip_put)
1111                 continue_at(cl, bch2_write_index, index_update_wq(op));
1112         return;
1113 err:
1114         op->error = ret;
1115
1116         continue_at(cl, bch2_write_index, index_update_wq(op));
1117         return;
1118 flush_io:
1119         closure_sync(cl);
1120
1121         if (!bch2_keylist_empty(&op->insert_keys)) {
1122                 __bch2_write_index(op);
1123
1124                 if (op->error) {
1125                         continue_at_nobarrier(cl, bch2_write_done, NULL);
1126                         return;
1127                 }
1128         }
1129
1130         goto again;
1131 }
1132
1133 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1134 {
1135         struct closure *cl = &op->cl;
1136         struct bio *bio = &op->wbio.bio;
1137         struct bvec_iter iter;
1138         struct bkey_i_inline_data *id;
1139         unsigned sectors;
1140         int ret;
1141
1142         ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1143                                    ARRAY_SIZE(op->inline_keys),
1144                                    BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1145         if (ret) {
1146                 op->error = ret;
1147                 goto err;
1148         }
1149
1150         sectors = bio_sectors(bio);
1151         op->pos.offset += sectors;
1152
1153         id = bkey_inline_data_init(op->insert_keys.top);
1154         id->k.p         = op->pos;
1155         id->k.version   = op->version;
1156         id->k.size      = sectors;
1157
1158         iter = bio->bi_iter;
1159         iter.bi_size = data_len;
1160         memcpy_from_bio(id->v.data, bio, iter);
1161
1162         while (data_len & 7)
1163                 id->v.data[data_len++] = '\0';
1164         set_bkey_val_bytes(&id->k, data_len);
1165         bch2_keylist_push(&op->insert_keys);
1166
1167         op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1168         continue_at_nobarrier(cl, bch2_write_index, NULL);
1169         return;
1170 err:
1171         bch2_write_done(&op->cl);
1172 }
1173
1174 /**
1175  * bch_write - handle a write to a cache device or flash only volume
1176  *
1177  * This is the starting point for any data to end up in a cache device; it could
1178  * be from a normal write, or a writeback write, or a write to a flash only
1179  * volume - it's also used by the moving garbage collector to compact data in
1180  * mostly empty buckets.
1181  *
1182  * It first writes the data to the cache, creating a list of keys to be inserted
1183  * (if the data won't fit in a single open bucket, there will be multiple keys);
1184  * after the data is written it calls bch_journal, and after the keys have been
1185  * added to the next journal write they're inserted into the btree.
1186  *
1187  * If op->discard is true, instead of inserting the data it invalidates the
1188  * region of the cache represented by op->bio and op->inode.
1189  */
1190 void bch2_write(struct closure *cl)
1191 {
1192         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1193         struct bio *bio = &op->wbio.bio;
1194         struct bch_fs *c = op->c;
1195         unsigned data_len;
1196
1197         BUG_ON(!op->nr_replicas);
1198         BUG_ON(!op->write_point.v);
1199         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
1200
1201         op->start_time = local_clock();
1202         bch2_keylist_init(&op->insert_keys, op->inline_keys);
1203         wbio_init(bio)->put_bio = false;
1204
1205         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
1206                 __bcache_io_error(c, "misaligned write");
1207                 op->error = -EIO;
1208                 goto err;
1209         }
1210
1211         if (c->opts.nochanges ||
1212             !percpu_ref_tryget(&c->writes)) {
1213                 __bcache_io_error(c, "read only");
1214                 op->error = -EROFS;
1215                 goto err;
1216         }
1217
1218         bch2_increment_clock(c, bio_sectors(bio), WRITE);
1219
1220         data_len = min_t(u64, bio->bi_iter.bi_size,
1221                          op->new_i_size - (op->pos.offset << 9));
1222
1223         if (data_len <= min(block_bytes(c) / 2, 1024U)) {
1224                 bch2_write_data_inline(op, data_len);
1225                 return;
1226         }
1227
1228         continue_at_nobarrier(cl, __bch2_write, NULL);
1229         return;
1230 err:
1231         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
1232                 bch2_disk_reservation_put(c, &op->res);
1233         if (op->end_io)
1234                 op->end_io(op);
1235         if (cl->parent)
1236                 closure_return(cl);
1237         else
1238                 closure_debug_destroy(cl);
1239 }
1240
1241 /* Cache promotion on read */
1242
1243 struct promote_op {
1244         struct closure          cl;
1245         struct rcu_head         rcu;
1246         u64                     start_time;
1247
1248         struct rhash_head       hash;
1249         struct bpos             pos;
1250
1251         struct migrate_write    write;
1252         struct bio_vec          bi_inline_vecs[0]; /* must be last */
1253 };
1254
1255 static const struct rhashtable_params bch_promote_params = {
1256         .head_offset    = offsetof(struct promote_op, hash),
1257         .key_offset     = offsetof(struct promote_op, pos),
1258         .key_len        = sizeof(struct bpos),
1259 };
1260
1261 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1262                                   struct bpos pos,
1263                                   struct bch_io_opts opts,
1264                                   unsigned flags)
1265 {
1266         if (!(flags & BCH_READ_MAY_PROMOTE))
1267                 return false;
1268
1269         if (!opts.promote_target)
1270                 return false;
1271
1272         if (bch2_bkey_has_target(c, k, opts.promote_target))
1273                 return false;
1274
1275         if (bch2_target_congested(c, opts.promote_target)) {
1276                 /* XXX trace this */
1277                 return false;
1278         }
1279
1280         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1281                                    bch_promote_params))
1282                 return false;
1283
1284         return true;
1285 }
1286
1287 static void promote_free(struct bch_fs *c, struct promote_op *op)
1288 {
1289         int ret;
1290
1291         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1292                                      bch_promote_params);
1293         BUG_ON(ret);
1294         percpu_ref_put(&c->writes);
1295         kfree_rcu(op, rcu);
1296 }
1297
1298 static void promote_done(struct closure *cl)
1299 {
1300         struct promote_op *op =
1301                 container_of(cl, struct promote_op, cl);
1302         struct bch_fs *c = op->write.op.c;
1303
1304         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1305                                op->start_time);
1306
1307         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1308         promote_free(c, op);
1309 }
1310
1311 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1312 {
1313         struct bch_fs *c = rbio->c;
1314         struct closure *cl = &op->cl;
1315         struct bio *bio = &op->write.op.wbio.bio;
1316
1317         trace_promote(&rbio->bio);
1318
1319         /* we now own pages: */
1320         BUG_ON(!rbio->bounce);
1321         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1322
1323         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1324                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1325         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1326
1327         bch2_migrate_read_done(&op->write, rbio);
1328
1329         closure_init(cl, NULL);
1330         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1331         closure_return_with_destructor(cl, promote_done);
1332 }
1333
1334 static struct promote_op *__promote_alloc(struct bch_fs *c,
1335                                           enum btree_id btree_id,
1336                                           struct bpos pos,
1337                                           struct extent_ptr_decoded *pick,
1338                                           struct bch_io_opts opts,
1339                                           unsigned sectors,
1340                                           struct bch_read_bio **rbio)
1341 {
1342         struct promote_op *op = NULL;
1343         struct bio *bio;
1344         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1345         int ret;
1346
1347         if (!percpu_ref_tryget(&c->writes))
1348                 return NULL;
1349
1350         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1351         if (!op)
1352                 goto err;
1353
1354         op->start_time = local_clock();
1355         op->pos = pos;
1356
1357         /*
1358          * We don't use the mempool here because extents that aren't
1359          * checksummed or compressed can be too big for the mempool:
1360          */
1361         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1362                         sizeof(struct bio_vec) * pages,
1363                         GFP_NOIO);
1364         if (!*rbio)
1365                 goto err;
1366
1367         rbio_init(&(*rbio)->bio, opts);
1368         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1369
1370         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1371                                  GFP_NOIO))
1372                 goto err;
1373
1374         (*rbio)->bounce         = true;
1375         (*rbio)->split          = true;
1376         (*rbio)->kmalloc        = true;
1377
1378         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1379                                           bch_promote_params))
1380                 goto err;
1381
1382         bio = &op->write.op.wbio.bio;
1383         bio_init(bio, bio->bi_inline_vecs, pages);
1384
1385         ret = bch2_migrate_write_init(c, &op->write,
1386                         writepoint_hashed((unsigned long) current),
1387                         opts,
1388                         DATA_PROMOTE,
1389                         (struct data_opts) {
1390                                 .target = opts.promote_target
1391                         },
1392                         btree_id,
1393                         bkey_s_c_null);
1394         BUG_ON(ret);
1395
1396         return op;
1397 err:
1398         if (*rbio)
1399                 bio_free_pages(&(*rbio)->bio);
1400         kfree(*rbio);
1401         *rbio = NULL;
1402         kfree(op);
1403         percpu_ref_put(&c->writes);
1404         return NULL;
1405 }
1406
1407 noinline
1408 static struct promote_op *promote_alloc(struct bch_fs *c,
1409                                                struct bvec_iter iter,
1410                                                struct bkey_s_c k,
1411                                                struct extent_ptr_decoded *pick,
1412                                                struct bch_io_opts opts,
1413                                                unsigned flags,
1414                                                struct bch_read_bio **rbio,
1415                                                bool *bounce,
1416                                                bool *read_full)
1417 {
1418         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1419         /* data might have to be decompressed in the write path: */
1420         unsigned sectors = promote_full
1421                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1422                 : bvec_iter_sectors(iter);
1423         struct bpos pos = promote_full
1424                 ? bkey_start_pos(k.k)
1425                 : POS(k.k->p.inode, iter.bi_sector);
1426         struct promote_op *promote;
1427
1428         if (!should_promote(c, k, pos, opts, flags))
1429                 return NULL;
1430
1431         promote = __promote_alloc(c,
1432                                   k.k->type == KEY_TYPE_reflink_v
1433                                   ? BTREE_ID_REFLINK
1434                                   : BTREE_ID_EXTENTS,
1435                                   pos, pick, opts, sectors, rbio);
1436         if (!promote)
1437                 return NULL;
1438
1439         *bounce         = true;
1440         *read_full      = promote_full;
1441         return promote;
1442 }
1443
1444 /* Read */
1445
1446 #define READ_RETRY_AVOID        1
1447 #define READ_RETRY              2
1448 #define READ_ERR                3
1449
1450 enum rbio_context {
1451         RBIO_CONTEXT_NULL,
1452         RBIO_CONTEXT_HIGHPRI,
1453         RBIO_CONTEXT_UNBOUND,
1454 };
1455
1456 static inline struct bch_read_bio *
1457 bch2_rbio_parent(struct bch_read_bio *rbio)
1458 {
1459         return rbio->split ? rbio->parent : rbio;
1460 }
1461
1462 __always_inline
1463 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1464                            enum rbio_context context,
1465                            struct workqueue_struct *wq)
1466 {
1467         if (context <= rbio->context) {
1468                 fn(&rbio->work);
1469         } else {
1470                 rbio->work.func         = fn;
1471                 rbio->context           = context;
1472                 queue_work(wq, &rbio->work);
1473         }
1474 }
1475
1476 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1477 {
1478         BUG_ON(rbio->bounce && !rbio->split);
1479
1480         if (rbio->promote)
1481                 promote_free(rbio->c, rbio->promote);
1482         rbio->promote = NULL;
1483
1484         if (rbio->bounce)
1485                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1486
1487         if (rbio->split) {
1488                 struct bch_read_bio *parent = rbio->parent;
1489
1490                 if (rbio->kmalloc)
1491                         kfree(rbio);
1492                 else
1493                         bio_put(&rbio->bio);
1494
1495                 rbio = parent;
1496         }
1497
1498         return rbio;
1499 }
1500
1501 /*
1502  * Only called on a top level bch_read_bio to complete an entire read request,
1503  * not a split:
1504  */
1505 static void bch2_rbio_done(struct bch_read_bio *rbio)
1506 {
1507         if (rbio->start_time)
1508                 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1509                                        rbio->start_time);
1510         bio_endio(&rbio->bio);
1511 }
1512
1513 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1514                                      struct bvec_iter bvec_iter, u64 inode,
1515                                      struct bch_io_failures *failed,
1516                                      unsigned flags)
1517 {
1518         struct btree_trans trans;
1519         struct btree_iter *iter;
1520         struct bkey_on_stack sk;
1521         struct bkey_s_c k;
1522         int ret;
1523
1524         flags &= ~BCH_READ_LAST_FRAGMENT;
1525         flags |= BCH_READ_MUST_CLONE;
1526
1527         bkey_on_stack_init(&sk);
1528         bch2_trans_init(&trans, c, 0, 0);
1529
1530         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1531                                    rbio->pos, BTREE_ITER_SLOTS);
1532 retry:
1533         rbio->bio.bi_status = 0;
1534
1535         k = bch2_btree_iter_peek_slot(iter);
1536         if (bkey_err(k))
1537                 goto err;
1538
1539         bkey_on_stack_realloc(&sk, c, k.k->u64s);
1540         bkey_reassemble(sk.k, k);
1541         k = bkey_i_to_s_c(sk.k);
1542         bch2_trans_unlock(&trans);
1543
1544         if (!bch2_bkey_matches_ptr(c, k,
1545                                    rbio->pick.ptr,
1546                                    rbio->pos.offset -
1547                                    rbio->pick.crc.offset)) {
1548                 /* extent we wanted to read no longer exists: */
1549                 rbio->hole = true;
1550                 goto out;
1551         }
1552
1553         ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
1554         if (ret == READ_RETRY)
1555                 goto retry;
1556         if (ret)
1557                 goto err;
1558 out:
1559         bch2_rbio_done(rbio);
1560         bch2_trans_exit(&trans);
1561         bkey_on_stack_exit(&sk, c);
1562         return;
1563 err:
1564         rbio->bio.bi_status = BLK_STS_IOERR;
1565         goto out;
1566 }
1567
1568 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1569                             struct bvec_iter bvec_iter, u64 inode,
1570                             struct bch_io_failures *failed, unsigned flags)
1571 {
1572         struct btree_trans trans;
1573         struct btree_iter *iter;
1574         struct bkey_on_stack sk;
1575         struct bkey_s_c k;
1576         int ret;
1577
1578         flags &= ~BCH_READ_LAST_FRAGMENT;
1579         flags |= BCH_READ_MUST_CLONE;
1580
1581         bkey_on_stack_init(&sk);
1582         bch2_trans_init(&trans, c, 0, 0);
1583 retry:
1584         bch2_trans_begin(&trans);
1585
1586         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1587                            POS(inode, bvec_iter.bi_sector),
1588                            BTREE_ITER_SLOTS, k, ret) {
1589                 unsigned bytes, sectors, offset_into_extent;
1590
1591                 bkey_on_stack_realloc(&sk, c, k.k->u64s);
1592                 bkey_reassemble(sk.k, k);
1593                 k = bkey_i_to_s_c(sk.k);
1594
1595                 offset_into_extent = iter->pos.offset -
1596                         bkey_start_offset(k.k);
1597                 sectors = k.k->size - offset_into_extent;
1598
1599                 ret = bch2_read_indirect_extent(&trans,
1600                                         &offset_into_extent, sk.k);
1601                 if (ret)
1602                         break;
1603
1604                 sectors = min(sectors, k.k->size - offset_into_extent);
1605
1606                 bch2_trans_unlock(&trans);
1607
1608                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1609                 swap(bvec_iter.bi_size, bytes);
1610
1611                 ret = __bch2_read_extent(c, rbio, bvec_iter, k,
1612                                 offset_into_extent, failed, flags);
1613                 switch (ret) {
1614                 case READ_RETRY:
1615                         goto retry;
1616                 case READ_ERR:
1617                         goto err;
1618                 };
1619
1620                 if (bytes == bvec_iter.bi_size)
1621                         goto out;
1622
1623                 swap(bvec_iter.bi_size, bytes);
1624                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1625         }
1626
1627         if (ret == -EINTR)
1628                 goto retry;
1629         /*
1630          * If we get here, it better have been because there was an error
1631          * reading a btree node
1632          */
1633         BUG_ON(!ret);
1634         __bcache_io_error(c, "btree IO error: %i", ret);
1635 err:
1636         rbio->bio.bi_status = BLK_STS_IOERR;
1637 out:
1638         bch2_trans_exit(&trans);
1639         bkey_on_stack_exit(&sk, c);
1640         bch2_rbio_done(rbio);
1641 }
1642
1643 static void bch2_rbio_retry(struct work_struct *work)
1644 {
1645         struct bch_read_bio *rbio =
1646                 container_of(work, struct bch_read_bio, work);
1647         struct bch_fs *c        = rbio->c;
1648         struct bvec_iter iter   = rbio->bvec_iter;
1649         unsigned flags          = rbio->flags;
1650         u64 inode               = rbio->pos.inode;
1651         struct bch_io_failures failed = { .nr = 0 };
1652
1653         trace_read_retry(&rbio->bio);
1654
1655         if (rbio->retry == READ_RETRY_AVOID)
1656                 bch2_mark_io_failure(&failed, &rbio->pick);
1657
1658         rbio->bio.bi_status = 0;
1659
1660         rbio = bch2_rbio_free(rbio);
1661
1662         flags |= BCH_READ_IN_RETRY;
1663         flags &= ~BCH_READ_MAY_PROMOTE;
1664
1665         if (flags & BCH_READ_NODECODE)
1666                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1667         else
1668                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1669 }
1670
1671 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1672                             blk_status_t error)
1673 {
1674         rbio->retry = retry;
1675
1676         if (rbio->flags & BCH_READ_IN_RETRY)
1677                 return;
1678
1679         if (retry == READ_ERR) {
1680                 rbio = bch2_rbio_free(rbio);
1681
1682                 rbio->bio.bi_status = error;
1683                 bch2_rbio_done(rbio);
1684         } else {
1685                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1686                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1687         }
1688 }
1689
1690 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1691 {
1692         struct bch_fs *c = rbio->c;
1693         struct btree_trans trans;
1694         struct btree_iter *iter;
1695         struct bkey_s_c k;
1696         struct bkey_on_stack new;
1697         struct bch_extent_crc_unpacked new_crc;
1698         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1699         int ret;
1700
1701         if (rbio->pick.crc.compression_type)
1702                 return;
1703
1704         bkey_on_stack_init(&new);
1705         bch2_trans_init(&trans, c, 0, 0);
1706 retry:
1707         bch2_trans_begin(&trans);
1708
1709         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1710                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1711         k = bch2_btree_iter_peek_slot(iter);
1712         if (IS_ERR_OR_NULL(k.k))
1713                 goto out;
1714
1715         bkey_on_stack_realloc(&new, c, k.k->u64s);
1716         bkey_reassemble(new.k, k);
1717         k = bkey_i_to_s_c(new.k);
1718
1719         if (bversion_cmp(k.k->version, rbio->version) ||
1720             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1721                 goto out;
1722
1723         /* Extent was merged? */
1724         if (bkey_start_offset(k.k) < data_offset ||
1725             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1726                 goto out;
1727
1728         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1729                         rbio->pick.crc, NULL, &new_crc,
1730                         bkey_start_offset(k.k) - data_offset, k.k->size,
1731                         rbio->pick.crc.csum_type)) {
1732                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1733                 goto out;
1734         }
1735
1736         if (!bch2_bkey_narrow_crcs(new.k, new_crc))
1737                 goto out;
1738
1739         bch2_trans_update(&trans, iter, new.k);
1740         ret = bch2_trans_commit(&trans, NULL, NULL,
1741                                 BTREE_INSERT_ATOMIC|
1742                                 BTREE_INSERT_NOFAIL|
1743                                 BTREE_INSERT_NOWAIT);
1744         if (ret == -EINTR)
1745                 goto retry;
1746 out:
1747         bch2_trans_exit(&trans);
1748         bkey_on_stack_exit(&new, c);
1749 }
1750
1751 /* Inner part that may run in process context */
1752 static void __bch2_read_endio(struct work_struct *work)
1753 {
1754         struct bch_read_bio *rbio =
1755                 container_of(work, struct bch_read_bio, work);
1756         struct bch_fs *c        = rbio->c;
1757         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1758         struct bio *src         = &rbio->bio;
1759         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1760         struct bvec_iter dst_iter = rbio->bvec_iter;
1761         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1762         struct nonce nonce = extent_nonce(rbio->version, crc);
1763         struct bch_csum csum;
1764
1765         /* Reset iterator for checksumming and copying bounced data: */
1766         if (rbio->bounce) {
1767                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1768                 src->bi_iter.bi_idx             = 0;
1769                 src->bi_iter.bi_bvec_done       = 0;
1770         } else {
1771                 src->bi_iter                    = rbio->bvec_iter;
1772         }
1773
1774         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1775         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1776                 goto csum_err;
1777
1778         if (unlikely(rbio->narrow_crcs))
1779                 bch2_rbio_narrow_crcs(rbio);
1780
1781         if (rbio->flags & BCH_READ_NODECODE)
1782                 goto nodecode;
1783
1784         /* Adjust crc to point to subset of data we want: */
1785         crc.offset     += rbio->offset_into_extent;
1786         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1787
1788         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1789                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1790                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1791                         goto decompression_err;
1792         } else {
1793                 /* don't need to decrypt the entire bio: */
1794                 nonce = nonce_add(nonce, crc.offset << 9);
1795                 bio_advance(src, crc.offset << 9);
1796
1797                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1798                 src->bi_iter.bi_size = dst_iter.bi_size;
1799
1800                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1801
1802                 if (rbio->bounce) {
1803                         struct bvec_iter src_iter = src->bi_iter;
1804                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1805                 }
1806         }
1807
1808         if (rbio->promote) {
1809                 /*
1810                  * Re encrypt data we decrypted, so it's consistent with
1811                  * rbio->crc:
1812                  */
1813                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1814                 promote_start(rbio->promote, rbio);
1815                 rbio->promote = NULL;
1816         }
1817 nodecode:
1818         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1819                 rbio = bch2_rbio_free(rbio);
1820                 bch2_rbio_done(rbio);
1821         }
1822         return;
1823 csum_err:
1824         /*
1825          * Checksum error: if the bio wasn't bounced, we may have been
1826          * reading into buffers owned by userspace (that userspace can
1827          * scribble over) - retry the read, bouncing it this time:
1828          */
1829         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1830                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1831                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1832                 return;
1833         }
1834
1835         bch2_dev_io_error(ca,
1836                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1837                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1838                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1839                 csum.hi, csum.lo, crc.csum_type);
1840         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1841         return;
1842 decompression_err:
1843         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1844                           rbio->pos.inode,
1845                           (u64) rbio->bvec_iter.bi_sector);
1846         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1847         return;
1848 }
1849
1850 static void bch2_read_endio(struct bio *bio)
1851 {
1852         struct bch_read_bio *rbio =
1853                 container_of(bio, struct bch_read_bio, bio);
1854         struct bch_fs *c        = rbio->c;
1855         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1856         struct workqueue_struct *wq = NULL;
1857         enum rbio_context context = RBIO_CONTEXT_NULL;
1858
1859         if (rbio->have_ioref) {
1860                 bch2_latency_acct(ca, rbio->submit_time, READ);
1861                 percpu_ref_put(&ca->io_ref);
1862         }
1863
1864         if (!rbio->split)
1865                 rbio->bio.bi_end_io = rbio->end_io;
1866
1867         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1868                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1869                 return;
1870         }
1871
1872         if (rbio->pick.ptr.cached &&
1873             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1874              ptr_stale(ca, &rbio->pick.ptr))) {
1875                 atomic_long_inc(&c->read_realloc_races);
1876
1877                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1878                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1879                 else
1880                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1881                 return;
1882         }
1883
1884         if (rbio->narrow_crcs ||
1885             rbio->pick.crc.compression_type ||
1886             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1887                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1888         else if (rbio->pick.crc.csum_type)
1889                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1890
1891         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1892 }
1893
1894 int __bch2_read_indirect_extent(struct btree_trans *trans,
1895                                 unsigned *offset_into_extent,
1896                                 struct bkey_i *orig_k)
1897 {
1898         struct btree_iter *iter;
1899         struct bkey_s_c k;
1900         u64 reflink_offset;
1901         int ret;
1902
1903         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
1904                 *offset_into_extent;
1905
1906         iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
1907                                    POS(0, reflink_offset),
1908                                    BTREE_ITER_SLOTS);
1909         ret = PTR_ERR_OR_ZERO(iter);
1910         if (ret)
1911                 return ret;
1912
1913         k = bch2_btree_iter_peek_slot(iter);
1914         ret = bkey_err(k);
1915         if (ret)
1916                 goto err;
1917
1918         if (k.k->type != KEY_TYPE_reflink_v) {
1919                 __bcache_io_error(trans->c,
1920                                 "pointer to nonexistent indirect extent");
1921                 ret = -EIO;
1922                 goto err;
1923         }
1924
1925         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1926         bkey_reassemble(orig_k, k);
1927 err:
1928         bch2_trans_iter_put(trans, iter);
1929         return ret;
1930 }
1931
1932 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1933                        struct bvec_iter iter, struct bkey_s_c k,
1934                        unsigned offset_into_extent,
1935                        struct bch_io_failures *failed, unsigned flags)
1936 {
1937         struct extent_ptr_decoded pick;
1938         struct bch_read_bio *rbio = NULL;
1939         struct bch_dev *ca;
1940         struct promote_op *promote = NULL;
1941         bool bounce = false, read_full = false, narrow_crcs = false;
1942         struct bpos pos = bkey_start_pos(k.k);
1943         int pick_ret;
1944
1945         if (k.k->type == KEY_TYPE_inline_data) {
1946                 struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
1947                 unsigned bytes = min_t(unsigned, iter.bi_size,
1948                                        bkey_val_bytes(d.k));
1949
1950                 swap(iter.bi_size, bytes);
1951                 memcpy_to_bio(&orig->bio, iter, d.v->data);
1952                 swap(iter.bi_size, bytes);
1953                 bio_advance_iter(&orig->bio, &iter, bytes);
1954                 zero_fill_bio_iter(&orig->bio, iter);
1955                 goto out_read_done;
1956         }
1957
1958         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1959
1960         /* hole or reservation - just zero fill: */
1961         if (!pick_ret)
1962                 goto hole;
1963
1964         if (pick_ret < 0) {
1965                 __bcache_io_error(c, "no device to read from");
1966                 goto err;
1967         }
1968
1969         if (pick_ret > 0)
1970                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1971
1972         if (flags & BCH_READ_NODECODE) {
1973                 /*
1974                  * can happen if we retry, and the extent we were going to read
1975                  * has been merged in the meantime:
1976                  */
1977                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1978                         goto hole;
1979
1980                 iter.bi_size    = pick.crc.compressed_size << 9;
1981                 goto noclone;
1982         }
1983
1984         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1985             bio_flagged(&orig->bio, BIO_CHAIN))
1986                 flags |= BCH_READ_MUST_CLONE;
1987
1988         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
1989                 bch2_can_narrow_extent_crcs(k, pick.crc);
1990
1991         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1992                 flags |= BCH_READ_MUST_BOUNCE;
1993
1994         EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1995
1996         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1997             (pick.crc.csum_type != BCH_CSUM_NONE &&
1998              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1999               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2000                (flags & BCH_READ_USER_MAPPED)) ||
2001               (flags & BCH_READ_MUST_BOUNCE)))) {
2002                 read_full = true;
2003                 bounce = true;
2004         }
2005
2006         if (orig->opts.promote_target)
2007                 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2008                                         &rbio, &bounce, &read_full);
2009
2010         if (!read_full) {
2011                 EBUG_ON(pick.crc.compression_type);
2012                 EBUG_ON(pick.crc.csum_type &&
2013                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2014                          bvec_iter_sectors(iter) != pick.crc.live_size ||
2015                          pick.crc.offset ||
2016                          offset_into_extent));
2017
2018                 pos.offset += offset_into_extent;
2019                 pick.ptr.offset += pick.crc.offset +
2020                         offset_into_extent;
2021                 offset_into_extent              = 0;
2022                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
2023                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
2024                 pick.crc.offset                 = 0;
2025                 pick.crc.live_size              = bvec_iter_sectors(iter);
2026                 offset_into_extent              = 0;
2027         }
2028
2029         if (rbio) {
2030                 /*
2031                  * promote already allocated bounce rbio:
2032                  * promote needs to allocate a bio big enough for uncompressing
2033                  * data in the write path, but we're not going to use it all
2034                  * here:
2035                  */
2036                 EBUG_ON(rbio->bio.bi_iter.bi_size <
2037                        pick.crc.compressed_size << 9);
2038                 rbio->bio.bi_iter.bi_size =
2039                         pick.crc.compressed_size << 9;
2040         } else if (bounce) {
2041                 unsigned sectors = pick.crc.compressed_size;
2042
2043                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
2044                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
2045                                                   &c->bio_read_split),
2046                                  orig->opts);
2047
2048                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2049                 rbio->bounce    = true;
2050                 rbio->split     = true;
2051         } else if (flags & BCH_READ_MUST_CLONE) {
2052                 /*
2053                  * Have to clone if there were any splits, due to error
2054                  * reporting issues (if a split errored, and retrying didn't
2055                  * work, when it reports the error to its parent (us) we don't
2056                  * know if the error was from our bio, and we should retry, or
2057                  * from the whole bio, in which case we don't want to retry and
2058                  * lose the error)
2059                  */
2060                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
2061                                                 &c->bio_read_split),
2062                                  orig->opts);
2063                 rbio->bio.bi_iter = iter;
2064                 rbio->split     = true;
2065         } else {
2066 noclone:
2067                 rbio = orig;
2068                 rbio->bio.bi_iter = iter;
2069                 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2070         }
2071
2072         EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2073
2074         rbio->c                 = c;
2075         rbio->submit_time       = local_clock();
2076         if (rbio->split)
2077                 rbio->parent    = orig;
2078         else
2079                 rbio->end_io    = orig->bio.bi_end_io;
2080         rbio->bvec_iter         = iter;
2081         rbio->offset_into_extent= offset_into_extent;
2082         rbio->flags             = flags;
2083         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2084         rbio->narrow_crcs       = narrow_crcs;
2085         rbio->hole              = 0;
2086         rbio->retry             = 0;
2087         rbio->context           = 0;
2088         /* XXX: only initialize this if needed */
2089         rbio->devs_have         = bch2_bkey_devs(k);
2090         rbio->pick              = pick;
2091         rbio->pos               = pos;
2092         rbio->version           = k.k->version;
2093         rbio->promote           = promote;
2094         INIT_WORK(&rbio->work, NULL);
2095
2096         rbio->bio.bi_opf        = orig->bio.bi_opf;
2097         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2098         rbio->bio.bi_end_io     = bch2_read_endio;
2099
2100         if (rbio->bounce)
2101                 trace_read_bounce(&rbio->bio);
2102
2103         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2104
2105         rcu_read_lock();
2106         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
2107         rcu_read_unlock();
2108
2109         if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2110                 bio_inc_remaining(&orig->bio);
2111                 trace_read_split(&orig->bio);
2112         }
2113
2114         if (!rbio->pick.idx) {
2115                 if (!rbio->have_ioref) {
2116                         __bcache_io_error(c, "no device to read from");
2117                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2118                         goto out;
2119                 }
2120
2121                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
2122                              bio_sectors(&rbio->bio));
2123                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2124
2125                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2126                         submit_bio(&rbio->bio);
2127                 else
2128                         submit_bio_wait(&rbio->bio);
2129         } else {
2130                 /* Attempting reconstruct read: */
2131                 if (bch2_ec_read_extent(c, rbio)) {
2132                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2133                         goto out;
2134                 }
2135
2136                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2137                         bio_endio(&rbio->bio);
2138         }
2139 out:
2140         if (likely(!(flags & BCH_READ_IN_RETRY))) {
2141                 return 0;
2142         } else {
2143                 int ret;
2144
2145                 rbio->context = RBIO_CONTEXT_UNBOUND;
2146                 bch2_read_endio(&rbio->bio);
2147
2148                 ret = rbio->retry;
2149                 rbio = bch2_rbio_free(rbio);
2150
2151                 if (ret == READ_RETRY_AVOID) {
2152                         bch2_mark_io_failure(failed, &pick);
2153                         ret = READ_RETRY;
2154                 }
2155
2156                 return ret;
2157         }
2158
2159 err:
2160         if (flags & BCH_READ_IN_RETRY)
2161                 return READ_ERR;
2162
2163         orig->bio.bi_status = BLK_STS_IOERR;
2164         goto out_read_done;
2165
2166 hole:
2167         /*
2168          * won't normally happen in the BCH_READ_NODECODE
2169          * (bch2_move_extent()) path, but if we retry and the extent we wanted
2170          * to read no longer exists we have to signal that:
2171          */
2172         if (flags & BCH_READ_NODECODE)
2173                 orig->hole = true;
2174
2175         zero_fill_bio_iter(&orig->bio, iter);
2176 out_read_done:
2177         if (flags & BCH_READ_LAST_FRAGMENT)
2178                 bch2_rbio_done(orig);
2179         return 0;
2180 }
2181
2182 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
2183 {
2184         struct btree_trans trans;
2185         struct btree_iter *iter;
2186         struct bkey_on_stack sk;
2187         struct bkey_s_c k;
2188         unsigned flags = BCH_READ_RETRY_IF_STALE|
2189                 BCH_READ_MAY_PROMOTE|
2190                 BCH_READ_USER_MAPPED;
2191         int ret;
2192
2193         BUG_ON(rbio->_state);
2194         BUG_ON(flags & BCH_READ_NODECODE);
2195         BUG_ON(flags & BCH_READ_IN_RETRY);
2196
2197         rbio->c = c;
2198         rbio->start_time = local_clock();
2199
2200         bkey_on_stack_init(&sk);
2201         bch2_trans_init(&trans, c, 0, 0);
2202 retry:
2203         bch2_trans_begin(&trans);
2204
2205         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2206                                    POS(inode, rbio->bio.bi_iter.bi_sector),
2207                                    BTREE_ITER_SLOTS);
2208         while (1) {
2209                 unsigned bytes, sectors, offset_into_extent;
2210
2211                 bch2_btree_iter_set_pos(iter,
2212                                 POS(inode, rbio->bio.bi_iter.bi_sector));
2213
2214                 k = bch2_btree_iter_peek_slot(iter);
2215                 ret = bkey_err(k);
2216                 if (ret)
2217                         goto err;
2218
2219                 offset_into_extent = iter->pos.offset -
2220                         bkey_start_offset(k.k);
2221                 sectors = k.k->size - offset_into_extent;
2222
2223                 bkey_on_stack_realloc(&sk, c, k.k->u64s);
2224                 bkey_reassemble(sk.k, k);
2225                 k = bkey_i_to_s_c(sk.k);
2226
2227                 ret = bch2_read_indirect_extent(&trans,
2228                                         &offset_into_extent, sk.k);
2229                 if (ret)
2230                         goto err;
2231
2232                 /*
2233                  * With indirect extents, the amount of data to read is the min
2234                  * of the original extent and the indirect extent:
2235                  */
2236                 sectors = min(sectors, k.k->size - offset_into_extent);
2237
2238                 /*
2239                  * Unlock the iterator while the btree node's lock is still in
2240                  * cache, before doing the IO:
2241                  */
2242                 bch2_trans_unlock(&trans);
2243
2244                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
2245                 swap(rbio->bio.bi_iter.bi_size, bytes);
2246
2247                 if (rbio->bio.bi_iter.bi_size == bytes)
2248                         flags |= BCH_READ_LAST_FRAGMENT;
2249
2250                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
2251
2252                 if (flags & BCH_READ_LAST_FRAGMENT)
2253                         break;
2254
2255                 swap(rbio->bio.bi_iter.bi_size, bytes);
2256                 bio_advance(&rbio->bio, bytes);
2257         }
2258 out:
2259         bch2_trans_exit(&trans);
2260         bkey_on_stack_exit(&sk, c);
2261         return;
2262 err:
2263         if (ret == -EINTR)
2264                 goto retry;
2265
2266         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
2267         bch2_rbio_done(rbio);
2268         goto out;
2269 }
2270
2271 void bch2_fs_io_exit(struct bch_fs *c)
2272 {
2273         if (c->promote_table.tbl)
2274                 rhashtable_destroy(&c->promote_table);
2275         mempool_exit(&c->bio_bounce_pages);
2276         bioset_exit(&c->bio_write);
2277         bioset_exit(&c->bio_read_split);
2278         bioset_exit(&c->bio_read);
2279 }
2280
2281 int bch2_fs_io_init(struct bch_fs *c)
2282 {
2283         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2284                         BIOSET_NEED_BVECS) ||
2285             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2286                         BIOSET_NEED_BVECS) ||
2287             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2288                         BIOSET_NEED_BVECS) ||
2289             mempool_init_page_pool(&c->bio_bounce_pages,
2290                                    max_t(unsigned,
2291                                          c->opts.btree_node_size,
2292                                          c->sb.encoded_extent_max) /
2293                                    PAGE_SECTORS, 0) ||
2294             rhashtable_init(&c->promote_table, &bch_promote_params))
2295                 return -ENOMEM;
2296
2297         return 0;
2298 }