git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Some low level IO code, and hacks for various block layer limitations
   4  *
   5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6  * Copyright 2012 Google, Inc.
   7  */
   8
   9 #include "bcachefs.h"
  10 #include "alloc_foreground.h"
  11 #include "bkey_on_stack.h"
  12 #include "bset.h"
  13 #include "btree_update.h"
  14 #include "buckets.h"
  15 #include "checksum.h"
  16 #include "compress.h"
  17 #include "clock.h"
  18 #include "debug.h"
  19 #include "disk_groups.h"
  20 #include "ec.h"
  21 #include "error.h"
  22 #include "extent_update.h"
  23 #include "inode.h"
  24 #include "io.h"
  25 #include "journal.h"
  26 #include "keylist.h"
  27 #include "move.h"
  28 #include "rebalance.h"
  29 #include "super.h"
  30 #include "super-io.h"
  31
  32 #include <linux/blkdev.h>
  33 #include <linux/random.h>
  34
  35 #include <trace/events/bcachefs.h>
  36
  37 static bool bch2_target_congested(struct bch_fs *c, u16 target)
  38 {
  39         const struct bch_devs_mask *devs;
  40         unsigned d, nr = 0, total = 0;
  41         u64 now = local_clock(), last;
  42         s64 congested;
  43         struct bch_dev *ca;
  44
  45         if (!target)
  46                 return false;
  47
  48         rcu_read_lock();
  49         devs = bch2_target_to_mask(c, target);
  50         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
  51                 ca = rcu_dereference(c->devs[d]);
  52                 if (!ca)
  53                         continue;
  54
  55                 congested = atomic_read(&ca->congested);
  56                 last = READ_ONCE(ca->congested_last);
  57                 if (time_after64(now, last))
  58                         congested -= (now - last) >> 12;
  59
  60                 total += max(congested, 0LL);
  61                 nr++;
  62         }
  63         rcu_read_unlock();
  64
  65         return bch2_rand_range(nr * CONGESTED_MAX) < total;
  66 }
  67
  68 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
  69                                        u64 now, int rw)
  70 {
  71         u64 latency_capable =
  72                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
  73         /* ideally we'd be taking into account the device's variance here: */
  74         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
  75         s64 latency_over = io_latency - latency_threshold;
  76
  77         if (latency_threshold && latency_over > 0) {
  78                 /*
  79                  * bump up congested by approximately latency_over * 4 /
  80                  * latency_threshold - we don't need much accuracy here so don't
  81                  * bother with the divide:
  82                  */
  83                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
  84                         atomic_add(latency_over >>
  85                                    max_t(int, ilog2(latency_threshold) - 2, 0),
  86                                    &ca->congested);
  87
  88                 ca->congested_last = now;
  89         } else if (atomic_read(&ca->congested) > 0) {
  90                 atomic_dec(&ca->congested);
  91         }
  92 }
  93
  94 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
  95 {
  96         atomic64_t *latency = &ca->cur_latency[rw];
  97         u64 now = local_clock();
  98         u64 io_latency = time_after64(now, submit_time)
  99                 ? now - submit_time
 100                 : 0;
 101         u64 old, new, v = atomic64_read(latency);
 102
 103         do {
 104                 old = v;
 105
 106                 /*
 107                  * If the io latency was reasonably close to the current
 108                  * latency, skip doing the update and atomic operation - most of
 109                  * the time:
 110                  */
 111                 if (abs((int) (old - io_latency)) < (old >> 1) &&
 112                     now & ~(~0 << 5))
 113                         break;
 114
 115                 new = ewma_add(old, io_latency, 5);
 116         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
 117
 118         bch2_congested_acct(ca, io_latency, now, rw);
 119
 120         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 121 }
 122
 123 /* Allocate, free from mempool: */
 124
 125 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 126 {
 127         struct bvec_iter_all iter;
 128         struct bio_vec *bv;
 129
 130         bio_for_each_segment_all(bv, bio, iter)
 131                 if (bv->bv_page != ZERO_PAGE(0))
 132                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
 133         bio->bi_vcnt = 0;
 134 }
 135
 136 static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
 137 {
 138         struct page *page;
 139
 140         if (likely(!*using_mempool)) {
 141                 page = alloc_page(GFP_NOIO);
 142                 if (unlikely(!page)) {
 143                         mutex_lock(&c->bio_bounce_pages_lock);
 144                         *using_mempool = true;
 145                         goto pool_alloc;
 146
 147                 }
 148         } else {
 149 pool_alloc:
 150                 page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
 151         }
 152
 153         return page;
 154 }
 155
 156 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 157                                size_t size)
 158 {
 159         bool using_mempool = false;
 160
 161         while (size) {
 162                 struct page *page = __bio_alloc_page_pool(c, &using_mempool);
 163                 unsigned len = min(PAGE_SIZE, size);
 164
 165                 BUG_ON(!bio_add_page(bio, page, len, 0));
 166                 size -= len;
 167         }
 168
 169         if (using_mempool)
 170                 mutex_unlock(&c->bio_bounce_pages_lock);
 171 }
 172
 173 /* Extent update path: */
 174
 175 static int sum_sector_overwrites(struct btree_trans *trans,
 176                                  struct btree_iter *extent_iter,
 177                                  struct bkey_i *new,
 178                                  bool may_allocate,
 179                                  bool *maybe_extending,
 180                                  s64 *delta)
 181 {
 182         struct btree_iter *iter;
 183         struct bkey_s_c old;
 184         int ret = 0;
 185
 186         *maybe_extending = true;
 187         *delta = 0;
 188
 189         iter = bch2_trans_copy_iter(trans, extent_iter);
 190         if (IS_ERR(iter))
 191                 return PTR_ERR(iter);
 192
 193         for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
 194                 if (!may_allocate &&
 195                     bch2_bkey_nr_ptrs_fully_allocated(old) <
 196                     bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
 197                         ret = -ENOSPC;
 198                         break;
 199                 }
 200
 201                 *delta += (min(new->k.p.offset,
 202                               old.k->p.offset) -
 203                           max(bkey_start_offset(&new->k),
 204                               bkey_start_offset(old.k))) *
 205                         (bkey_extent_is_allocation(&new->k) -
 206                          bkey_extent_is_allocation(old.k));
 207
 208                 if (bkey_cmp(old.k->p, new->k.p) >= 0) {
 209                         /*
 210                          * Check if there's already data above where we're
 211                          * going to be writing to - this means we're definitely
 212                          * not extending the file:
 213                          *
 214                          * Note that it's not sufficient to check if there's
 215                          * data up to the sector offset we're going to be
 216                          * writing to, because i_size could be up to one block
 217                          * less:
 218                          */
 219                         if (!bkey_cmp(old.k->p, new->k.p))
 220                                 old = bch2_btree_iter_next(iter);
 221
 222                         if (old.k && !bkey_err(old) &&
 223                             old.k->p.inode == extent_iter->pos.inode &&
 224                             bkey_extent_is_data(old.k))
 225                                 *maybe_extending = false;
 226
 227                         break;
 228                 }
 229         }
 230
 231         bch2_trans_iter_put(trans, iter);
 232         return ret;
 233 }
 234
 235 int bch2_extent_update(struct btree_trans *trans,
 236                        struct btree_iter *iter,
 237                        struct bkey_i *k,
 238                        struct disk_reservation *disk_res,
 239                        u64 *journal_seq,
 240                        u64 new_i_size,
 241                        s64 *i_sectors_delta)
 242 {
 243         /* this must live until after bch2_trans_commit(): */
 244         struct bkey_inode_buf inode_p;
 245         bool extending = false;
 246         s64 delta = 0;
 247         int ret;
 248
 249         ret = bch2_extent_trim_atomic(k, iter);
 250         if (ret)
 251                 return ret;
 252
 253         ret = sum_sector_overwrites(trans, iter, k,
 254                         disk_res && disk_res->sectors != 0,
 255                         &extending, &delta);
 256         if (ret)
 257                 return ret;
 258
 259         new_i_size = extending
 260                 ? min(k->k.p.offset << 9, new_i_size)
 261                 : 0;
 262
 263         if (delta || new_i_size) {
 264                 struct btree_iter *inode_iter;
 265                 struct bch_inode_unpacked inode_u;
 266
 267                 inode_iter = bch2_inode_peek(trans, &inode_u,
 268                                 k->k.p.inode, BTREE_ITER_INTENT);
 269                 if (IS_ERR(inode_iter))
 270                         return PTR_ERR(inode_iter);
 271
 272                 /*
 273                  * XXX:
 274                  * writeback can race a bit with truncate, because truncate
 275                  * first updates the inode then truncates the pagecache. This is
 276                  * ugly, but lets us preserve the invariant that the in memory
 277                  * i_size is always >= the on disk i_size.
 278                  *
 279                 BUG_ON(new_i_size > inode_u.bi_size &&
 280                        (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
 281                  */
 282                 BUG_ON(new_i_size > inode_u.bi_size && !extending);
 283
 284                 if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
 285                     new_i_size > inode_u.bi_size)
 286                         inode_u.bi_size = new_i_size;
 287                 else
 288                         new_i_size = 0;
 289
 290                 inode_u.bi_sectors += delta;
 291
 292                 if (delta || new_i_size) {
 293                         bch2_inode_pack(&inode_p, &inode_u);
 294                         bch2_trans_update(trans, inode_iter,
 295                                           &inode_p.inode.k_i);
 296                 }
 297
 298                 bch2_trans_iter_put(trans, inode_iter);
 299         }
 300
 301         bch2_trans_update(trans, iter, k);
 302
 303         ret = bch2_trans_commit(trans, disk_res, journal_seq,
 304                                 BTREE_INSERT_NOCHECK_RW|
 305                                 BTREE_INSERT_NOFAIL|
 306                                 BTREE_INSERT_USE_RESERVE);
 307         if (!ret && i_sectors_delta)
 308                 *i_sectors_delta += delta;
 309
 310         return ret;
 311 }
 312
 313 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 314                    struct bpos end, u64 *journal_seq,
 315                    s64 *i_sectors_delta)
 316 {
 317         struct bch_fs *c        = trans->c;
 318         unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
 319         struct bkey_s_c k;
 320         int ret = 0, ret2 = 0;
 321
 322         while ((k = bch2_btree_iter_peek(iter)).k &&
 323                bkey_cmp(iter->pos, end) < 0) {
 324                 struct disk_reservation disk_res =
 325                         bch2_disk_reservation_init(c, 0);
 326                 struct bkey_i delete;
 327
 328                 bch2_trans_reset(trans, TRANS_RESET_MEM);
 329
 330                 ret = bkey_err(k);
 331                 if (ret)
 332                         goto btree_err;
 333
 334                 bkey_init(&delete.k);
 335                 delete.k.p = iter->pos;
 336
 337                 /* create the biggest key we can */
 338                 bch2_key_resize(&delete.k, max_sectors);
 339                 bch2_cut_back(end, &delete);
 340
 341                 ret = bch2_extent_update(trans, iter, &delete,
 342                                 &disk_res, journal_seq,
 343                                 0, i_sectors_delta);
 344                 bch2_disk_reservation_put(c, &disk_res);
 345 btree_err:
 346                 if (ret == -EINTR) {
 347                         ret2 = ret;
 348                         ret = 0;
 349                 }
 350                 if (ret)
 351                         break;
 352         }
 353
 354         if (bkey_cmp(iter->pos, end) > 0) {
 355                 bch2_btree_iter_set_pos(iter, end);
 356                 ret = bch2_btree_iter_traverse(iter);
 357         }
 358
 359         return ret ?: ret2;
 360 }
 361
 362 int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 363                 u64 *journal_seq, s64 *i_sectors_delta)
 364 {
 365         struct btree_trans trans;
 366         struct btree_iter *iter;
 367         int ret = 0;
 368
 369         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 370         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 371                                    POS(inum, start),
 372                                    BTREE_ITER_INTENT);
 373
 374         ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
 375                              journal_seq, i_sectors_delta);
 376         bch2_trans_exit(&trans);
 377
 378         if (ret == -EINTR)
 379                 ret = 0;
 380
 381         return ret;
 382 }
 383
 384 int bch2_write_index_default(struct bch_write_op *op)
 385 {
 386         struct bch_fs *c = op->c;
 387         struct bkey_on_stack sk;
 388         struct keylist *keys = &op->insert_keys;
 389         struct bkey_i *k = bch2_keylist_front(keys);
 390         struct btree_trans trans;
 391         struct btree_iter *iter;
 392         int ret;
 393
 394         bkey_on_stack_init(&sk);
 395         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 396
 397         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
 398                                    bkey_start_pos(&k->k),
 399                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 400
 401         do {
 402                 bch2_trans_reset(&trans, TRANS_RESET_MEM);
 403
 404                 k = bch2_keylist_front(keys);
 405
 406                 bkey_on_stack_realloc(&sk, c, k->k.u64s);
 407                 bkey_copy(sk.k, k);
 408                 bch2_cut_front(iter->pos, sk.k);
 409
 410                 ret = bch2_extent_update(&trans, iter, sk.k,
 411                                          &op->res, op_journal_seq(op),
 412                                          op->new_i_size, &op->i_sectors_delta);
 413                 if (ret == -EINTR)
 414                         continue;
 415                 if (ret)
 416                         break;
 417
 418                 if (bkey_cmp(iter->pos, k->k.p) >= 0)
 419                         bch2_keylist_pop_front(keys);
 420         } while (!bch2_keylist_empty(keys));
 421
 422         bch2_trans_exit(&trans);
 423         bkey_on_stack_exit(&sk, c);
 424
 425         return ret;
 426 }
 427
 428 /* Writes */
 429
 430 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 431                                enum bch_data_type type,
 432                                const struct bkey_i *k)
 433 {
 434         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
 435         const struct bch_extent_ptr *ptr;
 436         struct bch_write_bio *n;
 437         struct bch_dev *ca;
 438
 439         BUG_ON(c->opts.nochanges);
 440
 441         bkey_for_each_ptr(ptrs, ptr) {
 442                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 443                        !c->devs[ptr->dev]);
 444
 445                 ca = bch_dev_bkey_exists(c, ptr->dev);
 446
 447                 if (to_entry(ptr + 1) < ptrs.end) {
 448                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
 449                                                    &ca->replica_set));
 450
 451                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
 452                         n->bio.bi_private       = wbio->bio.bi_private;
 453                         n->parent               = wbio;
 454                         n->split                = true;
 455                         n->bounce               = false;
 456                         n->put_bio              = true;
 457                         n->bio.bi_opf           = wbio->bio.bi_opf;
 458                         bio_inc_remaining(&wbio->bio);
 459                 } else {
 460                         n = wbio;
 461                         n->split                = false;
 462                 }
 463
 464                 n->c                    = c;
 465                 n->dev                  = ptr->dev;
 466                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
 467                 n->submit_time          = local_clock();
 468                 n->bio.bi_iter.bi_sector = ptr->offset;
 469
 470                 if (!journal_flushes_device(ca))
 471                         n->bio.bi_opf |= REQ_FUA;
 472
 473                 if (likely(n->have_ioref)) {
 474                         this_cpu_add(ca->io_done->sectors[WRITE][type],
 475                                      bio_sectors(&n->bio));
 476
 477                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
 478                         submit_bio(&n->bio);
 479                 } else {
 480                         n->bio.bi_status        = BLK_STS_REMOVED;
 481                         bio_endio(&n->bio);
 482                 }
 483         }
 484 }
 485
 486 static void __bch2_write(struct closure *);
 487
 488 static void bch2_write_done(struct closure *cl)
 489 {
 490         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 491         struct bch_fs *c = op->c;
 492
 493         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 494                 op->error = bch2_journal_error(&c->journal);
 495
 496         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 497                 bch2_disk_reservation_put(c, &op->res);
 498         percpu_ref_put(&c->writes);
 499         bch2_keylist_free(&op->insert_keys, op->inline_keys);
 500
 501         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 502
 503         if (op->end_io) {
 504                 EBUG_ON(cl->parent);
 505                 closure_debug_destroy(cl);
 506                 op->end_io(op);
 507         } else {
 508                 closure_return(cl);
 509         }
 510 }
 511
 512 /**
 513  * bch_write_index - after a write, update index to point to new data
 514  */
 515 static void __bch2_write_index(struct bch_write_op *op)
 516 {
 517         struct bch_fs *c = op->c;
 518         struct keylist *keys = &op->insert_keys;
 519         struct bch_extent_ptr *ptr;
 520         struct bkey_i *src, *dst = keys->keys, *n, *k;
 521         unsigned dev;
 522         int ret;
 523
 524         for (src = keys->keys; src != keys->top; src = n) {
 525                 n = bkey_next(src);
 526
 527                 if (bkey_extent_is_direct_data(&src->k)) {
 528                         bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
 529                                             test_bit(ptr->dev, op->failed.d));
 530
 531                         if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
 532                                 ret = -EIO;
 533                                 goto err;
 534                         }
 535                 }
 536
 537                 if (dst != src)
 538                         memmove_u64s_down(dst, src, src->u64s);
 539                 dst = bkey_next(dst);
 540         }
 541
 542         keys->top = dst;
 543
 544         /*
 545          * probably not the ideal place to hook this in, but I don't
 546          * particularly want to plumb io_opts all the way through the btree
 547          * update stack right now
 548          */
 549         for_each_keylist_key(keys, k)
 550                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 551
 552         if (!bch2_keylist_empty(keys)) {
 553                 u64 sectors_start = keylist_sectors(keys);
 554                 int ret = op->index_update_fn(op);
 555
 556                 BUG_ON(ret == -EINTR);
 557                 BUG_ON(keylist_sectors(keys) && !ret);
 558
 559                 op->written += sectors_start - keylist_sectors(keys);
 560
 561                 if (ret) {
 562                         __bcache_io_error(c, "btree IO error %i", ret);
 563                         op->error = ret;
 564                 }
 565         }
 566 out:
 567         /* If some a bucket wasn't written, we can't erasure code it: */
 568         for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
 569                 bch2_open_bucket_write_error(c, &op->open_buckets, dev);
 570
 571         bch2_open_buckets_put(c, &op->open_buckets);
 572         return;
 573 err:
 574         keys->top = keys->keys;
 575         op->error = ret;
 576         goto out;
 577 }
 578
 579 static void bch2_write_index(struct closure *cl)
 580 {
 581         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 582         struct bch_fs *c = op->c;
 583
 584         __bch2_write_index(op);
 585
 586         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 587                 bch2_journal_flush_seq_async(&c->journal,
 588                                              *op_journal_seq(op),
 589                                              cl);
 590                 continue_at(cl, bch2_write_done, index_update_wq(op));
 591         } else {
 592                 continue_at_nobarrier(cl, bch2_write_done, NULL);
 593         }
 594 }
 595
 596 static void bch2_write_endio(struct bio *bio)
 597 {
 598         struct closure *cl              = bio->bi_private;
 599         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
 600         struct bch_write_bio *wbio      = to_wbio(bio);
 601         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
 602         struct bch_fs *c                = wbio->c;
 603         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
 604
 605         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
 606                 set_bit(wbio->dev, op->failed.d);
 607
 608         if (wbio->have_ioref) {
 609                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
 610                 percpu_ref_put(&ca->io_ref);
 611         }
 612
 613         if (wbio->bounce)
 614                 bch2_bio_free_pages_pool(c, bio);
 615
 616         if (wbio->put_bio)
 617                 bio_put(bio);
 618
 619         if (parent)
 620                 bio_endio(&parent->bio);
 621         else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
 622                 closure_put(cl);
 623         else
 624                 continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
 625 }
 626
 627 static void init_append_extent(struct bch_write_op *op,
 628                                struct write_point *wp,
 629                                struct bversion version,
 630                                struct bch_extent_crc_unpacked crc)
 631 {
 632         struct bch_fs *c = op->c;
 633         struct bkey_i_extent *e;
 634         struct open_bucket *ob;
 635         unsigned i;
 636
 637         BUG_ON(crc.compressed_size > wp->sectors_free);
 638         wp->sectors_free -= crc.compressed_size;
 639         op->pos.offset += crc.uncompressed_size;
 640
 641         e = bkey_extent_init(op->insert_keys.top);
 642         e->k.p          = op->pos;
 643         e->k.size       = crc.uncompressed_size;
 644         e->k.version    = version;
 645
 646         if (crc.csum_type ||
 647             crc.compression_type ||
 648             crc.nonce)
 649                 bch2_extent_crc_append(&e->k_i, crc);
 650
 651         open_bucket_for_each(c, &wp->ptrs, ob, i) {
 652                 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 653                 union bch_extent_entry *end =
 654                         bkey_val_end(bkey_i_to_s(&e->k_i));
 655
 656                 end->ptr = ob->ptr;
 657                 end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 658                 end->ptr.cached = !ca->mi.durability ||
 659                         (op->flags & BCH_WRITE_CACHED) != 0;
 660                 end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
 661
 662                 e->k.u64s++;
 663
 664                 BUG_ON(crc.compressed_size > ob->sectors_free);
 665                 ob->sectors_free -= crc.compressed_size;
 666         }
 667
 668         bch2_keylist_push(&op->insert_keys);
 669 }
 670
 671 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 672                                         struct write_point *wp,
 673                                         struct bio *src,
 674                                         bool *page_alloc_failed,
 675                                         void *buf)
 676 {
 677         struct bch_write_bio *wbio;
 678         struct bio *bio;
 679         unsigned output_available =
 680                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
 681         unsigned pages = DIV_ROUND_UP(output_available +
 682                                       (buf
 683                                        ? ((unsigned long) buf & (PAGE_SIZE - 1))
 684                                        : 0), PAGE_SIZE);
 685
 686         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
 687         wbio                    = wbio_init(bio);
 688         wbio->put_bio           = true;
 689         /* copy WRITE_SYNC flag */
 690         wbio->bio.bi_opf        = src->bi_opf;
 691
 692         if (buf) {
 693                 bch2_bio_map(bio, buf, output_available);
 694                 return bio;
 695         }
 696
 697         wbio->bounce            = true;
 698
 699         /*
 700          * We can't use mempool for more than c->sb.encoded_extent_max
 701          * worth of pages, but we'd like to allocate more if we can:
 702          */
 703         bch2_bio_alloc_pages_pool(c, bio,
 704                                   min_t(unsigned, output_available,
 705                                         c->sb.encoded_extent_max << 9));
 706
 707         if (bio->bi_iter.bi_size < output_available)
 708                 *page_alloc_failed =
 709                         bch2_bio_alloc_pages(bio,
 710                                              output_available -
 711                                              bio->bi_iter.bi_size,
 712                                              GFP_NOFS) != 0;
 713
 714         return bio;
 715 }
 716
 717 static int bch2_write_rechecksum(struct bch_fs *c,
 718                                  struct bch_write_op *op,
 719                                  unsigned new_csum_type)
 720 {
 721         struct bio *bio = &op->wbio.bio;
 722         struct bch_extent_crc_unpacked new_crc;
 723         int ret;
 724
 725         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
 726
 727         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
 728             bch2_csum_type_is_encryption(new_csum_type))
 729                 new_csum_type = op->crc.csum_type;
 730
 731         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
 732                                   NULL, &new_crc,
 733                                   op->crc.offset, op->crc.live_size,
 734                                   new_csum_type);
 735         if (ret)
 736                 return ret;
 737
 738         bio_advance(bio, op->crc.offset << 9);
 739         bio->bi_iter.bi_size = op->crc.live_size << 9;
 740         op->crc = new_crc;
 741         return 0;
 742 }
 743
 744 static int bch2_write_decrypt(struct bch_write_op *op)
 745 {
 746         struct bch_fs *c = op->c;
 747         struct nonce nonce = extent_nonce(op->version, op->crc);
 748         struct bch_csum csum;
 749
 750         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
 751                 return 0;
 752
 753         /*
 754          * If we need to decrypt data in the write path, we'll no longer be able
 755          * to verify the existing checksum (poly1305 mac, in this case) after
 756          * it's decrypted - this is the last point we'll be able to reverify the
 757          * checksum:
 758          */
 759         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 760         if (bch2_crc_cmp(op->crc.csum, csum))
 761                 return -EIO;
 762
 763         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 764         op->crc.csum_type = 0;
 765         op->crc.csum = (struct bch_csum) { 0, 0 };
 766         return 0;
 767 }
 768
 769 static enum prep_encoded_ret {
 770         PREP_ENCODED_OK,
 771         PREP_ENCODED_ERR,
 772         PREP_ENCODED_CHECKSUM_ERR,
 773         PREP_ENCODED_DO_WRITE,
 774 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
 775 {
 776         struct bch_fs *c = op->c;
 777         struct bio *bio = &op->wbio.bio;
 778
 779         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
 780                 return PREP_ENCODED_OK;
 781
 782         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
 783
 784         /* Can we just write the entire extent as is? */
 785         if (op->crc.uncompressed_size == op->crc.live_size &&
 786             op->crc.compressed_size <= wp->sectors_free &&
 787             op->crc.compression_type == op->compression_type) {
 788                 if (!op->crc.compression_type &&
 789                     op->csum_type != op->crc.csum_type &&
 790                     bch2_write_rechecksum(c, op, op->csum_type))
 791                         return PREP_ENCODED_CHECKSUM_ERR;
 792
 793                 return PREP_ENCODED_DO_WRITE;
 794         }
 795
 796         /*
 797          * If the data is compressed and we couldn't write the entire extent as
 798          * is, we have to decompress it:
 799          */
 800         if (op->crc.compression_type) {
 801                 struct bch_csum csum;
 802
 803                 if (bch2_write_decrypt(op))
 804                         return PREP_ENCODED_CHECKSUM_ERR;
 805
 806                 /* Last point we can still verify checksum: */
 807                 csum = bch2_checksum_bio(c, op->crc.csum_type,
 808                                          extent_nonce(op->version, op->crc),
 809                                          bio);
 810                 if (bch2_crc_cmp(op->crc.csum, csum))
 811                         return PREP_ENCODED_CHECKSUM_ERR;
 812
 813                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
 814                         return PREP_ENCODED_ERR;
 815         }
 816
 817         /*
 818          * No longer have compressed data after this point - data might be
 819          * encrypted:
 820          */
 821
 822         /*
 823          * If the data is checksummed and we're only writing a subset,
 824          * rechecksum and adjust bio to point to currently live data:
 825          */
 826         if ((op->crc.live_size != op->crc.uncompressed_size ||
 827              op->crc.csum_type != op->csum_type) &&
 828             bch2_write_rechecksum(c, op, op->csum_type))
 829                 return PREP_ENCODED_CHECKSUM_ERR;
 830
 831         /*
 832          * If we want to compress the data, it has to be decrypted:
 833          */
 834         if ((op->compression_type ||
 835              bch2_csum_type_is_encryption(op->crc.csum_type) !=
 836              bch2_csum_type_is_encryption(op->csum_type)) &&
 837             bch2_write_decrypt(op))
 838                 return PREP_ENCODED_CHECKSUM_ERR;
 839
 840         return PREP_ENCODED_OK;
 841 }
 842
 843 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 844                              struct bio **_dst)
 845 {
 846         struct bch_fs *c = op->c;
 847         struct bio *src = &op->wbio.bio, *dst = src;
 848         struct bvec_iter saved_iter;
 849         void *ec_buf;
 850         struct bpos ec_pos = op->pos;
 851         unsigned total_output = 0, total_input = 0;
 852         bool bounce = false;
 853         bool page_alloc_failed = false;
 854         int ret, more = 0;
 855
 856         BUG_ON(!bio_sectors(src));
 857
 858         ec_buf = bch2_writepoint_ec_buf(c, wp);
 859
 860         switch (bch2_write_prep_encoded_data(op, wp)) {
 861         case PREP_ENCODED_OK:
 862                 break;
 863         case PREP_ENCODED_ERR:
 864                 ret = -EIO;
 865                 goto err;
 866         case PREP_ENCODED_CHECKSUM_ERR:
 867                 goto csum_err;
 868         case PREP_ENCODED_DO_WRITE:
 869                 /* XXX look for bug here */
 870                 if (ec_buf) {
 871                         dst = bch2_write_bio_alloc(c, wp, src,
 872                                                    &page_alloc_failed,
 873                                                    ec_buf);
 874                         bio_copy_data(dst, src);
 875                         bounce = true;
 876                 }
 877                 init_append_extent(op, wp, op->version, op->crc);
 878                 goto do_write;
 879         }
 880
 881         if (ec_buf ||
 882             op->compression_type ||
 883             (op->csum_type &&
 884              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 885             (bch2_csum_type_is_encryption(op->csum_type) &&
 886              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
 887                 dst = bch2_write_bio_alloc(c, wp, src,
 888                                            &page_alloc_failed,
 889                                            ec_buf);
 890                 bounce = true;
 891         }
 892
 893         saved_iter = dst->bi_iter;
 894
 895         do {
 896                 struct bch_extent_crc_unpacked crc =
 897                         (struct bch_extent_crc_unpacked) { 0 };
 898                 struct bversion version = op->version;
 899                 size_t dst_len, src_len;
 900
 901                 if (page_alloc_failed &&
 902                     bio_sectors(dst) < wp->sectors_free &&
 903                     bio_sectors(dst) < c->sb.encoded_extent_max)
 904                         break;
 905
 906                 BUG_ON(op->compression_type &&
 907                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
 908                        bch2_csum_type_is_encryption(op->crc.csum_type));
 909                 BUG_ON(op->compression_type && !bounce);
 910
 911                 crc.compression_type = op->compression_type
 912                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
 913                                              op->compression_type)
 914                         : 0;
 915                 if (!crc.compression_type) {
 916                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 917                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
 918
 919                         if (op->csum_type)
 920                                 dst_len = min_t(unsigned, dst_len,
 921                                                 c->sb.encoded_extent_max << 9);
 922
 923                         if (bounce) {
 924                                 swap(dst->bi_iter.bi_size, dst_len);
 925                                 bio_copy_data(dst, src);
 926                                 swap(dst->bi_iter.bi_size, dst_len);
 927                         }
 928
 929                         src_len = dst_len;
 930                 }
 931
 932                 BUG_ON(!src_len || !dst_len);
 933
 934                 if (bch2_csum_type_is_encryption(op->csum_type)) {
 935                         if (bversion_zero(version)) {
 936                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
 937                         } else {
 938                                 crc.nonce = op->nonce;
 939                                 op->nonce += src_len >> 9;
 940                         }
 941                 }
 942
 943                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 944                     !crc.compression_type &&
 945                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
 946                     bch2_csum_type_is_encryption(op->csum_type)) {
 947                         /*
 948                          * Note: when we're using rechecksum(), we need to be
 949                          * checksumming @src because it has all the data our
 950                          * existing checksum covers - if we bounced (because we
 951                          * were trying to compress), @dst will only have the
 952                          * part of the data the new checksum will cover.
 953                          *
 954                          * But normally we want to be checksumming post bounce,
 955                          * because part of the reason for bouncing is so the
 956                          * data can't be modified (by userspace) while it's in
 957                          * flight.
 958                          */
 959                         if (bch2_rechecksum_bio(c, src, version, op->crc,
 960                                         &crc, &op->crc,
 961                                         src_len >> 9,
 962                                         bio_sectors(src) - (src_len >> 9),
 963                                         op->csum_type))
 964                                 goto csum_err;
 965                 } else {
 966                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 967                             bch2_rechecksum_bio(c, src, version, op->crc,
 968                                         NULL, &op->crc,
 969                                         src_len >> 9,
 970                                         bio_sectors(src) - (src_len >> 9),
 971                                         op->crc.csum_type))
 972                                 goto csum_err;
 973
 974                         crc.compressed_size     = dst_len >> 9;
 975                         crc.uncompressed_size   = src_len >> 9;
 976                         crc.live_size           = src_len >> 9;
 977
 978                         swap(dst->bi_iter.bi_size, dst_len);
 979                         bch2_encrypt_bio(c, op->csum_type,
 980                                          extent_nonce(version, crc), dst);
 981                         crc.csum = bch2_checksum_bio(c, op->csum_type,
 982                                          extent_nonce(version, crc), dst);
 983                         crc.csum_type = op->csum_type;
 984                         swap(dst->bi_iter.bi_size, dst_len);
 985                 }
 986
 987                 init_append_extent(op, wp, version, crc);
 988
 989                 if (dst != src)
 990                         bio_advance(dst, dst_len);
 991                 bio_advance(src, src_len);
 992                 total_output    += dst_len;
 993                 total_input     += src_len;
 994         } while (dst->bi_iter.bi_size &&
 995                  src->bi_iter.bi_size &&
 996                  wp->sectors_free &&
 997                  !bch2_keylist_realloc(&op->insert_keys,
 998                                       op->inline_keys,
 999                                       ARRAY_SIZE(op->inline_keys),
1000                                       BKEY_EXTENT_U64s_MAX));
1001
1002         more = src->bi_iter.bi_size != 0;
1003
1004         dst->bi_iter = saved_iter;
1005
1006         if (dst == src && more) {
1007                 BUG_ON(total_output != total_input);
1008
1009                 dst = bio_split(src, total_input >> 9,
1010                                 GFP_NOIO, &c->bio_write);
1011                 wbio_init(dst)->put_bio = true;
1012                 /* copy WRITE_SYNC flag */
1013                 dst->bi_opf             = src->bi_opf;
1014         }
1015
1016         dst->bi_iter.bi_size = total_output;
1017 do_write:
1018         /* might have done a realloc... */
1019         bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
1020
1021         *_dst = dst;
1022         return more;
1023 csum_err:
1024         bch_err(c, "error verifying existing checksum while "
1025                 "rewriting existing data (memory corruption?)");
1026         ret = -EIO;
1027 err:
1028         if (to_wbio(dst)->bounce)
1029                 bch2_bio_free_pages_pool(c, dst);
1030         if (to_wbio(dst)->put_bio)
1031                 bio_put(dst);
1032
1033         return ret;
1034 }
1035
1036 static void __bch2_write(struct closure *cl)
1037 {
1038         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1039         struct bch_fs *c = op->c;
1040         struct write_point *wp;
1041         struct bio *bio;
1042         bool skip_put = true;
1043         int ret;
1044 again:
1045         memset(&op->failed, 0, sizeof(op->failed));
1046
1047         do {
1048                 struct bkey_i *key_to_write;
1049                 unsigned key_to_write_offset = op->insert_keys.top_p -
1050                         op->insert_keys.keys_p;
1051
1052                 /* +1 for possible cache device: */
1053                 if (op->open_buckets.nr + op->nr_replicas + 1 >
1054                     ARRAY_SIZE(op->open_buckets.v))
1055                         goto flush_io;
1056
1057                 if (bch2_keylist_realloc(&op->insert_keys,
1058                                         op->inline_keys,
1059                                         ARRAY_SIZE(op->inline_keys),
1060                                         BKEY_EXTENT_U64s_MAX))
1061                         goto flush_io;
1062
1063                 wp = bch2_alloc_sectors_start(c,
1064                         op->target,
1065                         op->opts.erasure_code,
1066                         op->write_point,
1067                         &op->devs_have,
1068                         op->nr_replicas,
1069                         op->nr_replicas_required,
1070                         op->alloc_reserve,
1071                         op->flags,
1072                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
1073                 EBUG_ON(!wp);
1074
1075                 if (unlikely(IS_ERR(wp))) {
1076                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
1077                                 ret = PTR_ERR(wp);
1078                                 goto err;
1079                         }
1080
1081                         goto flush_io;
1082                 }
1083
1084                 bch2_open_bucket_get(c, wp, &op->open_buckets);
1085                 ret = bch2_write_extent(op, wp, &bio);
1086                 bch2_alloc_sectors_done(c, wp);
1087
1088                 if (ret < 0)
1089                         goto err;
1090
1091                 if (ret)
1092                         skip_put = false;
1093
1094                 bio->bi_end_io  = bch2_write_endio;
1095                 bio->bi_private = &op->cl;
1096                 bio->bi_opf |= REQ_OP_WRITE;
1097
1098                 if (!skip_put)
1099                         closure_get(bio->bi_private);
1100                 else
1101                         op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
1102
1103                 key_to_write = (void *) (op->insert_keys.keys_p +
1104                                          key_to_write_offset);
1105
1106                 bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
1107                                           key_to_write);
1108         } while (ret);
1109
1110         if (!skip_put)
1111                 continue_at(cl, bch2_write_index, index_update_wq(op));
1112         return;
1113 err:
1114         op->error = ret;
1115
1116         continue_at(cl, bch2_write_index, index_update_wq(op));
1117         return;
1118 flush_io:
1119         closure_sync(cl);
1120
1121         if (!bch2_keylist_empty(&op->insert_keys)) {
1122                 __bch2_write_index(op);
1123
1124                 if (op->error) {
1125                         continue_at_nobarrier(cl, bch2_write_done, NULL);
1126                         return;
1127                 }
1128         }
1129
1130         goto again;
1131 }
1132
1133 static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
1134 {
1135         struct closure *cl = &op->cl;
1136         struct bio *bio = &op->wbio.bio;
1137         struct bvec_iter iter;
1138         struct bkey_i_inline_data *id;
1139         unsigned sectors;
1140         int ret;
1141
1142         bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
1143
1144         ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
1145                                    ARRAY_SIZE(op->inline_keys),
1146                                    BKEY_U64s + DIV_ROUND_UP(data_len, 8));
1147         if (ret) {
1148                 op->error = ret;
1149                 goto err;
1150         }
1151
1152         sectors = bio_sectors(bio);
1153         op->pos.offset += sectors;
1154
1155         id = bkey_inline_data_init(op->insert_keys.top);
1156         id->k.p         = op->pos;
1157         id->k.version   = op->version;
1158         id->k.size      = sectors;
1159
1160         iter = bio->bi_iter;
1161         iter.bi_size = data_len;
1162         memcpy_from_bio(id->v.data, bio, iter);
1163
1164         while (data_len & 7)
1165                 id->v.data[data_len++] = '\0';
1166         set_bkey_val_bytes(&id->k, data_len);
1167         bch2_keylist_push(&op->insert_keys);
1168
1169         op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
1170         continue_at_nobarrier(cl, bch2_write_index, NULL);
1171         return;
1172 err:
1173         bch2_write_done(&op->cl);
1174 }
1175
1176 /**
1177  * bch_write - handle a write to a cache device or flash only volume
1178  *
1179  * This is the starting point for any data to end up in a cache device; it could
1180  * be from a normal write, or a writeback write, or a write to a flash only
1181  * volume - it's also used by the moving garbage collector to compact data in
1182  * mostly empty buckets.
1183  *
1184  * It first writes the data to the cache, creating a list of keys to be inserted
1185  * (if the data won't fit in a single open bucket, there will be multiple keys);
1186  * after the data is written it calls bch_journal, and after the keys have been
1187  * added to the next journal write they're inserted into the btree.
1188  *
1189  * If op->discard is true, instead of inserting the data it invalidates the
1190  * region of the cache represented by op->bio and op->inode.
1191  */
1192 void bch2_write(struct closure *cl)
1193 {
1194         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
1195         struct bio *bio = &op->wbio.bio;
1196         struct bch_fs *c = op->c;
1197         unsigned data_len;
1198
1199         BUG_ON(!op->nr_replicas);
1200         BUG_ON(!op->write_point.v);
1201         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
1202
1203         op->start_time = local_clock();
1204         bch2_keylist_init(&op->insert_keys, op->inline_keys);
1205         wbio_init(bio)->put_bio = false;
1206
1207         if (bio_sectors(bio) & (c->opts.block_size - 1)) {
1208                 __bcache_io_error(c, "misaligned write");
1209                 op->error = -EIO;
1210                 goto err;
1211         }
1212
1213         if (c->opts.nochanges ||
1214             !percpu_ref_tryget(&c->writes)) {
1215                 __bcache_io_error(c, "read only");
1216                 op->error = -EROFS;
1217                 goto err;
1218         }
1219
1220         bch2_increment_clock(c, bio_sectors(bio), WRITE);
1221
1222         data_len = min_t(u64, bio->bi_iter.bi_size,
1223                          op->new_i_size - (op->pos.offset << 9));
1224
1225         if (c->opts.inline_data &&
1226             data_len <= min(block_bytes(c) / 2, 1024U)) {
1227                 bch2_write_data_inline(op, data_len);
1228                 return;
1229         }
1230
1231         continue_at_nobarrier(cl, __bch2_write, NULL);
1232         return;
1233 err:
1234         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
1235                 bch2_disk_reservation_put(c, &op->res);
1236
1237         if (op->end_io) {
1238                 EBUG_ON(cl->parent);
1239                 closure_debug_destroy(cl);
1240                 op->end_io(op);
1241         } else {
1242                 closure_return(cl);
1243         }
1244 }
1245
1246 /* Cache promotion on read */
1247
1248 struct promote_op {
1249         struct closure          cl;
1250         struct rcu_head         rcu;
1251         u64                     start_time;
1252
1253         struct rhash_head       hash;
1254         struct bpos             pos;
1255
1256         struct migrate_write    write;
1257         struct bio_vec          bi_inline_vecs[0]; /* must be last */
1258 };
1259
1260 static const struct rhashtable_params bch_promote_params = {
1261         .head_offset    = offsetof(struct promote_op, hash),
1262         .key_offset     = offsetof(struct promote_op, pos),
1263         .key_len        = sizeof(struct bpos),
1264 };
1265
1266 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
1267                                   struct bpos pos,
1268                                   struct bch_io_opts opts,
1269                                   unsigned flags)
1270 {
1271         if (!(flags & BCH_READ_MAY_PROMOTE))
1272                 return false;
1273
1274         if (!opts.promote_target)
1275                 return false;
1276
1277         if (bch2_bkey_has_target(c, k, opts.promote_target))
1278                 return false;
1279
1280         if (bch2_target_congested(c, opts.promote_target)) {
1281                 /* XXX trace this */
1282                 return false;
1283         }
1284
1285         if (rhashtable_lookup_fast(&c->promote_table, &pos,
1286                                    bch_promote_params))
1287                 return false;
1288
1289         return true;
1290 }
1291
1292 static void promote_free(struct bch_fs *c, struct promote_op *op)
1293 {
1294         int ret;
1295
1296         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
1297                                      bch_promote_params);
1298         BUG_ON(ret);
1299         percpu_ref_put(&c->writes);
1300         kfree_rcu(op, rcu);
1301 }
1302
1303 static void promote_done(struct closure *cl)
1304 {
1305         struct promote_op *op =
1306                 container_of(cl, struct promote_op, cl);
1307         struct bch_fs *c = op->write.op.c;
1308
1309         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
1310                                op->start_time);
1311
1312         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
1313         promote_free(c, op);
1314 }
1315
1316 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
1317 {
1318         struct bch_fs *c = rbio->c;
1319         struct closure *cl = &op->cl;
1320         struct bio *bio = &op->write.op.wbio.bio;
1321
1322         trace_promote(&rbio->bio);
1323
1324         /* we now own pages: */
1325         BUG_ON(!rbio->bounce);
1326         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
1327
1328         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
1329                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1330         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1331
1332         bch2_migrate_read_done(&op->write, rbio);
1333
1334         closure_init(cl, NULL);
1335         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1336         closure_return_with_destructor(cl, promote_done);
1337 }
1338
1339 static struct promote_op *__promote_alloc(struct bch_fs *c,
1340                                           enum btree_id btree_id,
1341                                           struct bpos pos,
1342                                           struct extent_ptr_decoded *pick,
1343                                           struct bch_io_opts opts,
1344                                           unsigned sectors,
1345                                           struct bch_read_bio **rbio)
1346 {
1347         struct promote_op *op = NULL;
1348         struct bio *bio;
1349         unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
1350         int ret;
1351
1352         if (!percpu_ref_tryget(&c->writes))
1353                 return NULL;
1354
1355         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
1356         if (!op)
1357                 goto err;
1358
1359         op->start_time = local_clock();
1360         op->pos = pos;
1361
1362         /*
1363          * We don't use the mempool here because extents that aren't
1364          * checksummed or compressed can be too big for the mempool:
1365          */
1366         *rbio = kzalloc(sizeof(struct bch_read_bio) +
1367                         sizeof(struct bio_vec) * pages,
1368                         GFP_NOIO);
1369         if (!*rbio)
1370                 goto err;
1371
1372         rbio_init(&(*rbio)->bio, opts);
1373         bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
1374
1375         if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
1376                                  GFP_NOIO))
1377                 goto err;
1378
1379         (*rbio)->bounce         = true;
1380         (*rbio)->split          = true;
1381         (*rbio)->kmalloc        = true;
1382
1383         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1384                                           bch_promote_params))
1385                 goto err;
1386
1387         bio = &op->write.op.wbio.bio;
1388         bio_init(bio, bio->bi_inline_vecs, pages);
1389
1390         ret = bch2_migrate_write_init(c, &op->write,
1391                         writepoint_hashed((unsigned long) current),
1392                         opts,
1393                         DATA_PROMOTE,
1394                         (struct data_opts) {
1395                                 .target = opts.promote_target
1396                         },
1397                         btree_id,
1398                         bkey_s_c_null);
1399         BUG_ON(ret);
1400
1401         return op;
1402 err:
1403         if (*rbio)
1404                 bio_free_pages(&(*rbio)->bio);
1405         kfree(*rbio);
1406         *rbio = NULL;
1407         kfree(op);
1408         percpu_ref_put(&c->writes);
1409         return NULL;
1410 }
1411
1412 noinline
1413 static struct promote_op *promote_alloc(struct bch_fs *c,
1414                                                struct bvec_iter iter,
1415                                                struct bkey_s_c k,
1416                                                struct extent_ptr_decoded *pick,
1417                                                struct bch_io_opts opts,
1418                                                unsigned flags,
1419                                                struct bch_read_bio **rbio,
1420                                                bool *bounce,
1421                                                bool *read_full)
1422 {
1423         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1424         /* data might have to be decompressed in the write path: */
1425         unsigned sectors = promote_full
1426                 ? max(pick->crc.compressed_size, pick->crc.live_size)
1427                 : bvec_iter_sectors(iter);
1428         struct bpos pos = promote_full
1429                 ? bkey_start_pos(k.k)
1430                 : POS(k.k->p.inode, iter.bi_sector);
1431         struct promote_op *promote;
1432
1433         if (!should_promote(c, k, pos, opts, flags))
1434                 return NULL;
1435
1436         promote = __promote_alloc(c,
1437                                   k.k->type == KEY_TYPE_reflink_v
1438                                   ? BTREE_ID_REFLINK
1439                                   : BTREE_ID_EXTENTS,
1440                                   pos, pick, opts, sectors, rbio);
1441         if (!promote)
1442                 return NULL;
1443
1444         *bounce         = true;
1445         *read_full      = promote_full;
1446         return promote;
1447 }
1448
1449 /* Read */
1450
1451 #define READ_RETRY_AVOID        1
1452 #define READ_RETRY              2
1453 #define READ_ERR                3
1454
1455 enum rbio_context {
1456         RBIO_CONTEXT_NULL,
1457         RBIO_CONTEXT_HIGHPRI,
1458         RBIO_CONTEXT_UNBOUND,
1459 };
1460
1461 static inline struct bch_read_bio *
1462 bch2_rbio_parent(struct bch_read_bio *rbio)
1463 {
1464         return rbio->split ? rbio->parent : rbio;
1465 }
1466
1467 __always_inline
1468 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1469                            enum rbio_context context,
1470                            struct workqueue_struct *wq)
1471 {
1472         if (context <= rbio->context) {
1473                 fn(&rbio->work);
1474         } else {
1475                 rbio->work.func         = fn;
1476                 rbio->context           = context;
1477                 queue_work(wq, &rbio->work);
1478         }
1479 }
1480
1481 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1482 {
1483         BUG_ON(rbio->bounce && !rbio->split);
1484
1485         if (rbio->promote)
1486                 promote_free(rbio->c, rbio->promote);
1487         rbio->promote = NULL;
1488
1489         if (rbio->bounce)
1490                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1491
1492         if (rbio->split) {
1493                 struct bch_read_bio *parent = rbio->parent;
1494
1495                 if (rbio->kmalloc)
1496                         kfree(rbio);
1497                 else
1498                         bio_put(&rbio->bio);
1499
1500                 rbio = parent;
1501         }
1502
1503         return rbio;
1504 }
1505
1506 /*
1507  * Only called on a top level bch_read_bio to complete an entire read request,
1508  * not a split:
1509  */
1510 static void bch2_rbio_done(struct bch_read_bio *rbio)
1511 {
1512         if (rbio->start_time)
1513                 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1514                                        rbio->start_time);
1515         bio_endio(&rbio->bio);
1516 }
1517
1518 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1519                                      struct bvec_iter bvec_iter, u64 inode,
1520                                      struct bch_io_failures *failed,
1521                                      unsigned flags)
1522 {
1523         struct btree_trans trans;
1524         struct btree_iter *iter;
1525         struct bkey_on_stack sk;
1526         struct bkey_s_c k;
1527         int ret;
1528
1529         flags &= ~BCH_READ_LAST_FRAGMENT;
1530         flags |= BCH_READ_MUST_CLONE;
1531
1532         bkey_on_stack_init(&sk);
1533         bch2_trans_init(&trans, c, 0, 0);
1534
1535         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
1536                                    rbio->pos, BTREE_ITER_SLOTS);
1537 retry:
1538         rbio->bio.bi_status = 0;
1539
1540         k = bch2_btree_iter_peek_slot(iter);
1541         if (bkey_err(k))
1542                 goto err;
1543
1544         bkey_on_stack_reassemble(&sk, c, k);
1545         k = bkey_i_to_s_c(sk.k);
1546         bch2_trans_unlock(&trans);
1547
1548         if (!bch2_bkey_matches_ptr(c, k,
1549                                    rbio->pick.ptr,
1550                                    rbio->pos.offset -
1551                                    rbio->pick.crc.offset)) {
1552                 /* extent we wanted to read no longer exists: */
1553                 rbio->hole = true;
1554                 goto out;
1555         }
1556
1557         ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
1558         if (ret == READ_RETRY)
1559                 goto retry;
1560         if (ret)
1561                 goto err;
1562 out:
1563         bch2_rbio_done(rbio);
1564         bch2_trans_exit(&trans);
1565         bkey_on_stack_exit(&sk, c);
1566         return;
1567 err:
1568         rbio->bio.bi_status = BLK_STS_IOERR;
1569         goto out;
1570 }
1571
1572 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1573                             struct bvec_iter bvec_iter, u64 inode,
1574                             struct bch_io_failures *failed, unsigned flags)
1575 {
1576         struct btree_trans trans;
1577         struct btree_iter *iter;
1578         struct bkey_on_stack sk;
1579         struct bkey_s_c k;
1580         int ret;
1581
1582         flags &= ~BCH_READ_LAST_FRAGMENT;
1583         flags |= BCH_READ_MUST_CLONE;
1584
1585         bkey_on_stack_init(&sk);
1586         bch2_trans_init(&trans, c, 0, 0);
1587 retry:
1588         bch2_trans_begin(&trans);
1589
1590         for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
1591                            POS(inode, bvec_iter.bi_sector),
1592                            BTREE_ITER_SLOTS, k, ret) {
1593                 unsigned bytes, sectors, offset_into_extent;
1594
1595                 bkey_on_stack_reassemble(&sk, c, k);
1596                 k = bkey_i_to_s_c(sk.k);
1597
1598                 offset_into_extent = iter->pos.offset -
1599                         bkey_start_offset(k.k);
1600                 sectors = k.k->size - offset_into_extent;
1601
1602                 ret = bch2_read_indirect_extent(&trans,
1603                                         &offset_into_extent, sk.k);
1604                 if (ret)
1605                         break;
1606
1607                 sectors = min(sectors, k.k->size - offset_into_extent);
1608
1609                 bch2_trans_unlock(&trans);
1610
1611                 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1612                 swap(bvec_iter.bi_size, bytes);
1613
1614                 ret = __bch2_read_extent(c, rbio, bvec_iter, k,
1615                                 offset_into_extent, failed, flags);
1616                 switch (ret) {
1617                 case READ_RETRY:
1618                         goto retry;
1619                 case READ_ERR:
1620                         goto err;
1621                 };
1622
1623                 if (bytes == bvec_iter.bi_size)
1624                         goto out;
1625
1626                 swap(bvec_iter.bi_size, bytes);
1627                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1628         }
1629
1630         if (ret == -EINTR)
1631                 goto retry;
1632         /*
1633          * If we get here, it better have been because there was an error
1634          * reading a btree node
1635          */
1636         BUG_ON(!ret);
1637         __bcache_io_error(c, "btree IO error: %i", ret);
1638 err:
1639         rbio->bio.bi_status = BLK_STS_IOERR;
1640 out:
1641         bch2_trans_exit(&trans);
1642         bkey_on_stack_exit(&sk, c);
1643         bch2_rbio_done(rbio);
1644 }
1645
1646 static void bch2_rbio_retry(struct work_struct *work)
1647 {
1648         struct bch_read_bio *rbio =
1649                 container_of(work, struct bch_read_bio, work);
1650         struct bch_fs *c        = rbio->c;
1651         struct bvec_iter iter   = rbio->bvec_iter;
1652         unsigned flags          = rbio->flags;
1653         u64 inode               = rbio->pos.inode;
1654         struct bch_io_failures failed = { .nr = 0 };
1655
1656         trace_read_retry(&rbio->bio);
1657
1658         if (rbio->retry == READ_RETRY_AVOID)
1659                 bch2_mark_io_failure(&failed, &rbio->pick);
1660
1661         rbio->bio.bi_status = 0;
1662
1663         rbio = bch2_rbio_free(rbio);
1664
1665         flags |= BCH_READ_IN_RETRY;
1666         flags &= ~BCH_READ_MAY_PROMOTE;
1667
1668         if (flags & BCH_READ_NODECODE)
1669                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1670         else
1671                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1672 }
1673
1674 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1675                             blk_status_t error)
1676 {
1677         rbio->retry = retry;
1678
1679         if (rbio->flags & BCH_READ_IN_RETRY)
1680                 return;
1681
1682         if (retry == READ_ERR) {
1683                 rbio = bch2_rbio_free(rbio);
1684
1685                 rbio->bio.bi_status = error;
1686                 bch2_rbio_done(rbio);
1687         } else {
1688                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1689                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1690         }
1691 }
1692
1693 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1694 {
1695         struct bch_fs *c = rbio->c;
1696         struct btree_trans trans;
1697         struct btree_iter *iter;
1698         struct bkey_s_c k;
1699         struct bkey_on_stack new;
1700         struct bch_extent_crc_unpacked new_crc;
1701         u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
1702         int ret;
1703
1704         if (rbio->pick.crc.compression_type)
1705                 return;
1706
1707         bkey_on_stack_init(&new);
1708         bch2_trans_init(&trans, c, 0, 0);
1709 retry:
1710         bch2_trans_begin(&trans);
1711
1712         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
1713                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
1714         k = bch2_btree_iter_peek_slot(iter);
1715         if (IS_ERR_OR_NULL(k.k))
1716                 goto out;
1717
1718         bkey_on_stack_reassemble(&new, c, k);
1719         k = bkey_i_to_s_c(new.k);
1720
1721         if (bversion_cmp(k.k->version, rbio->version) ||
1722             !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
1723                 goto out;
1724
1725         /* Extent was merged? */
1726         if (bkey_start_offset(k.k) < data_offset ||
1727             k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
1728                 goto out;
1729
1730         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1731                         rbio->pick.crc, NULL, &new_crc,
1732                         bkey_start_offset(k.k) - data_offset, k.k->size,
1733                         rbio->pick.crc.csum_type)) {
1734                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1735                 goto out;
1736         }
1737
1738         if (!bch2_bkey_narrow_crcs(new.k, new_crc))
1739                 goto out;
1740
1741         bch2_trans_update(&trans, iter, new.k);
1742         ret = bch2_trans_commit(&trans, NULL, NULL,
1743                                 BTREE_INSERT_NOFAIL|
1744                                 BTREE_INSERT_NOWAIT);
1745         if (ret == -EINTR)
1746                 goto retry;
1747 out:
1748         bch2_trans_exit(&trans);
1749         bkey_on_stack_exit(&new, c);
1750 }
1751
1752 /* Inner part that may run in process context */
1753 static void __bch2_read_endio(struct work_struct *work)
1754 {
1755         struct bch_read_bio *rbio =
1756                 container_of(work, struct bch_read_bio, work);
1757         struct bch_fs *c        = rbio->c;
1758         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1759         struct bio *src         = &rbio->bio;
1760         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1761         struct bvec_iter dst_iter = rbio->bvec_iter;
1762         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1763         struct nonce nonce = extent_nonce(rbio->version, crc);
1764         struct bch_csum csum;
1765
1766         /* Reset iterator for checksumming and copying bounced data: */
1767         if (rbio->bounce) {
1768                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1769                 src->bi_iter.bi_idx             = 0;
1770                 src->bi_iter.bi_bvec_done       = 0;
1771         } else {
1772                 src->bi_iter                    = rbio->bvec_iter;
1773         }
1774
1775         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1776         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1777                 goto csum_err;
1778
1779         if (unlikely(rbio->narrow_crcs))
1780                 bch2_rbio_narrow_crcs(rbio);
1781
1782         if (rbio->flags & BCH_READ_NODECODE)
1783                 goto nodecode;
1784
1785         /* Adjust crc to point to subset of data we want: */
1786         crc.offset     += rbio->offset_into_extent;
1787         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1788
1789         if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
1790                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1791                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1792                         goto decompression_err;
1793         } else {
1794                 /* don't need to decrypt the entire bio: */
1795                 nonce = nonce_add(nonce, crc.offset << 9);
1796                 bio_advance(src, crc.offset << 9);
1797
1798                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1799                 src->bi_iter.bi_size = dst_iter.bi_size;
1800
1801                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1802
1803                 if (rbio->bounce) {
1804                         struct bvec_iter src_iter = src->bi_iter;
1805                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1806                 }
1807         }
1808
1809         if (rbio->promote) {
1810                 /*
1811                  * Re encrypt data we decrypted, so it's consistent with
1812                  * rbio->crc:
1813                  */
1814                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1815                 promote_start(rbio->promote, rbio);
1816                 rbio->promote = NULL;
1817         }
1818 nodecode:
1819         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1820                 rbio = bch2_rbio_free(rbio);
1821                 bch2_rbio_done(rbio);
1822         }
1823         return;
1824 csum_err:
1825         /*
1826          * Checksum error: if the bio wasn't bounced, we may have been
1827          * reading into buffers owned by userspace (that userspace can
1828          * scribble over) - retry the read, bouncing it this time:
1829          */
1830         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1831                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1832                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1833                 return;
1834         }
1835
1836         bch2_dev_io_error(ca,
1837                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1838                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1839                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1840                 csum.hi, csum.lo, crc.csum_type);
1841         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1842         return;
1843 decompression_err:
1844         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1845                           rbio->pos.inode,
1846                           (u64) rbio->bvec_iter.bi_sector);
1847         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1848         return;
1849 }
1850
1851 static void bch2_read_endio(struct bio *bio)
1852 {
1853         struct bch_read_bio *rbio =
1854                 container_of(bio, struct bch_read_bio, bio);
1855         struct bch_fs *c        = rbio->c;
1856         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1857         struct workqueue_struct *wq = NULL;
1858         enum rbio_context context = RBIO_CONTEXT_NULL;
1859
1860         if (rbio->have_ioref) {
1861                 bch2_latency_acct(ca, rbio->submit_time, READ);
1862                 percpu_ref_put(&ca->io_ref);
1863         }
1864
1865         if (!rbio->split)
1866                 rbio->bio.bi_end_io = rbio->end_io;
1867
1868         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1869                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1870                 return;
1871         }
1872
1873         if (rbio->pick.ptr.cached &&
1874             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1875              ptr_stale(ca, &rbio->pick.ptr))) {
1876                 atomic_long_inc(&c->read_realloc_races);
1877
1878                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1879                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1880                 else
1881                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1882                 return;
1883         }
1884
1885         if (rbio->narrow_crcs ||
1886             rbio->pick.crc.compression_type ||
1887             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1888                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1889         else if (rbio->pick.crc.csum_type)
1890                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1891
1892         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1893 }
1894
1895 int __bch2_read_indirect_extent(struct btree_trans *trans,
1896                                 unsigned *offset_into_extent,
1897                                 struct bkey_i *orig_k)
1898 {
1899         struct btree_iter *iter;
1900         struct bkey_s_c k;
1901         u64 reflink_offset;
1902         int ret;
1903
1904         reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
1905                 *offset_into_extent;
1906
1907         iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
1908                                    POS(0, reflink_offset),
1909                                    BTREE_ITER_SLOTS);
1910         ret = PTR_ERR_OR_ZERO(iter);
1911         if (ret)
1912                 return ret;
1913
1914         k = bch2_btree_iter_peek_slot(iter);
1915         ret = bkey_err(k);
1916         if (ret)
1917                 goto err;
1918
1919         if (k.k->type != KEY_TYPE_reflink_v) {
1920                 __bcache_io_error(trans->c,
1921                                 "pointer to nonexistent indirect extent");
1922                 ret = -EIO;
1923                 goto err;
1924         }
1925
1926         *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1927         bkey_reassemble(orig_k, k);
1928 err:
1929         bch2_trans_iter_put(trans, iter);
1930         return ret;
1931 }
1932
1933 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1934                        struct bvec_iter iter, struct bkey_s_c k,
1935                        unsigned offset_into_extent,
1936                        struct bch_io_failures *failed, unsigned flags)
1937 {
1938         struct extent_ptr_decoded pick;
1939         struct bch_read_bio *rbio = NULL;
1940         struct bch_dev *ca;
1941         struct promote_op *promote = NULL;
1942         bool bounce = false, read_full = false, narrow_crcs = false;
1943         struct bpos pos = bkey_start_pos(k.k);
1944         int pick_ret;
1945
1946         if (k.k->type == KEY_TYPE_inline_data) {
1947                 struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
1948                 unsigned bytes = min_t(unsigned, iter.bi_size,
1949                                        bkey_val_bytes(d.k));
1950
1951                 swap(iter.bi_size, bytes);
1952                 memcpy_to_bio(&orig->bio, iter, d.v->data);
1953                 swap(iter.bi_size, bytes);
1954                 bio_advance_iter(&orig->bio, &iter, bytes);
1955                 zero_fill_bio_iter(&orig->bio, iter);
1956                 goto out_read_done;
1957         }
1958
1959         pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
1960
1961         /* hole or reservation - just zero fill: */
1962         if (!pick_ret)
1963                 goto hole;
1964
1965         if (pick_ret < 0) {
1966                 __bcache_io_error(c, "no device to read from");
1967                 goto err;
1968         }
1969
1970         if (pick_ret > 0)
1971                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1972
1973         if (flags & BCH_READ_NODECODE) {
1974                 /*
1975                  * can happen if we retry, and the extent we were going to read
1976                  * has been merged in the meantime:
1977                  */
1978                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1979                         goto hole;
1980
1981                 iter.bi_size    = pick.crc.compressed_size << 9;
1982                 goto noclone;
1983         }
1984
1985         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1986             bio_flagged(&orig->bio, BIO_CHAIN))
1987                 flags |= BCH_READ_MUST_CLONE;
1988
1989         narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
1990                 bch2_can_narrow_extent_crcs(k, pick.crc);
1991
1992         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1993                 flags |= BCH_READ_MUST_BOUNCE;
1994
1995         EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1996
1997         if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
1998             (pick.crc.csum_type != BCH_CSUM_NONE &&
1999              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2000               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
2001                (flags & BCH_READ_USER_MAPPED)) ||
2002               (flags & BCH_READ_MUST_BOUNCE)))) {
2003                 read_full = true;
2004                 bounce = true;
2005         }
2006
2007         if (orig->opts.promote_target)
2008                 promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
2009                                         &rbio, &bounce, &read_full);
2010
2011         if (!read_full) {
2012                 EBUG_ON(pick.crc.compression_type);
2013                 EBUG_ON(pick.crc.csum_type &&
2014                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
2015                          bvec_iter_sectors(iter) != pick.crc.live_size ||
2016                          pick.crc.offset ||
2017                          offset_into_extent));
2018
2019                 pos.offset += offset_into_extent;
2020                 pick.ptr.offset += pick.crc.offset +
2021                         offset_into_extent;
2022                 offset_into_extent              = 0;
2023                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
2024                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
2025                 pick.crc.offset                 = 0;
2026                 pick.crc.live_size              = bvec_iter_sectors(iter);
2027                 offset_into_extent              = 0;
2028         }
2029
2030         if (rbio) {
2031                 /*
2032                  * promote already allocated bounce rbio:
2033                  * promote needs to allocate a bio big enough for uncompressing
2034                  * data in the write path, but we're not going to use it all
2035                  * here:
2036                  */
2037                 EBUG_ON(rbio->bio.bi_iter.bi_size <
2038                        pick.crc.compressed_size << 9);
2039                 rbio->bio.bi_iter.bi_size =
2040                         pick.crc.compressed_size << 9;
2041         } else if (bounce) {
2042                 unsigned sectors = pick.crc.compressed_size;
2043
2044                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
2045                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
2046                                                   &c->bio_read_split),
2047                                  orig->opts);
2048
2049                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
2050                 rbio->bounce    = true;
2051                 rbio->split     = true;
2052         } else if (flags & BCH_READ_MUST_CLONE) {
2053                 /*
2054                  * Have to clone if there were any splits, due to error
2055                  * reporting issues (if a split errored, and retrying didn't
2056                  * work, when it reports the error to its parent (us) we don't
2057                  * know if the error was from our bio, and we should retry, or
2058                  * from the whole bio, in which case we don't want to retry and
2059                  * lose the error)
2060                  */
2061                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
2062                                                 &c->bio_read_split),
2063                                  orig->opts);
2064                 rbio->bio.bi_iter = iter;
2065                 rbio->split     = true;
2066         } else {
2067 noclone:
2068                 rbio = orig;
2069                 rbio->bio.bi_iter = iter;
2070                 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
2071         }
2072
2073         EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
2074
2075         rbio->c                 = c;
2076         rbio->submit_time       = local_clock();
2077         if (rbio->split)
2078                 rbio->parent    = orig;
2079         else
2080                 rbio->end_io    = orig->bio.bi_end_io;
2081         rbio->bvec_iter         = iter;
2082         rbio->offset_into_extent= offset_into_extent;
2083         rbio->flags             = flags;
2084         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
2085         rbio->narrow_crcs       = narrow_crcs;
2086         rbio->hole              = 0;
2087         rbio->retry             = 0;
2088         rbio->context           = 0;
2089         /* XXX: only initialize this if needed */
2090         rbio->devs_have         = bch2_bkey_devs(k);
2091         rbio->pick              = pick;
2092         rbio->pos               = pos;
2093         rbio->version           = k.k->version;
2094         rbio->promote           = promote;
2095         INIT_WORK(&rbio->work, NULL);
2096
2097         rbio->bio.bi_opf        = orig->bio.bi_opf;
2098         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
2099         rbio->bio.bi_end_io     = bch2_read_endio;
2100
2101         if (rbio->bounce)
2102                 trace_read_bounce(&rbio->bio);
2103
2104         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
2105
2106         rcu_read_lock();
2107         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
2108         rcu_read_unlock();
2109
2110         if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
2111                 bio_inc_remaining(&orig->bio);
2112                 trace_read_split(&orig->bio);
2113         }
2114
2115         if (!rbio->pick.idx) {
2116                 if (!rbio->have_ioref) {
2117                         __bcache_io_error(c, "no device to read from");
2118                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2119                         goto out;
2120                 }
2121
2122                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
2123                              bio_sectors(&rbio->bio));
2124                 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
2125
2126                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2127                         submit_bio(&rbio->bio);
2128                 else
2129                         submit_bio_wait(&rbio->bio);
2130         } else {
2131                 /* Attempting reconstruct read: */
2132                 if (bch2_ec_read_extent(c, rbio)) {
2133                         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
2134                         goto out;
2135                 }
2136
2137                 if (likely(!(flags & BCH_READ_IN_RETRY)))
2138                         bio_endio(&rbio->bio);
2139         }
2140 out:
2141         if (likely(!(flags & BCH_READ_IN_RETRY))) {
2142                 return 0;
2143         } else {
2144                 int ret;
2145
2146                 rbio->context = RBIO_CONTEXT_UNBOUND;
2147                 bch2_read_endio(&rbio->bio);
2148
2149                 ret = rbio->retry;
2150                 rbio = bch2_rbio_free(rbio);
2151
2152                 if (ret == READ_RETRY_AVOID) {
2153                         bch2_mark_io_failure(failed, &pick);
2154                         ret = READ_RETRY;
2155                 }
2156
2157                 return ret;
2158         }
2159
2160 err:
2161         if (flags & BCH_READ_IN_RETRY)
2162                 return READ_ERR;
2163
2164         orig->bio.bi_status = BLK_STS_IOERR;
2165         goto out_read_done;
2166
2167 hole:
2168         /*
2169          * won't normally happen in the BCH_READ_NODECODE
2170          * (bch2_move_extent()) path, but if we retry and the extent we wanted
2171          * to read no longer exists we have to signal that:
2172          */
2173         if (flags & BCH_READ_NODECODE)
2174                 orig->hole = true;
2175
2176         zero_fill_bio_iter(&orig->bio, iter);
2177 out_read_done:
2178         if (flags & BCH_READ_LAST_FRAGMENT)
2179                 bch2_rbio_done(orig);
2180         return 0;
2181 }
2182
2183 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
2184 {
2185         struct btree_trans trans;
2186         struct btree_iter *iter;
2187         struct bkey_on_stack sk;
2188         struct bkey_s_c k;
2189         unsigned flags = BCH_READ_RETRY_IF_STALE|
2190                 BCH_READ_MAY_PROMOTE|
2191                 BCH_READ_USER_MAPPED;
2192         int ret;
2193
2194         BUG_ON(rbio->_state);
2195         BUG_ON(flags & BCH_READ_NODECODE);
2196         BUG_ON(flags & BCH_READ_IN_RETRY);
2197
2198         rbio->c = c;
2199         rbio->start_time = local_clock();
2200
2201         bkey_on_stack_init(&sk);
2202         bch2_trans_init(&trans, c, 0, 0);
2203 retry:
2204         bch2_trans_begin(&trans);
2205
2206         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
2207                                    POS(inode, rbio->bio.bi_iter.bi_sector),
2208                                    BTREE_ITER_SLOTS);
2209         while (1) {
2210                 unsigned bytes, sectors, offset_into_extent;
2211
2212                 bch2_btree_iter_set_pos(iter,
2213                                 POS(inode, rbio->bio.bi_iter.bi_sector));
2214
2215                 k = bch2_btree_iter_peek_slot(iter);
2216                 ret = bkey_err(k);
2217                 if (ret)
2218                         goto err;
2219
2220                 offset_into_extent = iter->pos.offset -
2221                         bkey_start_offset(k.k);
2222                 sectors = k.k->size - offset_into_extent;
2223
2224                 bkey_on_stack_reassemble(&sk, c, k);
2225                 k = bkey_i_to_s_c(sk.k);
2226
2227                 ret = bch2_read_indirect_extent(&trans,
2228                                         &offset_into_extent, sk.k);
2229                 if (ret)
2230                         goto err;
2231
2232                 /*
2233                  * With indirect extents, the amount of data to read is the min
2234                  * of the original extent and the indirect extent:
2235                  */
2236                 sectors = min(sectors, k.k->size - offset_into_extent);
2237
2238                 /*
2239                  * Unlock the iterator while the btree node's lock is still in
2240                  * cache, before doing the IO:
2241                  */
2242                 bch2_trans_unlock(&trans);
2243
2244                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
2245                 swap(rbio->bio.bi_iter.bi_size, bytes);
2246
2247                 if (rbio->bio.bi_iter.bi_size == bytes)
2248                         flags |= BCH_READ_LAST_FRAGMENT;
2249
2250                 bch2_read_extent(c, rbio, k, offset_into_extent, flags);
2251
2252                 if (flags & BCH_READ_LAST_FRAGMENT)
2253                         break;
2254
2255                 swap(rbio->bio.bi_iter.bi_size, bytes);
2256                 bio_advance(&rbio->bio, bytes);
2257         }
2258 out:
2259         bch2_trans_exit(&trans);
2260         bkey_on_stack_exit(&sk, c);
2261         return;
2262 err:
2263         if (ret == -EINTR)
2264                 goto retry;
2265
2266         bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
2267         bch2_rbio_done(rbio);
2268         goto out;
2269 }
2270
2271 void bch2_fs_io_exit(struct bch_fs *c)
2272 {
2273         if (c->promote_table.tbl)
2274                 rhashtable_destroy(&c->promote_table);
2275         mempool_exit(&c->bio_bounce_pages);
2276         bioset_exit(&c->bio_write);
2277         bioset_exit(&c->bio_read_split);
2278         bioset_exit(&c->bio_read);
2279 }
2280
2281 int bch2_fs_io_init(struct bch_fs *c)
2282 {
2283         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
2284                         BIOSET_NEED_BVECS) ||
2285             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
2286                         BIOSET_NEED_BVECS) ||
2287             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
2288                         BIOSET_NEED_BVECS) ||
2289             mempool_init_page_pool(&c->bio_bounce_pages,
2290                                    max_t(unsigned,
2291                                          c->opts.btree_node_size,
2292                                          c->sb.encoded_extent_max) /
2293                                    PAGE_SECTORS, 0) ||
2294             rhashtable_init(&c->promote_table, &bch_promote_params))
2295                 return -ENOMEM;
2296
2297         return 0;
2298 }