git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/io.c

   1 /*
   2  * Some low level IO code, and hacks for various block layer limitations
   3  *
   4  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   5  * Copyright 2012 Google, Inc.
   6  */
   7
   8 #include "bcachefs.h"
   9 #include "alloc_foreground.h"
  10 #include "bset.h"
  11 #include "btree_update.h"
  12 #include "buckets.h"
  13 #include "checksum.h"
  14 #include "compress.h"
  15 #include "clock.h"
  16 #include "debug.h"
  17 #include "disk_groups.h"
  18 #include "error.h"
  19 #include "extents.h"
  20 #include "io.h"
  21 #include "journal.h"
  22 #include "keylist.h"
  23 #include "move.h"
  24 #include "rebalance.h"
  25 #include "replicas.h"
  26 #include "super.h"
  27 #include "super-io.h"
  28
  29 #include <linux/blkdev.h>
  30 #include <linux/random.h>
  31
  32 #include <trace/events/bcachefs.h>
  33
  34 static bool bch2_target_congested(struct bch_fs *c, u16 target)
  35 {
  36         const struct bch_devs_mask *devs;
  37         unsigned d, nr = 0, total = 0;
  38         u64 now = local_clock(), last;
  39         s64 congested;
  40         struct bch_dev *ca;
  41
  42         if (!target)
  43                 return false;
  44
  45         rcu_read_lock();
  46         devs = bch2_target_to_mask(c, target);
  47         for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
  48                 ca = rcu_dereference(c->devs[d]);
  49                 if (!ca)
  50                         continue;
  51
  52                 congested = atomic_read(&ca->congested);
  53                 last = READ_ONCE(ca->congested_last);
  54                 if (time_after64(now, last))
  55                         congested -= (now - last) >> 12;
  56
  57                 total += max(congested, 0LL);
  58                 nr++;
  59         }
  60         rcu_read_unlock();
  61
  62         return bch2_rand_range(nr * CONGESTED_MAX) < total;
  63 }
  64
  65 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
  66                                        u64 now, int rw)
  67 {
  68         u64 latency_capable =
  69                 ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
  70         /* ideally we'd be taking into account the device's variance here: */
  71         u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
  72         s64 latency_over = io_latency - latency_threshold;
  73
  74         if (latency_threshold && latency_over > 0) {
  75                 /*
  76                  * bump up congested by approximately latency_over * 4 /
  77                  * latency_threshold - we don't need much accuracy here so don't
  78                  * bother with the divide:
  79                  */
  80                 if (atomic_read(&ca->congested) < CONGESTED_MAX)
  81                         atomic_add(latency_over >>
  82                                    max_t(int, ilog2(latency_threshold) - 2, 0),
  83                                    &ca->congested);
  84
  85                 ca->congested_last = now;
  86         } else if (atomic_read(&ca->congested) > 0) {
  87                 atomic_dec(&ca->congested);
  88         }
  89 }
  90
  91 void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
  92 {
  93         atomic64_t *latency = &ca->cur_latency[rw];
  94         u64 now = local_clock();
  95         u64 io_latency = time_after64(now, submit_time)
  96                 ? now - submit_time
  97                 : 0;
  98         u64 old, new, v = atomic64_read(latency);
  99
 100         do {
 101                 old = v;
 102
 103                 /*
 104                  * If the io latency was reasonably close to the current
 105                  * latency, skip doing the update and atomic operation - most of
 106                  * the time:
 107                  */
 108                 if (abs((int) (old - io_latency)) < (old >> 1) &&
 109                     now & ~(~0 << 5))
 110                         break;
 111
 112                 new = ewma_add(old, io_latency, 5);
 113         } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
 114
 115         bch2_congested_acct(ca, io_latency, now, rw);
 116
 117         __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 118 }
 119
 120 /* Allocate, free from mempool: */
 121
 122 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 123 {
 124         struct bio_vec *bv;
 125         unsigned i;
 126
 127         bio_for_each_segment_all(bv, bio, i)
 128                 if (bv->bv_page != ZERO_PAGE(0))
 129                         mempool_free(bv->bv_page, &c->bio_bounce_pages);
 130         bio->bi_vcnt = 0;
 131 }
 132
 133 static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
 134                                     bool *using_mempool)
 135 {
 136         struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
 137
 138         if (likely(!*using_mempool)) {
 139                 bv->bv_page = alloc_page(GFP_NOIO);
 140                 if (unlikely(!bv->bv_page)) {
 141                         mutex_lock(&c->bio_bounce_pages_lock);
 142                         *using_mempool = true;
 143                         goto pool_alloc;
 144
 145                 }
 146         } else {
 147 pool_alloc:
 148                 bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
 149         }
 150
 151         bv->bv_len = PAGE_SIZE;
 152         bv->bv_offset = 0;
 153 }
 154
 155 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 156                                size_t bytes)
 157 {
 158         bool using_mempool = false;
 159
 160         BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
 161
 162         bio->bi_iter.bi_size = bytes;
 163
 164         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
 165                 bch2_bio_alloc_page_pool(c, bio, &using_mempool);
 166
 167         if (using_mempool)
 168                 mutex_unlock(&c->bio_bounce_pages_lock);
 169 }
 170
 171 void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
 172                                     size_t bytes)
 173 {
 174         while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
 175                 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
 176
 177                 BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
 178
 179                 bv->bv_page = alloc_page(GFP_NOIO);
 180                 if (!bv->bv_page) {
 181                         /*
 182                          * We already allocated from mempool, we can't allocate from it again
 183                          * without freeing the pages we already allocated or else we could
 184                          * deadlock:
 185                          */
 186                         bch2_bio_free_pages_pool(c, bio);
 187                         bch2_bio_alloc_pages_pool(c, bio, bytes);
 188                         return;
 189                 }
 190
 191                 bv->bv_len = PAGE_SIZE;
 192                 bv->bv_offset = 0;
 193                 bio->bi_vcnt++;
 194         }
 195
 196         bio->bi_iter.bi_size = bytes;
 197 }
 198
 199 /* Writes */
 200
 201 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 202                                enum bch_data_type type,
 203                                const struct bkey_i *k)
 204 {
 205         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
 206         const struct bch_extent_ptr *ptr;
 207         struct bch_write_bio *n;
 208         struct bch_dev *ca;
 209
 210         BUG_ON(c->opts.nochanges);
 211
 212         extent_for_each_ptr(e, ptr) {
 213                 BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
 214                        !c->devs[ptr->dev]);
 215
 216                 ca = bch_dev_bkey_exists(c, ptr->dev);
 217
 218                 if (ptr + 1 < &extent_entry_last(e)->ptr) {
 219                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
 220                                                    &ca->replica_set));
 221
 222                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
 223                         n->bio.bi_private       = wbio->bio.bi_private;
 224                         n->parent               = wbio;
 225                         n->split                = true;
 226                         n->bounce               = false;
 227                         n->put_bio              = true;
 228                         n->bio.bi_opf           = wbio->bio.bi_opf;
 229                         bio_inc_remaining(&wbio->bio);
 230                 } else {
 231                         n = wbio;
 232                         n->split                = false;
 233                 }
 234
 235                 n->c                    = c;
 236                 n->dev                  = ptr->dev;
 237                 n->have_ioref           = bch2_dev_get_ioref(ca, WRITE);
 238                 n->submit_time          = local_clock();
 239                 n->bio.bi_iter.bi_sector = ptr->offset;
 240
 241                 if (!journal_flushes_device(ca))
 242                         n->bio.bi_opf |= REQ_FUA;
 243
 244                 if (likely(n->have_ioref)) {
 245                         this_cpu_add(ca->io_done->sectors[WRITE][type],
 246                                      bio_sectors(&n->bio));
 247
 248                         bio_set_dev(&n->bio, ca->disk_sb.bdev);
 249                         submit_bio(&n->bio);
 250                 } else {
 251                         n->bio.bi_status        = BLK_STS_REMOVED;
 252                         bio_endio(&n->bio);
 253                 }
 254         }
 255 }
 256
 257 static void __bch2_write(struct closure *);
 258
 259 static void bch2_write_done(struct closure *cl)
 260 {
 261         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 262         struct bch_fs *c = op->c;
 263
 264         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 265                 op->error = bch2_journal_error(&c->journal);
 266
 267         if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 268                 bch2_disk_reservation_put(c, &op->res);
 269         percpu_ref_put(&c->writes);
 270         bch2_keylist_free(&op->insert_keys, op->inline_keys);
 271
 272         bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 273
 274         closure_return(cl);
 275 }
 276
 277 int bch2_write_index_default(struct bch_write_op *op)
 278 {
 279         struct keylist *keys = &op->insert_keys;
 280         struct btree_iter iter;
 281         int ret;
 282
 283         bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
 284                              bkey_start_pos(&bch2_keylist_front(keys)->k),
 285                              BTREE_ITER_INTENT);
 286
 287         ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
 288                                         op_journal_seq(op),
 289                                         BTREE_INSERT_NOFAIL|
 290                                         BTREE_INSERT_USE_RESERVE);
 291         bch2_btree_iter_unlock(&iter);
 292
 293         return ret;
 294 }
 295
 296 /**
 297  * bch_write_index - after a write, update index to point to new data
 298  */
 299 static void __bch2_write_index(struct bch_write_op *op)
 300 {
 301         struct bch_fs *c = op->c;
 302         struct keylist *keys = &op->insert_keys;
 303         struct bkey_s_extent e;
 304         struct bch_extent_ptr *ptr;
 305         struct bkey_i *src, *dst = keys->keys, *n, *k;
 306         int ret;
 307
 308         for (src = keys->keys; src != keys->top; src = n) {
 309                 n = bkey_next(src);
 310                 bkey_copy(dst, src);
 311
 312                 e = bkey_i_to_s_extent(dst);
 313
 314                 bch2_extent_drop_ptrs(e, ptr,
 315                         test_bit(ptr->dev, op->failed.d));
 316
 317                 if (!bch2_extent_nr_ptrs(e.c)) {
 318                         ret = -EIO;
 319                         goto err;
 320                 }
 321
 322                 if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
 323                         ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_EXTENTS,
 324                                                       e.s_c);
 325                         if (ret)
 326                                 goto err;
 327                 }
 328
 329                 dst = bkey_next(dst);
 330         }
 331
 332         keys->top = dst;
 333
 334         /*
 335          * probably not the ideal place to hook this in, but I don't
 336          * particularly want to plumb io_opts all the way through the btree
 337          * update stack right now
 338          */
 339         for_each_keylist_key(keys, k)
 340                 bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 341
 342         if (!bch2_keylist_empty(keys)) {
 343                 u64 sectors_start = keylist_sectors(keys);
 344                 int ret = op->index_update_fn(op);
 345
 346                 BUG_ON(keylist_sectors(keys) && !ret);
 347
 348                 op->written += sectors_start - keylist_sectors(keys);
 349
 350                 if (ret) {
 351                         __bcache_io_error(c, "btree IO error %i", ret);
 352                         op->error = ret;
 353                 }
 354         }
 355 out:
 356         bch2_open_buckets_put(c, &op->open_buckets);
 357         return;
 358 err:
 359         keys->top = keys->keys;
 360         op->error = ret;
 361         goto out;
 362 }
 363
 364 static void bch2_write_index(struct closure *cl)
 365 {
 366         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 367         struct bch_fs *c = op->c;
 368
 369         __bch2_write_index(op);
 370
 371         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 372                 bch2_journal_flush_seq_async(&c->journal,
 373                                              *op_journal_seq(op),
 374                                              cl);
 375                 continue_at(cl, bch2_write_done, index_update_wq(op));
 376         } else {
 377                 continue_at_nobarrier(cl, bch2_write_done, NULL);
 378         }
 379 }
 380
 381 static void bch2_write_endio(struct bio *bio)
 382 {
 383         struct closure *cl              = bio->bi_private;
 384         struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
 385         struct bch_write_bio *wbio      = to_wbio(bio);
 386         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
 387         struct bch_fs *c                = wbio->c;
 388         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
 389
 390         if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
 391                 set_bit(wbio->dev, op->failed.d);
 392
 393         if (wbio->have_ioref) {
 394                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
 395                 percpu_ref_put(&ca->io_ref);
 396         }
 397
 398         if (wbio->bounce)
 399                 bch2_bio_free_pages_pool(c, bio);
 400
 401         if (wbio->put_bio)
 402                 bio_put(bio);
 403
 404         if (parent)
 405                 bio_endio(&parent->bio);
 406         else
 407                 closure_put(cl);
 408 }
 409
 410 static void init_append_extent(struct bch_write_op *op,
 411                                struct write_point *wp,
 412                                struct bversion version,
 413                                struct bch_extent_crc_unpacked crc)
 414 {
 415         struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
 416
 417         op->pos.offset += crc.uncompressed_size;
 418         e->k.p = op->pos;
 419         e->k.size = crc.uncompressed_size;
 420         e->k.version = version;
 421         bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
 422
 423         bch2_extent_crc_append(e, crc);
 424         bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
 425
 426         bch2_keylist_push(&op->insert_keys);
 427 }
 428
 429 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 430                                         struct write_point *wp,
 431                                         struct bio *src,
 432                                         bool *page_alloc_failed)
 433 {
 434         struct bch_write_bio *wbio;
 435         struct bio *bio;
 436         unsigned output_available =
 437                 min(wp->sectors_free << 9, src->bi_iter.bi_size);
 438         unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
 439
 440         bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
 441         wbio                    = wbio_init(bio);
 442         wbio->bounce            = true;
 443         wbio->put_bio           = true;
 444         /* copy WRITE_SYNC flag */
 445         wbio->bio.bi_opf        = src->bi_opf;
 446
 447         /*
 448          * We can't use mempool for more than c->sb.encoded_extent_max
 449          * worth of pages, but we'd like to allocate more if we can:
 450          */
 451         while (bio->bi_iter.bi_size < output_available) {
 452                 unsigned len = min_t(unsigned, PAGE_SIZE,
 453                                      output_available - bio->bi_iter.bi_size);
 454                 struct page *p;
 455
 456                 p = alloc_page(GFP_NOIO);
 457                 if (!p) {
 458                         unsigned pool_max =
 459                                 min_t(unsigned, output_available,
 460                                       c->sb.encoded_extent_max << 9);
 461
 462                         if (bio_sectors(bio) < pool_max)
 463                                 bch2_bio_alloc_pages_pool(c, bio, pool_max);
 464                         break;
 465                 }
 466
 467                 bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
 468                         .bv_page        = p,
 469                         .bv_len         = len,
 470                         .bv_offset      = 0,
 471                 };
 472                 bio->bi_iter.bi_size += len;
 473         }
 474
 475         *page_alloc_failed = bio->bi_vcnt < pages;
 476         return bio;
 477 }
 478
 479 static int bch2_write_rechecksum(struct bch_fs *c,
 480                                  struct bch_write_op *op,
 481                                  unsigned new_csum_type)
 482 {
 483         struct bio *bio = &op->wbio.bio;
 484         struct bch_extent_crc_unpacked new_crc;
 485         int ret;
 486
 487         /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
 488
 489         if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
 490             bch2_csum_type_is_encryption(new_csum_type))
 491                 new_csum_type = op->crc.csum_type;
 492
 493         ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
 494                                   NULL, &new_crc,
 495                                   op->crc.offset, op->crc.live_size,
 496                                   new_csum_type);
 497         if (ret)
 498                 return ret;
 499
 500         bio_advance(bio, op->crc.offset << 9);
 501         bio->bi_iter.bi_size = op->crc.live_size << 9;
 502         op->crc = new_crc;
 503         return 0;
 504 }
 505
 506 static int bch2_write_decrypt(struct bch_write_op *op)
 507 {
 508         struct bch_fs *c = op->c;
 509         struct nonce nonce = extent_nonce(op->version, op->crc);
 510         struct bch_csum csum;
 511
 512         if (!bch2_csum_type_is_encryption(op->crc.csum_type))
 513                 return 0;
 514
 515         /*
 516          * If we need to decrypt data in the write path, we'll no longer be able
 517          * to verify the existing checksum (poly1305 mac, in this case) after
 518          * it's decrypted - this is the last point we'll be able to reverify the
 519          * checksum:
 520          */
 521         csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 522         if (bch2_crc_cmp(op->crc.csum, csum))
 523                 return -EIO;
 524
 525         bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 526         op->crc.csum_type = 0;
 527         op->crc.csum = (struct bch_csum) { 0, 0 };
 528         return 0;
 529 }
 530
 531 static enum prep_encoded_ret {
 532         PREP_ENCODED_OK,
 533         PREP_ENCODED_ERR,
 534         PREP_ENCODED_CHECKSUM_ERR,
 535         PREP_ENCODED_DO_WRITE,
 536 } bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
 537 {
 538         struct bch_fs *c = op->c;
 539         struct bio *bio = &op->wbio.bio;
 540
 541         if (!(op->flags & BCH_WRITE_DATA_ENCODED))
 542                 return PREP_ENCODED_OK;
 543
 544         BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
 545
 546         /* Can we just write the entire extent as is? */
 547         if (op->crc.uncompressed_size == op->crc.live_size &&
 548             op->crc.compressed_size <= wp->sectors_free &&
 549             op->crc.compression_type == op->compression_type) {
 550                 if (!op->crc.compression_type &&
 551                     op->csum_type != op->crc.csum_type &&
 552                     bch2_write_rechecksum(c, op, op->csum_type))
 553                         return PREP_ENCODED_CHECKSUM_ERR;
 554
 555                 return PREP_ENCODED_DO_WRITE;
 556         }
 557
 558         /*
 559          * If the data is compressed and we couldn't write the entire extent as
 560          * is, we have to decompress it:
 561          */
 562         if (op->crc.compression_type) {
 563                 struct bch_csum csum;
 564
 565                 if (bch2_write_decrypt(op))
 566                         return PREP_ENCODED_CHECKSUM_ERR;
 567
 568                 /* Last point we can still verify checksum: */
 569                 csum = bch2_checksum_bio(c, op->crc.csum_type,
 570                                          extent_nonce(op->version, op->crc),
 571                                          bio);
 572                 if (bch2_crc_cmp(op->crc.csum, csum))
 573                         return PREP_ENCODED_CHECKSUM_ERR;
 574
 575                 if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
 576                         return PREP_ENCODED_ERR;
 577         }
 578
 579         /*
 580          * No longer have compressed data after this point - data might be
 581          * encrypted:
 582          */
 583
 584         /*
 585          * If the data is checksummed and we're only writing a subset,
 586          * rechecksum and adjust bio to point to currently live data:
 587          */
 588         if ((op->crc.live_size != op->crc.uncompressed_size ||
 589              op->crc.csum_type != op->csum_type) &&
 590             bch2_write_rechecksum(c, op, op->csum_type))
 591                 return PREP_ENCODED_CHECKSUM_ERR;
 592
 593         /*
 594          * If we want to compress the data, it has to be decrypted:
 595          */
 596         if ((op->compression_type ||
 597              bch2_csum_type_is_encryption(op->crc.csum_type) !=
 598              bch2_csum_type_is_encryption(op->csum_type)) &&
 599             bch2_write_decrypt(op))
 600                 return PREP_ENCODED_CHECKSUM_ERR;
 601
 602         return PREP_ENCODED_OK;
 603 }
 604
 605 static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 606 {
 607         struct bch_fs *c = op->c;
 608         struct bio *src = &op->wbio.bio, *dst = src;
 609         struct bvec_iter saved_iter;
 610         struct bkey_i *key_to_write;
 611         unsigned key_to_write_offset = op->insert_keys.top_p -
 612                 op->insert_keys.keys_p;
 613         unsigned total_output = 0;
 614         bool bounce = false, page_alloc_failed = false;
 615         int ret, more = 0;
 616
 617         BUG_ON(!bio_sectors(src));
 618
 619         switch (bch2_write_prep_encoded_data(op, wp)) {
 620         case PREP_ENCODED_OK:
 621                 break;
 622         case PREP_ENCODED_ERR:
 623                 ret = -EIO;
 624                 goto err;
 625         case PREP_ENCODED_CHECKSUM_ERR:
 626                 goto csum_err;
 627         case PREP_ENCODED_DO_WRITE:
 628                 init_append_extent(op, wp, op->version, op->crc);
 629                 goto do_write;
 630         }
 631
 632         if (op->compression_type ||
 633             (op->csum_type &&
 634              !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
 635             (bch2_csum_type_is_encryption(op->csum_type) &&
 636              !(op->flags & BCH_WRITE_PAGES_OWNED))) {
 637                 dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
 638                 bounce = true;
 639         }
 640
 641         saved_iter = dst->bi_iter;
 642
 643         do {
 644                 struct bch_extent_crc_unpacked crc =
 645                         (struct bch_extent_crc_unpacked) { 0 };
 646                 struct bversion version = op->version;
 647                 size_t dst_len, src_len;
 648
 649                 if (page_alloc_failed &&
 650                     bio_sectors(dst) < wp->sectors_free &&
 651                     bio_sectors(dst) < c->sb.encoded_extent_max)
 652                         break;
 653
 654                 BUG_ON(op->compression_type &&
 655                        (op->flags & BCH_WRITE_DATA_ENCODED) &&
 656                        bch2_csum_type_is_encryption(op->crc.csum_type));
 657                 BUG_ON(op->compression_type && !bounce);
 658
 659                 crc.compression_type = op->compression_type
 660                         ?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
 661                                              op->compression_type)
 662                         : 0;
 663                 if (!crc.compression_type) {
 664                         dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 665                         dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
 666
 667                         if (op->csum_type)
 668                                 dst_len = min_t(unsigned, dst_len,
 669                                                 c->sb.encoded_extent_max << 9);
 670
 671                         if (bounce) {
 672                                 swap(dst->bi_iter.bi_size, dst_len);
 673                                 bio_copy_data(dst, src);
 674                                 swap(dst->bi_iter.bi_size, dst_len);
 675                         }
 676
 677                         src_len = dst_len;
 678                 }
 679
 680                 BUG_ON(!src_len || !dst_len);
 681
 682                 if (bch2_csum_type_is_encryption(op->csum_type)) {
 683                         if (bversion_zero(version)) {
 684                                 version.lo = atomic64_inc_return(&c->key_version) + 1;
 685                         } else {
 686                                 crc.nonce = op->nonce;
 687                                 op->nonce += src_len >> 9;
 688                         }
 689                 }
 690
 691                 if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 692                     !crc.compression_type &&
 693                     bch2_csum_type_is_encryption(op->crc.csum_type) ==
 694                     bch2_csum_type_is_encryption(op->csum_type)) {
 695                         /*
 696                          * Note: when we're using rechecksum(), we need to be
 697                          * checksumming @src because it has all the data our
 698                          * existing checksum covers - if we bounced (because we
 699                          * were trying to compress), @dst will only have the
 700                          * part of the data the new checksum will cover.
 701                          *
 702                          * But normally we want to be checksumming post bounce,
 703                          * because part of the reason for bouncing is so the
 704                          * data can't be modified (by userspace) while it's in
 705                          * flight.
 706                          */
 707                         if (bch2_rechecksum_bio(c, src, version, op->crc,
 708                                         &crc, &op->crc,
 709                                         src_len >> 9,
 710                                         bio_sectors(src) - (src_len >> 9),
 711                                         op->csum_type))
 712                                 goto csum_err;
 713                 } else {
 714                         if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 715                             bch2_rechecksum_bio(c, src, version, op->crc,
 716                                         NULL, &op->crc,
 717                                         src_len >> 9,
 718                                         bio_sectors(src) - (src_len >> 9),
 719                                         op->crc.csum_type))
 720                                 goto csum_err;
 721
 722                         crc.compressed_size     = dst_len >> 9;
 723                         crc.uncompressed_size   = src_len >> 9;
 724                         crc.live_size           = src_len >> 9;
 725
 726                         swap(dst->bi_iter.bi_size, dst_len);
 727                         bch2_encrypt_bio(c, op->csum_type,
 728                                          extent_nonce(version, crc), dst);
 729                         crc.csum = bch2_checksum_bio(c, op->csum_type,
 730                                          extent_nonce(version, crc), dst);
 731                         crc.csum_type = op->csum_type;
 732                         swap(dst->bi_iter.bi_size, dst_len);
 733                 }
 734
 735                 init_append_extent(op, wp, version, crc);
 736
 737                 if (dst != src)
 738                         bio_advance(dst, dst_len);
 739                 bio_advance(src, src_len);
 740                 total_output += dst_len;
 741         } while (dst->bi_iter.bi_size &&
 742                  src->bi_iter.bi_size &&
 743                  wp->sectors_free &&
 744                  !bch2_keylist_realloc(&op->insert_keys,
 745                                       op->inline_keys,
 746                                       ARRAY_SIZE(op->inline_keys),
 747                                       BKEY_EXTENT_U64s_MAX));
 748
 749         more = src->bi_iter.bi_size != 0;
 750
 751         dst->bi_iter = saved_iter;
 752
 753         if (!bounce && more) {
 754                 dst = bio_split(src, total_output >> 9,
 755                                 GFP_NOIO, &c->bio_write);
 756                 wbio_init(dst)->put_bio = true;
 757         }
 758
 759         dst->bi_iter.bi_size = total_output;
 760
 761         /* Free unneeded pages after compressing: */
 762         if (bounce)
 763                 while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
 764                         mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
 765                                      &c->bio_bounce_pages);
 766 do_write:
 767         /* might have done a realloc... */
 768
 769         key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 770
 771         dst->bi_end_io  = bch2_write_endio;
 772         dst->bi_private = &op->cl;
 773         bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
 774
 775         closure_get(dst->bi_private);
 776
 777         bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
 778                                   key_to_write);
 779         return more;
 780 csum_err:
 781         bch_err(c, "error verifying existing checksum while "
 782                 "rewriting existing data (memory corruption?)");
 783         ret = -EIO;
 784 err:
 785         if (bounce) {
 786                 bch2_bio_free_pages_pool(c, dst);
 787                 bio_put(dst);
 788         }
 789
 790         return ret;
 791 }
 792
 793 static void __bch2_write(struct closure *cl)
 794 {
 795         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 796         struct bch_fs *c = op->c;
 797         struct write_point *wp;
 798         int ret;
 799 again:
 800         do {
 801                 /* +1 for possible cache device: */
 802                 if (op->open_buckets.nr + op->nr_replicas + 1 >
 803                     ARRAY_SIZE(op->open_buckets.v))
 804                         goto flush_io;
 805
 806                 if (bch2_keylist_realloc(&op->insert_keys,
 807                                         op->inline_keys,
 808                                         ARRAY_SIZE(op->inline_keys),
 809                                         BKEY_EXTENT_U64s_MAX))
 810                         goto flush_io;
 811
 812                 wp = bch2_alloc_sectors_start(c,
 813                         op->target,
 814                         op->write_point,
 815                         &op->devs_have,
 816                         op->nr_replicas,
 817                         op->nr_replicas_required,
 818                         op->alloc_reserve,
 819                         op->flags,
 820                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
 821                 EBUG_ON(!wp);
 822
 823                 if (unlikely(IS_ERR(wp))) {
 824                         if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
 825                                 ret = PTR_ERR(wp);
 826                                 goto err;
 827                         }
 828
 829                         goto flush_io;
 830                 }
 831
 832                 ret = bch2_write_extent(op, wp);
 833
 834                 bch2_open_bucket_get(c, wp, &op->open_buckets);
 835                 bch2_alloc_sectors_done(c, wp);
 836
 837                 if (ret < 0)
 838                         goto err;
 839         } while (ret);
 840
 841         continue_at(cl, bch2_write_index, index_update_wq(op));
 842         return;
 843 err:
 844         op->error = ret;
 845
 846         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
 847                     ? bch2_write_index
 848                     : bch2_write_done, index_update_wq(op));
 849         return;
 850 flush_io:
 851         closure_sync(cl);
 852
 853         if (!bch2_keylist_empty(&op->insert_keys)) {
 854                 __bch2_write_index(op);
 855
 856                 if (op->error) {
 857                         continue_at_nobarrier(cl, bch2_write_done, NULL);
 858                         return;
 859                 }
 860         }
 861
 862         goto again;
 863 }
 864
 865 /**
 866  * bch_write - handle a write to a cache device or flash only volume
 867  *
 868  * This is the starting point for any data to end up in a cache device; it could
 869  * be from a normal write, or a writeback write, or a write to a flash only
 870  * volume - it's also used by the moving garbage collector to compact data in
 871  * mostly empty buckets.
 872  *
 873  * It first writes the data to the cache, creating a list of keys to be inserted
 874  * (if the data won't fit in a single open bucket, there will be multiple keys);
 875  * after the data is written it calls bch_journal, and after the keys have been
 876  * added to the next journal write they're inserted into the btree.
 877  *
 878  * If op->discard is true, instead of inserting the data it invalidates the
 879  * region of the cache represented by op->bio and op->inode.
 880  */
 881 void bch2_write(struct closure *cl)
 882 {
 883         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 884         struct bch_fs *c = op->c;
 885
 886         BUG_ON(!op->nr_replicas);
 887         BUG_ON(!op->write_point.v);
 888         BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 889         BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
 890
 891         op->start_time = local_clock();
 892
 893         memset(&op->failed, 0, sizeof(op->failed));
 894
 895         bch2_keylist_init(&op->insert_keys, op->inline_keys);
 896         wbio_init(&op->wbio.bio)->put_bio = false;
 897
 898         if (c->opts.nochanges ||
 899             !percpu_ref_tryget(&c->writes)) {
 900                 __bcache_io_error(c, "read only");
 901                 op->error = -EROFS;
 902                 if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
 903                         bch2_disk_reservation_put(c, &op->res);
 904                 closure_return(cl);
 905                 return;
 906         }
 907
 908         bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
 909
 910         continue_at_nobarrier(cl, __bch2_write, NULL);
 911 }
 912
 913 /* Cache promotion on read */
 914
 915 struct promote_op {
 916         struct closure          cl;
 917         u64                     start_time;
 918
 919         struct rhash_head       hash;
 920         struct bpos             pos;
 921
 922         struct migrate_write    write;
 923         struct bio_vec          bi_inline_vecs[0]; /* must be last */
 924 };
 925
 926 static const struct rhashtable_params bch_promote_params = {
 927         .head_offset    = offsetof(struct promote_op, hash),
 928         .key_offset     = offsetof(struct promote_op, pos),
 929         .key_len        = sizeof(struct bpos),
 930 };
 931
 932 static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
 933                                   struct bpos pos,
 934                                   struct bch_io_opts opts,
 935                                   unsigned flags)
 936 {
 937         if (!opts.promote_target)
 938                 return false;
 939
 940         if (!(flags & BCH_READ_MAY_PROMOTE))
 941                 return false;
 942
 943         if (percpu_ref_is_dying(&c->writes))
 944                 return false;
 945
 946         if (!bkey_extent_is_data(k.k))
 947                 return false;
 948
 949         if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
 950                 return false;
 951
 952         if (bch2_target_congested(c, opts.promote_target))
 953                 return false;
 954
 955         if (rhashtable_lookup_fast(&c->promote_table, &pos,
 956                                    bch_promote_params))
 957                 return false;
 958
 959         return true;
 960 }
 961
 962 static void promote_free(struct bch_fs *c, struct promote_op *op)
 963 {
 964         int ret;
 965
 966         ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
 967                                      bch_promote_params);
 968         BUG_ON(ret);
 969         percpu_ref_put(&c->writes);
 970         kfree(op);
 971 }
 972
 973 static void promote_done(struct closure *cl)
 974 {
 975         struct promote_op *op =
 976                 container_of(cl, struct promote_op, cl);
 977         struct bch_fs *c = op->write.op.c;
 978
 979         bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
 980                                op->start_time);
 981
 982         bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
 983         promote_free(c, op);
 984 }
 985
 986 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 987 {
 988         struct bch_fs *c = rbio->c;
 989         struct closure *cl = &op->cl;
 990         struct bio *bio = &op->write.op.wbio.bio;
 991
 992         trace_promote(&rbio->bio);
 993
 994         /* we now own pages: */
 995         BUG_ON(!rbio->bounce);
 996         BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
 997
 998         memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
 999                sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
1000         swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
1001
1002         bch2_migrate_read_done(&op->write, rbio);
1003
1004         closure_init(cl, NULL);
1005         closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
1006         closure_return_with_destructor(cl, promote_done);
1007 }
1008
1009 noinline
1010 static struct promote_op *__promote_alloc(struct bch_fs *c,
1011                                           struct bpos pos,
1012                                           struct extent_ptr_decoded *pick,
1013                                           struct bch_io_opts opts,
1014                                           unsigned rbio_sectors,
1015                                           struct bch_read_bio **rbio)
1016 {
1017         struct promote_op *op = NULL;
1018         struct bio *bio;
1019         unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
1020         /* data might have to be decompressed in the write path: */
1021         unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
1022                                            PAGE_SECTORS);
1023         int ret;
1024
1025         if (!percpu_ref_tryget(&c->writes))
1026                 return NULL;
1027
1028         op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
1029                      GFP_NOIO);
1030         if (!op)
1031                 goto err;
1032
1033         op->start_time = local_clock();
1034         op->pos = pos;
1035
1036         /*
1037          * promotes require bouncing, but if the extent isn't
1038          * checksummed/compressed it might be too big for the mempool:
1039          */
1040         if (rbio_sectors > c->sb.encoded_extent_max) {
1041                 *rbio = kzalloc(sizeof(struct bch_read_bio) +
1042                                 sizeof(struct bio_vec) * rbio_pages,
1043                                 GFP_NOIO);
1044                 if (!*rbio)
1045                         goto err;
1046
1047                 rbio_init(&(*rbio)->bio, opts);
1048                 bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
1049                          rbio_pages);
1050
1051                 (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
1052                 bch2_bio_map(&(*rbio)->bio, NULL);
1053
1054                 if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
1055                         goto err;
1056
1057                 (*rbio)->bounce         = true;
1058                 (*rbio)->split          = true;
1059                 (*rbio)->kmalloc        = true;
1060         }
1061
1062         if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
1063                                           bch_promote_params))
1064                 goto err;
1065
1066         bio = &op->write.op.wbio.bio;
1067         bio_init(bio, bio->bi_inline_vecs, wbio_pages);
1068
1069         ret = bch2_migrate_write_init(c, &op->write,
1070                         writepoint_hashed((unsigned long) current),
1071                         opts,
1072                         DATA_PROMOTE,
1073                         (struct data_opts) {
1074                                 .target = opts.promote_target
1075                         },
1076                         bkey_s_c_null);
1077         BUG_ON(ret);
1078
1079         return op;
1080 err:
1081         if (*rbio)
1082                 bio_free_pages(&(*rbio)->bio);
1083         kfree(*rbio);
1084         *rbio = NULL;
1085         kfree(op);
1086         percpu_ref_put(&c->writes);
1087         return NULL;
1088 }
1089
1090 static inline struct promote_op *promote_alloc(struct bch_fs *c,
1091                                                struct bvec_iter iter,
1092                                                struct bkey_s_c k,
1093                                                struct extent_ptr_decoded *pick,
1094                                                struct bch_io_opts opts,
1095                                                unsigned flags,
1096                                                struct bch_read_bio **rbio,
1097                                                bool *bounce,
1098                                                bool *read_full)
1099 {
1100         bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
1101         unsigned sectors = promote_full
1102                 ? pick->crc.compressed_size
1103                 : bvec_iter_sectors(iter);
1104         struct bpos pos = promote_full
1105                 ? bkey_start_pos(k.k)
1106                 : POS(k.k->p.inode, iter.bi_sector);
1107         struct promote_op *promote;
1108
1109         if (!should_promote(c, k, pos, opts, flags))
1110                 return NULL;
1111
1112         promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
1113         if (!promote)
1114                 return NULL;
1115
1116         *bounce         = true;
1117         *read_full      = promote_full;
1118         return promote;
1119 }
1120
1121 /* Read */
1122
1123 #define READ_RETRY_AVOID        1
1124 #define READ_RETRY              2
1125 #define READ_ERR                3
1126
1127 enum rbio_context {
1128         RBIO_CONTEXT_NULL,
1129         RBIO_CONTEXT_HIGHPRI,
1130         RBIO_CONTEXT_UNBOUND,
1131 };
1132
1133 static inline struct bch_read_bio *
1134 bch2_rbio_parent(struct bch_read_bio *rbio)
1135 {
1136         return rbio->split ? rbio->parent : rbio;
1137 }
1138
1139 __always_inline
1140 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
1141                            enum rbio_context context,
1142                            struct workqueue_struct *wq)
1143 {
1144         if (context <= rbio->context) {
1145                 fn(&rbio->work);
1146         } else {
1147                 rbio->work.func         = fn;
1148                 rbio->context           = context;
1149                 queue_work(wq, &rbio->work);
1150         }
1151 }
1152
1153 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
1154 {
1155         BUG_ON(rbio->bounce && !rbio->split);
1156
1157         if (rbio->promote)
1158                 promote_free(rbio->c, rbio->promote);
1159         rbio->promote = NULL;
1160
1161         if (rbio->bounce)
1162                 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
1163
1164         if (rbio->split) {
1165                 struct bch_read_bio *parent = rbio->parent;
1166
1167                 if (rbio->kmalloc)
1168                         kfree(rbio);
1169                 else
1170                         bio_put(&rbio->bio);
1171
1172                 rbio = parent;
1173         }
1174
1175         return rbio;
1176 }
1177
1178 static void bch2_rbio_done(struct bch_read_bio *rbio)
1179 {
1180         bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
1181                                rbio->start_time);
1182         bio_endio(&rbio->bio);
1183 }
1184
1185 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
1186                                      struct bvec_iter bvec_iter, u64 inode,
1187                                      struct bch_io_failures *failed,
1188                                      unsigned flags)
1189 {
1190         struct btree_iter iter;
1191         BKEY_PADDED(k) tmp;
1192         struct bkey_s_c k;
1193         int ret;
1194
1195         flags &= ~BCH_READ_LAST_FRAGMENT;
1196
1197         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
1198                              rbio->pos, BTREE_ITER_SLOTS);
1199 retry:
1200         rbio->bio.bi_status = 0;
1201
1202         k = bch2_btree_iter_peek_slot(&iter);
1203         if (btree_iter_err(k)) {
1204                 bch2_btree_iter_unlock(&iter);
1205                 goto err;
1206         }
1207
1208         bkey_reassemble(&tmp.k, k);
1209         k = bkey_i_to_s_c(&tmp.k);
1210         bch2_btree_iter_unlock(&iter);
1211
1212         if (!bkey_extent_is_data(k.k) ||
1213             !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
1214                                      rbio->pick.ptr,
1215                                      rbio->pos.offset -
1216                                      rbio->pick.crc.offset)) {
1217                 /* extent we wanted to read no longer exists: */
1218                 rbio->hole = true;
1219                 goto out;
1220         }
1221
1222         ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1223         if (ret == READ_RETRY)
1224                 goto retry;
1225         if (ret)
1226                 goto err;
1227         goto out;
1228 err:
1229         rbio->bio.bi_status = BLK_STS_IOERR;
1230 out:
1231         bch2_rbio_done(rbio);
1232 }
1233
1234 static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
1235                             struct bvec_iter bvec_iter, u64 inode,
1236                             struct bch_io_failures *failed, unsigned flags)
1237 {
1238         struct btree_iter iter;
1239         struct bkey_s_c k;
1240         int ret;
1241
1242         flags &= ~BCH_READ_LAST_FRAGMENT;
1243         flags |= BCH_READ_MUST_CLONE;
1244 retry:
1245         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1246                            POS(inode, bvec_iter.bi_sector),
1247                            BTREE_ITER_SLOTS, k) {
1248                 BKEY_PADDED(k) tmp;
1249                 unsigned bytes;
1250
1251                 bkey_reassemble(&tmp.k, k);
1252                 k = bkey_i_to_s_c(&tmp.k);
1253                 bch2_btree_iter_unlock(&iter);
1254
1255                 bytes = min_t(unsigned, bvec_iter.bi_size,
1256                               (k.k->p.offset - bvec_iter.bi_sector) << 9);
1257                 swap(bvec_iter.bi_size, bytes);
1258
1259                 ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
1260                 switch (ret) {
1261                 case READ_RETRY:
1262                         goto retry;
1263                 case READ_ERR:
1264                         goto err;
1265                 };
1266
1267                 if (bytes == bvec_iter.bi_size)
1268                         goto out;
1269
1270                 swap(bvec_iter.bi_size, bytes);
1271                 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1272         }
1273
1274         /*
1275          * If we get here, it better have been because there was an error
1276          * reading a btree node
1277          */
1278         ret = bch2_btree_iter_unlock(&iter);
1279         BUG_ON(!ret);
1280         __bcache_io_error(c, "btree IO error %i", ret);
1281 err:
1282         rbio->bio.bi_status = BLK_STS_IOERR;
1283 out:
1284         bch2_rbio_done(rbio);
1285 }
1286
1287 static void bch2_rbio_retry(struct work_struct *work)
1288 {
1289         struct bch_read_bio *rbio =
1290                 container_of(work, struct bch_read_bio, work);
1291         struct bch_fs *c        = rbio->c;
1292         struct bvec_iter iter   = rbio->bvec_iter;
1293         unsigned flags          = rbio->flags;
1294         u64 inode               = rbio->pos.inode;
1295         struct bch_io_failures failed = { .nr = 0 };
1296
1297         trace_read_retry(&rbio->bio);
1298
1299         if (rbio->retry == READ_RETRY_AVOID)
1300                 bch2_mark_io_failure(&failed, &rbio->pick);
1301
1302         rbio->bio.bi_status = 0;
1303
1304         rbio = bch2_rbio_free(rbio);
1305
1306         flags |= BCH_READ_IN_RETRY;
1307         flags &= ~BCH_READ_MAY_PROMOTE;
1308
1309         if (flags & BCH_READ_NODECODE)
1310                 bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
1311         else
1312                 bch2_read_retry(c, rbio, iter, inode, &failed, flags);
1313 }
1314
1315 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
1316                             blk_status_t error)
1317 {
1318         rbio->retry = retry;
1319
1320         if (rbio->flags & BCH_READ_IN_RETRY)
1321                 return;
1322
1323         if (retry == READ_ERR) {
1324                 rbio = bch2_rbio_free(rbio);
1325
1326                 rbio->bio.bi_status = error;
1327                 bch2_rbio_done(rbio);
1328         } else {
1329                 bch2_rbio_punt(rbio, bch2_rbio_retry,
1330                                RBIO_CONTEXT_UNBOUND, system_unbound_wq);
1331         }
1332 }
1333
1334 static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
1335 {
1336         struct bch_fs *c = rbio->c;
1337         struct btree_iter iter;
1338         struct bkey_s_c k;
1339         struct bkey_i_extent *e;
1340         BKEY_PADDED(k) new;
1341         struct bch_extent_crc_unpacked new_crc;
1342         unsigned offset;
1343         int ret;
1344
1345         if (rbio->pick.crc.compression_type)
1346                 return;
1347
1348         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
1349                              BTREE_ITER_INTENT);
1350 retry:
1351         k = bch2_btree_iter_peek(&iter);
1352         if (IS_ERR_OR_NULL(k.k))
1353                 goto out;
1354
1355         if (!bkey_extent_is_data(k.k))
1356                 goto out;
1357
1358         bkey_reassemble(&new.k, k);
1359         e = bkey_i_to_extent(&new.k);
1360
1361         if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
1362                                      rbio->pick.ptr,
1363                                      rbio->pos.offset -
1364                                      rbio->pick.crc.offset) ||
1365             bversion_cmp(e->k.version, rbio->version))
1366                 goto out;
1367
1368         /* Extent was merged? */
1369         if (bkey_start_offset(&e->k) < rbio->pos.offset ||
1370             e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
1371                 goto out;
1372
1373         /* The extent might have been partially overwritten since we read it: */
1374         offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
1375
1376         if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
1377                                 rbio->pick.crc, NULL, &new_crc,
1378                                 offset, e->k.size,
1379                                 rbio->pick.crc.csum_type)) {
1380                 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
1381                 goto out;
1382         }
1383
1384         if (!bch2_extent_narrow_crcs(e, new_crc))
1385                 goto out;
1386
1387         ret = bch2_btree_insert_at(c, NULL, NULL,
1388                                    BTREE_INSERT_ATOMIC|
1389                                    BTREE_INSERT_NOFAIL|
1390                                    BTREE_INSERT_NOWAIT,
1391                                    BTREE_INSERT_ENTRY(&iter, &e->k_i));
1392         if (ret == -EINTR)
1393                 goto retry;
1394 out:
1395         bch2_btree_iter_unlock(&iter);
1396 }
1397
1398 static bool should_narrow_crcs(struct bkey_s_c k,
1399                                struct extent_ptr_decoded *pick,
1400                                unsigned flags)
1401 {
1402         return !(flags & BCH_READ_IN_RETRY) &&
1403                 bkey_extent_is_data(k.k) &&
1404                 bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
1405 }
1406
1407 /* Inner part that may run in process context */
1408 static void __bch2_read_endio(struct work_struct *work)
1409 {
1410         struct bch_read_bio *rbio =
1411                 container_of(work, struct bch_read_bio, work);
1412         struct bch_fs *c        = rbio->c;
1413         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1414         struct bio *src         = &rbio->bio;
1415         struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
1416         struct bvec_iter dst_iter = rbio->bvec_iter;
1417         struct bch_extent_crc_unpacked crc = rbio->pick.crc;
1418         struct nonce nonce = extent_nonce(rbio->version, crc);
1419         struct bch_csum csum;
1420
1421         /* Reset iterator for checksumming and copying bounced data: */
1422         if (rbio->bounce) {
1423                 src->bi_iter.bi_size            = crc.compressed_size << 9;
1424                 src->bi_iter.bi_idx             = 0;
1425                 src->bi_iter.bi_bvec_done       = 0;
1426         } else {
1427                 src->bi_iter                    = rbio->bvec_iter;
1428         }
1429
1430         csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
1431         if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
1432                 goto csum_err;
1433
1434         if (unlikely(rbio->narrow_crcs))
1435                 bch2_rbio_narrow_crcs(rbio);
1436
1437         if (rbio->flags & BCH_READ_NODECODE)
1438                 goto nodecode;
1439
1440         /* Adjust crc to point to subset of data we want: */
1441         crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
1442         crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
1443
1444         if (crc.compression_type != BCH_COMPRESSION_NONE) {
1445                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1446                 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
1447                         goto decompression_err;
1448         } else {
1449                 /* don't need to decrypt the entire bio: */
1450                 nonce = nonce_add(nonce, crc.offset << 9);
1451                 bio_advance(src, crc.offset << 9);
1452
1453                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
1454                 src->bi_iter.bi_size = dst_iter.bi_size;
1455
1456                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1457
1458                 if (rbio->bounce) {
1459                         struct bvec_iter src_iter = src->bi_iter;
1460                         bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
1461                 }
1462         }
1463
1464         if (rbio->promote) {
1465                 /*
1466                  * Re encrypt data we decrypted, so it's consistent with
1467                  * rbio->crc:
1468                  */
1469                 bch2_encrypt_bio(c, crc.csum_type, nonce, src);
1470                 promote_start(rbio->promote, rbio);
1471                 rbio->promote = NULL;
1472         }
1473 nodecode:
1474         if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
1475                 rbio = bch2_rbio_free(rbio);
1476                 bch2_rbio_done(rbio);
1477         }
1478         return;
1479 csum_err:
1480         /*
1481          * Checksum error: if the bio wasn't bounced, we may have been
1482          * reading into buffers owned by userspace (that userspace can
1483          * scribble over) - retry the read, bouncing it this time:
1484          */
1485         if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
1486                 rbio->flags |= BCH_READ_MUST_BOUNCE;
1487                 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
1488                 return;
1489         }
1490
1491         bch2_dev_io_error(ca,
1492                 "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
1493                 rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
1494                 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
1495                 csum.hi, csum.lo, crc.csum_type);
1496         bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
1497         return;
1498 decompression_err:
1499         __bcache_io_error(c, "decompression error, inode %llu offset %llu",
1500                           rbio->pos.inode,
1501                           (u64) rbio->bvec_iter.bi_sector);
1502         bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
1503         return;
1504 }
1505
1506 static void bch2_read_endio(struct bio *bio)
1507 {
1508         struct bch_read_bio *rbio =
1509                 container_of(bio, struct bch_read_bio, bio);
1510         struct bch_fs *c        = rbio->c;
1511         struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
1512         struct workqueue_struct *wq = NULL;
1513         enum rbio_context context = RBIO_CONTEXT_NULL;
1514
1515         if (rbio->have_ioref) {
1516                 bch2_latency_acct(ca, rbio->submit_time, READ);
1517                 percpu_ref_put(&ca->io_ref);
1518         }
1519
1520         if (!rbio->split)
1521                 rbio->bio.bi_end_io = rbio->end_io;
1522
1523         if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
1524                 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
1525                 return;
1526         }
1527
1528         if (rbio->pick.ptr.cached &&
1529             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
1530              ptr_stale(ca, &rbio->pick.ptr))) {
1531                 atomic_long_inc(&c->read_realloc_races);
1532
1533                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
1534                         bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
1535                 else
1536                         bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
1537                 return;
1538         }
1539
1540         if (rbio->narrow_crcs ||
1541             rbio->pick.crc.compression_type ||
1542             bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
1543                 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
1544         else if (rbio->pick.crc.csum_type)
1545                 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
1546
1547         bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
1548 }
1549
1550 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
1551                        struct bvec_iter iter, struct bkey_s_c k,
1552                        struct bch_io_failures *failed, unsigned flags)
1553 {
1554         struct extent_ptr_decoded pick;
1555         struct bch_read_bio *rbio = NULL;
1556         struct bch_dev *ca;
1557         struct promote_op *promote = NULL;
1558         bool bounce = false, read_full = false, narrow_crcs = false;
1559         struct bpos pos = bkey_start_pos(k.k);
1560         int pick_ret;
1561
1562         pick_ret = bch2_extent_pick_ptr(c, k, failed, &pick);
1563
1564         /* hole or reservation - just zero fill: */
1565         if (!pick_ret)
1566                 goto hole;
1567
1568         if (pick_ret < 0)
1569                 goto no_device;
1570
1571         if (pick_ret > 0)
1572                 ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1573
1574         if (flags & BCH_READ_NODECODE) {
1575                 /*
1576                  * can happen if we retry, and the extent we were going to read
1577                  * has been merged in the meantime:
1578                  */
1579                 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
1580                         goto hole;
1581
1582                 iter.bi_sector  = pos.offset;
1583                 iter.bi_size    = pick.crc.compressed_size << 9;
1584                 goto noclone;
1585         }
1586
1587         if (!(flags & BCH_READ_LAST_FRAGMENT) ||
1588             bio_flagged(&orig->bio, BIO_CHAIN))
1589                 flags |= BCH_READ_MUST_CLONE;
1590
1591         narrow_crcs = should_narrow_crcs(k, &pick, flags);
1592
1593         if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
1594                 flags |= BCH_READ_MUST_BOUNCE;
1595
1596         EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
1597                 k.k->p.offset < bvec_iter_end_sector(iter));
1598
1599         if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
1600             (pick.crc.csum_type != BCH_CSUM_NONE &&
1601              (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1602               (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1603                (flags & BCH_READ_USER_MAPPED)) ||
1604               (flags & BCH_READ_MUST_BOUNCE)))) {
1605                 read_full = true;
1606                 bounce = true;
1607         }
1608
1609         promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
1610                                 &rbio, &bounce, &read_full);
1611
1612         if (!read_full) {
1613                 EBUG_ON(pick.crc.compression_type);
1614                 EBUG_ON(pick.crc.csum_type &&
1615                         (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1616                          bvec_iter_sectors(iter) != pick.crc.live_size ||
1617                          pick.crc.offset ||
1618                          iter.bi_sector != pos.offset));
1619
1620                 pick.ptr.offset += pick.crc.offset +
1621                         (iter.bi_sector - pos.offset);
1622                 pick.crc.compressed_size        = bvec_iter_sectors(iter);
1623                 pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
1624                 pick.crc.offset                 = 0;
1625                 pick.crc.live_size              = bvec_iter_sectors(iter);
1626                 pos.offset                      = iter.bi_sector;
1627         }
1628
1629         if (rbio) {
1630                 /* promote already allocated bounce rbio */
1631         } else if (bounce) {
1632                 unsigned sectors = pick.crc.compressed_size;
1633
1634                 rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
1635                                                   DIV_ROUND_UP(sectors, PAGE_SECTORS),
1636                                                   &c->bio_read_split),
1637                                  orig->opts);
1638
1639                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1640                 rbio->bounce    = true;
1641                 rbio->split     = true;
1642         } else if (flags & BCH_READ_MUST_CLONE) {
1643                 /*
1644                  * Have to clone if there were any splits, due to error
1645                  * reporting issues (if a split errored, and retrying didn't
1646                  * work, when it reports the error to its parent (us) we don't
1647                  * know if the error was from our bio, and we should retry, or
1648                  * from the whole bio, in which case we don't want to retry and
1649                  * lose the error)
1650                  */
1651                 rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
1652                                                 &c->bio_read_split),
1653                                  orig->opts);
1654                 rbio->bio.bi_iter = iter;
1655                 rbio->split     = true;
1656         } else {
1657 noclone:
1658                 rbio = orig;
1659                 rbio->bio.bi_iter = iter;
1660                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1661         }
1662
1663         BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1664
1665         rbio->c                 = c;
1666         rbio->submit_time       = local_clock();
1667         if (rbio->split)
1668                 rbio->parent    = orig;
1669         else
1670                 rbio->end_io    = orig->bio.bi_end_io;
1671         rbio->bvec_iter         = iter;
1672         rbio->flags             = flags;
1673         rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
1674         rbio->narrow_crcs       = narrow_crcs;
1675         rbio->hole              = 0;
1676         rbio->retry             = 0;
1677         rbio->context           = 0;
1678         rbio->devs_have         = bch2_bkey_devs(k);
1679         rbio->pick              = pick;
1680         rbio->pos               = pos;
1681         rbio->version           = k.k->version;
1682         rbio->promote           = promote;
1683         INIT_WORK(&rbio->work, NULL);
1684
1685         rbio->bio.bi_opf        = orig->bio.bi_opf;
1686         rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1687         rbio->bio.bi_end_io     = bch2_read_endio;
1688
1689         if (rbio->bounce)
1690                 trace_read_bounce(&rbio->bio);
1691
1692         bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1693
1694         if (!rbio->have_ioref)
1695                 goto no_device_postclone;
1696
1697         percpu_down_read_preempt_disable(&c->usage_lock);
1698         bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
1699         percpu_up_read_preempt_enable(&c->usage_lock);
1700
1701         this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
1702                      bio_sectors(&rbio->bio));
1703
1704         bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1705
1706         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1707                 if (!(flags & BCH_READ_LAST_FRAGMENT)) {
1708                         bio_inc_remaining(&orig->bio);
1709                         trace_read_split(&orig->bio);
1710                 }
1711
1712                 submit_bio(&rbio->bio);
1713                 return 0;
1714         } else {
1715                 int ret;
1716
1717                 submit_bio_wait(&rbio->bio);
1718
1719                 rbio->context = RBIO_CONTEXT_UNBOUND;
1720                 bch2_read_endio(&rbio->bio);
1721
1722                 ret = rbio->retry;
1723                 rbio = bch2_rbio_free(rbio);
1724
1725                 if (ret == READ_RETRY_AVOID) {
1726                         bch2_mark_io_failure(failed, &pick);
1727                         ret = READ_RETRY;
1728                 }
1729
1730                 return ret;
1731         }
1732
1733 no_device_postclone:
1734         if (!rbio->split)
1735                 rbio->bio.bi_end_io = rbio->end_io;
1736         bch2_rbio_free(rbio);
1737 no_device:
1738         __bcache_io_error(c, "no device to read from");
1739
1740         if (likely(!(flags & BCH_READ_IN_RETRY))) {
1741                 orig->bio.bi_status = BLK_STS_IOERR;
1742
1743                 if (flags & BCH_READ_LAST_FRAGMENT)
1744                         bch2_rbio_done(orig);
1745                 return 0;
1746         } else {
1747                 return READ_ERR;
1748         }
1749
1750 hole:
1751         /*
1752          * won't normally happen in the BCH_READ_NODECODE
1753          * (bch2_move_extent()) path, but if we retry and the extent we wanted
1754          * to read no longer exists we have to signal that:
1755          */
1756         if (flags & BCH_READ_NODECODE)
1757                 orig->hole = true;
1758
1759         zero_fill_bio_iter(&orig->bio, iter);
1760
1761         if (flags & BCH_READ_LAST_FRAGMENT)
1762                 bch2_rbio_done(orig);
1763         return 0;
1764 }
1765
1766 void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
1767 {
1768         struct btree_iter iter;
1769         struct bkey_s_c k;
1770         unsigned flags = BCH_READ_RETRY_IF_STALE|
1771                 BCH_READ_MAY_PROMOTE|
1772                 BCH_READ_USER_MAPPED;
1773         int ret;
1774
1775         BUG_ON(rbio->_state);
1776         BUG_ON(flags & BCH_READ_NODECODE);
1777         BUG_ON(flags & BCH_READ_IN_RETRY);
1778
1779         rbio->c = c;
1780         rbio->start_time = local_clock();
1781
1782         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
1783                            POS(inode, rbio->bio.bi_iter.bi_sector),
1784                            BTREE_ITER_SLOTS, k) {
1785                 BKEY_PADDED(k) tmp;
1786                 unsigned bytes;
1787
1788                 /*
1789                  * Unlock the iterator while the btree node's lock is still in
1790                  * cache, before doing the IO:
1791                  */
1792                 bkey_reassemble(&tmp.k, k);
1793                 k = bkey_i_to_s_c(&tmp.k);
1794                 bch2_btree_iter_unlock(&iter);
1795
1796                 bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
1797                               (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
1798                 swap(rbio->bio.bi_iter.bi_size, bytes);
1799
1800                 if (rbio->bio.bi_iter.bi_size == bytes)
1801                         flags |= BCH_READ_LAST_FRAGMENT;
1802
1803                 bch2_read_extent(c, rbio, k, flags);
1804
1805                 if (flags & BCH_READ_LAST_FRAGMENT)
1806                         return;
1807
1808                 swap(rbio->bio.bi_iter.bi_size, bytes);
1809                 bio_advance(&rbio->bio, bytes);
1810         }
1811
1812         /*
1813          * If we get here, it better have been because there was an error
1814          * reading a btree node
1815          */
1816         ret = bch2_btree_iter_unlock(&iter);
1817         BUG_ON(!ret);
1818         bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
1819         bch2_rbio_done(rbio);
1820 }
1821
1822 void bch2_fs_io_exit(struct bch_fs *c)
1823 {
1824         if (c->promote_table.tbl)
1825                 rhashtable_destroy(&c->promote_table);
1826         mempool_exit(&c->bio_bounce_pages);
1827         bioset_exit(&c->bio_write);
1828         bioset_exit(&c->bio_read_split);
1829         bioset_exit(&c->bio_read);
1830 }
1831
1832 int bch2_fs_io_init(struct bch_fs *c)
1833 {
1834         if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1835                         BIOSET_NEED_BVECS) ||
1836             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1837                         BIOSET_NEED_BVECS) ||
1838             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
1839                         BIOSET_NEED_BVECS) ||
1840             mempool_init_page_pool(&c->bio_bounce_pages,
1841                                    max_t(unsigned,
1842                                          c->opts.btree_node_size,
1843                                          c->sb.encoded_extent_max) /
1844                                    PAGE_SECTORS, 0) ||
1845             rhashtable_init(&c->promote_table, &bch_promote_params))
1846                 return -ENOMEM;
1847
1848         return 0;
1849 }