git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/request.c

   1 /*
   2  * Handle a read or a write request and decide what to do with it.
   3  *
   4  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   5  * Copyright 2012 Google, Inc.
   6  *
   7  * Main pieces here:
   8  *
   9  * 1) Data insert path, via bch_data_insert() -- writes data to cache and
  10  *    updates extents btree
  11  * 2) Read path, via bch_read() -- for now only used by bcachefs and ioctl
  12  *    interface
  13  * 3) Read path, via cache_lookup() and struct search -- used by block device
  14  *    make_request functions
  15  * 4) Cache promotion -- used by bch_read() and cache_lookup() to copy data to
  16  *    the cache, either from a backing device or a cache device in a higher tier
  17  *
  18  * One tricky thing that comes up is a race condition where a bucket may be
  19  * re-used while reads from it are still in flight. To guard against this, we
  20  * save the ptr that is being read and check if it is stale once the read
  21  * completes. If the ptr is stale, the read is retried.
  22  *
  23  * #2 and #3 will be unified further in the future.
  24  */
  25
  26 #include "bcache.h"
  27 #include "blockdev.h"
  28 #include "btree_update.h"
  29 #include "btree_iter.h"
  30 #include "clock.h"
  31 #include "debug.h"
  32 #include "error.h"
  33 #include "extents.h"
  34 #include "io.h"
  35 #include "journal.h"
  36 #include "keybuf.h"
  37 #include "request.h"
  38 #include "writeback.h"
  39 #include "stats.h"
  40
  41 #include <linux/module.h>
  42 #include <linux/hash.h>
  43 #include <linux/random.h>
  44 #include <linux/backing-dev.h>
  45
  46 #include <trace/events/bcache.h>
  47
  48 #define CUTOFF_CACHE_ADD        10
  49 #define CUTOFF_CACHE_READA      15
  50
  51 /* Congested? */
  52
  53 unsigned bch_get_congested(struct bch_fs *c)
  54 {
  55         int i;
  56         long rand;
  57
  58         if (!c->congested_read_threshold_us &&
  59             !c->congested_write_threshold_us)
  60                 return 0;
  61
  62         i = (local_clock_us() - c->congested_last_us) / 1024;
  63         if (i < 0)
  64                 return 0;
  65
  66         i += atomic_read(&c->congested);
  67         if (i >= 0)
  68                 return 0;
  69
  70         i += CONGESTED_MAX;
  71
  72         if (i > 0)
  73                 i = fract_exp_two(i, 6);
  74
  75         rand = get_random_int();
  76         i -= bitmap_weight(&rand, BITS_PER_LONG);
  77
  78         return i > 0 ? i : 1;
  79 }
  80
  81 static void add_sequential(struct task_struct *t)
  82 {
  83         t->sequential_io_avg = ewma_add(t->sequential_io_avg,
  84                                         t->sequential_io, 3);
  85         t->sequential_io = 0;
  86 }
  87
  88 static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
  89 {
  90         return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
  91 }
  92
  93 static bool check_should_bypass(struct cached_dev *dc, struct bio *bio, int rw)
  94 {
  95         struct bch_fs *c = dc->disk.c;
  96         unsigned mode = BDEV_CACHE_MODE(dc->disk_sb.sb);
  97         unsigned sectors, congested = bch_get_congested(c);
  98         struct task_struct *task = current;
  99         struct io *i;
 100
 101         if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 102             sectors_available(c) * 100 < c->capacity * CUTOFF_CACHE_ADD ||
 103             (bio_op(bio) == REQ_OP_DISCARD))
 104                 goto skip;
 105
 106         if (mode == CACHE_MODE_NONE ||
 107             (mode == CACHE_MODE_WRITEAROUND &&
 108              op_is_write(bio_op(bio))))
 109                 goto skip;
 110
 111         if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 112             bio_sectors(bio) & (c->sb.block_size - 1)) {
 113                 pr_debug("skipping unaligned io");
 114                 goto skip;
 115         }
 116
 117         if (bypass_torture_test(dc)) {
 118                 if ((get_random_int() & 3) == 3)
 119                         goto skip;
 120                 else
 121                         goto rescale;
 122         }
 123
 124         if (!congested && !dc->sequential_cutoff)
 125                 goto rescale;
 126
 127         if (!congested &&
 128             mode == CACHE_MODE_WRITEBACK &&
 129             op_is_write(bio_op(bio)) &&
 130             (bio->bi_opf & REQ_SYNC))
 131                 goto rescale;
 132
 133         spin_lock(&dc->io_lock);
 134
 135         hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
 136                 if (i->last == bio->bi_iter.bi_sector &&
 137                     time_before(jiffies, i->last_io))
 138                         goto found;
 139
 140         i = list_first_entry(&dc->io_lru, struct io, lru);
 141
 142         add_sequential(task);
 143         i->sequential = 0;
 144 found:
 145         if (i->sequential + bio->bi_iter.bi_size > i->sequential)
 146                 i->sequential   += bio->bi_iter.bi_size;
 147
 148         i->last                  = bio_end_sector(bio);
 149         i->last_io               = jiffies + msecs_to_jiffies(5000);
 150         task->sequential_io      = i->sequential;
 151
 152         hlist_del(&i->hash);
 153         hlist_add_head(&i->hash, iohash(dc, i->last));
 154         list_move_tail(&i->lru, &dc->io_lru);
 155
 156         spin_unlock(&dc->io_lock);
 157
 158         sectors = max(task->sequential_io,
 159                       task->sequential_io_avg) >> 9;
 160
 161         if (dc->sequential_cutoff &&
 162             sectors >= dc->sequential_cutoff >> 9) {
 163                 trace_bcache_bypass_sequential(bio);
 164                 goto skip;
 165         }
 166
 167         if (congested && sectors >= congested) {
 168                 trace_bcache_bypass_congested(bio);
 169                 goto skip;
 170         }
 171
 172 rescale:
 173         return false;
 174 skip:
 175         bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
 176         return true;
 177 }
 178
 179 /* Common code for the make_request functions */
 180
 181 /**
 182  * request_endio - endio function for backing device bios
 183  */
 184 static void request_endio(struct bio *bio)
 185 {
 186         struct closure *cl = bio->bi_private;
 187
 188         if (bio->bi_error) {
 189                 struct search *s = container_of(cl, struct search, cl);
 190                 s->iop.error = bio->bi_error;
 191                 /* Only cache read errors are recoverable */
 192                 s->recoverable = false;
 193         }
 194
 195         bio_put(bio);
 196         closure_put(cl);
 197 }
 198
 199 static void bio_complete(struct search *s)
 200 {
 201         if (s->orig_bio) {
 202                 generic_end_io_acct(bio_data_dir(s->orig_bio),
 203                                     &s->d->disk->part0, s->start_time);
 204
 205                 trace_bcache_request_end(s->d, s->orig_bio);
 206                 s->orig_bio->bi_error = s->iop.error;
 207                 bio_endio(s->orig_bio);
 208                 s->orig_bio = NULL;
 209         }
 210 }
 211
 212 static void do_bio_hook(struct search *s, struct bio *orig_bio)
 213 {
 214         int rw = bio_data_dir(orig_bio);
 215         struct bio *bio = rw ? &s->wbio.bio : &s->rbio.bio;
 216
 217         bio_init(bio);
 218         __bio_clone_fast(bio, orig_bio);
 219         bio->bi_end_io          = request_endio;
 220         bio->bi_private         = &s->cl;
 221
 222         bio_cnt_set(bio, 3);
 223 }
 224
 225 static void search_free(struct closure *cl)
 226 {
 227         struct search *s = container_of(cl, struct search, cl);
 228
 229         bio_complete(s);
 230
 231         if (s->iop.bio)
 232                 bio_put(&s->iop.bio->bio);
 233
 234         closure_debug_destroy(cl);
 235         mempool_free(s, &s->d->c->search);
 236 }
 237
 238 static inline struct search *search_alloc(struct bio *bio,
 239                                           struct bcache_device *d)
 240 {
 241         struct search *s;
 242
 243         s = mempool_alloc(&d->c->search, GFP_NOIO);
 244
 245         closure_init(&s->cl, NULL);
 246         do_bio_hook(s, bio);
 247
 248         s->orig_bio             = bio;
 249         s->d                    = d;
 250         s->recoverable          = 1;
 251         s->bypass               = 0;
 252         s->write                = op_is_write(bio_op(bio));
 253         s->read_dirty_data      = 0;
 254         s->cache_miss           = 0;
 255         s->start_time           = jiffies;
 256         s->inode                = bcache_dev_inum(d);
 257
 258         s->iop.c                = d->c;
 259         s->iop.bio              = NULL;
 260         s->iop.error            = 0;
 261
 262         return s;
 263 }
 264
 265 /* Cached devices */
 266
 267 static void cached_dev_bio_complete(struct closure *cl)
 268 {
 269         struct search *s = container_of(cl, struct search, cl);
 270         struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 271
 272         search_free(cl);
 273         cached_dev_put(dc);
 274 }
 275
 276 /* Process reads */
 277
 278 static void cached_dev_read_error(struct closure *cl)
 279 {
 280         struct search *s = container_of(cl, struct search, cl);
 281         struct bio *bio = &s->rbio.bio;
 282
 283         if (s->recoverable) {
 284                 /* Read bucket invalidate races are handled here, also plain
 285                  * old IO errors from the cache that can be retried from the
 286                  * backing device (reads of clean data) */
 287                 trace_bcache_read_retry(s->orig_bio);
 288
 289                 s->iop.error = 0;
 290                 do_bio_hook(s, s->orig_bio);
 291
 292                 /* XXX: invalidate cache, don't count twice */
 293
 294                 closure_bio_submit(bio, cl);
 295         }
 296
 297         continue_at(cl, cached_dev_bio_complete, NULL);
 298 }
 299
 300 static void cached_dev_read_done(struct closure *cl)
 301 {
 302         struct search *s = container_of(cl, struct search, cl);
 303         struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 304
 305         if (dc->verify && s->recoverable && !s->read_dirty_data)
 306                 bch_data_verify(dc, s->orig_bio);
 307
 308         continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
 309 }
 310
 311 static void cached_dev_read_done_bh(struct closure *cl)
 312 {
 313         struct search *s = container_of(cl, struct search, cl);
 314         struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 315
 316         bch_mark_cache_accounting(s->iop.c, dc, !s->cache_miss, s->bypass);
 317         trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
 318
 319         if (s->iop.error)
 320                 continue_at_nobarrier(cl, cached_dev_read_error, s->iop.c->wq);
 321         else if (dc->verify)
 322                 continue_at_nobarrier(cl, cached_dev_read_done, s->iop.c->wq);
 323         else
 324                 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
 325 }
 326
 327 /**
 328  * __cache_promote -- insert result of read bio into cache
 329  *
 330  * Used for backing devices and flash-only volumes.
 331  *
 332  * @orig_bio must actually be a bbio with a valid key.
 333  */
 334 void __cache_promote(struct bch_fs *c, struct bch_read_bio *orig_bio,
 335                      struct bkey_s_c old,
 336                      struct bkey_s_c new,
 337                      unsigned write_flags)
 338 {
 339 #if 0
 340         struct cache_promote_op *op;
 341         struct bio *bio;
 342         unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
 343
 344         /* XXX: readahead? */
 345
 346         op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
 347         if (!op)
 348                 goto out_submit;
 349
 350         /* clone the bbio */
 351         memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
 352
 353         bio = &op->bio.bio.bio;
 354         bio_init(bio);
 355         bio_get(bio);
 356         bio->bi_bdev            = orig_bio->bio.bi_bdev;
 357         bio->bi_iter.bi_sector  = orig_bio->bio.bi_iter.bi_sector;
 358         bio->bi_iter.bi_size    = orig_bio->bio.bi_iter.bi_size;
 359         bio->bi_end_io          = cache_promote_endio;
 360         bio->bi_private         = &op->cl;
 361         bio->bi_io_vec          = bio->bi_inline_vecs;
 362         bch_bio_map(bio, NULL);
 363
 364         if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
 365                 goto out_free;
 366
 367         orig_bio->ca = NULL;
 368
 369         closure_init(&op->cl, &c->cl);
 370         op->orig_bio            = &orig_bio->bio;
 371         op->stale               = 0;
 372
 373         bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point,
 374                           new, old,
 375                           BCH_WRITE_ALLOC_NOWAIT|write_flags);
 376         op->iop.nr_replicas = 1;
 377
 378         //bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
 379         //bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
 380
 381         trace_bcache_promote(&orig_bio->bio);
 382
 383         op->bio.bio.submit_time_us = local_clock_us();
 384         closure_bio_submit(bio, &op->cl);
 385
 386         continue_at(&op->cl, cache_promote_write, c->wq);
 387 out_free:
 388         kfree(op);
 389 out_submit:
 390         generic_make_request(&orig_bio->bio);
 391 #endif
 392 }
 393
 394 /**
 395  * cached_dev_cache_miss - populate cache with data from backing device
 396  *
 397  * We don't write to the cache if s->bypass is set.
 398  */
 399 static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
 400                                  struct bio *bio, unsigned sectors)
 401 {
 402         int ret;
 403         unsigned reada = 0;
 404         struct bio *miss;
 405         BKEY_PADDED(key) replace;
 406
 407         s->cache_miss = 1;
 408
 409         if (s->bypass)
 410                 goto nopromote;
 411 #if 0
 412         struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 413
 414         /* XXX: broken */
 415         if (!(bio->bi_opf & REQ_RAHEAD) &&
 416             !(bio->bi_opf & REQ_META) &&
 417             ((u64) sectors_available(dc->disk.c) * 100 <
 418              (u64) iter->c->capacity * CUTOFF_CACHE_READA))
 419                 reada = min_t(sector_t, dc->readahead >> 9,
 420                               bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
 421 #endif
 422         sectors = min(sectors, bio_sectors(bio) + reada);
 423
 424         replace.key.k = KEY(s->inode,
 425                             bio->bi_iter.bi_sector + sectors,
 426                             sectors);
 427
 428         ret = bch_btree_insert_check_key(iter, &replace.key);
 429         if (ret == -EINTR)
 430                 return ret;
 431
 432         miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 433
 434         miss->bi_end_io         = request_endio;
 435         miss->bi_private        = &s->cl;
 436
 437         //to_bbio(miss)->key.k = KEY(s->inode,
 438         //                         bio_end_sector(miss),
 439         //                         bio_sectors(miss));
 440         to_rbio(miss)->ca = NULL;
 441
 442         closure_get(&s->cl);
 443         __cache_promote(s->iop.c, to_rbio(miss),
 444                         bkey_i_to_s_c(&replace.key),
 445                         bkey_to_s_c(&KEY(replace.key.k.p.inode,
 446                                          replace.key.k.p.offset,
 447                                          replace.key.k.size)),
 448                         BCH_WRITE_CACHED);
 449
 450         return 0;
 451 nopromote:
 452         miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 453
 454         miss->bi_end_io         = request_endio;
 455         miss->bi_private        = &s->cl;
 456         closure_bio_submit(miss, &s->cl);
 457
 458         return 0;
 459 }
 460
 461 static void cached_dev_read(struct cached_dev *dc, struct search *s)
 462 {
 463         struct bch_fs *c = s->iop.c;
 464         struct closure *cl = &s->cl;
 465         struct bio *bio = &s->rbio.bio;
 466         struct btree_iter iter;
 467         struct bkey_s_c k;
 468         int ret;
 469
 470         bch_increment_clock(c, bio_sectors(bio), READ);
 471
 472         for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
 473                                 POS(s->inode, bio->bi_iter.bi_sector), k) {
 474                 BKEY_PADDED(k) tmp;
 475                 struct extent_pick_ptr pick;
 476                 unsigned sectors, bytes;
 477                 bool is_last;
 478 retry:
 479                 bkey_reassemble(&tmp.k, k);
 480                 bch_btree_iter_unlock(&iter);
 481                 k = bkey_i_to_s_c(&tmp.k);
 482
 483                 bch_extent_pick_ptr(c, k, &pick);
 484                 if (IS_ERR(pick.ca)) {
 485                         bcache_io_error(c, bio, "no device to read from");
 486                         goto out;
 487                 }
 488
 489                 sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
 490                         bio->bi_iter.bi_sector;
 491                 bytes = sectors << 9;
 492                 is_last = bytes == bio->bi_iter.bi_size;
 493                 swap(bio->bi_iter.bi_size, bytes);
 494
 495                 if (pick.ca) {
 496                         PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
 497                                 c->prio_clock[READ].hand;
 498
 499                         if (!bkey_extent_is_cached(k.k))
 500                                 s->read_dirty_data = true;
 501
 502                         bch_read_extent(c, &s->rbio, k, &pick,
 503                                         BCH_READ_FORCE_BOUNCE|
 504                                         BCH_READ_RETRY_IF_STALE|
 505                                         (!s->bypass ? BCH_READ_PROMOTE : 0)|
 506                                         (is_last ? BCH_READ_IS_LAST : 0));
 507                 } else {
 508                         /* not present (hole), or stale cached data */
 509                         if (cached_dev_cache_miss(&iter, s, bio, sectors)) {
 510                                 k = bch_btree_iter_peek_with_holes(&iter);
 511                                 if (btree_iter_err(k))
 512                                         break;
 513                                 goto retry;
 514                         }
 515                 }
 516
 517                 swap(bio->bi_iter.bi_size, bytes);
 518                 bio_advance(bio, bytes);
 519
 520                 if (is_last) {
 521                         bch_btree_iter_unlock(&iter);
 522                         goto out;
 523                 }
 524         }
 525
 526         /*
 527          * If we get here, it better have been because there was an error
 528          * reading a btree node
 529          */
 530         ret = bch_btree_iter_unlock(&iter);
 531         BUG_ON(!ret);
 532         bcache_io_error(c, bio, "btree IO error %i", ret);
 533 out:
 534         continue_at(cl, cached_dev_read_done_bh, NULL);
 535 }
 536
 537 /* Process writes */
 538
 539 static void cached_dev_write_complete(struct closure *cl)
 540 {
 541         struct search *s = container_of(cl, struct search, cl);
 542         struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 543
 544         up_read_non_owner(&dc->writeback_lock);
 545         cached_dev_bio_complete(cl);
 546 }
 547
 548 static void cached_dev_write(struct cached_dev *dc, struct search *s)
 549 {
 550         struct closure *cl = &s->cl;
 551         struct bio *bio = &s->wbio.bio;
 552         bool writeback = false;
 553         bool bypass = s->bypass;
 554         struct bkey insert_key = KEY(s->inode,
 555                                      bio_end_sector(bio),
 556                                      bio_sectors(bio));
 557         unsigned flags = BCH_WRITE_DISCARD_ON_ERROR;
 558
 559         down_read_non_owner(&dc->writeback_lock);
 560         if (bch_keybuf_check_overlapping(&dc->writeback_keys,
 561                                          bkey_start_pos(&insert_key),
 562                                          insert_key.p)) {
 563                 /*
 564                  * We overlap with some dirty data undergoing background
 565                  * writeback, force this write to writeback
 566                  */
 567                 bypass = false;
 568                 writeback = true;
 569         }
 570
 571         /*
 572          * Discards aren't _required_ to do anything, so skipping if
 573          * check_overlapping returned true is ok
 574          *
 575          * But check_overlapping drops dirty keys for which io hasn't started,
 576          * so we still want to call it.
 577          */
 578         if (bio_op(bio) == REQ_OP_DISCARD)
 579                 bypass = true;
 580
 581         if (should_writeback(dc, bio, BDEV_CACHE_MODE(dc->disk_sb.sb),
 582                              bypass)) {
 583                 bypass = false;
 584                 writeback = true;
 585         }
 586
 587         if (bypass) {
 588                 /*
 589                  * If this is a bypass-write (as opposed to a discard), send
 590                  * it down to the backing device. If this is a discard, only
 591                  * send it to the backing device if the backing device
 592                  * supports discards. Otherwise, we simply discard the key
 593                  * range from the cache and don't touch the backing device.
 594                  */
 595                 if ((bio_op(bio) != REQ_OP_DISCARD) ||
 596                     blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
 597                         closure_bio_submit(s->orig_bio, cl);
 598         } else if (writeback) {
 599                 bch_writeback_add(dc);
 600
 601                 if (bio->bi_opf & REQ_PREFLUSH) {
 602                         /* Also need to send a flush to the backing device */
 603                         struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
 604                                                              &dc->disk.bio_split);
 605
 606                         flush->bi_bdev  = bio->bi_bdev;
 607                         flush->bi_end_io = request_endio;
 608                         flush->bi_private = cl;
 609                         bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH);
 610
 611                         closure_bio_submit(flush, cl);
 612                 }
 613         } else {
 614                 struct bio *writethrough =
 615                         bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
 616
 617                 closure_bio_submit(writethrough, cl);
 618
 619                 flags |= BCH_WRITE_CACHED;
 620                 flags |= BCH_WRITE_ALLOC_NOWAIT;
 621         }
 622
 623         if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
 624                 flags |= BCH_WRITE_FLUSH;
 625         if (bypass)
 626                 flags |= BCH_WRITE_DISCARD;
 627
 628         bch_write_op_init(&s->iop, dc->disk.c, &s->wbio,
 629                           (struct disk_reservation) { 0 },
 630                           foreground_write_point(dc->disk.c,
 631                                         (unsigned long) current),
 632                           bkey_start_pos(&insert_key),
 633                           NULL, flags);
 634
 635         closure_call(&s->iop.cl, bch_write, NULL, cl);
 636         continue_at(cl, cached_dev_write_complete, NULL);
 637 }
 638
 639 /* Cached devices - read & write stuff */
 640
 641 static void __cached_dev_make_request(struct request_queue *q, struct bio *bio)
 642 {
 643         struct search *s;
 644         struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 645         struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 646         int rw = bio_data_dir(bio);
 647
 648         generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
 649
 650         bio->bi_bdev = dc->disk_sb.bdev;
 651         bio->bi_iter.bi_sector += le64_to_cpu(dc->disk_sb.sb->data_offset);
 652
 653         if (cached_dev_get(dc)) {
 654                 struct bio *clone;
 655
 656                 s = search_alloc(bio, d);
 657                 trace_bcache_request_start(s->d, bio);
 658
 659                 clone = rw ? &s->wbio.bio : &s->rbio.bio;
 660
 661                 if (!bio->bi_iter.bi_size) {
 662                         if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
 663                                 bch_journal_flush_async(&s->iop.c->journal,
 664                                                         &s->cl);
 665
 666                         /*
 667                          * If it's a flush, we send the flush to the backing
 668                          * device too
 669                          */
 670                         closure_bio_submit(clone, &s->cl);
 671
 672                         continue_at(&s->cl, cached_dev_bio_complete, NULL);
 673                 } else {
 674                         s->bypass = check_should_bypass(dc, bio, rw);
 675
 676                         if (rw)
 677                                 cached_dev_write(dc, s);
 678                         else
 679                                 cached_dev_read(dc, s);
 680                 }
 681         } else {
 682                 if ((bio_op(bio) == REQ_OP_DISCARD) &&
 683                     !blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
 684                         bio_endio(bio);
 685                 else
 686                         generic_make_request(bio);
 687         }
 688 }
 689
 690 static blk_qc_t cached_dev_make_request(struct request_queue *q,
 691                                         struct bio *bio)
 692 {
 693         __cached_dev_make_request(q, bio);
 694         return BLK_QC_T_NONE;
 695 }
 696
 697 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
 698                             unsigned int cmd, unsigned long arg)
 699 {
 700         struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 701         return __blkdev_driver_ioctl(dc->disk_sb.bdev, mode, cmd, arg);
 702 }
 703
 704 static int cached_dev_congested(void *data, int bits)
 705 {
 706         struct bcache_device *d = data;
 707         struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 708         struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
 709         int ret = 0;
 710
 711         if (bdi_congested(&q->backing_dev_info, bits))
 712                 return 1;
 713
 714         if (cached_dev_get(dc)) {
 715                 ret |= bch_congested(d->c, bits);
 716                 cached_dev_put(dc);
 717         }
 718
 719         return ret;
 720 }
 721
 722 void bch_cached_dev_request_init(struct cached_dev *dc)
 723 {
 724         struct gendisk *g = dc->disk.disk;
 725
 726         g->queue->make_request_fn               = cached_dev_make_request;
 727         g->queue->backing_dev_info.congested_fn = cached_dev_congested;
 728         dc->disk.ioctl                          = cached_dev_ioctl;
 729 }
 730
 731 /* Blockdev volumes */
 732
 733 static void __blockdev_volume_make_request(struct request_queue *q,
 734                                            struct bio *bio)
 735 {
 736         struct search *s;
 737         struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 738         int rw = bio_data_dir(bio);
 739
 740         generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
 741
 742         trace_bcache_request_start(d, bio);
 743
 744         s = search_alloc(bio, d);
 745
 746         if (!bio->bi_iter.bi_size) {
 747                 if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
 748                         bch_journal_flush_async(&s->iop.c->journal,
 749                                                 &s->cl);
 750
 751                 continue_at(&s->cl, search_free, NULL);
 752         } else if (rw) {
 753                 struct disk_reservation res = { 0 };
 754                 unsigned flags = 0;
 755
 756                 if (bio_op(bio) != REQ_OP_DISCARD &&
 757                     bch_disk_reservation_get(d->c, &res, bio_sectors(bio), 0)) {
 758                         s->iop.error = -ENOSPC;
 759                         continue_at(&s->cl, search_free, NULL);
 760                         return;
 761                 }
 762
 763                 if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
 764                         flags |= BCH_WRITE_FLUSH;
 765                 if (bio_op(bio) == REQ_OP_DISCARD)
 766                         flags |= BCH_WRITE_DISCARD;
 767
 768                 bch_write_op_init(&s->iop, d->c, &s->wbio, res,
 769                                   foreground_write_point(d->c,
 770                                                 (unsigned long) current),
 771                                   POS(s->inode, bio->bi_iter.bi_sector),
 772                                   NULL, flags);
 773
 774                 closure_call(&s->iop.cl, bch_write, NULL, &s->cl);
 775         } else {
 776                 closure_get(&s->cl);
 777                 bch_read(d->c, &s->rbio, bcache_dev_inum(d));
 778         }
 779         continue_at(&s->cl, search_free, NULL);
 780 }
 781
 782 static blk_qc_t blockdev_volume_make_request(struct request_queue *q,
 783                                              struct bio *bio)
 784 {
 785         __blockdev_volume_make_request(q, bio);
 786         return BLK_QC_T_NONE;
 787 }
 788
 789 static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode,
 790                                  unsigned int cmd, unsigned long arg)
 791 {
 792         return -ENOTTY;
 793 }
 794
 795 static int blockdev_volume_congested(void *data, int bits)
 796 {
 797         struct bcache_device *d = data;
 798
 799         return bch_congested(d->c, bits);
 800 }
 801
 802 void bch_blockdev_volume_request_init(struct bcache_device *d)
 803 {
 804         struct gendisk *g = d->disk;
 805
 806         g->queue->make_request_fn               = blockdev_volume_make_request;
 807         g->queue->backing_dev_info.congested_fn = blockdev_volume_congested;
 808         d->ioctl                                = blockdev_volume_ioctl;
 809 }