git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/super.c

   1 /*
   2  * bcache setup/teardown code, and some metadata io - read a superblock and
   3  * figure out what to do with it.
   4  *
   5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6  * Copyright 2012 Google, Inc.
   7  */
   8
   9 #include "bcache.h"
  10 #include "blockdev.h"
  11 #include "alloc.h"
  12 #include "btree_cache.h"
  13 #include "btree_gc.h"
  14 #include "btree_update.h"
  15 #include "btree_io.h"
  16 #include "chardev.h"
  17 #include "checksum.h"
  18 #include "clock.h"
  19 #include "compress.h"
  20 #include "debug.h"
  21 #include "error.h"
  22 #include "fs.h"
  23 #include "fs-gc.h"
  24 #include "inode.h"
  25 #include "io.h"
  26 #include "journal.h"
  27 #include "keylist.h"
  28 #include "move.h"
  29 #include "migrate.h"
  30 #include "movinggc.h"
  31 #include "notify.h"
  32 #include "stats.h"
  33 #include "super.h"
  34 #include "super-io.h"
  35 #include "tier.h"
  36 #include "writeback.h"
  37
  38 #include <linux/backing-dev.h>
  39 #include <linux/blkdev.h>
  40 #include <linux/debugfs.h>
  41 #include <linux/device.h>
  42 #include <linux/genhd.h>
  43 #include <linux/idr.h>
  44 #include <linux/kthread.h>
  45 #include <linux/module.h>
  46 #include <linux/percpu.h>
  47 #include <linux/random.h>
  48 #include <linux/reboot.h>
  49 #include <linux/sysfs.h>
  50 #include <crypto/hash.h>
  51
  52 #include <trace/events/bcache.h>
  53
  54 MODULE_LICENSE("GPL");
  55 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
  56
  57 static const uuid_le invalid_uuid = {
  58         .b = {
  59                 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
  60                 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
  61         }
  62 };
  63
  64 static struct kset *bcache_kset;
  65 struct mutex bch_register_lock;
  66 LIST_HEAD(bch_fs_list);
  67
  68 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
  69 struct workqueue_struct *bcache_io_wq;
  70 struct crypto_shash *bch_sha256;
  71
  72 static void bch_dev_stop(struct cache *);
  73 static int bch_dev_online(struct cache *);
  74
  75 static int bch_congested_fn(void *data, int bdi_bits)
  76 {
  77         struct backing_dev_info *bdi;
  78         struct cache_set *c = data;
  79         struct cache *ca;
  80         unsigned i;
  81         int ret = 0;
  82
  83         rcu_read_lock();
  84         if (bdi_bits & (1 << WB_sync_congested)) {
  85                 /* Reads - check all devices: */
  86                 for_each_cache_rcu(ca, c, i) {
  87                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
  88
  89                         if (bdi_congested(bdi, bdi_bits)) {
  90                                 ret = 1;
  91                                 break;
  92                         }
  93                 }
  94         } else {
  95                 /* Writes only go to tier 0: */
  96                 group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
  97                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
  98
  99                         if (bdi_congested(bdi, bdi_bits)) {
 100                                 ret = 1;
 101                                 break;
 102                         }
 103                 }
 104         }
 105         rcu_read_unlock();
 106
 107         return ret;
 108 }
 109
 110 /* Cache set RO/RW: */
 111
 112 /*
 113  * For startup/shutdown of RW stuff, the dependencies are:
 114  *
 115  * - foreground writes depend on copygc and tiering (to free up space)
 116  *
 117  * - copygc and tiering depend on mark and sweep gc (they actually probably
 118  *   don't because they either reserve ahead of time or don't block if
 119  *   allocations fail, but allocations can require mark and sweep gc to run
 120  *   because of generation number wraparound)
 121  *
 122  * - all of the above depends on the allocator threads
 123  *
 124  * - allocator depends on the journal (when it rewrites prios and gens)
 125  */
 126
 127 static void __bch_fs_read_only(struct cache_set *c)
 128 {
 129         struct cache *ca;
 130         unsigned i;
 131
 132         c->tiering_pd.rate.rate = UINT_MAX;
 133         bch_ratelimit_reset(&c->tiering_pd.rate);
 134         bch_tiering_read_stop(c);
 135
 136         for_each_cache(ca, c, i)
 137                 bch_moving_gc_stop(ca);
 138
 139         bch_gc_thread_stop(c);
 140
 141         bch_btree_flush(c);
 142
 143         for_each_cache(ca, c, i)
 144                 bch_dev_allocator_stop(ca);
 145
 146         /*
 147          * Write a journal entry after flushing the btree, so we don't end up
 148          * replaying everything we just flushed:
 149          */
 150         if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
 151                 int ret;
 152
 153                 bch_journal_flush_async(&c->journal, NULL);
 154                 ret = bch_journal_meta(&c->journal);
 155                 BUG_ON(ret && !bch_journal_error(&c->journal));
 156         }
 157
 158         cancel_delayed_work_sync(&c->journal.write_work);
 159         cancel_delayed_work_sync(&c->journal.reclaim_work);
 160 }
 161
 162 static void bch_writes_disabled(struct percpu_ref *writes)
 163 {
 164         struct cache_set *c = container_of(writes, struct cache_set, writes);
 165
 166         set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 167         wake_up(&bch_read_only_wait);
 168 }
 169
 170 static void bch_fs_read_only_work(struct work_struct *work)
 171 {
 172         struct cache_set *c =
 173                 container_of(work, struct cache_set, read_only_work);
 174
 175         percpu_ref_put(&c->writes);
 176
 177         del_timer(&c->foreground_write_wakeup);
 178         cancel_delayed_work(&c->pd_controllers_update);
 179
 180         c->foreground_write_pd.rate.rate = UINT_MAX;
 181         bch_wake_delayed_writes((unsigned long) c);
 182
 183         if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
 184                 /*
 185                  * If we're not doing an emergency shutdown, we want to wait on
 186                  * outstanding writes to complete so they don't see spurious
 187                  * errors due to shutting down the allocator:
 188                  */
 189                 wait_event(bch_read_only_wait,
 190                            test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 191
 192                 __bch_fs_read_only(c);
 193
 194                 if (!bch_journal_error(&c->journal) &&
 195                     !test_bit(BCH_FS_ERROR, &c->flags)) {
 196                         mutex_lock(&c->sb_lock);
 197                         SET_BCH_SB_CLEAN(c->disk_sb, true);
 198                         bch_write_super(c);
 199                         mutex_unlock(&c->sb_lock);
 200                 }
 201         } else {
 202                 /*
 203                  * If we are doing an emergency shutdown outstanding writes may
 204                  * hang until we shutdown the allocator so we don't want to wait
 205                  * on outstanding writes before shutting everything down - but
 206                  * we do need to wait on them before returning and signalling
 207                  * that going RO is complete:
 208                  */
 209                 __bch_fs_read_only(c);
 210
 211                 wait_event(bch_read_only_wait,
 212                            test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 213         }
 214
 215         bch_notify_fs_read_only(c);
 216         trace_fs_read_only_done(c);
 217
 218         set_bit(BCH_FS_RO_COMPLETE, &c->flags);
 219         wake_up(&bch_read_only_wait);
 220 }
 221
 222 bool bch_fs_read_only(struct cache_set *c)
 223 {
 224         if (test_and_set_bit(BCH_FS_RO, &c->flags))
 225                 return false;
 226
 227         trace_fs_read_only(c);
 228
 229         percpu_ref_get(&c->writes);
 230
 231         /*
 232          * Block new foreground-end write operations from starting - any new
 233          * writes will return -EROFS:
 234          *
 235          * (This is really blocking new _allocations_, writes to previously
 236          * allocated space can still happen until stopping the allocator in
 237          * bch_dev_allocator_stop()).
 238          */
 239         percpu_ref_kill(&c->writes);
 240
 241         queue_work(system_freezable_wq, &c->read_only_work);
 242         return true;
 243 }
 244
 245 bool bch_fs_emergency_read_only(struct cache_set *c)
 246 {
 247         bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 248
 249         bch_fs_read_only(c);
 250         bch_journal_halt(&c->journal);
 251
 252         wake_up(&bch_read_only_wait);
 253         return ret;
 254 }
 255
 256 void bch_fs_read_only_sync(struct cache_set *c)
 257 {
 258         /* so we don't race with bch_fs_read_write() */
 259         lockdep_assert_held(&bch_register_lock);
 260
 261         bch_fs_read_only(c);
 262
 263         wait_event(bch_read_only_wait,
 264                    test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
 265                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 266 }
 267
 268 static const char *__bch_fs_read_write(struct cache_set *c)
 269 {
 270         struct cache *ca;
 271         const char *err;
 272         unsigned i;
 273
 274         lockdep_assert_held(&bch_register_lock);
 275
 276         err = "error starting allocator thread";
 277         for_each_cache(ca, c, i)
 278                 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
 279                     bch_dev_allocator_start(ca)) {
 280                         percpu_ref_put(&ca->ref);
 281                         goto err;
 282                 }
 283
 284         err = "error starting btree GC thread";
 285         if (bch_gc_thread_start(c))
 286                 goto err;
 287
 288         for_each_cache(ca, c, i) {
 289                 if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
 290                         continue;
 291
 292                 err = "error starting moving GC thread";
 293                 if (bch_moving_gc_thread_start(ca)) {
 294                         percpu_ref_put(&ca->ref);
 295                         goto err;
 296                 }
 297         }
 298
 299         err = "error starting tiering thread";
 300         if (bch_tiering_read_start(c))
 301                 goto err;
 302
 303         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 304
 305         return NULL;
 306 err:
 307         __bch_fs_read_only(c);
 308         return err;
 309 }
 310
 311 const char *bch_fs_read_write(struct cache_set *c)
 312 {
 313         const char *err;
 314
 315         lockdep_assert_held(&bch_register_lock);
 316
 317         if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
 318                 return NULL;
 319
 320         err = __bch_fs_read_write(c);
 321         if (err)
 322                 return err;
 323
 324         percpu_ref_reinit(&c->writes);
 325
 326         clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 327         clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 328         clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
 329         clear_bit(BCH_FS_RO, &c->flags);
 330         return NULL;
 331 }
 332
 333 /* Cache set startup/shutdown: */
 334
 335 static void bch_fs_free(struct cache_set *c)
 336 {
 337         del_timer_sync(&c->foreground_write_wakeup);
 338         cancel_delayed_work_sync(&c->pd_controllers_update);
 339         cancel_work_sync(&c->read_only_work);
 340         cancel_work_sync(&c->bio_submit_work);
 341         cancel_work_sync(&c->read_retry_work);
 342
 343         bch_fs_encryption_free(c);
 344         bch_btree_cache_free(c);
 345         bch_journal_free(&c->journal);
 346         bch_io_clock_exit(&c->io_clock[WRITE]);
 347         bch_io_clock_exit(&c->io_clock[READ]);
 348         bch_compress_free(c);
 349         bch_fs_blockdev_exit(c);
 350         bdi_destroy(&c->bdi);
 351         lg_lock_free(&c->bucket_stats_lock);
 352         free_percpu(c->bucket_stats_percpu);
 353         mempool_exit(&c->btree_bounce_pool);
 354         mempool_exit(&c->bio_bounce_pages);
 355         bioset_exit(&c->bio_write);
 356         bioset_exit(&c->bio_read_split);
 357         bioset_exit(&c->bio_read);
 358         bioset_exit(&c->btree_read_bio);
 359         mempool_exit(&c->btree_interior_update_pool);
 360         mempool_exit(&c->btree_reserve_pool);
 361         mempool_exit(&c->fill_iter);
 362         percpu_ref_exit(&c->writes);
 363
 364         if (c->copygc_wq)
 365                 destroy_workqueue(c->copygc_wq);
 366         if (c->wq)
 367                 destroy_workqueue(c->wq);
 368
 369         kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
 370         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
 371         kfree(c);
 372         module_put(THIS_MODULE);
 373 }
 374
 375 /*
 376  * should be __bch_fs_stop4 - block devices are closed, now we can finally
 377  * free it
 378  */
 379 void bch_fs_release(struct kobject *kobj)
 380 {
 381         struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 382         struct completion *stop_completion = c->stop_completion;
 383
 384         bch_notify_fs_stopped(c);
 385         bch_info(c, "stopped");
 386
 387         bch_fs_free(c);
 388
 389         if (stop_completion)
 390                 complete(stop_completion);
 391 }
 392
 393 /*
 394  * All activity on the cache_set should have stopped now - close devices:
 395  */
 396 static void __bch_fs_stop3(struct closure *cl)
 397 {
 398         struct cache_set *c = container_of(cl, struct cache_set, cl);
 399         struct cache *ca;
 400         unsigned i;
 401
 402         mutex_lock(&bch_register_lock);
 403         for_each_cache(ca, c, i)
 404                 bch_dev_stop(ca);
 405
 406         list_del(&c->list);
 407         mutex_unlock(&bch_register_lock);
 408
 409         closure_debug_destroy(&c->cl);
 410         kobject_put(&c->kobj);
 411 }
 412
 413 /*
 414  * Openers (i.e. block devices) should have exited, shutdown all userspace
 415  * interfaces and wait for &c->cl to hit 0
 416  */
 417 static void __bch_fs_stop2(struct closure *cl)
 418 {
 419         struct cache_set *c = container_of(cl, struct cache_set, caching);
 420
 421         bch_debug_exit_cache_set(c);
 422         bch_fs_chardev_exit(c);
 423
 424         if (c->kobj.state_in_sysfs)
 425                 kobject_del(&c->kobj);
 426
 427         bch_cache_accounting_destroy(&c->accounting);
 428
 429         kobject_put(&c->time_stats);
 430         kobject_put(&c->opts_dir);
 431         kobject_put(&c->internal);
 432
 433         mutex_lock(&bch_register_lock);
 434         bch_fs_read_only_sync(c);
 435         mutex_unlock(&bch_register_lock);
 436
 437         closure_return(cl);
 438 }
 439
 440 /*
 441  * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
 442  * haven't waited for anything to stop yet, we're just punting to process
 443  * context to shut down block devices:
 444  */
 445 static void __bch_fs_stop1(struct closure *cl)
 446 {
 447         struct cache_set *c = container_of(cl, struct cache_set, caching);
 448
 449         bch_blockdevs_stop(c);
 450
 451         continue_at(cl, __bch_fs_stop2, system_wq);
 452 }
 453
 454 void bch_fs_stop(struct cache_set *c)
 455 {
 456         if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
 457                 closure_queue(&c->caching);
 458 }
 459
 460 void bch_fs_stop_sync(struct cache_set *c)
 461 {
 462         DECLARE_COMPLETION_ONSTACK(complete);
 463
 464         c->stop_completion = &complete;
 465         bch_fs_stop(c);
 466         closure_put(&c->cl);
 467
 468         /* Killable? */
 469         wait_for_completion(&complete);
 470 }
 471
 472 /* Stop, detaching from backing devices: */
 473 void bch_fs_detach(struct cache_set *c)
 474 {
 475         if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
 476                 bch_fs_stop(c);
 477 }
 478
 479 static unsigned bch_fs_nr_devices(struct cache_set *c)
 480 {
 481         struct bch_sb_field_members *mi;
 482         unsigned i, nr = 0;
 483
 484         mutex_lock(&c->sb_lock);
 485         mi = bch_sb_get_members(c->disk_sb);
 486
 487         for (i = 0; i < c->disk_sb->nr_devices; i++)
 488                 if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
 489                         nr++;
 490
 491         mutex_unlock(&c->sb_lock);
 492
 493         return nr;
 494 }
 495
 496 static unsigned bch_fs_nr_online_devices(struct cache_set *c)
 497 {
 498         unsigned i, nr = 0;
 499
 500         for (i = 0; i < c->sb.nr_devices; i++)
 501                 if (c->cache[i])
 502                         nr++;
 503
 504         return nr;
 505 }
 506
 507 #define alloc_bucket_pages(gfp, ca)                     \
 508         ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
 509
 510 static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 511 {
 512         struct cache_set *c;
 513         unsigned iter_size, journal_entry_bytes;
 514
 515         c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
 516         if (!c)
 517                 return NULL;
 518
 519         __module_get(THIS_MODULE);
 520
 521         c->minor                = -1;
 522
 523         mutex_init(&c->sb_lock);
 524         INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
 525         mutex_init(&c->btree_cache_lock);
 526         mutex_init(&c->bucket_lock);
 527         mutex_init(&c->btree_root_lock);
 528         INIT_WORK(&c->read_only_work, bch_fs_read_only_work);
 529
 530         init_rwsem(&c->gc_lock);
 531
 532 #define BCH_TIME_STAT(name, frequency_units, duration_units)            \
 533         spin_lock_init(&c->name##_time.lock);
 534         BCH_TIME_STATS()
 535 #undef BCH_TIME_STAT
 536
 537         bch_open_buckets_init(c);
 538         bch_tiering_init_cache_set(c);
 539
 540         INIT_LIST_HEAD(&c->list);
 541         INIT_LIST_HEAD(&c->cached_devs);
 542         INIT_LIST_HEAD(&c->btree_cache);
 543         INIT_LIST_HEAD(&c->btree_cache_freeable);
 544         INIT_LIST_HEAD(&c->btree_cache_freed);
 545
 546         INIT_LIST_HEAD(&c->btree_interior_update_list);
 547         mutex_init(&c->btree_reserve_cache_lock);
 548         mutex_init(&c->btree_interior_update_lock);
 549
 550         mutex_init(&c->bio_bounce_pages_lock);
 551         INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
 552         spin_lock_init(&c->bio_submit_lock);
 553         bio_list_init(&c->read_retry_list);
 554         spin_lock_init(&c->read_retry_lock);
 555         INIT_WORK(&c->read_retry_work, bch_read_retry_work);
 556         mutex_init(&c->zlib_workspace_lock);
 557
 558         seqcount_init(&c->gc_pos_lock);
 559
 560         c->prio_clock[READ].hand = 1;
 561         c->prio_clock[READ].min_prio = 0;
 562         c->prio_clock[WRITE].hand = 1;
 563         c->prio_clock[WRITE].min_prio = 0;
 564
 565         c->congested_read_threshold_us  = 2000;
 566         c->congested_write_threshold_us = 20000;
 567         c->error_limit  = 16 << IO_ERROR_SHIFT;
 568         init_waitqueue_head(&c->writeback_wait);
 569
 570         c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
 571
 572         c->copy_gc_enabled = 1;
 573         c->tiering_enabled = 1;
 574         c->tiering_percent = 10;
 575
 576         c->foreground_target_percent = 20;
 577
 578         c->journal.write_time   = &c->journal_write_time;
 579         c->journal.delay_time   = &c->journal_delay_time;
 580         c->journal.blocked_time = &c->journal_blocked_time;
 581         c->journal.flush_seq_time = &c->journal_flush_seq_time;
 582
 583         mutex_init(&c->uevent_lock);
 584
 585         mutex_lock(&c->sb_lock);
 586
 587         if (bch_sb_to_cache_set(c, sb)) {
 588                 mutex_unlock(&c->sb_lock);
 589                 goto err;
 590         }
 591
 592         mutex_unlock(&c->sb_lock);
 593
 594         scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
 595
 596         bch_opts_apply(&c->opts, bch_sb_opts(sb));
 597         bch_opts_apply(&c->opts, opts);
 598
 599         c->opts.nochanges       |= c->opts.noreplay;
 600         c->opts.read_only       |= c->opts.nochanges;
 601
 602         c->block_bits           = ilog2(c->sb.block_size);
 603
 604         if (bch_fs_init_fault("fs_alloc"))
 605                 goto err;
 606
 607         iter_size = (btree_blocks(c) + 1) * 2 *
 608                 sizeof(struct btree_node_iter_set);
 609
 610         journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
 611
 612         if (!(c->wq = alloc_workqueue("bcache",
 613                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 614             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
 615                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 616             percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
 617             mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 618                                       sizeof(struct btree_reserve)) ||
 619             mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 620                                       sizeof(struct btree_interior_update)) ||
 621             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 622             bioset_init(&c->btree_read_bio, 1, 0) ||
 623             bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
 624             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
 625             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
 626             mempool_init_page_pool(&c->bio_bounce_pages,
 627                                    max_t(unsigned,
 628                                          c->sb.btree_node_size,
 629                                          BCH_ENCODED_EXTENT_MAX) /
 630                                    PAGE_SECTORS, 0) ||
 631             !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
 632             lg_lock_init(&c->bucket_stats_lock) ||
 633             mempool_init_page_pool(&c->btree_bounce_pool, 1,
 634                                    ilog2(btree_pages(c))) ||
 635             bdi_setup_and_register(&c->bdi, "bcache") ||
 636             bch_fs_blockdev_init(c) ||
 637             bch_io_clock_init(&c->io_clock[READ]) ||
 638             bch_io_clock_init(&c->io_clock[WRITE]) ||
 639             bch_journal_alloc(&c->journal, journal_entry_bytes) ||
 640             bch_btree_cache_alloc(c) ||
 641             bch_fs_encryption_init(c) ||
 642             bch_compress_init(c) ||
 643             bch_check_set_has_compressed_data(c, c->opts.compression))
 644                 goto err;
 645
 646         c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 647         c->bdi.congested_fn     = bch_congested_fn;
 648         c->bdi.congested_data   = c;
 649
 650         /*
 651          * Now that all allocations have succeeded, init various refcounty
 652          * things that let us shutdown:
 653          */
 654         closure_init(&c->cl, NULL);
 655
 656         c->kobj.kset = bcache_kset;
 657         kobject_init(&c->kobj, &bch_fs_ktype);
 658         kobject_init(&c->internal, &bch_fs_internal_ktype);
 659         kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype);
 660         kobject_init(&c->time_stats, &bch_fs_time_stats_ktype);
 661
 662         bch_cache_accounting_init(&c->accounting, &c->cl);
 663
 664         closure_init(&c->caching, &c->cl);
 665         set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
 666
 667         continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
 668         return c;
 669 err:
 670         bch_fs_free(c);
 671         return NULL;
 672 }
 673
 674 static int bch_fs_online(struct cache_set *c)
 675 {
 676         struct cache *ca;
 677         unsigned i;
 678         int ret;
 679
 680         lockdep_assert_held(&bch_register_lock);
 681
 682         if (!list_empty(&c->list))
 683                 return 0;
 684
 685         list_add(&c->list, &bch_fs_list);
 686
 687         ret = bch_fs_chardev_init(c);
 688         if (ret)
 689                 return ret;
 690
 691         if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
 692             kobject_add(&c->internal, &c->kobj, "internal") ||
 693             kobject_add(&c->opts_dir, &c->kobj, "options") ||
 694             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
 695             bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
 696                 return -1;
 697
 698         for_each_cache(ca, c, i)
 699                 if (bch_dev_online(ca)) {
 700                         percpu_ref_put(&ca->ref);
 701                         return -1;
 702                 }
 703
 704         return 0;
 705 }
 706
 707 static const char *bch_fs_start(struct cache_set *c)
 708 {
 709         const char *err = "cannot allocate memory";
 710         struct bch_sb_field_members *mi;
 711         struct cache *ca;
 712         unsigned i, id;
 713         time64_t now;
 714         LIST_HEAD(journal);
 715         struct jset *j;
 716         int ret = -EINVAL;
 717
 718         lockdep_assert_held(&bch_register_lock);
 719         BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
 720
 721         /* We don't want bch_fatal_error() to free underneath us */
 722         closure_get(&c->caching);
 723
 724         /*
 725          * Make sure that each cache object's mi is up to date before
 726          * we start testing it.
 727          */
 728         for_each_cache(ca, c, i)
 729                 bch_sb_from_cache_set(c, ca);
 730
 731         if (BCH_SB_INITIALIZED(c->disk_sb)) {
 732                 ret = bch_journal_read(c, &journal);
 733                 if (ret)
 734                         goto err;
 735
 736                 pr_debug("btree_journal_read() done");
 737
 738                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
 739
 740                 err = "error reading priorities";
 741                 for_each_cache(ca, c, i) {
 742                         ret = bch_prio_read(ca);
 743                         if (ret) {
 744                                 percpu_ref_put(&ca->ref);
 745                                 goto err;
 746                         }
 747                 }
 748
 749                 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
 750                 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
 751
 752                 for_each_cache(ca, c, i) {
 753                         bch_recalc_min_prio(ca, READ);
 754                         bch_recalc_min_prio(ca, WRITE);
 755                 }
 756
 757                 for (id = 0; id < BTREE_ID_NR; id++) {
 758                         unsigned level;
 759                         struct bkey_i *k;
 760
 761                         err = "bad btree root";
 762                         k = bch_journal_find_btree_root(c, j, id, &level);
 763                         if (!k && id == BTREE_ID_EXTENTS)
 764                                 goto err;
 765                         if (!k) {
 766                                 pr_debug("missing btree root: %d", id);
 767                                 continue;
 768                         }
 769
 770                         err = "error reading btree root";
 771                         if (bch_btree_root_read(c, id, k, level))
 772                                 goto err;
 773                 }
 774
 775                 bch_verbose(c, "starting mark and sweep:");
 776
 777                 err = "error in recovery";
 778                 if (bch_initial_gc(c, &journal))
 779                         goto err;
 780
 781                 if (c->opts.noreplay)
 782                         goto recovery_done;
 783
 784                 bch_verbose(c, "mark and sweep done");
 785
 786                 /*
 787                  * bch_journal_start() can't happen sooner, or btree_gc_finish()
 788                  * will give spurious errors about oldest_gen > bucket_gen -
 789                  * this is a hack but oh well.
 790                  */
 791                 bch_journal_start(c);
 792
 793                 err = "error starting allocator thread";
 794                 for_each_cache(ca, c, i)
 795                         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
 796                             bch_dev_allocator_start(ca)) {
 797                                 percpu_ref_put(&ca->ref);
 798                                 goto err;
 799                         }
 800
 801                 bch_verbose(c, "starting journal replay:");
 802
 803                 err = "journal replay failed";
 804                 ret = bch_journal_replay(c, &journal);
 805                 if (ret)
 806                         goto err;
 807
 808                 bch_verbose(c, "journal replay done");
 809
 810                 if (c->opts.norecovery)
 811                         goto recovery_done;
 812
 813                 bch_verbose(c, "starting fsck:");
 814                 err = "error in fsck";
 815                 ret = bch_fsck(c, !c->opts.nofsck);
 816                 if (ret)
 817                         goto err;
 818
 819                 bch_verbose(c, "fsck done");
 820         } else {
 821                 struct bch_inode_unpacked inode;
 822                 struct bkey_inode_buf packed_inode;
 823                 struct closure cl;
 824
 825                 closure_init_stack(&cl);
 826
 827                 bch_notice(c, "initializing new filesystem");
 828
 829                 err = "unable to allocate journal buckets";
 830                 for_each_cache(ca, c, i)
 831                         if (bch_dev_journal_alloc(ca)) {
 832                                 percpu_ref_put(&ca->ref);
 833                                 goto err;
 834                         }
 835
 836                 bch_initial_gc(c, NULL);
 837
 838                 /*
 839                  * journal_res_get() will crash if called before this has
 840                  * set up the journal.pin FIFO and journal.cur pointer:
 841                  */
 842                 bch_journal_start(c);
 843                 bch_journal_set_replay_done(&c->journal);
 844
 845                 err = "error starting allocator thread";
 846                 for_each_cache(ca, c, i)
 847                         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
 848                             bch_dev_allocator_start(ca)) {
 849                                 percpu_ref_put(&ca->ref);
 850                                 goto err;
 851                         }
 852
 853                 err = "cannot allocate new btree root";
 854                 for (id = 0; id < BTREE_ID_NR; id++)
 855                         if (bch_btree_root_alloc(c, id, &cl)) {
 856                                 closure_sync(&cl);
 857                                 goto err;
 858                         }
 859
 860                 /* Wait for new btree roots to be written: */
 861                 closure_sync(&cl);
 862
 863                 bch_inode_init(c, &inode, 0, 0,
 864                                S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
 865                 inode.inum = BCACHE_ROOT_INO;
 866
 867                 bch_inode_pack(&packed_inode, &inode);
 868
 869                 err = "error creating root directory";
 870                 if (bch_btree_insert(c, BTREE_ID_INODES,
 871                                      &packed_inode.inode.k_i,
 872                                      NULL, NULL, NULL, 0))
 873                         goto err;
 874
 875                 err = "error writing first journal entry";
 876                 if (bch_journal_meta(&c->journal))
 877                         goto err;
 878         }
 879 recovery_done:
 880         if (c->opts.read_only) {
 881                 bch_fs_read_only_sync(c);
 882         } else {
 883                 err = __bch_fs_read_write(c);
 884                 if (err)
 885                         goto err;
 886         }
 887
 888         mutex_lock(&c->sb_lock);
 889         mi = bch_sb_get_members(c->disk_sb);
 890         now = ktime_get_seconds();
 891
 892         rcu_read_lock();
 893         for_each_cache_rcu(ca, c, i)
 894                 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
 895         rcu_read_unlock();
 896
 897         SET_BCH_SB_INITIALIZED(c->disk_sb, true);
 898         SET_BCH_SB_CLEAN(c->disk_sb, false);
 899         c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
 900
 901         bch_write_super(c);
 902         mutex_unlock(&c->sb_lock);
 903
 904         err = "dynamic fault";
 905         if (bch_fs_init_fault("fs_start"))
 906                 goto err;
 907
 908         err = "error creating kobject";
 909         if (bch_fs_online(c))
 910                 goto err;
 911
 912         err = "can't bring up blockdev volumes";
 913         if (bch_blockdev_volumes_start(c))
 914                 goto err;
 915
 916         bch_debug_init_cache_set(c);
 917         set_bit(BCH_FS_RUNNING, &c->flags);
 918         bch_attach_backing_devs(c);
 919
 920         bch_notify_fs_read_write(c);
 921         err = NULL;
 922 out:
 923         bch_journal_entries_free(&journal);
 924         closure_put(&c->caching);
 925         return err;
 926 err:
 927         switch (ret) {
 928         case BCH_FSCK_ERRORS_NOT_FIXED:
 929                 bch_err(c, "filesystem contains errors: please report this to the developers");
 930                 pr_cont("mount with -o fix_errors to repair");
 931                 err = "fsck error";
 932                 break;
 933         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
 934                 bch_err(c, "filesystem contains errors: please report this to the developers");
 935                 pr_cont("repair unimplemented: inform the developers so that it can be added");
 936                 err = "fsck error";
 937                 break;
 938         case BCH_FSCK_REPAIR_IMPOSSIBLE:
 939                 bch_err(c, "filesystem contains errors, but repair impossible");
 940                 err = "fsck error";
 941                 break;
 942         case BCH_FSCK_UNKNOWN_VERSION:
 943                 err = "unknown metadata version";;
 944                 break;
 945         case -ENOMEM:
 946                 err = "cannot allocate memory";
 947                 break;
 948         case -EIO:
 949                 err = "IO error";
 950                 break;
 951         }
 952
 953         BUG_ON(!err);
 954         set_bit(BCH_FS_ERROR, &c->flags);
 955         goto out;
 956 }
 957
 958 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 959 {
 960         struct bch_sb_field_members *sb_mi;
 961
 962         sb_mi = bch_sb_get_members(sb);
 963         if (!sb_mi)
 964                 return "Invalid superblock: member info area missing";
 965
 966         if (le16_to_cpu(sb->block_size) != c->sb.block_size)
 967                 return "mismatched block size";
 968
 969         if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
 970             BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
 971                 return "new cache bucket_size is too small";
 972
 973         return NULL;
 974 }
 975
 976 static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 977 {
 978         struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
 979         struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
 980         uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
 981         const char *err;
 982
 983         err = bch_dev_may_add(sb, c);
 984         if (err)
 985                 return err;
 986
 987         if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
 988                 return "device has been removed";
 989
 990         /*
 991          * When attaching an existing device, the cache set superblock must
 992          * already contain member_info with a matching UUID
 993          */
 994         if (sb->dev_idx >= c->disk_sb->nr_devices ||
 995             memcmp(&mi->members[sb->dev_idx].uuid,
 996                    &dev_uuid, sizeof(uuid_le)))
 997                 return "cache sb does not match set";
 998
 999         return NULL;
1000 }
1001
1002 /* Cache device */
1003
1004 bool bch_dev_read_only(struct cache *ca)
1005 {
1006         struct cache_set *c = ca->set;
1007         struct bch_sb_field_members *mi;
1008         char buf[BDEVNAME_SIZE];
1009
1010         bdevname(ca->disk_sb.bdev, buf);
1011
1012         lockdep_assert_held(&bch_register_lock);
1013
1014         if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
1015                 return false;
1016
1017         if (!bch_dev_may_remove(ca)) {
1018                 bch_err(c, "required member %s going RO, forcing fs RO", buf);
1019                 bch_fs_read_only_sync(c);
1020         }
1021
1022         trace_bcache_cache_read_only(ca);
1023
1024         bch_moving_gc_stop(ca);
1025
1026         /*
1027          * This stops new data writes (e.g. to existing open data
1028          * buckets) and then waits for all existing writes to
1029          * complete.
1030          */
1031         bch_dev_allocator_stop(ca);
1032
1033         bch_dev_group_remove(&c->journal.devs, ca);
1034
1035         /*
1036          * Device data write barrier -- no non-meta-data writes should
1037          * occur after this point.  However, writes to btree buckets,
1038          * journal buckets, and the superblock can still occur.
1039          */
1040         trace_bcache_cache_read_only_done(ca);
1041
1042         bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
1043         bch_notify_dev_read_only(ca);
1044
1045         mutex_lock(&c->sb_lock);
1046         mi = bch_sb_get_members(c->disk_sb);
1047         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
1048                              BCH_MEMBER_STATE_RO);
1049         bch_write_super(c);
1050         mutex_unlock(&c->sb_lock);
1051         return true;
1052 }
1053
1054 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
1055 {
1056         lockdep_assert_held(&bch_register_lock);
1057
1058         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
1059                 return NULL;
1060
1061         if (test_bit(BCH_DEV_REMOVING, &ca->flags))
1062                 return "removing";
1063
1064         trace_bcache_cache_read_write(ca);
1065
1066         if (bch_dev_allocator_start(ca))
1067                 return "error starting allocator thread";
1068
1069         if (bch_moving_gc_thread_start(ca))
1070                 return "error starting moving GC thread";
1071
1072         bch_dev_group_add(&c->journal.devs, ca);
1073
1074         wake_up_process(c->tiering_read);
1075
1076         bch_notify_dev_read_write(ca);
1077         trace_bcache_cache_read_write_done(ca);
1078
1079         return NULL;
1080 }
1081
1082 const char *bch_dev_read_write(struct cache *ca)
1083 {
1084         struct cache_set *c = ca->set;
1085         struct bch_sb_field_members *mi;
1086         const char *err;
1087
1088         err = __bch_dev_read_write(c, ca);
1089         if (err)
1090                 return err;
1091
1092         mutex_lock(&c->sb_lock);
1093         mi = bch_sb_get_members(c->disk_sb);
1094         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
1095                              BCH_MEMBER_STATE_ACTIVE);
1096         bch_write_super(c);
1097         mutex_unlock(&c->sb_lock);
1098
1099         return NULL;
1100 }
1101
1102 /*
1103  * bch_dev_stop has already returned, so we no longer hold the register
1104  * lock at the point this is called.
1105  */
1106
1107 void bch_dev_release(struct kobject *kobj)
1108 {
1109         struct cache *ca = container_of(kobj, struct cache, kobj);
1110
1111         percpu_ref_exit(&ca->ref);
1112         kfree(ca);
1113 }
1114
1115 static void bch_dev_free_work(struct work_struct *work)
1116 {
1117         struct cache *ca = container_of(work, struct cache, free_work);
1118         struct cache_set *c = ca->set;
1119         unsigned i;
1120
1121         cancel_work_sync(&ca->io_error_work);
1122
1123         if (c && c->kobj.state_in_sysfs) {
1124                 char buf[12];
1125
1126                 sprintf(buf, "cache%u", ca->dev_idx);
1127                 sysfs_remove_link(&c->kobj, buf);
1128         }
1129
1130         if (ca->kobj.state_in_sysfs)
1131                 kobject_del(&ca->kobj);
1132
1133         bch_free_super(&ca->disk_sb);
1134
1135         /*
1136          * bch_dev_stop can be called in the middle of initialization
1137          * of the struct cache object.
1138          * As such, not all the sub-structures may be initialized.
1139          * However, they were zeroed when the object was allocated.
1140          */
1141
1142         bch_journal_free_cache(ca);
1143         free_percpu(ca->sectors_written);
1144         bioset_exit(&ca->replica_set);
1145         free_percpu(ca->bucket_stats_percpu);
1146         free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1147         kfree(ca->prio_buckets);
1148         kfree(ca->bio_prio);
1149         kfree(ca->journal.bio);
1150         vfree(ca->buckets);
1151         vfree(ca->oldest_gens);
1152         free_heap(&ca->heap);
1153         free_fifo(&ca->free_inc);
1154
1155         for (i = 0; i < RESERVE_NR; i++)
1156                 free_fifo(&ca->free[i]);
1157
1158         kobject_put(&ca->kobj);
1159
1160         if (c)
1161                 kobject_put(&c->kobj);
1162 }
1163
1164 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
1165 {
1166         struct cache *ca = container_of(ref, struct cache, ref);
1167
1168         schedule_work(&ca->free_work);
1169 }
1170
1171 static void bch_dev_free_rcu(struct rcu_head *rcu)
1172 {
1173         struct cache *ca = container_of(rcu, struct cache, free_rcu);
1174
1175         /*
1176          * This decrements the ref count to ca, and once the ref count
1177          * is 0 (outstanding bios to the ca also incremented it and
1178          * decrement it on completion/error), bch_dev_percpu_ref_release
1179          * is called, and that eventually results in bch_dev_free_work
1180          * being called, which in turn results in bch_dev_release being
1181          * called.
1182          *
1183          * In particular, these functions won't be called until there are no
1184          * bios outstanding (the per-cpu ref counts are all 0), so it
1185          * is safe to remove the actual sysfs device at that point,
1186          * and that can indicate success to the user.
1187          */
1188
1189         percpu_ref_kill(&ca->ref);
1190 }
1191
1192 static void bch_dev_stop(struct cache *ca)
1193 {
1194         struct cache_set *c = ca->set;
1195
1196         lockdep_assert_held(&bch_register_lock);
1197
1198         if (c) {
1199                 BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
1200                 rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
1201         }
1202
1203         call_rcu(&ca->free_rcu, bch_dev_free_rcu);
1204 }
1205
1206 static void bch_dev_remove_work(struct work_struct *work)
1207 {
1208         struct cache *ca = container_of(work, struct cache, remove_work);
1209         struct bch_sb_field_members *mi;
1210         struct cache_set *c = ca->set;
1211         char name[BDEVNAME_SIZE];
1212         bool force = test_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
1213         unsigned dev_idx = ca->dev_idx;
1214
1215         bdevname(ca->disk_sb.bdev, name);
1216
1217         /*
1218          * Device should already be RO, now migrate data off:
1219          *
1220          * XXX: locking is sketchy, bch_dev_read_write() has to check
1221          * BCH_DEV_REMOVING bit
1222          */
1223         if (!ca->mi.has_data) {
1224                 /* Nothing to do: */
1225         } else if (!bch_move_data_off_device(ca)) {
1226                 mutex_lock(&c->sb_lock);
1227                 mi = bch_sb_get_members(c->disk_sb);
1228                 SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
1229
1230                 bch_write_super(c);
1231                 mutex_unlock(&c->sb_lock);
1232         } else if (force) {
1233                 bch_flag_data_bad(ca);
1234
1235                 mutex_lock(&c->sb_lock);
1236                 mi = bch_sb_get_members(c->disk_sb);
1237                 SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
1238
1239                 bch_write_super(c);
1240                 mutex_unlock(&c->sb_lock);
1241         } else {
1242                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1243                         name);
1244                 clear_bit(BCH_DEV_REMOVING, &ca->flags);
1245                 return;
1246         }
1247
1248         /* Now metadata: */
1249
1250         if (!ca->mi.has_metadata) {
1251                 /* Nothing to do: */
1252         } else if (!bch_move_meta_data_off_device(ca)) {
1253                 mutex_lock(&c->sb_lock);
1254                 mi = bch_sb_get_members(c->disk_sb);
1255                 SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
1256
1257                 bch_write_super(c);
1258                 mutex_unlock(&c->sb_lock);
1259         } else {
1260                 bch_err(c, "Remove of %s failed, unable to migrate metadata off",
1261                         name);
1262                 clear_bit(BCH_DEV_REMOVING, &ca->flags);
1263                 return;
1264         }
1265
1266         /*
1267          * Ok, really doing the remove:
1268          * Drop device's prio pointer before removing it from superblock:
1269          */
1270         bch_notify_dev_removed(ca);
1271
1272         spin_lock(&c->journal.lock);
1273         c->journal.prio_buckets[dev_idx] = 0;
1274         spin_unlock(&c->journal.lock);
1275
1276         bch_journal_meta(&c->journal);
1277
1278         /*
1279          * Stop device before removing it from the cache set's list of devices -
1280          * and get our own ref on cache set since ca is going away:
1281          */
1282         closure_get(&c->cl);
1283
1284         mutex_lock(&bch_register_lock);
1285         bch_dev_stop(ca);
1286
1287         /*
1288          * RCU barrier between dropping between c->cache and dropping from
1289          * member info:
1290          */
1291         synchronize_rcu();
1292
1293         lockdep_assert_held(&bch_register_lock);
1294
1295         /*
1296          * Free this device's slot in the bch_member array - all pointers to
1297          * this device must be gone:
1298          */
1299         mutex_lock(&c->sb_lock);
1300         mi = bch_sb_get_members(c->disk_sb);
1301         memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1302
1303         bch_write_super(c);
1304         mutex_unlock(&c->sb_lock);
1305
1306         mutex_unlock(&bch_register_lock);
1307
1308         closure_put(&c->cl);
1309 }
1310
1311 bool bch_dev_remove(struct cache *ca, bool force)
1312 {
1313         mutex_lock(&bch_register_lock);
1314
1315         if (test_bit(BCH_DEV_REMOVING, &ca->flags))
1316                 return false;
1317
1318         if (!bch_dev_may_remove(ca)) {
1319                 bch_err(ca->set, "Can't remove last device in tier %u",
1320                         ca->mi.tier);
1321                 bch_notify_dev_remove_failed(ca);
1322                 return false;
1323         }
1324
1325         /* First, go RO before we try to migrate data off: */
1326         bch_dev_read_only(ca);
1327
1328         if (force)
1329                 set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
1330         set_bit(BCH_DEV_REMOVING, &ca->flags);
1331         bch_notify_dev_removing(ca);
1332
1333         mutex_unlock(&bch_register_lock);
1334
1335         /* Migrate the data and finish removal asynchronously: */
1336
1337         queue_work(system_long_wq, &ca->remove_work);
1338         return true;
1339 }
1340
1341 static int bch_dev_online(struct cache *ca)
1342 {
1343         char buf[12];
1344
1345         lockdep_assert_held(&bch_register_lock);
1346
1347         sprintf(buf, "cache%u", ca->dev_idx);
1348
1349         if (kobject_add(&ca->kobj,
1350                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1351                         "bcache") ||
1352             sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
1353             sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
1354                 return -1;
1355
1356         return 0;
1357 }
1358
1359 static const char *bch_dev_alloc(struct bcache_superblock *sb,
1360                                  struct cache_set *c,
1361                                  struct cache **ret)
1362 {
1363         struct bch_member *member;
1364         size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1365         size_t heap_size;
1366         unsigned i;
1367         const char *err = "cannot allocate memory";
1368         struct cache *ca;
1369
1370         if (c->sb.nr_devices == 1)
1371                 bdevname(sb->bdev, c->name);
1372
1373         if (bch_fs_init_fault("dev_alloc"))
1374                 return err;
1375
1376         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1377         if (!ca)
1378                 return err;
1379
1380         if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release,
1381                             0, GFP_KERNEL)) {
1382                 kfree(ca);
1383                 return err;
1384         }
1385
1386         kobject_init(&ca->kobj, &bch_dev_ktype);
1387
1388         spin_lock_init(&ca->self.lock);
1389         ca->self.nr_devices = 1;
1390         rcu_assign_pointer(ca->self.d[0].dev, ca);
1391         ca->dev_idx = sb->sb->dev_idx;
1392
1393         INIT_WORK(&ca->free_work, bch_dev_free_work);
1394         INIT_WORK(&ca->remove_work, bch_dev_remove_work);
1395         spin_lock_init(&ca->freelist_lock);
1396         spin_lock_init(&ca->prio_buckets_lock);
1397         mutex_init(&ca->heap_lock);
1398         bch_moving_init_cache(ca);
1399
1400         ca->disk_sb = *sb;
1401         ca->disk_sb.bdev->bd_holder = ca;
1402         memset(sb, 0, sizeof(*sb));
1403
1404         INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1405
1406         err = "dynamic fault";
1407         if (bch_fs_init_fault("dev_alloc"))
1408                 goto err;
1409
1410         member = bch_sb_get_members(ca->disk_sb.sb)->members +
1411                 ca->disk_sb.sb->dev_idx;
1412
1413         ca->mi = cache_mi_to_cpu_mi(member);
1414         ca->uuid = member->uuid;
1415         ca->bucket_bits = ilog2(ca->mi.bucket_size);
1416
1417         /* XXX: tune these */
1418         movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1419         reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1420         /*
1421          * free_inc must be smaller than the copygc reserve: if it was bigger,
1422          * one copygc iteration might not make enough buckets available to fill
1423          * up free_inc and allow the allocator to make forward progress
1424          */
1425         free_inc_reserve = movinggc_reserve / 2;
1426         heap_size = movinggc_reserve * 8;
1427
1428         if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1429             !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1430             !init_fifo(&ca->free[RESERVE_MOVINGGC],
1431                        movinggc_reserve, GFP_KERNEL) ||
1432             !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1433             !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
1434             !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
1435             !(ca->oldest_gens   = vzalloc(sizeof(u8) *
1436                                           ca->mi.nbuckets)) ||
1437             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1438                                           ca->mi.nbuckets)) ||
1439             !(ca->prio_buckets  = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1440                                           2, GFP_KERNEL)) ||
1441             !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1442             !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
1443             !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
1444             bioset_init(&ca->replica_set, 4,
1445                         offsetof(struct bch_write_bio, bio)) ||
1446             !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
1447             bch_journal_init_cache(ca))
1448                 goto err;
1449
1450         ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1451
1452         total_reserve = ca->free_inc.size;
1453         for (i = 0; i < RESERVE_NR; i++)
1454                 total_reserve += ca->free[i].size;
1455         pr_debug("%zu buckets reserved", total_reserve);
1456
1457         ca->copygc_write_point.group = &ca->self;
1458         ca->tiering_write_point.group = &ca->self;
1459
1460         /*
1461          * Increase journal write timeout if flushes to this device are
1462          * expensive:
1463          */
1464         if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
1465             journal_flushes_device(ca))
1466                 c->journal.write_delay_ms =
1467                         max(c->journal.write_delay_ms, 1000U);
1468
1469         kobject_get(&c->kobj);
1470         ca->set = c;
1471
1472         kobject_get(&ca->kobj);
1473         rcu_assign_pointer(c->cache[ca->dev_idx], ca);
1474
1475         mutex_lock(&c->sb_lock);
1476
1477         if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
1478                 bch_sb_to_cache_set(c, ca->disk_sb.sb);
1479
1480         mutex_unlock(&c->sb_lock);
1481
1482         err = "error creating kobject";
1483         if (c->kobj.state_in_sysfs &&
1484             bch_dev_online(ca))
1485                 goto err;
1486
1487         if (ret)
1488                 *ret = ca;
1489         else
1490                 kobject_put(&ca->kobj);
1491         return NULL;
1492 err:
1493         bch_dev_stop(ca);
1494         return err;
1495 }
1496
1497 static struct cache_set *bch_fs_lookup(uuid_le uuid)
1498 {
1499         struct cache_set *c;
1500
1501         lockdep_assert_held(&bch_register_lock);
1502
1503         list_for_each_entry(c, &bch_fs_list, list)
1504                 if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
1505                         return c;
1506
1507         return NULL;
1508 }
1509
1510 int bch_dev_add(struct cache_set *c, const char *path)
1511 {
1512         struct bcache_superblock sb;
1513         const char *err;
1514         struct cache *ca;
1515         struct bch_sb_field *f;
1516         struct bch_sb_field_members *mi, *dev_mi;
1517         struct bch_member saved_mi;
1518         unsigned dev_idx, nr_devices, u64s;
1519         int ret = -EINVAL;
1520
1521         mutex_lock(&bch_register_lock);
1522
1523         err = bch_read_super(&sb, c->opts, path);
1524         if (err)
1525                 goto err_unlock_register;
1526
1527         err = bch_validate_cache_super(&sb);
1528         if (err)
1529                 goto err_unlock_register;
1530
1531         mutex_lock(&c->sb_lock);
1532
1533         err = bch_dev_may_add(sb.sb, c);
1534         if (err)
1535                 goto err_unlock;
1536
1537         /*
1538          * Preserve the old cache member information (esp. tier)
1539          * before we start bashing the disk stuff.
1540          */
1541         dev_mi = bch_sb_get_members(sb.sb);
1542         saved_mi = dev_mi->members[sb.sb->dev_idx];
1543         saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
1544
1545         down_read(&c->gc_lock);
1546
1547         if (dynamic_fault("bcache:add:no_slot"))
1548                 goto no_slot;
1549
1550         if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
1551                 goto no_slot;
1552
1553         mi = bch_sb_get_members(c->disk_sb);
1554         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1555                 if (dev_idx >= c->sb.nr_devices ||
1556                     bch_is_zero(mi->members[dev_idx].uuid.b,
1557                                  sizeof(uuid_le)))
1558                         goto have_slot;
1559 no_slot:
1560         up_read(&c->gc_lock);
1561
1562         err = "no slots available in superblock";
1563         ret = -ENOSPC;
1564         goto err_unlock;
1565
1566 have_slot:
1567         up_read(&c->gc_lock);
1568
1569         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1570         u64s = (sizeof(struct bch_sb_field_members) +
1571                 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1572         err = "no space in superblock for member info";
1573
1574         f = bch_fs_sb_field_resize(c, &mi->field, u64s);
1575         if (!f)
1576                 goto err_unlock;
1577
1578         mi = container_of(f, struct bch_sb_field_members, field);
1579
1580         f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
1581         if (!f)
1582                 goto err_unlock;
1583
1584         dev_mi = container_of(f, struct bch_sb_field_members, field);
1585         memcpy(dev_mi, mi, u64s * sizeof(u64));
1586         dev_mi->members[dev_idx] = saved_mi;
1587
1588         sb.sb->dev_idx          = dev_idx;
1589         sb.sb->nr_devices       = nr_devices;
1590
1591         if (bch_fs_mi_update(c, dev_mi->members, nr_devices)) {
1592                 err = "cannot allocate memory";
1593                 ret = -ENOMEM;
1594                 goto err_unlock;
1595         }
1596
1597         /* commit new member info */
1598         memcpy(mi, dev_mi, u64s * sizeof(u64));
1599         c->disk_sb->nr_devices  = nr_devices;
1600         c->sb.nr_devices        = nr_devices;
1601
1602         err = bch_dev_alloc(&sb, c, &ca);
1603         if (err)
1604                 goto err_unlock;
1605
1606         bch_write_super(c);
1607
1608         err = "journal alloc failed";
1609         if (bch_dev_journal_alloc(ca))
1610                 goto err_put;
1611
1612         bch_notify_dev_added(ca);
1613
1614         if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
1615                 err = __bch_dev_read_write(c, ca);
1616                 if (err)
1617                         goto err_put;
1618         }
1619
1620         kobject_put(&ca->kobj);
1621         mutex_unlock(&c->sb_lock);
1622         mutex_unlock(&bch_register_lock);
1623         return 0;
1624 err_put:
1625         bch_dev_stop(ca);
1626 err_unlock:
1627         mutex_unlock(&c->sb_lock);
1628 err_unlock_register:
1629         mutex_unlock(&bch_register_lock);
1630         bch_free_super(&sb);
1631
1632         bch_err(c, "Unable to add device: %s", err);
1633         return ret ?: -EINVAL;
1634 }
1635
1636 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
1637                         struct bch_opts opts, struct cache_set **ret)
1638 {
1639         const char *err;
1640         struct cache_set *c = NULL;
1641         struct bcache_superblock *sb;
1642         uuid_le uuid;
1643         unsigned i;
1644
1645         memset(&uuid, 0, sizeof(uuid_le));
1646
1647         if (!nr_devices)
1648                 return "need at least one device";
1649
1650         if (!try_module_get(THIS_MODULE))
1651                 return "module unloading";
1652
1653         err = "cannot allocate memory";
1654         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1655         if (!sb)
1656                 goto err;
1657
1658         /*
1659          * bch_read_super() needs to happen under register_lock, so that the
1660          * exclusive open is atomic with adding the new cache set to the list of
1661          * cache sets:
1662          */
1663         mutex_lock(&bch_register_lock);
1664
1665         for (i = 0; i < nr_devices; i++) {
1666                 err = bch_read_super(&sb[i], opts, devices[i]);
1667                 if (err)
1668                         goto err_unlock;
1669
1670                 err = "attempting to register backing device";
1671                 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
1672                         goto err_unlock;
1673
1674                 err = bch_validate_cache_super(&sb[i]);
1675                 if (err)
1676                         goto err_unlock;
1677         }
1678
1679         err = "cache set already registered";
1680         if (bch_fs_lookup(sb->sb->uuid))
1681                 goto err_unlock;
1682
1683         err = "cannot allocate memory";
1684         c = bch_fs_alloc(sb[0].sb, opts);
1685         if (!c)
1686                 goto err_unlock;
1687
1688         for (i = 0; i < nr_devices; i++) {
1689                 err = bch_dev_alloc(&sb[i], c, NULL);
1690                 if (err)
1691                         goto err_unlock;
1692         }
1693
1694         err = "insufficient devices";
1695         if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
1696                 goto err_unlock;
1697
1698         err = bch_fs_start(c);
1699         if (err)
1700                 goto err_unlock;
1701
1702         err = "error creating kobject";
1703         if (bch_fs_online(c))
1704                 goto err_unlock;
1705
1706         if (ret) {
1707                 closure_get(&c->cl);
1708                 *ret = c;
1709         }
1710
1711         mutex_unlock(&bch_register_lock);
1712
1713         err = NULL;
1714 out:
1715         kfree(sb);
1716         module_put(THIS_MODULE);
1717         if (err)
1718                 c = NULL;
1719         return err;
1720 err_unlock:
1721         if (c)
1722                 bch_fs_stop(c);
1723         mutex_unlock(&bch_register_lock);
1724 err:
1725         for (i = 0; i < nr_devices; i++)
1726                 bch_free_super(&sb[i]);
1727         goto out;
1728 }
1729
1730 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
1731                                   struct bch_opts opts)
1732 {
1733         char name[BDEVNAME_SIZE];
1734         const char *err;
1735         struct cache_set *c;
1736         bool allocated_cache_set = false;
1737
1738         err = bch_validate_cache_super(sb);
1739         if (err)
1740                 return err;
1741
1742         bdevname(sb->bdev, name);
1743
1744         c = bch_fs_lookup(sb->sb->uuid);
1745         if (c) {
1746                 err = bch_dev_in_fs(sb->sb, c);
1747                 if (err)
1748                         return err;
1749         } else {
1750                 c = bch_fs_alloc(sb->sb, opts);
1751                 if (!c)
1752                         return "cannot allocate memory";
1753
1754                 allocated_cache_set = true;
1755         }
1756
1757         err = bch_dev_alloc(sb, c, NULL);
1758         if (err)
1759                 goto err;
1760
1761         if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
1762                 err = bch_fs_start(c);
1763                 if (err)
1764                         goto err;
1765         } else {
1766                 err = "error creating kobject";
1767                 if (bch_fs_online(c))
1768                         goto err;
1769         }
1770
1771         bch_info(c, "started");
1772         return NULL;
1773 err:
1774         if (allocated_cache_set)
1775                 bch_fs_stop(c);
1776         return err;
1777 }
1778
1779 const char *bch_fs_open_incremental(const char *path)
1780 {
1781         struct bcache_superblock sb;
1782         struct bch_opts opts = bch_opts_empty();
1783         const char *err;
1784
1785         mutex_lock(&bch_register_lock);
1786
1787         err = bch_read_super(&sb, opts, path);
1788         if (err)
1789                 goto err;
1790
1791         if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
1792                 err = bch_backing_dev_register(&sb);
1793         else
1794                 err = __bch_fs_open_incremental(&sb, opts);
1795
1796         bch_free_super(&sb);
1797 err:
1798         mutex_unlock(&bch_register_lock);
1799         return err;
1800 }
1801
1802 /* Global interfaces/init */
1803
1804 #define kobj_attribute_write(n, fn)                                     \
1805         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1806
1807 #define kobj_attribute_rw(n, show, store)                               \
1808         static struct kobj_attribute ksysfs_##n =                       \
1809                 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1810
1811 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1812                                const char *, size_t);
1813
1814 kobj_attribute_write(register,          register_bcache);
1815 kobj_attribute_write(register_quiet,    register_bcache);
1816
1817 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1818                                const char *buffer, size_t size)
1819 {
1820         ssize_t ret = -EINVAL;
1821         const char *err = "cannot allocate memory";
1822         char *path = NULL;
1823
1824         if (!try_module_get(THIS_MODULE))
1825                 return -EBUSY;
1826
1827         if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
1828                 goto err;
1829
1830         err = bch_fs_open_incremental(strim(path));
1831         if (err)
1832                 goto err;
1833
1834         ret = size;
1835 out:
1836         kfree(path);
1837         module_put(THIS_MODULE);
1838         return ret;
1839 err:
1840         pr_err("error opening %s: %s", path, err);
1841         goto out;
1842 }
1843
1844 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1845 {
1846         if (code == SYS_DOWN ||
1847             code == SYS_HALT ||
1848             code == SYS_POWER_OFF) {
1849                 struct cache_set *c;
1850
1851                 mutex_lock(&bch_register_lock);
1852
1853                 if (!list_empty(&bch_fs_list))
1854                         pr_info("Setting all devices read only:");
1855
1856                 list_for_each_entry(c, &bch_fs_list, list)
1857                         bch_fs_read_only(c);
1858
1859                 list_for_each_entry(c, &bch_fs_list, list)
1860                         bch_fs_read_only_sync(c);
1861
1862                 mutex_unlock(&bch_register_lock);
1863         }
1864
1865         return NOTIFY_DONE;
1866 }
1867
1868 static struct notifier_block reboot = {
1869         .notifier_call  = bcache_reboot,
1870         .priority       = INT_MAX, /* before any real devices */
1871 };
1872
1873 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
1874                            const char *buffer, size_t size)
1875 {
1876         bcache_reboot(NULL, SYS_DOWN, NULL);
1877         return size;
1878 }
1879
1880 kobj_attribute_write(reboot,            reboot_test);
1881
1882 static void bcache_exit(void)
1883 {
1884         bch_debug_exit();
1885         bch_fs_exit();
1886         bch_blockdev_exit();
1887         bch_chardev_exit();
1888         if (bcache_kset)
1889                 kset_unregister(bcache_kset);
1890         if (bcache_io_wq)
1891                 destroy_workqueue(bcache_io_wq);
1892         if (!IS_ERR_OR_NULL(bch_sha256))
1893                 crypto_free_shash(bch_sha256);
1894         unregister_reboot_notifier(&reboot);
1895 }
1896
1897 static int __init bcache_init(void)
1898 {
1899         static const struct attribute *files[] = {
1900                 &ksysfs_register.attr,
1901                 &ksysfs_register_quiet.attr,
1902                 &ksysfs_reboot.attr,
1903                 NULL
1904         };
1905
1906         mutex_init(&bch_register_lock);
1907         register_reboot_notifier(&reboot);
1908         closure_debug_init();
1909         bkey_pack_test();
1910
1911         bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
1912         if (IS_ERR(bch_sha256))
1913                 goto err;
1914
1915         if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
1916             !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
1917             sysfs_create_files(&bcache_kset->kobj, files) ||
1918             bch_chardev_init() ||
1919             bch_blockdev_init() ||
1920             bch_fs_init() ||
1921             bch_debug_init())
1922                 goto err;
1923
1924         return 0;
1925 err:
1926         bcache_exit();
1927         return -ENOMEM;
1928 }
1929
1930 #define BCH_DEBUG_PARAM(name, description)                      \
1931         bool bch_##name;                                        \
1932         module_param_named(name, bch_##name, bool, 0644);       \
1933         MODULE_PARM_DESC(name, description);
1934 BCH_DEBUG_PARAMS()
1935 #undef BCH_DEBUG_PARAM
1936
1937 module_exit(bcache_exit);
1938 module_init(bcache_init);