git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/super.c

   1 /*
   2  * bcache setup/teardown code, and some metadata io - read a superblock and
   3  * figure out what to do with it.
   4  *
   5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6  * Copyright 2012 Google, Inc.
   7  */
   8
   9 #include "bcache.h"
  10 #include "blockdev.h"
  11 #include "alloc.h"
  12 #include "btree_cache.h"
  13 #include "btree_gc.h"
  14 #include "btree_update.h"
  15 #include "btree_io.h"
  16 #include "chardev.h"
  17 #include "checksum.h"
  18 #include "clock.h"
  19 #include "compress.h"
  20 #include "debug.h"
  21 #include "error.h"
  22 #include "fs.h"
  23 #include "fs-gc.h"
  24 #include "inode.h"
  25 #include "io.h"
  26 #include "journal.h"
  27 #include "keylist.h"
  28 #include "move.h"
  29 #include "migrate.h"
  30 #include "movinggc.h"
  31 #include "notify.h"
  32 #include "stats.h"
  33 #include "super.h"
  34 #include "super-io.h"
  35 #include "tier.h"
  36 #include "writeback.h"
  37
  38 #include <linux/backing-dev.h>
  39 #include <linux/blkdev.h>
  40 #include <linux/debugfs.h>
  41 #include <linux/device.h>
  42 #include <linux/genhd.h>
  43 #include <linux/idr.h>
  44 #include <linux/kthread.h>
  45 #include <linux/module.h>
  46 #include <linux/percpu.h>
  47 #include <linux/random.h>
  48 #include <linux/reboot.h>
  49 #include <linux/sysfs.h>
  50 #include <crypto/hash.h>
  51
  52 #include <trace/events/bcache.h>
  53
  54 MODULE_LICENSE("GPL");
  55 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
  56
  57 static const uuid_le invalid_uuid = {
  58         .b = {
  59                 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
  60                 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
  61         }
  62 };
  63
  64 static struct kset *bcache_kset;
  65 static LIST_HEAD(bch_fs_list);
  66 static DEFINE_MUTEX(bch_fs_list_lock);
  67
  68 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
  69 struct workqueue_struct *bcache_io_wq;
  70 struct crypto_shash *bch_sha256;
  71
  72 static void bch_dev_free(struct bch_dev *);
  73 static int bch_dev_alloc(struct bch_fs *, unsigned);
  74 static int bch_dev_sysfs_online(struct bch_dev *);
  75 static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *);
  76
  77 struct bch_fs *bch_bdev_to_fs(struct block_device *bdev)
  78 {
  79         struct bch_fs *c;
  80         struct bch_dev *ca;
  81         unsigned i;
  82
  83         mutex_lock(&bch_fs_list_lock);
  84         rcu_read_lock();
  85
  86         list_for_each_entry(c, &bch_fs_list, list)
  87                 for_each_member_device_rcu(ca, c, i)
  88                         if (ca->disk_sb.bdev == bdev) {
  89                                 closure_get(&c->cl);
  90                                 goto found;
  91                         }
  92         c = NULL;
  93 found:
  94         rcu_read_unlock();
  95         mutex_unlock(&bch_fs_list_lock);
  96
  97         return c;
  98 }
  99
 100 static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid)
 101 {
 102         struct bch_fs *c;
 103
 104         lockdep_assert_held(&bch_fs_list_lock);
 105
 106         list_for_each_entry(c, &bch_fs_list, list)
 107                 if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
 108                         return c;
 109
 110         return NULL;
 111 }
 112
 113 struct bch_fs *bch_uuid_to_fs(uuid_le uuid)
 114 {
 115         struct bch_fs *c;
 116
 117         mutex_lock(&bch_fs_list_lock);
 118         c = __bch_uuid_to_fs(uuid);
 119         if (c)
 120                 closure_get(&c->cl);
 121         mutex_unlock(&bch_fs_list_lock);
 122
 123         return c;
 124 }
 125
 126 int bch_congested(struct bch_fs *c, int bdi_bits)
 127 {
 128         struct backing_dev_info *bdi;
 129         struct bch_dev *ca;
 130         unsigned i;
 131         int ret = 0;
 132
 133         if (bdi_bits & (1 << WB_sync_congested)) {
 134                 /* Reads - check all devices: */
 135                 for_each_readable_member(ca, c, i) {
 136                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 137
 138                         if (bdi_congested(bdi, bdi_bits)) {
 139                                 ret = 1;
 140                                 break;
 141                         }
 142                 }
 143         } else {
 144                 /* Writes prefer fastest tier: */
 145                 struct bch_tier *tier = READ_ONCE(c->fastest_tier);
 146                 struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
 147
 148                 rcu_read_lock();
 149                 group_for_each_dev(ca, grp, i) {
 150                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 151
 152                         if (bdi_congested(bdi, bdi_bits)) {
 153                                 ret = 1;
 154                                 break;
 155                         }
 156                 }
 157                 rcu_read_unlock();
 158         }
 159
 160         return ret;
 161 }
 162
 163 static int bch_congested_fn(void *data, int bdi_bits)
 164 {
 165         struct bch_fs *c = data;
 166
 167         return bch_congested(c, bdi_bits);
 168 }
 169
 170 /* Filesystem RO/RW: */
 171
 172 /*
 173  * For startup/shutdown of RW stuff, the dependencies are:
 174  *
 175  * - foreground writes depend on copygc and tiering (to free up space)
 176  *
 177  * - copygc and tiering depend on mark and sweep gc (they actually probably
 178  *   don't because they either reserve ahead of time or don't block if
 179  *   allocations fail, but allocations can require mark and sweep gc to run
 180  *   because of generation number wraparound)
 181  *
 182  * - all of the above depends on the allocator threads
 183  *
 184  * - allocator depends on the journal (when it rewrites prios and gens)
 185  */
 186
 187 static void __bch_fs_read_only(struct bch_fs *c)
 188 {
 189         struct bch_dev *ca;
 190         unsigned i;
 191
 192         bch_tiering_stop(c);
 193
 194         for_each_member_device(ca, c, i)
 195                 bch_moving_gc_stop(ca);
 196
 197         bch_gc_thread_stop(c);
 198
 199         bch_btree_flush(c);
 200
 201         for_each_member_device(ca, c, i)
 202                 bch_dev_allocator_stop(ca);
 203
 204         bch_fs_journal_stop(&c->journal);
 205 }
 206
 207 static void bch_writes_disabled(struct percpu_ref *writes)
 208 {
 209         struct bch_fs *c = container_of(writes, struct bch_fs, writes);
 210
 211         set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 212         wake_up(&bch_read_only_wait);
 213 }
 214
 215 void bch_fs_read_only(struct bch_fs *c)
 216 {
 217         mutex_lock(&c->state_lock);
 218         if (c->state != BCH_FS_STARTING &&
 219             c->state != BCH_FS_RW)
 220                 goto out;
 221
 222         if (test_bit(BCH_FS_ERROR, &c->flags))
 223                 goto out;
 224
 225         trace_fs_read_only(c);
 226
 227         /*
 228          * Block new foreground-end write operations from starting - any new
 229          * writes will return -EROFS:
 230          *
 231          * (This is really blocking new _allocations_, writes to previously
 232          * allocated space can still happen until stopping the allocator in
 233          * bch_dev_allocator_stop()).
 234          */
 235         percpu_ref_kill(&c->writes);
 236
 237         del_timer(&c->foreground_write_wakeup);
 238         cancel_delayed_work(&c->pd_controllers_update);
 239
 240         c->foreground_write_pd.rate.rate = UINT_MAX;
 241         bch_wake_delayed_writes((unsigned long) c);
 242
 243         /*
 244          * If we're not doing an emergency shutdown, we want to wait on
 245          * outstanding writes to complete so they don't see spurious errors due
 246          * to shutting down the allocator:
 247          *
 248          * If we are doing an emergency shutdown outstanding writes may
 249          * hang until we shutdown the allocator so we don't want to wait
 250          * on outstanding writes before shutting everything down - but
 251          * we do need to wait on them before returning and signalling
 252          * that going RO is complete:
 253          */
 254         wait_event(bch_read_only_wait,
 255                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
 256                    test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 257
 258         __bch_fs_read_only(c);
 259
 260         wait_event(bch_read_only_wait,
 261                    test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 262
 263         clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 264
 265         if (!bch_journal_error(&c->journal) &&
 266             !test_bit(BCH_FS_ERROR, &c->flags)) {
 267                 mutex_lock(&c->sb_lock);
 268                 SET_BCH_SB_CLEAN(c->disk_sb, true);
 269                 bch_write_super(c);
 270                 mutex_unlock(&c->sb_lock);
 271         }
 272
 273         c->state = BCH_FS_RO;
 274         bch_notify_fs_read_only(c);
 275         trace_fs_read_only_done(c);
 276 out:
 277         mutex_unlock(&c->state_lock);
 278 }
 279
 280 static void bch_fs_read_only_work(struct work_struct *work)
 281 {
 282         struct bch_fs *c =
 283                 container_of(work, struct bch_fs, read_only_work);
 284
 285         bch_fs_read_only(c);
 286 }
 287
 288 static void bch_fs_read_only_async(struct bch_fs *c)
 289 {
 290         queue_work(system_long_wq, &c->read_only_work);
 291 }
 292
 293 bool bch_fs_emergency_read_only(struct bch_fs *c)
 294 {
 295         bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 296
 297         bch_fs_read_only_async(c);
 298         bch_journal_halt(&c->journal);
 299
 300         wake_up(&bch_read_only_wait);
 301         return ret;
 302 }
 303
 304 const char *bch_fs_read_write(struct bch_fs *c)
 305 {
 306         struct bch_dev *ca;
 307         const char *err = NULL;
 308         unsigned i;
 309
 310         mutex_lock(&c->state_lock);
 311         if (c->state != BCH_FS_STARTING &&
 312             c->state != BCH_FS_RO)
 313                 goto out;
 314
 315         err = "error starting allocator thread";
 316         for_each_rw_member(ca, c, i)
 317                 if (bch_dev_allocator_start(ca)) {
 318                         percpu_ref_put(&ca->io_ref);
 319                         goto err;
 320                 }
 321
 322         err = "error starting btree GC thread";
 323         if (bch_gc_thread_start(c))
 324                 goto err;
 325
 326         err = "error starting moving GC thread";
 327         for_each_rw_member(ca, c, i)
 328                 if (bch_moving_gc_start(ca)) {
 329                         percpu_ref_put(&ca->io_ref);
 330                         goto err;
 331                 }
 332
 333         err = "error starting tiering thread";
 334         if (bch_tiering_start(c))
 335                 goto err;
 336
 337         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 338
 339         if (c->state != BCH_FS_STARTING)
 340                 percpu_ref_reinit(&c->writes);
 341
 342         c->state = BCH_FS_RW;
 343         err = NULL;
 344 out:
 345         mutex_unlock(&c->state_lock);
 346         return err;
 347 err:
 348         __bch_fs_read_only(c);
 349         goto out;
 350 }
 351
 352 /* Filesystem startup/shutdown: */
 353
 354 static void bch_fs_free(struct bch_fs *c)
 355 {
 356         bch_fs_encryption_exit(c);
 357         bch_fs_btree_exit(c);
 358         bch_fs_journal_exit(&c->journal);
 359         bch_io_clock_exit(&c->io_clock[WRITE]);
 360         bch_io_clock_exit(&c->io_clock[READ]);
 361         bch_fs_compress_exit(c);
 362         bch_fs_blockdev_exit(c);
 363         bdi_destroy(&c->bdi);
 364         lg_lock_free(&c->usage_lock);
 365         free_percpu(c->usage_percpu);
 366         mempool_exit(&c->btree_bounce_pool);
 367         mempool_exit(&c->bio_bounce_pages);
 368         bioset_exit(&c->bio_write);
 369         bioset_exit(&c->bio_read_split);
 370         bioset_exit(&c->bio_read);
 371         bioset_exit(&c->btree_read_bio);
 372         mempool_exit(&c->btree_interior_update_pool);
 373         mempool_exit(&c->btree_reserve_pool);
 374         mempool_exit(&c->fill_iter);
 375         percpu_ref_exit(&c->writes);
 376
 377         if (c->copygc_wq)
 378                 destroy_workqueue(c->copygc_wq);
 379         if (c->wq)
 380                 destroy_workqueue(c->wq);
 381
 382         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
 383         kfree(c);
 384         module_put(THIS_MODULE);
 385 }
 386
 387 static void bch_fs_exit(struct bch_fs *c)
 388 {
 389         unsigned i;
 390
 391         del_timer_sync(&c->foreground_write_wakeup);
 392         cancel_delayed_work_sync(&c->pd_controllers_update);
 393         cancel_work_sync(&c->read_only_work);
 394         cancel_work_sync(&c->bio_submit_work);
 395         cancel_work_sync(&c->read_retry_work);
 396
 397         for (i = 0; i < c->sb.nr_devices; i++)
 398                 if (c->devs[i])
 399                         bch_dev_free(c->devs[i]);
 400
 401         closure_debug_destroy(&c->cl);
 402         kobject_put(&c->kobj);
 403 }
 404
 405 static void bch_fs_offline(struct bch_fs *c)
 406 {
 407         struct bch_dev *ca;
 408         unsigned i;
 409
 410         mutex_lock(&bch_fs_list_lock);
 411         list_del(&c->list);
 412         mutex_unlock(&bch_fs_list_lock);
 413
 414         for_each_member_device(ca, c, i)
 415                 if (ca->kobj.state_in_sysfs &&
 416                     ca->disk_sb.bdev)
 417                         sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
 418                                           "bcache");
 419
 420         if (c->kobj.state_in_sysfs)
 421                 kobject_del(&c->kobj);
 422
 423         bch_fs_debug_exit(c);
 424         bch_fs_chardev_exit(c);
 425
 426         bch_cache_accounting_destroy(&c->accounting);
 427
 428         kobject_put(&c->time_stats);
 429         kobject_put(&c->opts_dir);
 430         kobject_put(&c->internal);
 431
 432         __bch_fs_read_only(c);
 433 }
 434
 435 /*
 436  * should be __bch_fs_stop4 - block devices are closed, now we can finally
 437  * free it
 438  */
 439 void bch_fs_release(struct kobject *kobj)
 440 {
 441         struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 442
 443         bch_notify_fs_stopped(c);
 444         bch_fs_free(c);
 445 }
 446
 447 /*
 448  * All activity on the filesystem should have stopped now - close devices:
 449  */
 450 static void __bch_fs_stop3(struct closure *cl)
 451 {
 452         struct bch_fs *c = container_of(cl, struct bch_fs, cl);
 453
 454         bch_fs_exit(c);
 455 }
 456
 457 /*
 458  * Openers (i.e. block devices) should have exited, shutdown all userspace
 459  * interfaces and wait for &c->cl to hit 0
 460  */
 461 static void __bch_fs_stop2(struct closure *cl)
 462 {
 463         struct bch_fs *c = container_of(cl, struct bch_fs, caching);
 464
 465         bch_fs_offline(c);
 466
 467         closure_return(cl);
 468 }
 469
 470 /*
 471  * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
 472  * we haven't waited for anything to stop yet, we're just punting to process
 473  * context to shut down block devices:
 474  */
 475 static void __bch_fs_stop1(struct closure *cl)
 476 {
 477         struct bch_fs *c = container_of(cl, struct bch_fs, caching);
 478
 479         bch_blockdevs_stop(c);
 480
 481         continue_at(cl, __bch_fs_stop2, system_wq);
 482 }
 483
 484 void bch_fs_stop_async(struct bch_fs *c)
 485 {
 486         mutex_lock(&c->state_lock);
 487         if (c->state != BCH_FS_STOPPING) {
 488                 c->state = BCH_FS_STOPPING;
 489                 closure_queue(&c->caching);
 490         }
 491         mutex_unlock(&c->state_lock);
 492 }
 493
 494 void bch_fs_stop(struct bch_fs *c)
 495 {
 496         mutex_lock(&c->state_lock);
 497         BUG_ON(c->state == BCH_FS_STOPPING);
 498         c->state = BCH_FS_STOPPING;
 499         mutex_unlock(&c->state_lock);
 500
 501         bch_blockdevs_stop(c);
 502
 503         closure_sync(&c->caching);
 504         closure_debug_destroy(&c->caching);
 505
 506         bch_fs_offline(c);
 507
 508         closure_put(&c->cl);
 509         closure_sync(&c->cl);
 510
 511         bch_fs_exit(c);
 512 }
 513
 514 /* Stop, detaching from backing devices: */
 515 void bch_fs_detach(struct bch_fs *c)
 516 {
 517         if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
 518                 bch_fs_stop_async(c);
 519 }
 520
 521 #define alloc_bucket_pages(gfp, ca)                     \
 522         ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
 523
 524 static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 525 {
 526         struct bch_sb_field_members *mi;
 527         struct bch_fs *c;
 528         unsigned i, iter_size, journal_entry_bytes;
 529
 530         c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
 531         if (!c)
 532                 return NULL;
 533
 534         __module_get(THIS_MODULE);
 535
 536         c->minor                = -1;
 537
 538         mutex_init(&c->state_lock);
 539         mutex_init(&c->sb_lock);
 540         INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
 541         mutex_init(&c->btree_cache_lock);
 542         mutex_init(&c->bucket_lock);
 543         mutex_init(&c->btree_root_lock);
 544         INIT_WORK(&c->read_only_work, bch_fs_read_only_work);
 545
 546         init_rwsem(&c->gc_lock);
 547
 548 #define BCH_TIME_STAT(name, frequency_units, duration_units)            \
 549         spin_lock_init(&c->name##_time.lock);
 550         BCH_TIME_STATS()
 551 #undef BCH_TIME_STAT
 552
 553         bch_fs_allocator_init(c);
 554         bch_fs_tiering_init(c);
 555
 556         INIT_LIST_HEAD(&c->list);
 557         INIT_LIST_HEAD(&c->cached_devs);
 558         INIT_LIST_HEAD(&c->btree_cache);
 559         INIT_LIST_HEAD(&c->btree_cache_freeable);
 560         INIT_LIST_HEAD(&c->btree_cache_freed);
 561
 562         INIT_LIST_HEAD(&c->btree_interior_update_list);
 563         mutex_init(&c->btree_reserve_cache_lock);
 564         mutex_init(&c->btree_interior_update_lock);
 565
 566         mutex_init(&c->bio_bounce_pages_lock);
 567         INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
 568         spin_lock_init(&c->bio_submit_lock);
 569         bio_list_init(&c->read_retry_list);
 570         spin_lock_init(&c->read_retry_lock);
 571         INIT_WORK(&c->read_retry_work, bch_read_retry_work);
 572         mutex_init(&c->zlib_workspace_lock);
 573
 574         seqcount_init(&c->gc_pos_lock);
 575
 576         c->prio_clock[READ].hand = 1;
 577         c->prio_clock[READ].min_prio = 0;
 578         c->prio_clock[WRITE].hand = 1;
 579         c->prio_clock[WRITE].min_prio = 0;
 580
 581         c->congested_read_threshold_us  = 2000;
 582         c->congested_write_threshold_us = 20000;
 583         c->error_limit  = 16 << IO_ERROR_SHIFT;
 584         init_waitqueue_head(&c->writeback_wait);
 585
 586         c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
 587
 588         c->copy_gc_enabled = 1;
 589         c->tiering_enabled = 1;
 590         c->tiering_percent = 10;
 591
 592         c->foreground_target_percent = 20;
 593
 594         c->journal.write_time   = &c->journal_write_time;
 595         c->journal.delay_time   = &c->journal_delay_time;
 596         c->journal.blocked_time = &c->journal_blocked_time;
 597         c->journal.flush_seq_time = &c->journal_flush_seq_time;
 598
 599         mutex_init(&c->uevent_lock);
 600
 601         mutex_lock(&c->sb_lock);
 602
 603         if (bch_sb_to_fs(c, sb)) {
 604                 mutex_unlock(&c->sb_lock);
 605                 goto err;
 606         }
 607
 608         mutex_unlock(&c->sb_lock);
 609
 610         scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
 611
 612         bch_opts_apply(&c->opts, bch_sb_opts(sb));
 613         bch_opts_apply(&c->opts, opts);
 614
 615         c->opts.nochanges       |= c->opts.noreplay;
 616         c->opts.read_only       |= c->opts.nochanges;
 617
 618         c->block_bits           = ilog2(c->sb.block_size);
 619
 620         if (bch_fs_init_fault("fs_alloc"))
 621                 goto err;
 622
 623         iter_size = (btree_blocks(c) + 1) * 2 *
 624                 sizeof(struct btree_node_iter_set);
 625
 626         journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
 627
 628         if (!(c->wq = alloc_workqueue("bcache",
 629                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 630             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
 631                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 632             percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
 633             mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 634                                       sizeof(struct btree_reserve)) ||
 635             mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 636                                       sizeof(struct btree_interior_update)) ||
 637             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 638             bioset_init(&c->btree_read_bio, 1, 0) ||
 639             bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
 640             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
 641             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
 642             mempool_init_page_pool(&c->bio_bounce_pages,
 643                                    max_t(unsigned,
 644                                          c->sb.btree_node_size,
 645                                          BCH_ENCODED_EXTENT_MAX) /
 646                                    PAGE_SECTORS, 0) ||
 647             !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 648             lg_lock_init(&c->usage_lock) ||
 649             mempool_init_page_pool(&c->btree_bounce_pool, 1,
 650                                    ilog2(btree_pages(c))) ||
 651             bdi_setup_and_register(&c->bdi, "bcache") ||
 652             bch_fs_blockdev_init(c) ||
 653             bch_io_clock_init(&c->io_clock[READ]) ||
 654             bch_io_clock_init(&c->io_clock[WRITE]) ||
 655             bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
 656             bch_fs_btree_init(c) ||
 657             bch_fs_encryption_init(c) ||
 658             bch_fs_compress_init(c) ||
 659             bch_check_set_has_compressed_data(c, c->opts.compression))
 660                 goto err;
 661
 662         c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 663         c->bdi.congested_fn     = bch_congested_fn;
 664         c->bdi.congested_data   = c;
 665
 666         mi = bch_sb_get_members(c->disk_sb);
 667         for (i = 0; i < c->sb.nr_devices; i++)
 668                 if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
 669                     bch_dev_alloc(c, i))
 670                         goto err;
 671
 672         /*
 673          * Now that all allocations have succeeded, init various refcounty
 674          * things that let us shutdown:
 675          */
 676         closure_init(&c->cl, NULL);
 677
 678         c->kobj.kset = bcache_kset;
 679         kobject_init(&c->kobj, &bch_fs_ktype);
 680         kobject_init(&c->internal, &bch_fs_internal_ktype);
 681         kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype);
 682         kobject_init(&c->time_stats, &bch_fs_time_stats_ktype);
 683
 684         bch_cache_accounting_init(&c->accounting, &c->cl);
 685
 686         closure_init(&c->caching, &c->cl);
 687         set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
 688
 689         closure_get(&c->cl);
 690         continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
 691         return c;
 692 err:
 693         bch_fs_free(c);
 694         return NULL;
 695 }
 696
 697 static const char *__bch_fs_online(struct bch_fs *c)
 698 {
 699         struct bch_dev *ca;
 700         const char *err = NULL;
 701         unsigned i;
 702         int ret;
 703
 704         lockdep_assert_held(&bch_fs_list_lock);
 705
 706         if (!list_empty(&c->list))
 707                 return NULL;
 708
 709         if (__bch_uuid_to_fs(c->sb.uuid))
 710                 return "filesystem UUID already open";
 711
 712         ret = bch_fs_chardev_init(c);
 713         if (ret)
 714                 return "error creating character device";
 715
 716         bch_fs_debug_init(c);
 717
 718         if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
 719             kobject_add(&c->internal, &c->kobj, "internal") ||
 720             kobject_add(&c->opts_dir, &c->kobj, "options") ||
 721             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
 722             bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
 723                 return "error creating sysfs objects";
 724
 725         mutex_lock(&c->state_lock);
 726
 727         err = "error creating sysfs objects";
 728         __for_each_member_device(ca, c, i)
 729                 if (bch_dev_sysfs_online(ca))
 730                         goto err;
 731
 732         err = "can't bring up blockdev volumes";
 733         if (bch_blockdev_volumes_start(c))
 734                 goto err;
 735
 736         bch_attach_backing_devs(c);
 737
 738         list_add(&c->list, &bch_fs_list);
 739         err = NULL;
 740 err:
 741         mutex_unlock(&c->state_lock);
 742         return err;
 743 }
 744
 745 static const char *bch_fs_online(struct bch_fs *c)
 746 {
 747         const char *err;
 748
 749         mutex_lock(&bch_fs_list_lock);
 750         err = __bch_fs_online(c);
 751         mutex_unlock(&bch_fs_list_lock);
 752
 753         return err;
 754 }
 755
 756 static const char *__bch_fs_start(struct bch_fs *c)
 757 {
 758         const char *err = "cannot allocate memory";
 759         struct bch_sb_field_members *mi;
 760         struct bch_dev *ca;
 761         unsigned i, id;
 762         time64_t now;
 763         LIST_HEAD(journal);
 764         struct jset *j;
 765         int ret = -EINVAL;
 766
 767         BUG_ON(c->state != BCH_FS_STARTING);
 768
 769         mutex_lock(&c->sb_lock);
 770         for_each_online_member(ca, c, i)
 771                 bch_sb_from_fs(c, ca);
 772         mutex_unlock(&c->sb_lock);
 773
 774         if (BCH_SB_INITIALIZED(c->disk_sb)) {
 775                 ret = bch_journal_read(c, &journal);
 776                 if (ret)
 777                         goto err;
 778
 779                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
 780
 781                 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
 782                 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
 783
 784                 err = "error reading priorities";
 785                 for_each_readable_member(ca, c, i) {
 786                         ret = bch_prio_read(ca);
 787                         if (ret) {
 788                                 percpu_ref_put(&ca->io_ref);
 789                                 goto err;
 790                         }
 791                 }
 792
 793                 for (id = 0; id < BTREE_ID_NR; id++) {
 794                         unsigned level;
 795                         struct bkey_i *k;
 796
 797                         err = "bad btree root";
 798                         k = bch_journal_find_btree_root(c, j, id, &level);
 799                         if (!k && id == BTREE_ID_EXTENTS)
 800                                 goto err;
 801                         if (!k) {
 802                                 pr_debug("missing btree root: %d", id);
 803                                 continue;
 804                         }
 805
 806                         err = "error reading btree root";
 807                         if (bch_btree_root_read(c, id, k, level))
 808                                 goto err;
 809                 }
 810
 811                 bch_verbose(c, "starting mark and sweep:");
 812
 813                 err = "error in recovery";
 814                 if (bch_initial_gc(c, &journal))
 815                         goto err;
 816
 817                 if (c->opts.noreplay)
 818                         goto recovery_done;
 819
 820                 bch_verbose(c, "mark and sweep done");
 821
 822                 /*
 823                  * bch_journal_start() can't happen sooner, or btree_gc_finish()
 824                  * will give spurious errors about oldest_gen > bucket_gen -
 825                  * this is a hack but oh well.
 826                  */
 827                 bch_journal_start(c);
 828
 829                 err = "error starting allocator thread";
 830                 for_each_rw_member(ca, c, i)
 831                         if (bch_dev_allocator_start(ca)) {
 832                                 percpu_ref_put(&ca->io_ref);
 833                                 goto err;
 834                         }
 835
 836                 bch_verbose(c, "starting journal replay:");
 837
 838                 err = "journal replay failed";
 839                 ret = bch_journal_replay(c, &journal);
 840                 if (ret)
 841                         goto err;
 842
 843                 bch_verbose(c, "journal replay done");
 844
 845                 if (c->opts.norecovery)
 846                         goto recovery_done;
 847
 848                 bch_verbose(c, "starting fsck:");
 849                 err = "error in fsck";
 850                 ret = bch_fsck(c, !c->opts.nofsck);
 851                 if (ret)
 852                         goto err;
 853
 854                 bch_verbose(c, "fsck done");
 855         } else {
 856                 struct bch_inode_unpacked inode;
 857                 struct bkey_inode_buf packed_inode;
 858                 struct closure cl;
 859
 860                 closure_init_stack(&cl);
 861
 862                 bch_notice(c, "initializing new filesystem");
 863
 864                 bch_initial_gc(c, NULL);
 865
 866                 err = "unable to allocate journal buckets";
 867                 for_each_rw_member(ca, c, i)
 868                         if (bch_dev_journal_alloc(ca)) {
 869                                 percpu_ref_put(&ca->io_ref);
 870                                 goto err;
 871                         }
 872
 873                 /*
 874                  * journal_res_get() will crash if called before this has
 875                  * set up the journal.pin FIFO and journal.cur pointer:
 876                  */
 877                 bch_journal_start(c);
 878                 bch_journal_set_replay_done(&c->journal);
 879
 880                 err = "error starting allocator thread";
 881                 for_each_rw_member(ca, c, i)
 882                         if (bch_dev_allocator_start(ca)) {
 883                                 percpu_ref_put(&ca->io_ref);
 884                                 goto err;
 885                         }
 886
 887                 err = "cannot allocate new btree root";
 888                 for (id = 0; id < BTREE_ID_NR; id++)
 889                         if (bch_btree_root_alloc(c, id, &cl)) {
 890                                 closure_sync(&cl);
 891                                 goto err;
 892                         }
 893
 894                 /* Wait for new btree roots to be written: */
 895                 closure_sync(&cl);
 896
 897                 bch_inode_init(c, &inode, 0, 0,
 898                                S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
 899                 inode.inum = BCACHE_ROOT_INO;
 900
 901                 bch_inode_pack(&packed_inode, &inode);
 902
 903                 err = "error creating root directory";
 904                 if (bch_btree_insert(c, BTREE_ID_INODES,
 905                                      &packed_inode.inode.k_i,
 906                                      NULL, NULL, NULL, 0))
 907                         goto err;
 908
 909                 err = "error writing first journal entry";
 910                 if (bch_journal_meta(&c->journal))
 911                         goto err;
 912         }
 913 recovery_done:
 914         err = "dynamic fault";
 915         if (bch_fs_init_fault("fs_start"))
 916                 goto err;
 917
 918         if (c->opts.read_only) {
 919                 bch_fs_read_only(c);
 920         } else {
 921                 err = bch_fs_read_write(c);
 922                 if (err)
 923                         goto err;
 924         }
 925
 926         mutex_lock(&c->sb_lock);
 927         mi = bch_sb_get_members(c->disk_sb);
 928         now = ktime_get_seconds();
 929
 930         for_each_member_device(ca, c, i)
 931                 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
 932
 933         SET_BCH_SB_INITIALIZED(c->disk_sb, true);
 934         SET_BCH_SB_CLEAN(c->disk_sb, false);
 935         c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
 936
 937         bch_write_super(c);
 938         mutex_unlock(&c->sb_lock);
 939
 940         err = NULL;
 941 out:
 942         bch_journal_entries_free(&journal);
 943         return err;
 944 err:
 945         switch (ret) {
 946         case BCH_FSCK_ERRORS_NOT_FIXED:
 947                 bch_err(c, "filesystem contains errors: please report this to the developers");
 948                 pr_cont("mount with -o fix_errors to repair");
 949                 err = "fsck error";
 950                 break;
 951         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
 952                 bch_err(c, "filesystem contains errors: please report this to the developers");
 953                 pr_cont("repair unimplemented: inform the developers so that it can be added");
 954                 err = "fsck error";
 955                 break;
 956         case BCH_FSCK_REPAIR_IMPOSSIBLE:
 957                 bch_err(c, "filesystem contains errors, but repair impossible");
 958                 err = "fsck error";
 959                 break;
 960         case BCH_FSCK_UNKNOWN_VERSION:
 961                 err = "unknown metadata version";;
 962                 break;
 963         case -ENOMEM:
 964                 err = "cannot allocate memory";
 965                 break;
 966         case -EIO:
 967                 err = "IO error";
 968                 break;
 969         }
 970
 971         BUG_ON(!err);
 972         set_bit(BCH_FS_ERROR, &c->flags);
 973         goto out;
 974 }
 975
 976 const char *bch_fs_start(struct bch_fs *c)
 977 {
 978         return __bch_fs_start(c) ?: bch_fs_online(c);
 979 }
 980
 981 static const char *bch_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 982 {
 983         struct bch_sb_field_members *sb_mi;
 984
 985         sb_mi = bch_sb_get_members(sb);
 986         if (!sb_mi)
 987                 return "Invalid superblock: member info area missing";
 988
 989         if (le16_to_cpu(sb->block_size) != c->sb.block_size)
 990                 return "mismatched block size";
 991
 992         if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
 993             BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
 994                 return "new cache bucket size is too small";
 995
 996         return NULL;
 997 }
 998
 999 static const char *bch_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
1000 {
1001         struct bch_sb *newest =
1002                 le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
1003         struct bch_sb_field_members *mi = bch_sb_get_members(newest);
1004
1005         if (uuid_le_cmp(fs->uuid, sb->uuid))
1006                 return "device not a member of filesystem";
1007
1008         if (sb->dev_idx >= newest->nr_devices)
1009                 return "device has invalid dev_idx";
1010
1011         if (bch_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
1012                 return "device has been removed";
1013
1014         if (fs->block_size != sb->block_size)
1015                 return "mismatched block size";
1016
1017         return NULL;
1018 }
1019
1020 /* Device startup/shutdown: */
1021
1022 void bch_dev_release(struct kobject *kobj)
1023 {
1024         struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
1025
1026         kfree(ca);
1027 }
1028
1029 static void bch_dev_free(struct bch_dev *ca)
1030 {
1031         unsigned i;
1032
1033         cancel_work_sync(&ca->io_error_work);
1034
1035         if (ca->kobj.state_in_sysfs &&
1036             ca->disk_sb.bdev)
1037                 sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1038                                   "bcache");
1039
1040         if (ca->kobj.state_in_sysfs)
1041                 kobject_del(&ca->kobj);
1042
1043         bch_free_super(&ca->disk_sb);
1044         bch_dev_journal_exit(ca);
1045
1046         free_percpu(ca->sectors_written);
1047         bioset_exit(&ca->replica_set);
1048         free_percpu(ca->usage_percpu);
1049         free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1050         kfree(ca->prio_buckets);
1051         kfree(ca->bio_prio);
1052         vfree(ca->buckets);
1053         vfree(ca->oldest_gens);
1054         free_heap(&ca->heap);
1055         free_fifo(&ca->free_inc);
1056
1057         for (i = 0; i < RESERVE_NR; i++)
1058                 free_fifo(&ca->free[i]);
1059
1060         percpu_ref_exit(&ca->io_ref);
1061         percpu_ref_exit(&ca->ref);
1062         kobject_put(&ca->kobj);
1063 }
1064
1065 static void bch_dev_io_ref_release(struct percpu_ref *ref)
1066 {
1067         struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
1068
1069         complete(&ca->offline_complete);
1070 }
1071
1072 static void __bch_dev_offline(struct bch_dev *ca)
1073 {
1074         struct bch_fs *c = ca->fs;
1075
1076         lockdep_assert_held(&c->state_lock);
1077
1078         __bch_dev_read_only(ca->fs, ca);
1079
1080         reinit_completion(&ca->offline_complete);
1081         percpu_ref_kill(&ca->io_ref);
1082         wait_for_completion(&ca->offline_complete);
1083
1084         if (ca->kobj.state_in_sysfs) {
1085                 struct kobject *block =
1086                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
1087
1088                 sysfs_remove_link(block, "bcache");
1089                 sysfs_remove_link(&ca->kobj, "block");
1090         }
1091
1092         bch_free_super(&ca->disk_sb);
1093         bch_dev_journal_exit(ca);
1094 }
1095
1096 static void bch_dev_ref_release(struct percpu_ref *ref)
1097 {
1098         struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
1099
1100         complete(&ca->stop_complete);
1101 }
1102
1103 static void bch_dev_stop(struct bch_dev *ca)
1104 {
1105         struct bch_fs *c = ca->fs;
1106
1107         lockdep_assert_held(&c->state_lock);
1108
1109         BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
1110         rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
1111
1112         synchronize_rcu();
1113
1114         reinit_completion(&ca->stop_complete);
1115         percpu_ref_kill(&ca->ref);
1116         wait_for_completion(&ca->stop_complete);
1117 }
1118
1119 static int bch_dev_sysfs_online(struct bch_dev *ca)
1120 {
1121         struct bch_fs *c = ca->fs;
1122         int ret;
1123
1124         if (!c->kobj.state_in_sysfs)
1125                 return 0;
1126
1127         if (!ca->kobj.state_in_sysfs) {
1128                 ret = kobject_add(&ca->kobj, &ca->fs->kobj,
1129                                   "dev-%u", ca->dev_idx);
1130                 if (ret)
1131                         return ret;
1132         }
1133
1134         if (ca->disk_sb.bdev) {
1135                 struct kobject *block =
1136                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
1137
1138                 ret = sysfs_create_link(block, &ca->kobj, "bcache");
1139                 if (ret)
1140                         return ret;
1141                 ret = sysfs_create_link(&ca->kobj, block, "block");
1142                 if (ret)
1143                         return ret;
1144         }
1145
1146         return 0;
1147 }
1148
1149 static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx)
1150 {
1151         struct bch_member *member;
1152         size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1153         size_t heap_size;
1154         unsigned i;
1155         struct bch_dev *ca;
1156
1157         if (bch_fs_init_fault("dev_alloc"))
1158                 return -ENOMEM;
1159
1160         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1161         if (!ca)
1162                 return -ENOMEM;
1163
1164         kobject_init(&ca->kobj, &bch_dev_ktype);
1165         init_completion(&ca->stop_complete);
1166         init_completion(&ca->offline_complete);
1167
1168         spin_lock_init(&ca->self.lock);
1169         ca->self.nr = 1;
1170         rcu_assign_pointer(ca->self.d[0].dev, ca);
1171         ca->dev_idx = dev_idx;
1172
1173         spin_lock_init(&ca->freelist_lock);
1174         spin_lock_init(&ca->prio_buckets_lock);
1175         mutex_init(&ca->heap_lock);
1176         bch_dev_moving_gc_init(ca);
1177
1178         INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1179
1180         if (bch_fs_init_fault("dev_alloc"))
1181                 goto err;
1182
1183         member = bch_sb_get_members(c->disk_sb)->members + dev_idx;
1184
1185         ca->mi = bch_mi_to_cpu(member);
1186         ca->uuid = member->uuid;
1187         ca->bucket_bits = ilog2(ca->mi.bucket_size);
1188         scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
1189
1190         /* XXX: tune these */
1191         movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1192         reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1193         /*
1194          * free_inc must be smaller than the copygc reserve: if it was bigger,
1195          * one copygc iteration might not make enough buckets available to fill
1196          * up free_inc and allow the allocator to make forward progress
1197          */
1198         free_inc_reserve = movinggc_reserve / 2;
1199         heap_size = movinggc_reserve * 8;
1200
1201         if (percpu_ref_init(&ca->ref, bch_dev_ref_release,
1202                             0, GFP_KERNEL) ||
1203             percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release,
1204                             PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
1205             !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1206             !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1207             !init_fifo(&ca->free[RESERVE_MOVINGGC],
1208                        movinggc_reserve, GFP_KERNEL) ||
1209             !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1210             !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
1211             !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
1212             !(ca->oldest_gens   = vzalloc(sizeof(u8) *
1213                                           ca->mi.nbuckets)) ||
1214             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1215                                           ca->mi.nbuckets)) ||
1216             !(ca->prio_buckets  = kzalloc(sizeof(u64) * prio_buckets(ca) *
1217                                           2, GFP_KERNEL)) ||
1218             !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1219             !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
1220             !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
1221             bioset_init(&ca->replica_set, 4,
1222                         offsetof(struct bch_write_bio, bio)) ||
1223             !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
1224                 goto err;
1225
1226         ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1227
1228         total_reserve = ca->free_inc.size;
1229         for (i = 0; i < RESERVE_NR; i++)
1230                 total_reserve += ca->free[i].size;
1231
1232         ca->copygc_write_point.group = &ca->self;
1233         ca->tiering_write_point.group = &ca->self;
1234
1235         ca->fs = c;
1236         rcu_assign_pointer(c->devs[ca->dev_idx], ca);
1237
1238         if (bch_dev_sysfs_online(ca))
1239                 pr_warn("error creating sysfs objects");
1240
1241         return 0;
1242 err:
1243         bch_dev_free(ca);
1244         return -ENOMEM;
1245 }
1246
1247 static int __bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
1248 {
1249         struct bch_dev *ca;
1250         int ret;
1251
1252         lockdep_assert_held(&c->sb_lock);
1253
1254         if (le64_to_cpu(sb->sb->seq) >
1255             le64_to_cpu(c->disk_sb->seq))
1256                 bch_sb_to_fs(c, sb->sb);
1257
1258         BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
1259                !c->devs[sb->sb->dev_idx]);
1260
1261         ca = c->devs[sb->sb->dev_idx];
1262         if (ca->disk_sb.bdev) {
1263                 bch_err(c, "already have device online in slot %u",
1264                         sb->sb->dev_idx);
1265                 return -EINVAL;
1266         }
1267
1268         ret = bch_dev_journal_init(ca, sb->sb);
1269         if (ret)
1270                 return ret;
1271
1272         /*
1273          * Increase journal write timeout if flushes to this device are
1274          * expensive:
1275          */
1276         if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
1277             journal_flushes_device(ca))
1278                 c->journal.write_delay_ms =
1279                         max(c->journal.write_delay_ms, 1000U);
1280
1281         /* Commit: */
1282         ca->disk_sb = *sb;
1283         if (sb->mode & FMODE_EXCL)
1284                 ca->disk_sb.bdev->bd_holder = ca;
1285         memset(sb, 0, sizeof(*sb));
1286
1287         if (c->sb.nr_devices == 1)
1288                 bdevname(ca->disk_sb.bdev, c->name);
1289         bdevname(ca->disk_sb.bdev, ca->name);
1290
1291         if (bch_dev_sysfs_online(ca))
1292                 pr_warn("error creating sysfs objects");
1293
1294         lg_local_lock(&c->usage_lock);
1295         if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
1296                 bch_mark_dev_metadata(ca->fs, ca);
1297         lg_local_unlock(&c->usage_lock);
1298
1299         percpu_ref_reinit(&ca->io_ref);
1300         return 0;
1301 }
1302
1303 /* Device management: */
1304
1305 bool bch_fs_may_start(struct bch_fs *c, int flags)
1306 {
1307         struct bch_sb_field_members *mi;
1308         unsigned meta_missing = 0;
1309         unsigned data_missing = 0;
1310         bool degraded = false;
1311         unsigned i;
1312
1313         mutex_lock(&c->sb_lock);
1314         mi = bch_sb_get_members(c->disk_sb);
1315
1316         for (i = 0; i < c->disk_sb->nr_devices; i++)
1317                 if (!c->devs[i] &&
1318                     !bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
1319                         degraded = true;
1320                         if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
1321                                 meta_missing++;
1322                         if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
1323                                 data_missing++;
1324                 }
1325         mutex_unlock(&c->sb_lock);
1326
1327         if (degraded &&
1328             !(flags & BCH_FORCE_IF_DEGRADED))
1329                 return false;
1330
1331         if (meta_missing &&
1332             !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1333                 return false;
1334
1335         if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
1336             !(flags & BCH_FORCE_IF_METADATA_LOST))
1337                 return false;
1338
1339         if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1340                 return false;
1341
1342         if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
1343             !(flags & BCH_FORCE_IF_DATA_LOST))
1344                 return false;
1345
1346         return true;
1347 }
1348
1349 bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
1350                            enum bch_member_state new_state, int flags)
1351 {
1352         lockdep_assert_held(&c->state_lock);
1353
1354         if (new_state == BCH_MEMBER_STATE_RW)
1355                 return true;
1356
1357         if (ca->mi.has_data &&
1358             !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1359                 return false;
1360
1361         if (ca->mi.has_data &&
1362             c->sb.data_replicas_have <= 1 &&
1363             !(flags & BCH_FORCE_IF_DATA_LOST))
1364                 return false;
1365
1366         if (ca->mi.has_metadata &&
1367             !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1368                 return false;
1369
1370         if (ca->mi.has_metadata &&
1371             c->sb.meta_replicas_have <= 1 &&
1372             !(flags & BCH_FORCE_IF_METADATA_LOST))
1373                 return false;
1374
1375         return true;
1376 }
1377
1378 static void __bch_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
1379 {
1380         bch_moving_gc_stop(ca);
1381
1382         /*
1383          * This stops new data writes (e.g. to existing open data
1384          * buckets) and then waits for all existing writes to
1385          * complete.
1386          */
1387         bch_dev_allocator_stop(ca);
1388
1389         bch_dev_group_remove(&c->journal.devs, ca);
1390 }
1391
1392 static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
1393 {
1394         lockdep_assert_held(&c->state_lock);
1395
1396         BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
1397
1398         trace_bcache_cache_read_write(ca);
1399
1400         if (bch_dev_allocator_start(ca))
1401                 return "error starting allocator thread";
1402
1403         if (bch_moving_gc_start(ca))
1404                 return "error starting moving GC thread";
1405
1406         if (bch_tiering_start(c))
1407                 return "error starting tiering thread";
1408
1409         bch_notify_dev_read_write(ca);
1410         trace_bcache_cache_read_write_done(ca);
1411
1412         return NULL;
1413 }
1414
1415 int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1416                         enum bch_member_state new_state, int flags)
1417 {
1418         struct bch_sb_field_members *mi;
1419
1420         if (ca->mi.state == new_state)
1421                 return 0;
1422
1423         if (!bch_dev_state_allowed(c, ca, new_state, flags))
1424                 return -EINVAL;
1425
1426         if (new_state == BCH_MEMBER_STATE_RW) {
1427                 if (__bch_dev_read_write(c, ca))
1428                         return -ENOMEM;
1429         } else {
1430                 __bch_dev_read_only(c, ca);
1431         }
1432
1433         bch_notice(ca, "%s", bch_dev_state[new_state]);
1434
1435         mutex_lock(&c->sb_lock);
1436         mi = bch_sb_get_members(c->disk_sb);
1437         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
1438         bch_write_super(c);
1439         mutex_unlock(&c->sb_lock);
1440
1441         return 0;
1442 }
1443
1444 int bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1445                       enum bch_member_state new_state, int flags)
1446 {
1447         int ret;
1448
1449         mutex_lock(&c->state_lock);
1450         ret = __bch_dev_set_state(c, ca, new_state, flags);
1451         mutex_unlock(&c->state_lock);
1452
1453         return ret;
1454 }
1455
1456 #if 0
1457 int bch_dev_migrate_from(struct bch_fs *c, struct bch_dev *ca)
1458 {
1459         /* First, go RO before we try to migrate data off: */
1460         ret = bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, flags);
1461         if (ret)
1462                 return ret;
1463
1464         bch_notify_dev_removing(ca);
1465
1466         /* Migrate data, metadata off device: */
1467
1468         ret = bch_move_data_off_device(ca);
1469         if (ret && !(flags & BCH_FORCE_IF_DATA_LOST)) {
1470                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1471                         name);
1472                 return ret;
1473         }
1474
1475         if (ret)
1476                 ret = bch_flag_data_bad(ca);
1477         if (ret) {
1478                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1479                         name);
1480                 return ret;
1481         }
1482
1483         ret = bch_move_metadata_off_device(ca);
1484         if (ret)
1485                 return ret;
1486 }
1487 #endif
1488
1489 /* Device add/removal: */
1490
1491 static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1492 {
1493         struct bch_sb_field_members *mi;
1494         unsigned dev_idx = ca->dev_idx;
1495         int ret;
1496
1497         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1498                 bch_err(ca, "Cannot remove RW device");
1499                 bch_notify_dev_remove_failed(ca);
1500                 return -EINVAL;
1501         }
1502
1503         if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1504                 bch_err(ca, "Cannot remove without losing data");
1505                 bch_notify_dev_remove_failed(ca);
1506                 return -EINVAL;
1507         }
1508
1509         /*
1510          * XXX: verify that dev_idx is really not in use anymore, anywhere
1511          *
1512          * flag_data_bad() does not check btree pointers
1513          */
1514         ret = bch_flag_data_bad(ca);
1515         if (ret) {
1516                 bch_err(ca, "Remove failed");
1517                 return ret;
1518         }
1519
1520         if (ca->mi.has_data || ca->mi.has_metadata) {
1521                 bch_err(ca, "Can't remove, still has data");
1522                 return ret;
1523         }
1524
1525         /*
1526          * Ok, really doing the remove:
1527          * Drop device's prio pointer before removing it from superblock:
1528          */
1529         bch_notify_dev_removed(ca);
1530
1531         spin_lock(&c->journal.lock);
1532         c->journal.prio_buckets[dev_idx] = 0;
1533         spin_unlock(&c->journal.lock);
1534
1535         bch_journal_meta(&c->journal);
1536
1537         __bch_dev_offline(ca);
1538         bch_dev_stop(ca);
1539         bch_dev_free(ca);
1540
1541         /*
1542          * Free this device's slot in the bch_member array - all pointers to
1543          * this device must be gone:
1544          */
1545         mutex_lock(&c->sb_lock);
1546         mi = bch_sb_get_members(c->disk_sb);
1547         memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1548
1549         bch_write_super(c);
1550
1551         mutex_unlock(&c->sb_lock);
1552
1553         return 0;
1554 }
1555
1556 int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1557 {
1558         int ret;
1559
1560         mutex_lock(&c->state_lock);
1561         percpu_ref_put(&ca->ref);
1562         ret = __bch_dev_remove(c, ca, flags);
1563         mutex_unlock(&c->state_lock);
1564
1565         return ret;
1566 }
1567
1568 int bch_dev_add(struct bch_fs *c, const char *path)
1569 {
1570         struct bcache_superblock sb;
1571         const char *err;
1572         struct bch_dev *ca = NULL;
1573         struct bch_sb_field_members *mi, *dev_mi;
1574         struct bch_member saved_mi;
1575         unsigned dev_idx, nr_devices, u64s;
1576         int ret = -EINVAL;
1577
1578         err = bch_read_super(&sb, bch_opts_empty(), path);
1579         if (err)
1580                 return -EINVAL;
1581
1582         err = bch_validate_cache_super(&sb);
1583         if (err)
1584                 return -EINVAL;
1585
1586         err = bch_dev_may_add(sb.sb, c);
1587         if (err)
1588                 return -EINVAL;
1589
1590         mutex_lock(&c->state_lock);
1591         mutex_lock(&c->sb_lock);
1592
1593         /*
1594          * Preserve the old cache member information (esp. tier)
1595          * before we start bashing the disk stuff.
1596          */
1597         dev_mi = bch_sb_get_members(sb.sb);
1598         saved_mi = dev_mi->members[sb.sb->dev_idx];
1599         saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
1600
1601         if (dynamic_fault("bcache:add:no_slot"))
1602                 goto no_slot;
1603
1604         mi = bch_sb_get_members(c->disk_sb);
1605         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1606                 if (dev_idx >= c->sb.nr_devices ||
1607                     bch_is_zero(mi->members[dev_idx].uuid.b,
1608                                  sizeof(uuid_le)))
1609                         goto have_slot;
1610 no_slot:
1611         err = "no slots available in superblock";
1612         ret = -ENOSPC;
1613         goto err_unlock;
1614
1615 have_slot:
1616         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1617         u64s = (sizeof(struct bch_sb_field_members) +
1618                 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1619         err = "no space in superblock for member info";
1620
1621         mi = bch_fs_sb_resize_members(c, u64s);
1622         if (!mi)
1623                 goto err_unlock;
1624
1625         dev_mi = bch_sb_resize_members(&sb, u64s);
1626         if (!dev_mi)
1627                 goto err_unlock;
1628
1629         memcpy(dev_mi, mi, u64s * sizeof(u64));
1630         dev_mi->members[dev_idx] = saved_mi;
1631
1632         sb.sb->uuid             = c->disk_sb->uuid;
1633         sb.sb->dev_idx          = dev_idx;
1634         sb.sb->nr_devices       = nr_devices;
1635
1636         /* commit new member info */
1637         memcpy(mi, dev_mi, u64s * sizeof(u64));
1638         c->disk_sb->nr_devices  = nr_devices;
1639         c->sb.nr_devices        = nr_devices;
1640
1641         if (bch_dev_alloc(c, dev_idx)) {
1642                 err = "cannot allocate memory";
1643                 ret = -ENOMEM;
1644                 goto err_unlock;
1645         }
1646
1647         if (__bch_dev_online(c, &sb)) {
1648                 err = "bch_dev_online() error";
1649                 ret = -ENOMEM;
1650                 goto err_unlock;
1651         }
1652
1653         bch_write_super(c);
1654         mutex_unlock(&c->sb_lock);
1655
1656         ca = c->devs[dev_idx];
1657         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1658                 err = "journal alloc failed";
1659                 if (bch_dev_journal_alloc(ca))
1660                         goto err;
1661
1662                 err = __bch_dev_read_write(c, ca);
1663                 if (err)
1664                         goto err;
1665         }
1666
1667         bch_notify_dev_added(ca);
1668         mutex_unlock(&c->state_lock);
1669         return 0;
1670 err_unlock:
1671         mutex_unlock(&c->sb_lock);
1672 err:
1673         mutex_unlock(&c->state_lock);
1674         bch_free_super(&sb);
1675
1676         bch_err(c, "Unable to add device: %s", err);
1677         return ret ?: -EINVAL;
1678 }
1679
1680 int bch_dev_online(struct bch_fs *c, const char *path)
1681 {
1682         struct bcache_superblock sb = { 0 };
1683         const char *err;
1684
1685         mutex_lock(&c->state_lock);
1686
1687         err = bch_read_super(&sb, bch_opts_empty(), path);
1688         if (err)
1689                 goto err;
1690
1691         err = bch_dev_in_fs(c->disk_sb, sb.sb);
1692         if (err)
1693                 goto err;
1694
1695         mutex_lock(&c->sb_lock);
1696         if (__bch_dev_online(c, &sb)) {
1697                 mutex_unlock(&c->sb_lock);
1698                 goto err;
1699         }
1700         mutex_unlock(&c->sb_lock);
1701
1702         mutex_unlock(&c->state_lock);
1703         return 0;
1704 err:
1705         mutex_unlock(&c->state_lock);
1706         bch_free_super(&sb);
1707         bch_err(c, "error bringing %s online: %s", path, err);
1708         return -EINVAL;
1709 }
1710
1711 int bch_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
1712 {
1713         mutex_lock(&c->state_lock);
1714
1715         if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1716                 bch_err(ca, "Cannot offline required disk");
1717                 mutex_unlock(&c->state_lock);
1718                 return -EINVAL;
1719         }
1720
1721         __bch_dev_read_only(c, ca);
1722         __bch_dev_offline(ca);
1723
1724         mutex_unlock(&c->state_lock);
1725         return 0;
1726 }
1727
1728 int bch_dev_migrate(struct bch_fs *c, struct bch_dev *ca)
1729 {
1730         int ret;
1731
1732         mutex_lock(&c->state_lock);
1733
1734         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1735                 bch_err(ca, "Cannot migrate data off RW device");
1736                 mutex_unlock(&c->state_lock);
1737                 return -EINVAL;
1738         }
1739
1740         mutex_unlock(&c->state_lock);
1741
1742         ret = bch_move_data_off_device(ca);
1743         if (ret) {
1744                 bch_err(ca, "Error migrating data: %i", ret);
1745                 return ret;
1746         }
1747
1748         ret = bch_move_metadata_off_device(ca);
1749         if (ret) {
1750                 bch_err(ca, "Error migrating metadata: %i", ret);
1751                 return ret;
1752         }
1753
1754         if (ca->mi.has_data || ca->mi.has_metadata) {
1755                 bch_err(ca, "Migrate error: data still present");
1756                 return -EINVAL;
1757         }
1758
1759         return 0;
1760 }
1761
1762 /* Filesystem open: */
1763
1764 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
1765                         struct bch_opts opts, struct bch_fs **ret)
1766 {
1767         const char *err;
1768         struct bch_fs *c = NULL;
1769         struct bcache_superblock *sb;
1770         unsigned i, best_sb = 0;
1771
1772         if (!nr_devices)
1773                 return "need at least one device";
1774
1775         if (!try_module_get(THIS_MODULE))
1776                 return "module unloading";
1777
1778         err = "cannot allocate memory";
1779         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1780         if (!sb)
1781                 goto err;
1782
1783         for (i = 0; i < nr_devices; i++) {
1784                 err = bch_read_super(&sb[i], opts, devices[i]);
1785                 if (err)
1786                         goto err;
1787
1788                 err = "attempting to register backing device";
1789                 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
1790                         goto err;
1791
1792                 err = bch_validate_cache_super(&sb[i]);
1793                 if (err)
1794                         goto err;
1795         }
1796
1797         for (i = 1; i < nr_devices; i++)
1798                 if (le64_to_cpu(sb[i].sb->seq) >
1799                     le64_to_cpu(sb[best_sb].sb->seq))
1800                         best_sb = i;
1801
1802         for (i = 0; i < nr_devices; i++) {
1803                 err = bch_dev_in_fs(sb[best_sb].sb, sb[i].sb);
1804                 if (err)
1805                         goto err;
1806         }
1807
1808         err = "cannot allocate memory";
1809         c = bch_fs_alloc(sb[best_sb].sb, opts);
1810         if (!c)
1811                 goto err;
1812
1813         err = "bch_dev_online() error";
1814         mutex_lock(&c->sb_lock);
1815         for (i = 0; i < nr_devices; i++)
1816                 if (__bch_dev_online(c, &sb[i])) {
1817                         mutex_unlock(&c->sb_lock);
1818                         goto err;
1819                 }
1820         mutex_unlock(&c->sb_lock);
1821
1822         err = "insufficient devices";
1823         if (!bch_fs_may_start(c, 0))
1824                 goto err;
1825
1826         if (!c->opts.nostart) {
1827                 err = __bch_fs_start(c);
1828                 if (err)
1829                         goto err;
1830         }
1831
1832         err = bch_fs_online(c);
1833         if (err)
1834                 goto err;
1835
1836         if (ret)
1837                 *ret = c;
1838         else
1839                 closure_put(&c->cl);
1840
1841         err = NULL;
1842 out:
1843         kfree(sb);
1844         module_put(THIS_MODULE);
1845         if (err)
1846                 c = NULL;
1847         return err;
1848 err:
1849         if (c)
1850                 bch_fs_stop(c);
1851
1852         for (i = 0; i < nr_devices; i++)
1853                 bch_free_super(&sb[i]);
1854         goto out;
1855 }
1856
1857 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
1858                                              struct bch_opts opts)
1859 {
1860         const char *err;
1861         struct bch_fs *c;
1862         bool allocated_fs = false;
1863
1864         err = bch_validate_cache_super(sb);
1865         if (err)
1866                 return err;
1867
1868         mutex_lock(&bch_fs_list_lock);
1869         c = __bch_uuid_to_fs(sb->sb->uuid);
1870         if (c) {
1871                 closure_get(&c->cl);
1872
1873                 err = bch_dev_in_fs(c->disk_sb, sb->sb);
1874                 if (err)
1875                         goto err;
1876         } else {
1877                 c = bch_fs_alloc(sb->sb, opts);
1878                 err = "cannot allocate memory";
1879                 if (!c)
1880                         goto err;
1881
1882                 allocated_fs = true;
1883         }
1884
1885         err = "bch_dev_online() error";
1886
1887         mutex_lock(&c->sb_lock);
1888         if (__bch_dev_online(c, sb)) {
1889                 mutex_unlock(&c->sb_lock);
1890                 goto err;
1891         }
1892         mutex_unlock(&c->sb_lock);
1893
1894         if (!c->opts.nostart && bch_fs_may_start(c, 0)) {
1895                 err = __bch_fs_start(c);
1896                 if (err)
1897                         goto err;
1898         }
1899
1900         err = __bch_fs_online(c);
1901         if (err)
1902                 goto err;
1903
1904         closure_put(&c->cl);
1905         mutex_unlock(&bch_fs_list_lock);
1906
1907         return NULL;
1908 err:
1909         mutex_unlock(&bch_fs_list_lock);
1910
1911         if (allocated_fs)
1912                 bch_fs_stop(c);
1913         else if (c)
1914                 closure_put(&c->cl);
1915
1916         return err;
1917 }
1918
1919 const char *bch_fs_open_incremental(const char *path)
1920 {
1921         struct bcache_superblock sb;
1922         struct bch_opts opts = bch_opts_empty();
1923         const char *err;
1924
1925         err = bch_read_super(&sb, opts, path);
1926         if (err)
1927                 return err;
1928
1929         if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
1930                 mutex_lock(&bch_fs_list_lock);
1931                 err = bch_backing_dev_register(&sb);
1932                 mutex_unlock(&bch_fs_list_lock);
1933         } else {
1934                 err = __bch_fs_open_incremental(&sb, opts);
1935         }
1936
1937         bch_free_super(&sb);
1938
1939         return err;
1940 }
1941
1942 /* Global interfaces/init */
1943
1944 #define kobj_attribute_write(n, fn)                                     \
1945         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1946
1947 #define kobj_attribute_rw(n, show, store)                               \
1948         static struct kobj_attribute ksysfs_##n =                       \
1949                 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1950
1951 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1952                                const char *, size_t);
1953
1954 kobj_attribute_write(register,          register_bcache);
1955 kobj_attribute_write(register_quiet,    register_bcache);
1956
1957 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1958                                const char *buffer, size_t size)
1959 {
1960         ssize_t ret = -EINVAL;
1961         const char *err = "cannot allocate memory";
1962         char *path = NULL;
1963
1964         if (!try_module_get(THIS_MODULE))
1965                 return -EBUSY;
1966
1967         if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
1968                 goto err;
1969
1970         err = bch_fs_open_incremental(strim(path));
1971         if (err)
1972                 goto err;
1973
1974         ret = size;
1975 out:
1976         kfree(path);
1977         module_put(THIS_MODULE);
1978         return ret;
1979 err:
1980         pr_err("error opening %s: %s", path, err);
1981         goto out;
1982 }
1983
1984 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1985 {
1986         if (code == SYS_DOWN ||
1987             code == SYS_HALT ||
1988             code == SYS_POWER_OFF) {
1989                 struct bch_fs *c;
1990
1991                 mutex_lock(&bch_fs_list_lock);
1992
1993                 if (!list_empty(&bch_fs_list))
1994                         pr_info("Setting all devices read only:");
1995
1996                 list_for_each_entry(c, &bch_fs_list, list)
1997                         bch_fs_read_only_async(c);
1998
1999                 list_for_each_entry(c, &bch_fs_list, list)
2000                         bch_fs_read_only(c);
2001
2002                 mutex_unlock(&bch_fs_list_lock);
2003         }
2004
2005         return NOTIFY_DONE;
2006 }
2007
2008 static struct notifier_block reboot = {
2009         .notifier_call  = bcache_reboot,
2010         .priority       = INT_MAX, /* before any real devices */
2011 };
2012
2013 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
2014                            const char *buffer, size_t size)
2015 {
2016         bcache_reboot(NULL, SYS_DOWN, NULL);
2017         return size;
2018 }
2019
2020 kobj_attribute_write(reboot,            reboot_test);
2021
2022 static void bcache_exit(void)
2023 {
2024         bch_debug_exit();
2025         bch_vfs_exit();
2026         bch_blockdev_exit();
2027         bch_chardev_exit();
2028         if (bcache_kset)
2029                 kset_unregister(bcache_kset);
2030         if (bcache_io_wq)
2031                 destroy_workqueue(bcache_io_wq);
2032         if (!IS_ERR_OR_NULL(bch_sha256))
2033                 crypto_free_shash(bch_sha256);
2034         unregister_reboot_notifier(&reboot);
2035 }
2036
2037 static int __init bcache_init(void)
2038 {
2039         static const struct attribute *files[] = {
2040                 &ksysfs_register.attr,
2041                 &ksysfs_register_quiet.attr,
2042                 &ksysfs_reboot.attr,
2043                 NULL
2044         };
2045
2046         register_reboot_notifier(&reboot);
2047         closure_debug_init();
2048         bkey_pack_test();
2049
2050         bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
2051         if (IS_ERR(bch_sha256))
2052                 goto err;
2053
2054         if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
2055             !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
2056             sysfs_create_files(&bcache_kset->kobj, files) ||
2057             bch_chardev_init() ||
2058             bch_blockdev_init() ||
2059             bch_vfs_init() ||
2060             bch_debug_init())
2061                 goto err;
2062
2063         return 0;
2064 err:
2065         bcache_exit();
2066         return -ENOMEM;
2067 }
2068
2069 #define BCH_DEBUG_PARAM(name, description)                      \
2070         bool bch_##name;                                        \
2071         module_param_named(name, bch_##name, bool, 0644);       \
2072         MODULE_PARM_DESC(name, description);
2073 BCH_DEBUG_PARAMS()
2074 #undef BCH_DEBUG_PARAM
2075
2076 module_exit(bcache_exit);
2077 module_init(bcache_init);