2 * bcache setup/teardown code, and some metadata io - read a superblock and
3 * figure out what to do with it.
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
12 #include "btree_cache.h"
14 #include "btree_update.h"
36 #include "writeback.h"
38 #include <linux/backing-dev.h>
39 #include <linux/blkdev.h>
40 #include <linux/debugfs.h>
41 #include <linux/device.h>
42 #include <linux/genhd.h>
43 #include <linux/idr.h>
44 #include <linux/kthread.h>
45 #include <linux/module.h>
46 #include <linux/percpu.h>
47 #include <linux/random.h>
48 #include <linux/reboot.h>
49 #include <linux/sysfs.h>
50 #include <crypto/hash.h>
52 #include <trace/events/bcache.h>
54 MODULE_LICENSE("GPL");
55 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
57 static const uuid_le invalid_uuid = {
59 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
60 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
64 static struct kset *bcache_kset;
65 struct mutex bch_register_lock;
66 LIST_HEAD(bch_fs_list);
68 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
69 struct workqueue_struct *bcache_io_wq;
70 struct crypto_shash *bch_sha256;
72 static void bch_dev_stop(struct cache *);
73 static int bch_dev_online(struct cache *);
75 static int bch_congested_fn(void *data, int bdi_bits)
77 struct backing_dev_info *bdi;
78 struct cache_set *c = data;
84 if (bdi_bits & (1 << WB_sync_congested)) {
85 /* Reads - check all devices: */
86 for_each_cache_rcu(ca, c, i) {
87 bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
89 if (bdi_congested(bdi, bdi_bits)) {
95 /* Writes only go to tier 0: */
96 group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
97 bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
99 if (bdi_congested(bdi, bdi_bits)) {
110 /* Cache set RO/RW: */
113 * For startup/shutdown of RW stuff, the dependencies are:
115 * - foreground writes depend on copygc and tiering (to free up space)
117 * - copygc and tiering depend on mark and sweep gc (they actually probably
118 * don't because they either reserve ahead of time or don't block if
119 * allocations fail, but allocations can require mark and sweep gc to run
120 * because of generation number wraparound)
122 * - all of the above depends on the allocator threads
124 * - allocator depends on the journal (when it rewrites prios and gens)
127 static void __bch_fs_read_only(struct cache_set *c)
132 c->tiering_pd.rate.rate = UINT_MAX;
133 bch_ratelimit_reset(&c->tiering_pd.rate);
134 bch_tiering_read_stop(c);
136 for_each_cache(ca, c, i)
137 bch_moving_gc_stop(ca);
139 bch_gc_thread_stop(c);
143 for_each_cache(ca, c, i)
144 bch_dev_allocator_stop(ca);
147 * Write a journal entry after flushing the btree, so we don't end up
148 * replaying everything we just flushed:
150 if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
153 bch_journal_flush_async(&c->journal, NULL);
154 ret = bch_journal_meta(&c->journal);
155 BUG_ON(ret && !bch_journal_error(&c->journal));
158 cancel_delayed_work_sync(&c->journal.write_work);
159 cancel_delayed_work_sync(&c->journal.reclaim_work);
162 static void bch_writes_disabled(struct percpu_ref *writes)
164 struct cache_set *c = container_of(writes, struct cache_set, writes);
166 set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
167 wake_up(&bch_read_only_wait);
170 static void bch_fs_read_only_work(struct work_struct *work)
172 struct cache_set *c =
173 container_of(work, struct cache_set, read_only_work);
175 percpu_ref_put(&c->writes);
177 del_timer(&c->foreground_write_wakeup);
178 cancel_delayed_work(&c->pd_controllers_update);
180 c->foreground_write_pd.rate.rate = UINT_MAX;
181 bch_wake_delayed_writes((unsigned long) c);
183 if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
185 * If we're not doing an emergency shutdown, we want to wait on
186 * outstanding writes to complete so they don't see spurious
187 * errors due to shutting down the allocator:
189 wait_event(bch_read_only_wait,
190 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
192 __bch_fs_read_only(c);
194 if (!bch_journal_error(&c->journal) &&
195 !test_bit(BCH_FS_ERROR, &c->flags)) {
196 mutex_lock(&c->sb_lock);
197 SET_BCH_SB_CLEAN(c->disk_sb, true);
199 mutex_unlock(&c->sb_lock);
203 * If we are doing an emergency shutdown outstanding writes may
204 * hang until we shutdown the allocator so we don't want to wait
205 * on outstanding writes before shutting everything down - but
206 * we do need to wait on them before returning and signalling
207 * that going RO is complete:
209 __bch_fs_read_only(c);
211 wait_event(bch_read_only_wait,
212 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
215 bch_notify_fs_read_only(c);
216 trace_fs_read_only_done(c);
218 set_bit(BCH_FS_RO_COMPLETE, &c->flags);
219 wake_up(&bch_read_only_wait);
222 bool bch_fs_read_only(struct cache_set *c)
224 if (test_and_set_bit(BCH_FS_RO, &c->flags))
227 trace_fs_read_only(c);
229 percpu_ref_get(&c->writes);
232 * Block new foreground-end write operations from starting - any new
233 * writes will return -EROFS:
235 * (This is really blocking new _allocations_, writes to previously
236 * allocated space can still happen until stopping the allocator in
237 * bch_dev_allocator_stop()).
239 percpu_ref_kill(&c->writes);
241 queue_work(system_freezable_wq, &c->read_only_work);
245 bool bch_fs_emergency_read_only(struct cache_set *c)
247 bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
250 bch_journal_halt(&c->journal);
252 wake_up(&bch_read_only_wait);
256 void bch_fs_read_only_sync(struct cache_set *c)
258 /* so we don't race with bch_fs_read_write() */
259 lockdep_assert_held(&bch_register_lock);
263 wait_event(bch_read_only_wait,
264 test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
265 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
268 static const char *__bch_fs_read_write(struct cache_set *c)
274 lockdep_assert_held(&bch_register_lock);
276 err = "error starting allocator thread";
277 for_each_cache(ca, c, i)
278 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
279 bch_dev_allocator_start(ca)) {
280 percpu_ref_put(&ca->ref);
284 err = "error starting btree GC thread";
285 if (bch_gc_thread_start(c))
288 for_each_cache(ca, c, i) {
289 if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
292 err = "error starting moving GC thread";
293 if (bch_moving_gc_thread_start(ca)) {
294 percpu_ref_put(&ca->ref);
299 err = "error starting tiering thread";
300 if (bch_tiering_read_start(c))
303 schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
307 __bch_fs_read_only(c);
311 const char *bch_fs_read_write(struct cache_set *c)
315 lockdep_assert_held(&bch_register_lock);
317 if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
320 err = __bch_fs_read_write(c);
324 percpu_ref_reinit(&c->writes);
326 clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
327 clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
328 clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
329 clear_bit(BCH_FS_RO, &c->flags);
333 /* Cache set startup/shutdown: */
335 static void bch_fs_free(struct cache_set *c)
337 del_timer_sync(&c->foreground_write_wakeup);
338 cancel_delayed_work_sync(&c->pd_controllers_update);
339 cancel_work_sync(&c->read_only_work);
340 cancel_work_sync(&c->bio_submit_work);
341 cancel_work_sync(&c->read_retry_work);
343 bch_fs_encryption_free(c);
344 bch_btree_cache_free(c);
345 bch_journal_free(&c->journal);
346 bch_io_clock_exit(&c->io_clock[WRITE]);
347 bch_io_clock_exit(&c->io_clock[READ]);
348 bch_compress_free(c);
349 bch_fs_blockdev_exit(c);
350 bdi_destroy(&c->bdi);
351 lg_lock_free(&c->bucket_stats_lock);
352 free_percpu(c->bucket_stats_percpu);
353 mempool_exit(&c->btree_bounce_pool);
354 mempool_exit(&c->bio_bounce_pages);
355 bioset_exit(&c->bio_write);
356 bioset_exit(&c->bio_read_split);
357 bioset_exit(&c->bio_read);
358 bioset_exit(&c->btree_read_bio);
359 mempool_exit(&c->btree_interior_update_pool);
360 mempool_exit(&c->btree_reserve_pool);
361 mempool_exit(&c->fill_iter);
362 percpu_ref_exit(&c->writes);
365 destroy_workqueue(c->copygc_wq);
367 destroy_workqueue(c->wq);
369 kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
370 free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
372 module_put(THIS_MODULE);
376 * should be __bch_fs_stop4 - block devices are closed, now we can finally
379 void bch_fs_release(struct kobject *kobj)
381 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
382 struct completion *stop_completion = c->stop_completion;
384 bch_notify_fs_stopped(c);
385 bch_info(c, "stopped");
390 complete(stop_completion);
394 * All activity on the cache_set should have stopped now - close devices:
396 static void __bch_fs_stop3(struct closure *cl)
398 struct cache_set *c = container_of(cl, struct cache_set, cl);
402 mutex_lock(&bch_register_lock);
403 for_each_cache(ca, c, i)
407 mutex_unlock(&bch_register_lock);
409 closure_debug_destroy(&c->cl);
410 kobject_put(&c->kobj);
414 * Openers (i.e. block devices) should have exited, shutdown all userspace
415 * interfaces and wait for &c->cl to hit 0
417 static void __bch_fs_stop2(struct closure *cl)
419 struct cache_set *c = container_of(cl, struct cache_set, caching);
421 bch_debug_exit_cache_set(c);
422 bch_fs_chardev_exit(c);
424 if (c->kobj.state_in_sysfs)
425 kobject_del(&c->kobj);
427 bch_cache_accounting_destroy(&c->accounting);
429 kobject_put(&c->time_stats);
430 kobject_put(&c->opts_dir);
431 kobject_put(&c->internal);
433 mutex_lock(&bch_register_lock);
434 bch_fs_read_only_sync(c);
435 mutex_unlock(&bch_register_lock);
441 * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
442 * haven't waited for anything to stop yet, we're just punting to process
443 * context to shut down block devices:
445 static void __bch_fs_stop1(struct closure *cl)
447 struct cache_set *c = container_of(cl, struct cache_set, caching);
449 bch_blockdevs_stop(c);
451 continue_at(cl, __bch_fs_stop2, system_wq);
454 void bch_fs_stop(struct cache_set *c)
456 if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
457 closure_queue(&c->caching);
460 void bch_fs_stop_sync(struct cache_set *c)
462 DECLARE_COMPLETION_ONSTACK(complete);
464 c->stop_completion = &complete;
469 wait_for_completion(&complete);
472 /* Stop, detaching from backing devices: */
473 void bch_fs_detach(struct cache_set *c)
475 if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
479 static unsigned bch_fs_nr_devices(struct cache_set *c)
481 struct bch_sb_field_members *mi;
484 mutex_lock(&c->sb_lock);
485 mi = bch_sb_get_members(c->disk_sb);
487 for (i = 0; i < c->disk_sb->nr_devices; i++)
488 if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
491 mutex_unlock(&c->sb_lock);
496 static unsigned bch_fs_nr_online_devices(struct cache_set *c)
500 for (i = 0; i < c->sb.nr_devices; i++)
507 #define alloc_bucket_pages(gfp, ca) \
508 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
510 static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
513 unsigned iter_size, journal_entry_bytes;
515 c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
519 __module_get(THIS_MODULE);
523 mutex_init(&c->sb_lock);
524 INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
525 mutex_init(&c->btree_cache_lock);
526 mutex_init(&c->bucket_lock);
527 mutex_init(&c->btree_root_lock);
528 INIT_WORK(&c->read_only_work, bch_fs_read_only_work);
530 init_rwsem(&c->gc_lock);
532 #define BCH_TIME_STAT(name, frequency_units, duration_units) \
533 spin_lock_init(&c->name##_time.lock);
537 bch_open_buckets_init(c);
538 bch_tiering_init_cache_set(c);
540 INIT_LIST_HEAD(&c->list);
541 INIT_LIST_HEAD(&c->cached_devs);
542 INIT_LIST_HEAD(&c->btree_cache);
543 INIT_LIST_HEAD(&c->btree_cache_freeable);
544 INIT_LIST_HEAD(&c->btree_cache_freed);
546 INIT_LIST_HEAD(&c->btree_interior_update_list);
547 mutex_init(&c->btree_reserve_cache_lock);
548 mutex_init(&c->btree_interior_update_lock);
550 mutex_init(&c->bio_bounce_pages_lock);
551 INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
552 spin_lock_init(&c->bio_submit_lock);
553 bio_list_init(&c->read_retry_list);
554 spin_lock_init(&c->read_retry_lock);
555 INIT_WORK(&c->read_retry_work, bch_read_retry_work);
556 mutex_init(&c->zlib_workspace_lock);
558 seqcount_init(&c->gc_pos_lock);
560 c->prio_clock[READ].hand = 1;
561 c->prio_clock[READ].min_prio = 0;
562 c->prio_clock[WRITE].hand = 1;
563 c->prio_clock[WRITE].min_prio = 0;
565 c->congested_read_threshold_us = 2000;
566 c->congested_write_threshold_us = 20000;
567 c->error_limit = 16 << IO_ERROR_SHIFT;
568 init_waitqueue_head(&c->writeback_wait);
570 c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
572 c->copy_gc_enabled = 1;
573 c->tiering_enabled = 1;
574 c->tiering_percent = 10;
576 c->foreground_target_percent = 20;
578 c->journal.write_time = &c->journal_write_time;
579 c->journal.delay_time = &c->journal_delay_time;
580 c->journal.blocked_time = &c->journal_blocked_time;
581 c->journal.flush_seq_time = &c->journal_flush_seq_time;
583 mutex_init(&c->uevent_lock);
585 mutex_lock(&c->sb_lock);
587 if (bch_sb_to_cache_set(c, sb)) {
588 mutex_unlock(&c->sb_lock);
592 mutex_unlock(&c->sb_lock);
594 scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
596 bch_opts_apply(&c->opts, bch_sb_opts(sb));
597 bch_opts_apply(&c->opts, opts);
599 c->opts.nochanges |= c->opts.noreplay;
600 c->opts.read_only |= c->opts.nochanges;
602 c->block_bits = ilog2(c->sb.block_size);
604 if (bch_fs_init_fault("fs_alloc"))
607 iter_size = (btree_blocks(c) + 1) * 2 *
608 sizeof(struct btree_node_iter_set);
610 journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
612 if (!(c->wq = alloc_workqueue("bcache",
613 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
614 !(c->copygc_wq = alloc_workqueue("bcache_copygc",
615 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
616 percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
617 mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
618 sizeof(struct btree_reserve)) ||
619 mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
620 sizeof(struct btree_interior_update)) ||
621 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
622 bioset_init(&c->btree_read_bio, 1, 0) ||
623 bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
624 bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
625 bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
626 mempool_init_page_pool(&c->bio_bounce_pages,
628 c->sb.btree_node_size,
629 BCH_ENCODED_EXTENT_MAX) /
631 !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
632 lg_lock_init(&c->bucket_stats_lock) ||
633 mempool_init_page_pool(&c->btree_bounce_pool, 1,
634 ilog2(btree_pages(c))) ||
635 bdi_setup_and_register(&c->bdi, "bcache") ||
636 bch_fs_blockdev_init(c) ||
637 bch_io_clock_init(&c->io_clock[READ]) ||
638 bch_io_clock_init(&c->io_clock[WRITE]) ||
639 bch_journal_alloc(&c->journal, journal_entry_bytes) ||
640 bch_btree_cache_alloc(c) ||
641 bch_fs_encryption_init(c) ||
642 bch_compress_init(c) ||
643 bch_check_set_has_compressed_data(c, c->opts.compression))
646 c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
647 c->bdi.congested_fn = bch_congested_fn;
648 c->bdi.congested_data = c;
651 * Now that all allocations have succeeded, init various refcounty
652 * things that let us shutdown:
654 closure_init(&c->cl, NULL);
656 c->kobj.kset = bcache_kset;
657 kobject_init(&c->kobj, &bch_fs_ktype);
658 kobject_init(&c->internal, &bch_fs_internal_ktype);
659 kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype);
660 kobject_init(&c->time_stats, &bch_fs_time_stats_ktype);
662 bch_cache_accounting_init(&c->accounting, &c->cl);
664 closure_init(&c->caching, &c->cl);
665 set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
667 continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
674 static int bch_fs_online(struct cache_set *c)
680 lockdep_assert_held(&bch_register_lock);
682 if (!list_empty(&c->list))
685 list_add(&c->list, &bch_fs_list);
687 ret = bch_fs_chardev_init(c);
691 if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
692 kobject_add(&c->internal, &c->kobj, "internal") ||
693 kobject_add(&c->opts_dir, &c->kobj, "options") ||
694 kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
695 bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
698 for_each_cache(ca, c, i)
699 if (bch_dev_online(ca)) {
700 percpu_ref_put(&ca->ref);
707 static const char *bch_fs_start(struct cache_set *c)
709 const char *err = "cannot allocate memory";
710 struct bch_sb_field_members *mi;
718 lockdep_assert_held(&bch_register_lock);
719 BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
721 /* We don't want bch_fatal_error() to free underneath us */
722 closure_get(&c->caching);
725 * Make sure that each cache object's mi is up to date before
726 * we start testing it.
728 for_each_cache(ca, c, i)
729 bch_sb_from_cache_set(c, ca);
731 if (BCH_SB_INITIALIZED(c->disk_sb)) {
732 ret = bch_journal_read(c, &journal);
736 pr_debug("btree_journal_read() done");
738 j = &list_entry(journal.prev, struct journal_replay, list)->j;
740 err = "error reading priorities";
741 for_each_cache(ca, c, i) {
742 ret = bch_prio_read(ca);
744 percpu_ref_put(&ca->ref);
749 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
750 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
752 for_each_cache(ca, c, i) {
753 bch_recalc_min_prio(ca, READ);
754 bch_recalc_min_prio(ca, WRITE);
757 for (id = 0; id < BTREE_ID_NR; id++) {
761 err = "bad btree root";
762 k = bch_journal_find_btree_root(c, j, id, &level);
763 if (!k && id == BTREE_ID_EXTENTS)
766 pr_debug("missing btree root: %d", id);
770 err = "error reading btree root";
771 if (bch_btree_root_read(c, id, k, level))
775 bch_verbose(c, "starting mark and sweep:");
777 err = "error in recovery";
778 if (bch_initial_gc(c, &journal))
781 if (c->opts.noreplay)
784 bch_verbose(c, "mark and sweep done");
787 * bch_journal_start() can't happen sooner, or btree_gc_finish()
788 * will give spurious errors about oldest_gen > bucket_gen -
789 * this is a hack but oh well.
791 bch_journal_start(c);
793 err = "error starting allocator thread";
794 for_each_cache(ca, c, i)
795 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
796 bch_dev_allocator_start(ca)) {
797 percpu_ref_put(&ca->ref);
801 bch_verbose(c, "starting journal replay:");
803 err = "journal replay failed";
804 ret = bch_journal_replay(c, &journal);
808 bch_verbose(c, "journal replay done");
810 if (c->opts.norecovery)
813 bch_verbose(c, "starting fsck:");
814 err = "error in fsck";
815 ret = bch_fsck(c, !c->opts.nofsck);
819 bch_verbose(c, "fsck done");
821 struct bch_inode_unpacked inode;
822 struct bkey_inode_buf packed_inode;
825 closure_init_stack(&cl);
827 bch_notice(c, "initializing new filesystem");
829 err = "unable to allocate journal buckets";
830 for_each_cache(ca, c, i)
831 if (bch_dev_journal_alloc(ca)) {
832 percpu_ref_put(&ca->ref);
836 bch_initial_gc(c, NULL);
839 * journal_res_get() will crash if called before this has
840 * set up the journal.pin FIFO and journal.cur pointer:
842 bch_journal_start(c);
843 bch_journal_set_replay_done(&c->journal);
845 err = "error starting allocator thread";
846 for_each_cache(ca, c, i)
847 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
848 bch_dev_allocator_start(ca)) {
849 percpu_ref_put(&ca->ref);
853 err = "cannot allocate new btree root";
854 for (id = 0; id < BTREE_ID_NR; id++)
855 if (bch_btree_root_alloc(c, id, &cl)) {
860 /* Wait for new btree roots to be written: */
863 bch_inode_init(c, &inode, 0, 0,
864 S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
865 inode.inum = BCACHE_ROOT_INO;
867 bch_inode_pack(&packed_inode, &inode);
869 err = "error creating root directory";
870 if (bch_btree_insert(c, BTREE_ID_INODES,
871 &packed_inode.inode.k_i,
872 NULL, NULL, NULL, 0))
875 err = "error writing first journal entry";
876 if (bch_journal_meta(&c->journal))
880 if (c->opts.read_only) {
881 bch_fs_read_only_sync(c);
883 err = __bch_fs_read_write(c);
888 mutex_lock(&c->sb_lock);
889 mi = bch_sb_get_members(c->disk_sb);
890 now = ktime_get_seconds();
893 for_each_cache_rcu(ca, c, i)
894 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
897 SET_BCH_SB_INITIALIZED(c->disk_sb, true);
898 SET_BCH_SB_CLEAN(c->disk_sb, false);
899 c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
902 mutex_unlock(&c->sb_lock);
904 err = "dynamic fault";
905 if (bch_fs_init_fault("fs_start"))
908 err = "error creating kobject";
909 if (bch_fs_online(c))
912 err = "can't bring up blockdev volumes";
913 if (bch_blockdev_volumes_start(c))
916 bch_debug_init_cache_set(c);
917 set_bit(BCH_FS_RUNNING, &c->flags);
918 bch_attach_backing_devs(c);
920 bch_notify_fs_read_write(c);
923 bch_journal_entries_free(&journal);
924 closure_put(&c->caching);
928 case BCH_FSCK_ERRORS_NOT_FIXED:
929 bch_err(c, "filesystem contains errors: please report this to the developers");
930 pr_cont("mount with -o fix_errors to repair");
933 case BCH_FSCK_REPAIR_UNIMPLEMENTED:
934 bch_err(c, "filesystem contains errors: please report this to the developers");
935 pr_cont("repair unimplemented: inform the developers so that it can be added");
938 case BCH_FSCK_REPAIR_IMPOSSIBLE:
939 bch_err(c, "filesystem contains errors, but repair impossible");
942 case BCH_FSCK_UNKNOWN_VERSION:
943 err = "unknown metadata version";;
946 err = "cannot allocate memory";
954 set_bit(BCH_FS_ERROR, &c->flags);
958 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
960 struct bch_sb_field_members *sb_mi;
962 sb_mi = bch_sb_get_members(sb);
964 return "Invalid superblock: member info area missing";
966 if (le16_to_cpu(sb->block_size) != c->sb.block_size)
967 return "mismatched block size";
969 if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
970 BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
971 return "new cache bucket_size is too small";
976 static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
978 struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
979 struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
980 uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
983 err = bch_dev_may_add(sb, c);
987 if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
988 return "device has been removed";
991 * When attaching an existing device, the cache set superblock must
992 * already contain member_info with a matching UUID
994 if (sb->dev_idx >= c->disk_sb->nr_devices ||
995 memcmp(&mi->members[sb->dev_idx].uuid,
996 &dev_uuid, sizeof(uuid_le)))
997 return "cache sb does not match set";
1004 bool bch_dev_read_only(struct cache *ca)
1006 struct cache_set *c = ca->set;
1007 struct bch_sb_field_members *mi;
1008 char buf[BDEVNAME_SIZE];
1010 bdevname(ca->disk_sb.bdev, buf);
1012 lockdep_assert_held(&bch_register_lock);
1014 if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
1017 if (!bch_dev_may_remove(ca)) {
1018 bch_err(c, "required member %s going RO, forcing fs RO", buf);
1019 bch_fs_read_only_sync(c);
1022 trace_bcache_cache_read_only(ca);
1024 bch_moving_gc_stop(ca);
1027 * This stops new data writes (e.g. to existing open data
1028 * buckets) and then waits for all existing writes to
1031 bch_dev_allocator_stop(ca);
1033 bch_dev_group_remove(&c->journal.devs, ca);
1036 * Device data write barrier -- no non-meta-data writes should
1037 * occur after this point. However, writes to btree buckets,
1038 * journal buckets, and the superblock can still occur.
1040 trace_bcache_cache_read_only_done(ca);
1042 bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
1043 bch_notify_dev_read_only(ca);
1045 mutex_lock(&c->sb_lock);
1046 mi = bch_sb_get_members(c->disk_sb);
1047 SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
1048 BCH_MEMBER_STATE_RO);
1050 mutex_unlock(&c->sb_lock);
1054 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
1056 lockdep_assert_held(&bch_register_lock);
1058 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
1061 if (test_bit(BCH_DEV_REMOVING, &ca->flags))
1064 trace_bcache_cache_read_write(ca);
1066 if (bch_dev_allocator_start(ca))
1067 return "error starting allocator thread";
1069 if (bch_moving_gc_thread_start(ca))
1070 return "error starting moving GC thread";
1072 bch_dev_group_add(&c->journal.devs, ca);
1074 wake_up_process(c->tiering_read);
1076 bch_notify_dev_read_write(ca);
1077 trace_bcache_cache_read_write_done(ca);
1082 const char *bch_dev_read_write(struct cache *ca)
1084 struct cache_set *c = ca->set;
1085 struct bch_sb_field_members *mi;
1088 err = __bch_dev_read_write(c, ca);
1092 mutex_lock(&c->sb_lock);
1093 mi = bch_sb_get_members(c->disk_sb);
1094 SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
1095 BCH_MEMBER_STATE_ACTIVE);
1097 mutex_unlock(&c->sb_lock);
1103 * bch_dev_stop has already returned, so we no longer hold the register
1104 * lock at the point this is called.
1107 void bch_dev_release(struct kobject *kobj)
1109 struct cache *ca = container_of(kobj, struct cache, kobj);
1111 percpu_ref_exit(&ca->ref);
1115 static void bch_dev_free_work(struct work_struct *work)
1117 struct cache *ca = container_of(work, struct cache, free_work);
1118 struct cache_set *c = ca->set;
1121 cancel_work_sync(&ca->io_error_work);
1123 if (c && c->kobj.state_in_sysfs) {
1126 sprintf(buf, "cache%u", ca->dev_idx);
1127 sysfs_remove_link(&c->kobj, buf);
1130 if (ca->kobj.state_in_sysfs)
1131 kobject_del(&ca->kobj);
1133 bch_free_super(&ca->disk_sb);
1136 * bch_dev_stop can be called in the middle of initialization
1137 * of the struct cache object.
1138 * As such, not all the sub-structures may be initialized.
1139 * However, they were zeroed when the object was allocated.
1142 bch_journal_free_cache(ca);
1143 free_percpu(ca->sectors_written);
1144 bioset_exit(&ca->replica_set);
1145 free_percpu(ca->bucket_stats_percpu);
1146 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1147 kfree(ca->prio_buckets);
1148 kfree(ca->bio_prio);
1149 kfree(ca->journal.bio);
1151 vfree(ca->oldest_gens);
1152 free_heap(&ca->heap);
1153 free_fifo(&ca->free_inc);
1155 for (i = 0; i < RESERVE_NR; i++)
1156 free_fifo(&ca->free[i]);
1158 kobject_put(&ca->kobj);
1161 kobject_put(&c->kobj);
1164 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
1166 struct cache *ca = container_of(ref, struct cache, ref);
1168 schedule_work(&ca->free_work);
1171 static void bch_dev_free_rcu(struct rcu_head *rcu)
1173 struct cache *ca = container_of(rcu, struct cache, free_rcu);
1176 * This decrements the ref count to ca, and once the ref count
1177 * is 0 (outstanding bios to the ca also incremented it and
1178 * decrement it on completion/error), bch_dev_percpu_ref_release
1179 * is called, and that eventually results in bch_dev_free_work
1180 * being called, which in turn results in bch_dev_release being
1183 * In particular, these functions won't be called until there are no
1184 * bios outstanding (the per-cpu ref counts are all 0), so it
1185 * is safe to remove the actual sysfs device at that point,
1186 * and that can indicate success to the user.
1189 percpu_ref_kill(&ca->ref);
1192 static void bch_dev_stop(struct cache *ca)
1194 struct cache_set *c = ca->set;
1196 lockdep_assert_held(&bch_register_lock);
1199 BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
1200 rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
1203 call_rcu(&ca->free_rcu, bch_dev_free_rcu);
1206 static void bch_dev_remove_work(struct work_struct *work)
1208 struct cache *ca = container_of(work, struct cache, remove_work);
1209 struct bch_sb_field_members *mi;
1210 struct cache_set *c = ca->set;
1211 char name[BDEVNAME_SIZE];
1212 bool force = test_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
1213 unsigned dev_idx = ca->dev_idx;
1215 bdevname(ca->disk_sb.bdev, name);
1218 * Device should already be RO, now migrate data off:
1220 * XXX: locking is sketchy, bch_dev_read_write() has to check
1221 * BCH_DEV_REMOVING bit
1223 if (!ca->mi.has_data) {
1224 /* Nothing to do: */
1225 } else if (!bch_move_data_off_device(ca)) {
1226 mutex_lock(&c->sb_lock);
1227 mi = bch_sb_get_members(c->disk_sb);
1228 SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
1231 mutex_unlock(&c->sb_lock);
1233 bch_flag_data_bad(ca);
1235 mutex_lock(&c->sb_lock);
1236 mi = bch_sb_get_members(c->disk_sb);
1237 SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
1240 mutex_unlock(&c->sb_lock);
1242 bch_err(c, "Remove of %s failed, unable to migrate data off",
1244 clear_bit(BCH_DEV_REMOVING, &ca->flags);
1250 if (!ca->mi.has_metadata) {
1251 /* Nothing to do: */
1252 } else if (!bch_move_meta_data_off_device(ca)) {
1253 mutex_lock(&c->sb_lock);
1254 mi = bch_sb_get_members(c->disk_sb);
1255 SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
1258 mutex_unlock(&c->sb_lock);
1260 bch_err(c, "Remove of %s failed, unable to migrate metadata off",
1262 clear_bit(BCH_DEV_REMOVING, &ca->flags);
1267 * Ok, really doing the remove:
1268 * Drop device's prio pointer before removing it from superblock:
1270 bch_notify_dev_removed(ca);
1272 spin_lock(&c->journal.lock);
1273 c->journal.prio_buckets[dev_idx] = 0;
1274 spin_unlock(&c->journal.lock);
1276 bch_journal_meta(&c->journal);
1279 * Stop device before removing it from the cache set's list of devices -
1280 * and get our own ref on cache set since ca is going away:
1282 closure_get(&c->cl);
1284 mutex_lock(&bch_register_lock);
1288 * RCU barrier between dropping between c->cache and dropping from
1293 lockdep_assert_held(&bch_register_lock);
1296 * Free this device's slot in the bch_member array - all pointers to
1297 * this device must be gone:
1299 mutex_lock(&c->sb_lock);
1300 mi = bch_sb_get_members(c->disk_sb);
1301 memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1304 mutex_unlock(&c->sb_lock);
1306 mutex_unlock(&bch_register_lock);
1308 closure_put(&c->cl);
1311 bool bch_dev_remove(struct cache *ca, bool force)
1313 mutex_lock(&bch_register_lock);
1315 if (test_bit(BCH_DEV_REMOVING, &ca->flags))
1318 if (!bch_dev_may_remove(ca)) {
1319 bch_err(ca->set, "Can't remove last device in tier %u",
1321 bch_notify_dev_remove_failed(ca);
1325 /* First, go RO before we try to migrate data off: */
1326 bch_dev_read_only(ca);
1329 set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
1330 set_bit(BCH_DEV_REMOVING, &ca->flags);
1331 bch_notify_dev_removing(ca);
1333 mutex_unlock(&bch_register_lock);
1335 /* Migrate the data and finish removal asynchronously: */
1337 queue_work(system_long_wq, &ca->remove_work);
1341 static int bch_dev_online(struct cache *ca)
1345 lockdep_assert_held(&bch_register_lock);
1347 sprintf(buf, "cache%u", ca->dev_idx);
1349 if (kobject_add(&ca->kobj,
1350 &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1352 sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
1353 sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
1359 static const char *bch_dev_alloc(struct bcache_superblock *sb,
1360 struct cache_set *c,
1363 struct bch_member *member;
1364 size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1367 const char *err = "cannot allocate memory";
1370 if (c->sb.nr_devices == 1)
1371 bdevname(sb->bdev, c->name);
1373 if (bch_fs_init_fault("dev_alloc"))
1376 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1380 if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release,
1386 kobject_init(&ca->kobj, &bch_dev_ktype);
1388 spin_lock_init(&ca->self.lock);
1389 ca->self.nr_devices = 1;
1390 rcu_assign_pointer(ca->self.d[0].dev, ca);
1391 ca->dev_idx = sb->sb->dev_idx;
1393 INIT_WORK(&ca->free_work, bch_dev_free_work);
1394 INIT_WORK(&ca->remove_work, bch_dev_remove_work);
1395 spin_lock_init(&ca->freelist_lock);
1396 spin_lock_init(&ca->prio_buckets_lock);
1397 mutex_init(&ca->heap_lock);
1398 bch_moving_init_cache(ca);
1401 ca->disk_sb.bdev->bd_holder = ca;
1402 memset(sb, 0, sizeof(*sb));
1404 INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1406 err = "dynamic fault";
1407 if (bch_fs_init_fault("dev_alloc"))
1410 member = bch_sb_get_members(ca->disk_sb.sb)->members +
1411 ca->disk_sb.sb->dev_idx;
1413 ca->mi = cache_mi_to_cpu_mi(member);
1414 ca->uuid = member->uuid;
1415 ca->bucket_bits = ilog2(ca->mi.bucket_size);
1417 /* XXX: tune these */
1418 movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1419 reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1421 * free_inc must be smaller than the copygc reserve: if it was bigger,
1422 * one copygc iteration might not make enough buckets available to fill
1423 * up free_inc and allow the allocator to make forward progress
1425 free_inc_reserve = movinggc_reserve / 2;
1426 heap_size = movinggc_reserve * 8;
1428 if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1429 !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1430 !init_fifo(&ca->free[RESERVE_MOVINGGC],
1431 movinggc_reserve, GFP_KERNEL) ||
1432 !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1433 !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
1434 !init_heap(&ca->heap, heap_size, GFP_KERNEL) ||
1435 !(ca->oldest_gens = vzalloc(sizeof(u8) *
1436 ca->mi.nbuckets)) ||
1437 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1438 ca->mi.nbuckets)) ||
1439 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1441 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1442 !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
1443 !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
1444 bioset_init(&ca->replica_set, 4,
1445 offsetof(struct bch_write_bio, bio)) ||
1446 !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
1447 bch_journal_init_cache(ca))
1450 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1452 total_reserve = ca->free_inc.size;
1453 for (i = 0; i < RESERVE_NR; i++)
1454 total_reserve += ca->free[i].size;
1455 pr_debug("%zu buckets reserved", total_reserve);
1457 ca->copygc_write_point.group = &ca->self;
1458 ca->tiering_write_point.group = &ca->self;
1461 * Increase journal write timeout if flushes to this device are
1464 if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
1465 journal_flushes_device(ca))
1466 c->journal.write_delay_ms =
1467 max(c->journal.write_delay_ms, 1000U);
1469 kobject_get(&c->kobj);
1472 kobject_get(&ca->kobj);
1473 rcu_assign_pointer(c->cache[ca->dev_idx], ca);
1475 mutex_lock(&c->sb_lock);
1477 if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
1478 bch_sb_to_cache_set(c, ca->disk_sb.sb);
1480 mutex_unlock(&c->sb_lock);
1482 err = "error creating kobject";
1483 if (c->kobj.state_in_sysfs &&
1490 kobject_put(&ca->kobj);
1497 static struct cache_set *bch_fs_lookup(uuid_le uuid)
1499 struct cache_set *c;
1501 lockdep_assert_held(&bch_register_lock);
1503 list_for_each_entry(c, &bch_fs_list, list)
1504 if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
1510 int bch_dev_add(struct cache_set *c, const char *path)
1512 struct bcache_superblock sb;
1515 struct bch_sb_field *f;
1516 struct bch_sb_field_members *mi, *dev_mi;
1517 struct bch_member saved_mi;
1518 unsigned dev_idx, nr_devices, u64s;
1521 mutex_lock(&bch_register_lock);
1523 err = bch_read_super(&sb, c->opts, path);
1525 goto err_unlock_register;
1527 err = bch_validate_cache_super(&sb);
1529 goto err_unlock_register;
1531 mutex_lock(&c->sb_lock);
1533 err = bch_dev_may_add(sb.sb, c);
1538 * Preserve the old cache member information (esp. tier)
1539 * before we start bashing the disk stuff.
1541 dev_mi = bch_sb_get_members(sb.sb);
1542 saved_mi = dev_mi->members[sb.sb->dev_idx];
1543 saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
1545 down_read(&c->gc_lock);
1547 if (dynamic_fault("bcache:add:no_slot"))
1550 if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
1553 mi = bch_sb_get_members(c->disk_sb);
1554 for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1555 if (dev_idx >= c->sb.nr_devices ||
1556 bch_is_zero(mi->members[dev_idx].uuid.b,
1560 up_read(&c->gc_lock);
1562 err = "no slots available in superblock";
1567 up_read(&c->gc_lock);
1569 nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1570 u64s = (sizeof(struct bch_sb_field_members) +
1571 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1572 err = "no space in superblock for member info";
1574 f = bch_fs_sb_field_resize(c, &mi->field, u64s);
1578 mi = container_of(f, struct bch_sb_field_members, field);
1580 f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
1584 dev_mi = container_of(f, struct bch_sb_field_members, field);
1585 memcpy(dev_mi, mi, u64s * sizeof(u64));
1586 dev_mi->members[dev_idx] = saved_mi;
1588 sb.sb->dev_idx = dev_idx;
1589 sb.sb->nr_devices = nr_devices;
1591 if (bch_fs_mi_update(c, dev_mi->members, nr_devices)) {
1592 err = "cannot allocate memory";
1597 /* commit new member info */
1598 memcpy(mi, dev_mi, u64s * sizeof(u64));
1599 c->disk_sb->nr_devices = nr_devices;
1600 c->sb.nr_devices = nr_devices;
1602 err = bch_dev_alloc(&sb, c, &ca);
1608 err = "journal alloc failed";
1609 if (bch_dev_journal_alloc(ca))
1612 bch_notify_dev_added(ca);
1614 if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
1615 err = __bch_dev_read_write(c, ca);
1620 kobject_put(&ca->kobj);
1621 mutex_unlock(&c->sb_lock);
1622 mutex_unlock(&bch_register_lock);
1627 mutex_unlock(&c->sb_lock);
1628 err_unlock_register:
1629 mutex_unlock(&bch_register_lock);
1630 bch_free_super(&sb);
1632 bch_err(c, "Unable to add device: %s", err);
1633 return ret ?: -EINVAL;
1636 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
1637 struct bch_opts opts, struct cache_set **ret)
1640 struct cache_set *c = NULL;
1641 struct bcache_superblock *sb;
1645 memset(&uuid, 0, sizeof(uuid_le));
1648 return "need at least one device";
1650 if (!try_module_get(THIS_MODULE))
1651 return "module unloading";
1653 err = "cannot allocate memory";
1654 sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1659 * bch_read_super() needs to happen under register_lock, so that the
1660 * exclusive open is atomic with adding the new cache set to the list of
1663 mutex_lock(&bch_register_lock);
1665 for (i = 0; i < nr_devices; i++) {
1666 err = bch_read_super(&sb[i], opts, devices[i]);
1670 err = "attempting to register backing device";
1671 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
1674 err = bch_validate_cache_super(&sb[i]);
1679 err = "cache set already registered";
1680 if (bch_fs_lookup(sb->sb->uuid))
1683 err = "cannot allocate memory";
1684 c = bch_fs_alloc(sb[0].sb, opts);
1688 for (i = 0; i < nr_devices; i++) {
1689 err = bch_dev_alloc(&sb[i], c, NULL);
1694 err = "insufficient devices";
1695 if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
1698 err = bch_fs_start(c);
1702 err = "error creating kobject";
1703 if (bch_fs_online(c))
1707 closure_get(&c->cl);
1711 mutex_unlock(&bch_register_lock);
1716 module_put(THIS_MODULE);
1723 mutex_unlock(&bch_register_lock);
1725 for (i = 0; i < nr_devices; i++)
1726 bch_free_super(&sb[i]);
1730 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
1731 struct bch_opts opts)
1733 char name[BDEVNAME_SIZE];
1735 struct cache_set *c;
1736 bool allocated_cache_set = false;
1738 err = bch_validate_cache_super(sb);
1742 bdevname(sb->bdev, name);
1744 c = bch_fs_lookup(sb->sb->uuid);
1746 err = bch_dev_in_fs(sb->sb, c);
1750 c = bch_fs_alloc(sb->sb, opts);
1752 return "cannot allocate memory";
1754 allocated_cache_set = true;
1757 err = bch_dev_alloc(sb, c, NULL);
1761 if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
1762 err = bch_fs_start(c);
1766 err = "error creating kobject";
1767 if (bch_fs_online(c))
1771 bch_info(c, "started");
1774 if (allocated_cache_set)
1779 const char *bch_fs_open_incremental(const char *path)
1781 struct bcache_superblock sb;
1782 struct bch_opts opts = bch_opts_empty();
1785 mutex_lock(&bch_register_lock);
1787 err = bch_read_super(&sb, opts, path);
1791 if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
1792 err = bch_backing_dev_register(&sb);
1794 err = __bch_fs_open_incremental(&sb, opts);
1796 bch_free_super(&sb);
1798 mutex_unlock(&bch_register_lock);
1802 /* Global interfaces/init */
1804 #define kobj_attribute_write(n, fn) \
1805 static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1807 #define kobj_attribute_rw(n, show, store) \
1808 static struct kobj_attribute ksysfs_##n = \
1809 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1811 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1812 const char *, size_t);
1814 kobj_attribute_write(register, register_bcache);
1815 kobj_attribute_write(register_quiet, register_bcache);
1817 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1818 const char *buffer, size_t size)
1820 ssize_t ret = -EINVAL;
1821 const char *err = "cannot allocate memory";
1824 if (!try_module_get(THIS_MODULE))
1827 if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
1830 err = bch_fs_open_incremental(strim(path));
1837 module_put(THIS_MODULE);
1840 pr_err("error opening %s: %s", path, err);
1844 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1846 if (code == SYS_DOWN ||
1848 code == SYS_POWER_OFF) {
1849 struct cache_set *c;
1851 mutex_lock(&bch_register_lock);
1853 if (!list_empty(&bch_fs_list))
1854 pr_info("Setting all devices read only:");
1856 list_for_each_entry(c, &bch_fs_list, list)
1857 bch_fs_read_only(c);
1859 list_for_each_entry(c, &bch_fs_list, list)
1860 bch_fs_read_only_sync(c);
1862 mutex_unlock(&bch_register_lock);
1868 static struct notifier_block reboot = {
1869 .notifier_call = bcache_reboot,
1870 .priority = INT_MAX, /* before any real devices */
1873 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
1874 const char *buffer, size_t size)
1876 bcache_reboot(NULL, SYS_DOWN, NULL);
1880 kobj_attribute_write(reboot, reboot_test);
1882 static void bcache_exit(void)
1886 bch_blockdev_exit();
1889 kset_unregister(bcache_kset);
1891 destroy_workqueue(bcache_io_wq);
1892 if (!IS_ERR_OR_NULL(bch_sha256))
1893 crypto_free_shash(bch_sha256);
1894 unregister_reboot_notifier(&reboot);
1897 static int __init bcache_init(void)
1899 static const struct attribute *files[] = {
1900 &ksysfs_register.attr,
1901 &ksysfs_register_quiet.attr,
1902 &ksysfs_reboot.attr,
1906 mutex_init(&bch_register_lock);
1907 register_reboot_notifier(&reboot);
1908 closure_debug_init();
1911 bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
1912 if (IS_ERR(bch_sha256))
1915 if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
1916 !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
1917 sysfs_create_files(&bcache_kset->kobj, files) ||
1918 bch_chardev_init() ||
1919 bch_blockdev_init() ||
1930 #define BCH_DEBUG_PARAM(name, description) \
1932 module_param_named(name, bch_##name, bool, 0644); \
1933 MODULE_PARM_DESC(name, description);
1935 #undef BCH_DEBUG_PARAM
1937 module_exit(bcache_exit);
1938 module_init(bcache_init);