2 * bcachefs setup/teardown code, and some metadata io - read a superblock and
3 * figure out what to do with it.
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
11 #include "btree_cache.h"
13 #include "btree_update.h"
35 #include <linux/backing-dev.h>
36 #include <linux/blkdev.h>
37 #include <linux/debugfs.h>
38 #include <linux/device.h>
39 #include <linux/genhd.h>
40 #include <linux/idr.h>
41 #include <linux/kthread.h>
42 #include <linux/module.h>
43 #include <linux/percpu.h>
44 #include <linux/random.h>
45 #include <linux/sysfs.h>
46 #include <crypto/hash.h>
48 #include <trace/events/bcachefs.h>
50 MODULE_LICENSE("GPL");
51 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
54 struct kobj_type type ## _ktype = { \
55 .release = type ## _release, \
56 .sysfs_ops = &type ## _sysfs_ops, \
57 .default_attrs = type ## _files \
60 static void bch2_fs_release(struct kobject *);
61 static void bch2_dev_release(struct kobject *);
63 static void bch2_fs_internal_release(struct kobject *k)
67 static void bch2_fs_opts_dir_release(struct kobject *k)
71 static void bch2_fs_time_stats_release(struct kobject *k)
75 static KTYPE(bch2_fs);
76 static KTYPE(bch2_fs_internal);
77 static KTYPE(bch2_fs_opts_dir);
78 static KTYPE(bch2_fs_time_stats);
79 static KTYPE(bch2_dev);
81 static struct kset *bcachefs_kset;
82 static LIST_HEAD(bch_fs_list);
83 static DEFINE_MUTEX(bch_fs_list_lock);
85 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
87 static void bch2_dev_free(struct bch_dev *);
88 static int bch2_dev_alloc(struct bch_fs *, unsigned);
89 static int bch2_dev_sysfs_online(struct bch_dev *);
90 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
92 struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev)
98 mutex_lock(&bch_fs_list_lock);
101 list_for_each_entry(c, &bch_fs_list, list)
102 for_each_member_device_rcu(ca, c, i)
103 if (ca->disk_sb.bdev == bdev) {
110 mutex_unlock(&bch_fs_list_lock);
115 static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
119 lockdep_assert_held(&bch_fs_list_lock);
121 list_for_each_entry(c, &bch_fs_list, list)
122 if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
128 struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
132 mutex_lock(&bch_fs_list_lock);
133 c = __bch2_uuid_to_fs(uuid);
136 mutex_unlock(&bch_fs_list_lock);
141 int bch2_congested(struct bch_fs *c, int bdi_bits)
143 struct backing_dev_info *bdi;
148 if (bdi_bits & (1 << WB_sync_congested)) {
149 /* Reads - check all devices: */
150 for_each_readable_member(ca, c, i) {
151 bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
153 if (bdi_congested(bdi, bdi_bits)) {
159 /* Writes prefer fastest tier: */
160 struct bch_tier *tier = READ_ONCE(c->fastest_tier);
161 struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
164 group_for_each_dev(ca, grp, i) {
165 bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
167 if (bdi_congested(bdi, bdi_bits)) {
178 static int bch2_congested_fn(void *data, int bdi_bits)
180 struct bch_fs *c = data;
182 return bch2_congested(c, bdi_bits);
185 /* Filesystem RO/RW: */
188 * For startup/shutdown of RW stuff, the dependencies are:
190 * - foreground writes depend on copygc and tiering (to free up space)
192 * - copygc and tiering depend on mark and sweep gc (they actually probably
193 * don't because they either reserve ahead of time or don't block if
194 * allocations fail, but allocations can require mark and sweep gc to run
195 * because of generation number wraparound)
197 * - all of the above depends on the allocator threads
199 * - allocator depends on the journal (when it rewrites prios and gens)
202 static void __bch2_fs_read_only(struct bch_fs *c)
207 bch2_tiering_stop(c);
209 for_each_member_device(ca, c, i)
210 bch2_moving_gc_stop(ca);
212 bch2_gc_thread_stop(c);
216 for_each_member_device(ca, c, i)
217 bch2_dev_allocator_stop(ca);
219 bch2_fs_journal_stop(&c->journal);
222 static void bch2_writes_disabled(struct percpu_ref *writes)
224 struct bch_fs *c = container_of(writes, struct bch_fs, writes);
226 set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
227 wake_up(&bch_read_only_wait);
230 void bch2_fs_read_only(struct bch_fs *c)
232 mutex_lock(&c->state_lock);
233 if (c->state != BCH_FS_STARTING &&
234 c->state != BCH_FS_RW)
237 if (test_bit(BCH_FS_ERROR, &c->flags))
241 * Block new foreground-end write operations from starting - any new
242 * writes will return -EROFS:
244 * (This is really blocking new _allocations_, writes to previously
245 * allocated space can still happen until stopping the allocator in
246 * bch2_dev_allocator_stop()).
248 percpu_ref_kill(&c->writes);
250 del_timer(&c->foreground_write_wakeup);
251 cancel_delayed_work(&c->pd_controllers_update);
253 c->foreground_write_pd.rate.rate = UINT_MAX;
254 bch2_wake_delayed_writes((unsigned long) c);
257 * If we're not doing an emergency shutdown, we want to wait on
258 * outstanding writes to complete so they don't see spurious errors due
259 * to shutting down the allocator:
261 * If we are doing an emergency shutdown outstanding writes may
262 * hang until we shutdown the allocator so we don't want to wait
263 * on outstanding writes before shutting everything down - but
264 * we do need to wait on them before returning and signalling
265 * that going RO is complete:
267 wait_event(bch_read_only_wait,
268 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
269 test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
271 __bch2_fs_read_only(c);
273 wait_event(bch_read_only_wait,
274 test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
276 clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
278 if (!bch2_journal_error(&c->journal) &&
279 !test_bit(BCH_FS_ERROR, &c->flags)) {
280 mutex_lock(&c->sb_lock);
281 SET_BCH_SB_CLEAN(c->disk_sb, true);
283 mutex_unlock(&c->sb_lock);
286 c->state = BCH_FS_RO;
288 mutex_unlock(&c->state_lock);
291 static void bch2_fs_read_only_work(struct work_struct *work)
294 container_of(work, struct bch_fs, read_only_work);
296 bch2_fs_read_only(c);
299 static void bch2_fs_read_only_async(struct bch_fs *c)
301 queue_work(system_long_wq, &c->read_only_work);
304 bool bch2_fs_emergency_read_only(struct bch_fs *c)
306 bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
308 bch2_fs_read_only_async(c);
309 bch2_journal_halt(&c->journal);
311 wake_up(&bch_read_only_wait);
315 const char *bch2_fs_read_write(struct bch_fs *c)
318 const char *err = NULL;
321 mutex_lock(&c->state_lock);
322 if (c->state != BCH_FS_STARTING &&
323 c->state != BCH_FS_RO)
326 err = "error starting allocator thread";
327 for_each_rw_member(ca, c, i)
328 if (bch2_dev_allocator_start(ca)) {
329 percpu_ref_put(&ca->io_ref);
333 err = "error starting btree GC thread";
334 if (bch2_gc_thread_start(c))
337 err = "error starting moving GC thread";
338 for_each_rw_member(ca, c, i)
339 if (bch2_moving_gc_start(ca)) {
340 percpu_ref_put(&ca->io_ref);
344 err = "error starting tiering thread";
345 if (bch2_tiering_start(c))
348 schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
350 if (c->state != BCH_FS_STARTING)
351 percpu_ref_reinit(&c->writes);
353 c->state = BCH_FS_RW;
356 mutex_unlock(&c->state_lock);
359 __bch2_fs_read_only(c);
363 /* Filesystem startup/shutdown: */
365 static void bch2_fs_free(struct bch_fs *c)
367 bch2_fs_encryption_exit(c);
368 bch2_fs_btree_exit(c);
369 bch2_fs_journal_exit(&c->journal);
370 bch2_io_clock_exit(&c->io_clock[WRITE]);
371 bch2_io_clock_exit(&c->io_clock[READ]);
372 bch2_fs_compress_exit(c);
373 bdi_destroy(&c->bdi);
374 lg_lock_free(&c->usage_lock);
375 free_percpu(c->usage_percpu);
376 mempool_exit(&c->btree_bounce_pool);
377 mempool_exit(&c->bio_bounce_pages);
378 bioset_exit(&c->bio_write);
379 bioset_exit(&c->bio_read_split);
380 bioset_exit(&c->bio_read);
381 bioset_exit(&c->btree_read_bio);
382 mempool_exit(&c->btree_interior_update_pool);
383 mempool_exit(&c->btree_reserve_pool);
384 mempool_exit(&c->fill_iter);
385 percpu_ref_exit(&c->writes);
388 destroy_workqueue(c->copygc_wq);
390 destroy_workqueue(c->wq);
392 free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
394 module_put(THIS_MODULE);
397 static void bch2_fs_exit(struct bch_fs *c)
401 del_timer_sync(&c->foreground_write_wakeup);
402 cancel_delayed_work_sync(&c->pd_controllers_update);
403 cancel_work_sync(&c->read_only_work);
404 cancel_work_sync(&c->read_retry_work);
406 for (i = 0; i < c->sb.nr_devices; i++)
408 bch2_dev_free(c->devs[i]);
410 closure_debug_destroy(&c->cl);
411 kobject_put(&c->kobj);
414 static void bch2_fs_offline(struct bch_fs *c)
419 mutex_lock(&bch_fs_list_lock);
421 mutex_unlock(&bch_fs_list_lock);
423 for_each_member_device(ca, c, i)
424 if (ca->kobj.state_in_sysfs &&
426 sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
429 if (c->kobj.state_in_sysfs)
430 kobject_del(&c->kobj);
432 bch2_fs_debug_exit(c);
433 bch2_fs_chardev_exit(c);
435 kobject_put(&c->time_stats);
436 kobject_put(&c->opts_dir);
437 kobject_put(&c->internal);
439 __bch2_fs_read_only(c);
442 static void bch2_fs_release(struct kobject *kobj)
444 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
449 void bch2_fs_stop(struct bch_fs *c)
451 mutex_lock(&c->state_lock);
452 BUG_ON(c->state == BCH_FS_STOPPING);
453 c->state = BCH_FS_STOPPING;
454 mutex_unlock(&c->state_lock);
458 closure_sync(&c->cl);
463 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
465 struct bch_sb_field_members *mi;
467 unsigned i, iter_size;
469 c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
473 __module_get(THIS_MODULE);
477 mutex_init(&c->state_lock);
478 mutex_init(&c->sb_lock);
479 mutex_init(&c->btree_cache_lock);
480 mutex_init(&c->bucket_lock);
481 mutex_init(&c->btree_root_lock);
482 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
484 init_rwsem(&c->gc_lock);
486 #define BCH_TIME_STAT(name, frequency_units, duration_units) \
487 spin_lock_init(&c->name##_time.lock);
491 bch2_fs_allocator_init(c);
492 bch2_fs_tiering_init(c);
494 INIT_LIST_HEAD(&c->list);
495 INIT_LIST_HEAD(&c->btree_cache);
496 INIT_LIST_HEAD(&c->btree_cache_freeable);
497 INIT_LIST_HEAD(&c->btree_cache_freed);
499 INIT_LIST_HEAD(&c->btree_interior_update_list);
500 mutex_init(&c->btree_reserve_cache_lock);
501 mutex_init(&c->btree_interior_update_lock);
503 mutex_init(&c->bio_bounce_pages_lock);
504 bio_list_init(&c->read_retry_list);
505 spin_lock_init(&c->read_retry_lock);
506 INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
507 mutex_init(&c->zlib_workspace_lock);
509 seqcount_init(&c->gc_pos_lock);
511 c->prio_clock[READ].hand = 1;
512 c->prio_clock[READ].min_prio = 0;
513 c->prio_clock[WRITE].hand = 1;
514 c->prio_clock[WRITE].min_prio = 0;
516 init_waitqueue_head(&c->writeback_wait);
517 c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
519 c->copy_gc_enabled = 1;
520 c->tiering_enabled = 1;
521 c->tiering_percent = 10;
523 c->foreground_target_percent = 20;
525 c->journal.write_time = &c->journal_write_time;
526 c->journal.delay_time = &c->journal_delay_time;
527 c->journal.blocked_time = &c->journal_blocked_time;
528 c->journal.flush_seq_time = &c->journal_flush_seq_time;
530 mutex_lock(&c->sb_lock);
532 if (bch2_sb_to_fs(c, sb)) {
533 mutex_unlock(&c->sb_lock);
537 mutex_unlock(&c->sb_lock);
539 scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
541 bch2_opts_apply(&c->opts, bch2_sb_opts(sb));
542 bch2_opts_apply(&c->opts, opts);
544 c->opts.nochanges |= c->opts.noreplay;
545 c->opts.read_only |= c->opts.nochanges;
547 c->block_bits = ilog2(c->sb.block_size);
549 if (bch2_fs_init_fault("fs_alloc"))
552 iter_size = (btree_blocks(c) + 1) * 2 *
553 sizeof(struct btree_node_iter_set);
555 if (!(c->wq = alloc_workqueue("bcachefs",
556 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
557 !(c->copygc_wq = alloc_workqueue("bcache_copygc",
558 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
559 percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
560 mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
561 sizeof(struct btree_reserve)) ||
562 mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
563 sizeof(struct btree_interior_update)) ||
564 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
565 bioset_init(&c->btree_read_bio, 1, 0) ||
566 bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
567 bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
568 bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
569 mempool_init_page_pool(&c->bio_bounce_pages,
571 c->sb.btree_node_size,
572 BCH_ENCODED_EXTENT_MAX) /
574 !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
575 lg_lock_init(&c->usage_lock) ||
576 mempool_init_page_pool(&c->btree_bounce_pool, 1,
577 ilog2(btree_pages(c))) ||
578 bdi_setup_and_register(&c->bdi, "bcachefs") ||
579 bch2_io_clock_init(&c->io_clock[READ]) ||
580 bch2_io_clock_init(&c->io_clock[WRITE]) ||
581 bch2_fs_journal_init(&c->journal) ||
582 bch2_fs_btree_init(c) ||
583 bch2_fs_encryption_init(c) ||
584 bch2_fs_compress_init(c) ||
585 bch2_check_set_has_compressed_data(c, c->opts.compression))
588 c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
589 c->bdi.congested_fn = bch2_congested_fn;
590 c->bdi.congested_data = c;
592 mi = bch2_sb_get_members(c->disk_sb);
593 for (i = 0; i < c->sb.nr_devices; i++)
594 if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
595 bch2_dev_alloc(c, i))
599 * Now that all allocations have succeeded, init various refcounty
600 * things that let us shutdown:
602 closure_init(&c->cl, NULL);
604 c->kobj.kset = bcachefs_kset;
605 kobject_init(&c->kobj, &bch2_fs_ktype);
606 kobject_init(&c->internal, &bch2_fs_internal_ktype);
607 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
608 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
615 static const char *__bch2_fs_online(struct bch_fs *c)
618 const char *err = NULL;
622 lockdep_assert_held(&bch_fs_list_lock);
624 if (!list_empty(&c->list))
627 if (__bch2_uuid_to_fs(c->sb.uuid))
628 return "filesystem UUID already open";
630 ret = bch2_fs_chardev_init(c);
632 return "error creating character device";
634 bch2_fs_debug_init(c);
636 if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
637 kobject_add(&c->internal, &c->kobj, "internal") ||
638 kobject_add(&c->opts_dir, &c->kobj, "options") ||
639 kobject_add(&c->time_stats, &c->kobj, "time_stats"))
640 return "error creating sysfs objects";
642 mutex_lock(&c->state_lock);
644 err = "error creating sysfs objects";
645 __for_each_member_device(ca, c, i)
646 if (bch2_dev_sysfs_online(ca))
649 list_add(&c->list, &bch_fs_list);
652 mutex_unlock(&c->state_lock);
656 static const char *bch2_fs_online(struct bch_fs *c)
660 mutex_lock(&bch_fs_list_lock);
661 err = __bch2_fs_online(c);
662 mutex_unlock(&bch_fs_list_lock);
667 static const char *__bch2_fs_start(struct bch_fs *c)
669 const char *err = "cannot allocate memory";
670 struct bch_sb_field_members *mi;
678 BUG_ON(c->state != BCH_FS_STARTING);
680 mutex_lock(&c->sb_lock);
681 for_each_online_member(ca, c, i)
682 bch2_sb_from_fs(c, ca);
683 mutex_unlock(&c->sb_lock);
685 if (BCH_SB_INITIALIZED(c->disk_sb)) {
686 ret = bch2_journal_read(c, &journal);
690 j = &list_entry(journal.prev, struct journal_replay, list)->j;
692 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
693 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
695 err = "error reading priorities";
696 for_each_readable_member(ca, c, i) {
697 ret = bch2_prio_read(ca);
699 percpu_ref_put(&ca->io_ref);
704 for (id = 0; id < BTREE_ID_NR; id++) {
708 err = "bad btree root";
709 k = bch2_journal_find_btree_root(c, j, id, &level);
710 if (!k && id == BTREE_ID_EXTENTS)
713 pr_debug("missing btree root: %d", id);
717 err = "error reading btree root";
718 if (bch2_btree_root_read(c, id, k, level))
722 bch_verbose(c, "starting mark and sweep:");
724 err = "error in recovery";
725 ret = bch2_initial_gc(c, &journal);
729 if (c->opts.noreplay)
732 bch_verbose(c, "mark and sweep done");
735 * bch2_journal_start() can't happen sooner, or btree_gc_finish()
736 * will give spurious errors about oldest_gen > bucket_gen -
737 * this is a hack but oh well.
739 bch2_journal_start(c);
741 err = "error starting allocator thread";
742 for_each_rw_member(ca, c, i)
743 if (bch2_dev_allocator_start(ca)) {
744 percpu_ref_put(&ca->io_ref);
748 bch_verbose(c, "starting journal replay:");
750 err = "journal replay failed";
751 ret = bch2_journal_replay(c, &journal);
755 bch_verbose(c, "journal replay done");
757 if (c->opts.norecovery)
760 bch_verbose(c, "starting fsck:");
761 err = "error in fsck";
762 ret = bch2_fsck(c, !c->opts.nofsck);
766 for_each_rw_member(ca, c, i)
767 if (ca->need_prio_write) {
768 ret = bch2_prio_write(ca);
770 percpu_ref_put(&ca->io_ref);
775 bch_verbose(c, "fsck done");
777 struct bch_inode_unpacked inode;
778 struct bkey_inode_buf packed_inode;
781 closure_init_stack(&cl);
783 bch_notice(c, "initializing new filesystem");
785 ret = bch2_initial_gc(c, &journal);
789 err = "unable to allocate journal buckets";
790 for_each_rw_member(ca, c, i)
791 if (bch2_dev_journal_alloc(ca)) {
792 percpu_ref_put(&ca->io_ref);
797 * journal_res_get() will crash if called before this has
798 * set up the journal.pin FIFO and journal.cur pointer:
800 bch2_journal_start(c);
801 bch2_journal_set_replay_done(&c->journal);
803 err = "error starting allocator thread";
804 for_each_rw_member(ca, c, i)
805 if (bch2_dev_allocator_start(ca)) {
806 percpu_ref_put(&ca->io_ref);
810 err = "cannot allocate new btree root";
811 for (id = 0; id < BTREE_ID_NR; id++)
812 if (bch2_btree_root_alloc(c, id, &cl)) {
817 /* Wait for new btree roots to be written: */
820 bch2_inode_init(c, &inode, 0, 0,
821 S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
822 inode.inum = BCACHE_ROOT_INO;
824 bch2_inode_pack(&packed_inode, &inode);
826 err = "error creating root directory";
827 if (bch2_btree_insert(c, BTREE_ID_INODES,
828 &packed_inode.inode.k_i,
829 NULL, NULL, NULL, 0))
832 err = "error writing first journal entry";
833 if (bch2_journal_meta(&c->journal))
837 err = "dynamic fault";
838 if (bch2_fs_init_fault("fs_start"))
841 if (c->opts.read_only) {
842 bch2_fs_read_only(c);
844 err = bch2_fs_read_write(c);
849 mutex_lock(&c->sb_lock);
850 mi = bch2_sb_get_members(c->disk_sb);
851 now = ktime_get_seconds();
853 for_each_member_device(ca, c, i)
854 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
856 SET_BCH_SB_INITIALIZED(c->disk_sb, true);
857 SET_BCH_SB_CLEAN(c->disk_sb, false);
858 c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
861 mutex_unlock(&c->sb_lock);
865 bch2_journal_entries_free(&journal);
869 case BCH_FSCK_ERRORS_NOT_FIXED:
870 bch_err(c, "filesystem contains errors: please report this to the developers");
871 pr_cont("mount with -o fix_errors to repair");
874 case BCH_FSCK_REPAIR_UNIMPLEMENTED:
875 bch_err(c, "filesystem contains errors: please report this to the developers");
876 pr_cont("repair unimplemented: inform the developers so that it can be added");
879 case BCH_FSCK_REPAIR_IMPOSSIBLE:
880 bch_err(c, "filesystem contains errors, but repair impossible");
883 case BCH_FSCK_UNKNOWN_VERSION:
884 err = "unknown metadata version";;
887 err = "cannot allocate memory";
895 set_bit(BCH_FS_ERROR, &c->flags);
899 const char *bch2_fs_start(struct bch_fs *c)
901 return __bch2_fs_start(c) ?: bch2_fs_online(c);
904 static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
906 struct bch_sb_field_members *sb_mi;
908 sb_mi = bch2_sb_get_members(sb);
910 return "Invalid superblock: member info area missing";
912 if (le16_to_cpu(sb->block_size) != c->sb.block_size)
913 return "mismatched block size";
915 if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
916 BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
917 return "new cache bucket size is too small";
922 static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
924 struct bch_sb *newest =
925 le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
926 struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
928 if (uuid_le_cmp(fs->uuid, sb->uuid))
929 return "device not a member of filesystem";
931 if (sb->dev_idx >= newest->nr_devices)
932 return "device has invalid dev_idx";
934 if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
935 return "device has been removed";
937 if (fs->block_size != sb->block_size)
938 return "mismatched block size";
943 /* Device startup/shutdown: */
945 static void bch2_dev_release(struct kobject *kobj)
947 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
952 static void bch2_dev_free(struct bch_dev *ca)
956 cancel_work_sync(&ca->io_error_work);
958 if (ca->kobj.state_in_sysfs &&
960 sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
963 if (ca->kobj.state_in_sysfs)
964 kobject_del(&ca->kobj);
966 bch2_free_super(&ca->disk_sb);
967 bch2_dev_journal_exit(ca);
969 free_percpu(ca->sectors_written);
970 bioset_exit(&ca->replica_set);
971 free_percpu(ca->usage_percpu);
972 kvpfree(ca->disk_buckets, bucket_bytes(ca));
973 kfree(ca->prio_buckets);
976 vfree(ca->oldest_gens);
977 free_heap(&ca->heap);
978 free_fifo(&ca->free_inc);
980 for (i = 0; i < RESERVE_NR; i++)
981 free_fifo(&ca->free[i]);
983 percpu_ref_exit(&ca->io_ref);
984 percpu_ref_exit(&ca->ref);
985 kobject_put(&ca->kobj);
988 static void bch2_dev_io_ref_release(struct percpu_ref *ref)
990 struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
992 complete(&ca->offline_complete);
995 static void __bch2_dev_offline(struct bch_dev *ca)
997 struct bch_fs *c = ca->fs;
999 lockdep_assert_held(&c->state_lock);
1001 __bch2_dev_read_only(ca->fs, ca);
1003 reinit_completion(&ca->offline_complete);
1004 percpu_ref_kill(&ca->io_ref);
1005 wait_for_completion(&ca->offline_complete);
1007 if (ca->kobj.state_in_sysfs) {
1008 struct kobject *block =
1009 &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
1011 sysfs_remove_link(block, "bcachefs");
1012 sysfs_remove_link(&ca->kobj, "block");
1015 bch2_free_super(&ca->disk_sb);
1016 bch2_dev_journal_exit(ca);
1019 static void bch2_dev_ref_release(struct percpu_ref *ref)
1021 struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
1023 complete(&ca->stop_complete);
1026 static void bch2_dev_stop(struct bch_dev *ca)
1028 struct bch_fs *c = ca->fs;
1030 lockdep_assert_held(&c->state_lock);
1032 BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
1033 rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
1037 reinit_completion(&ca->stop_complete);
1038 percpu_ref_kill(&ca->ref);
1039 wait_for_completion(&ca->stop_complete);
1042 static int bch2_dev_sysfs_online(struct bch_dev *ca)
1044 struct bch_fs *c = ca->fs;
1047 if (!c->kobj.state_in_sysfs)
1050 if (!ca->kobj.state_in_sysfs) {
1051 ret = kobject_add(&ca->kobj, &ca->fs->kobj,
1052 "dev-%u", ca->dev_idx);
1057 if (ca->disk_sb.bdev) {
1058 struct kobject *block =
1059 &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
1061 ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
1064 ret = sysfs_create_link(&ca->kobj, block, "block");
1072 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
1074 struct bch_member *member;
1075 size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1080 if (bch2_fs_init_fault("dev_alloc"))
1083 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1087 kobject_init(&ca->kobj, &bch2_dev_ktype);
1088 init_completion(&ca->stop_complete);
1089 init_completion(&ca->offline_complete);
1091 spin_lock_init(&ca->self.lock);
1093 rcu_assign_pointer(ca->self.d[0].dev, ca);
1094 ca->dev_idx = dev_idx;
1096 spin_lock_init(&ca->freelist_lock);
1097 spin_lock_init(&ca->prio_buckets_lock);
1098 mutex_init(&ca->heap_lock);
1099 mutex_init(&ca->prio_write_lock);
1100 bch2_dev_moving_gc_init(ca);
1102 INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work);
1104 if (bch2_fs_init_fault("dev_alloc"))
1107 member = bch2_sb_get_members(c->disk_sb)->members + dev_idx;
1109 ca->mi = bch2_mi_to_cpu(member);
1110 ca->uuid = member->uuid;
1111 ca->bucket_bits = ilog2(ca->mi.bucket_size);
1112 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
1114 /* XXX: tune these */
1115 movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1116 reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1118 * free_inc must be smaller than the copygc reserve: if it was bigger,
1119 * one copygc iteration might not make enough buckets available to fill
1120 * up free_inc and allow the allocator to make forward progress
1122 free_inc_reserve = movinggc_reserve / 2;
1123 heap_size = movinggc_reserve * 8;
1125 if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
1127 percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
1128 PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
1129 !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1130 !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1131 !init_fifo(&ca->free[RESERVE_MOVINGGC],
1132 movinggc_reserve, GFP_KERNEL) ||
1133 !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1134 !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
1135 !init_heap(&ca->heap, heap_size, GFP_KERNEL) ||
1136 !(ca->oldest_gens = vzalloc(sizeof(u8) *
1137 ca->mi.nbuckets)) ||
1138 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1139 ca->mi.nbuckets)) ||
1140 !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
1142 !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
1143 !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
1144 !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
1145 bioset_init(&ca->replica_set, 4,
1146 offsetof(struct bch_write_bio, bio)) ||
1147 !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
1150 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1152 total_reserve = ca->free_inc.size;
1153 for (i = 0; i < RESERVE_NR; i++)
1154 total_reserve += ca->free[i].size;
1156 ca->copygc_write_point.group = &ca->self;
1157 ca->tiering_write_point.group = &ca->self;
1160 rcu_assign_pointer(c->devs[ca->dev_idx], ca);
1162 if (bch2_dev_sysfs_online(ca))
1163 pr_warn("error creating sysfs objects");
1171 static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
1176 lockdep_assert_held(&c->sb_lock);
1178 if (le64_to_cpu(sb->sb->seq) >
1179 le64_to_cpu(c->disk_sb->seq))
1180 bch2_sb_to_fs(c, sb->sb);
1182 BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
1183 !c->devs[sb->sb->dev_idx]);
1185 ca = c->devs[sb->sb->dev_idx];
1186 if (ca->disk_sb.bdev) {
1187 bch_err(c, "already have device online in slot %u",
1192 ret = bch2_dev_journal_init(ca, sb->sb);
1197 * Increase journal write timeout if flushes to this device are
1200 if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
1201 journal_flushes_device(ca))
1202 c->journal.write_delay_ms =
1203 max(c->journal.write_delay_ms, 1000U);
1207 if (sb->mode & FMODE_EXCL)
1208 ca->disk_sb.bdev->bd_holder = ca;
1209 memset(sb, 0, sizeof(*sb));
1211 if (c->sb.nr_devices == 1)
1212 bdevname(ca->disk_sb.bdev, c->name);
1213 bdevname(ca->disk_sb.bdev, ca->name);
1215 if (bch2_dev_sysfs_online(ca))
1216 pr_warn("error creating sysfs objects");
1218 lg_local_lock(&c->usage_lock);
1219 if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
1220 bch2_mark_dev_metadata(ca->fs, ca);
1221 lg_local_unlock(&c->usage_lock);
1223 percpu_ref_reinit(&ca->io_ref);
1227 /* Device management: */
1229 bool bch2_fs_may_start(struct bch_fs *c, int flags)
1231 struct bch_sb_field_members *mi;
1232 unsigned meta_missing = 0;
1233 unsigned data_missing = 0;
1234 bool degraded = false;
1237 mutex_lock(&c->sb_lock);
1238 mi = bch2_sb_get_members(c->disk_sb);
1240 for (i = 0; i < c->disk_sb->nr_devices; i++)
1242 !bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
1244 if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
1246 if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
1249 mutex_unlock(&c->sb_lock);
1252 !(flags & BCH_FORCE_IF_DEGRADED))
1256 !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1259 if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
1260 !(flags & BCH_FORCE_IF_METADATA_LOST))
1263 if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1266 if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
1267 !(flags & BCH_FORCE_IF_DATA_LOST))
1274 * Note: this function is also used by the error paths - when a particular
1275 * device sees an error, we call it to determine whether we can just set the
1276 * device RO, or - if this function returns false - we'll set the whole
1279 * XXX: maybe we should be more explicit about whether we're changing state
1280 * because we got an error or what have you?
1282 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
1283 enum bch_member_state new_state, int flags)
1285 lockdep_assert_held(&c->state_lock);
1287 if (new_state == BCH_MEMBER_STATE_RW)
1290 if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
1294 * If the device is already offline - whatever is going on with it can't
1295 * possible make the FS need to go RO:
1297 if (!bch2_dev_is_online(ca))
1300 if (ca->mi.has_data &&
1301 !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1304 if (ca->mi.has_data &&
1305 c->sb.data_replicas_have <= 1 &&
1306 !(flags & BCH_FORCE_IF_DATA_LOST))
1309 if (ca->mi.has_metadata &&
1310 !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1313 if (ca->mi.has_metadata &&
1314 c->sb.meta_replicas_have <= 1 &&
1315 !(flags & BCH_FORCE_IF_METADATA_LOST))
1321 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
1323 bch2_moving_gc_stop(ca);
1326 * This stops new data writes (e.g. to existing open data
1327 * buckets) and then waits for all existing writes to
1330 bch2_dev_allocator_stop(ca);
1332 bch2_dev_group_remove(&c->journal.devs, ca);
1335 static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
1337 lockdep_assert_held(&c->state_lock);
1339 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
1341 if (bch2_dev_allocator_start(ca))
1342 return "error starting allocator thread";
1344 if (bch2_moving_gc_start(ca))
1345 return "error starting moving GC thread";
1347 if (bch2_tiering_start(c))
1348 return "error starting tiering thread";
1353 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1354 enum bch_member_state new_state, int flags)
1356 struct bch_sb_field_members *mi;
1358 if (ca->mi.state == new_state)
1361 if (!bch2_dev_state_allowed(c, ca, new_state, flags))
1364 if (new_state == BCH_MEMBER_STATE_RW) {
1365 if (__bch2_dev_read_write(c, ca))
1368 __bch2_dev_read_only(c, ca);
1371 bch_notice(ca, "%s", bch2_dev_state[new_state]);
1373 mutex_lock(&c->sb_lock);
1374 mi = bch2_sb_get_members(c->disk_sb);
1375 SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
1376 bch2_write_super(c);
1377 mutex_unlock(&c->sb_lock);
1382 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
1383 enum bch_member_state new_state, int flags)
1387 mutex_lock(&c->state_lock);
1388 ret = __bch2_dev_set_state(c, ca, new_state, flags);
1389 mutex_unlock(&c->state_lock);
1394 /* Device add/removal: */
1396 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
1398 struct bch_sb_field_members *mi;
1399 unsigned dev_idx = ca->dev_idx;
1402 mutex_lock(&c->state_lock);
1404 percpu_ref_put(&ca->ref); /* XXX */
1406 if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1407 bch_err(ca, "Cannot remove RW device");
1411 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1412 bch_err(ca, "Cannot remove without losing data");
1417 * XXX: verify that dev_idx is really not in use anymore, anywhere
1419 * flag_data_bad() does not check btree pointers
1421 ret = bch2_flag_data_bad(ca);
1423 bch_err(ca, "Remove failed");
1427 if (ca->mi.has_data || ca->mi.has_metadata) {
1428 bch_err(ca, "Remove failed, still has data");
1433 * Ok, really doing the remove:
1434 * Drop device's prio pointer before removing it from superblock:
1436 spin_lock(&c->journal.lock);
1437 c->journal.prio_buckets[dev_idx] = 0;
1438 spin_unlock(&c->journal.lock);
1440 bch2_journal_meta(&c->journal);
1442 __bch2_dev_offline(ca);
1447 * Free this device's slot in the bch_member array - all pointers to
1448 * this device must be gone:
1450 mutex_lock(&c->sb_lock);
1451 mi = bch2_sb_get_members(c->disk_sb);
1452 memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
1454 bch2_write_super(c);
1456 mutex_unlock(&c->sb_lock);
1457 mutex_unlock(&c->state_lock);
1460 mutex_unlock(&c->state_lock);
1464 int bch2_dev_add(struct bch_fs *c, const char *path)
1466 struct bcache_superblock sb;
1468 struct bch_dev *ca = NULL;
1469 struct bch_sb_field_members *mi, *dev_mi;
1470 struct bch_member saved_mi;
1471 unsigned dev_idx, nr_devices, u64s;
1474 err = bch2_read_super(&sb, bch2_opts_empty(), path);
1478 err = bch2_validate_cache_super(&sb);
1482 err = bch2_dev_may_add(sb.sb, c);
1486 mutex_lock(&c->state_lock);
1487 mutex_lock(&c->sb_lock);
1490 * Preserve the old cache member information (esp. tier)
1491 * before we start bashing the disk stuff.
1493 dev_mi = bch2_sb_get_members(sb.sb);
1494 saved_mi = dev_mi->members[sb.sb->dev_idx];
1495 saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
1497 if (dynamic_fault("bcachefs:add:no_slot"))
1500 mi = bch2_sb_get_members(c->disk_sb);
1501 for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
1502 if (dev_idx >= c->sb.nr_devices ||
1503 bch2_is_zero(mi->members[dev_idx].uuid.b,
1507 err = "no slots available in superblock";
1512 nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
1513 u64s = (sizeof(struct bch_sb_field_members) +
1514 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
1515 err = "no space in superblock for member info";
1517 mi = bch2_fs_sb_resize_members(c, u64s);
1521 dev_mi = bch2_sb_resize_members(&sb, u64s);
1525 memcpy(dev_mi, mi, u64s * sizeof(u64));
1526 dev_mi->members[dev_idx] = saved_mi;
1528 sb.sb->uuid = c->disk_sb->uuid;
1529 sb.sb->dev_idx = dev_idx;
1530 sb.sb->nr_devices = nr_devices;
1532 /* commit new member info */
1533 memcpy(mi, dev_mi, u64s * sizeof(u64));
1534 c->disk_sb->nr_devices = nr_devices;
1535 c->sb.nr_devices = nr_devices;
1537 if (bch2_dev_alloc(c, dev_idx)) {
1538 err = "cannot allocate memory";
1543 if (__bch2_dev_online(c, &sb)) {
1544 err = "bch2_dev_online() error";
1549 bch2_write_super(c);
1550 mutex_unlock(&c->sb_lock);
1552 ca = c->devs[dev_idx];
1553 if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1554 err = "journal alloc failed";
1555 if (bch2_dev_journal_alloc(ca))
1558 err = __bch2_dev_read_write(c, ca);
1563 mutex_unlock(&c->state_lock);
1566 mutex_unlock(&c->sb_lock);
1568 mutex_unlock(&c->state_lock);
1569 bch2_free_super(&sb);
1571 bch_err(c, "Unable to add device: %s", err);
1572 return ret ?: -EINVAL;
1575 int bch2_dev_online(struct bch_fs *c, const char *path)
1577 struct bcache_superblock sb = { 0 };
1583 mutex_lock(&c->state_lock);
1585 err = bch2_read_super(&sb, bch2_opts_empty(), path);
1589 dev_idx = sb.sb->dev_idx;
1591 err = bch2_dev_in_fs(c->disk_sb, sb.sb);
1595 mutex_lock(&c->sb_lock);
1596 if (__bch2_dev_online(c, &sb)) {
1597 err = "__bch2_dev_online() error";
1598 mutex_unlock(&c->sb_lock);
1601 mutex_unlock(&c->sb_lock);
1603 ca = c->devs[dev_idx];
1604 ret = bch2_prio_read(ca);
1606 err = "error reading priorities";
1610 if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1611 err = __bch2_dev_read_write(c, ca);
1616 mutex_unlock(&c->state_lock);
1619 mutex_unlock(&c->state_lock);
1620 bch2_free_super(&sb);
1621 bch_err(c, "error bringing %s online: %s", path, err);
1625 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
1627 mutex_lock(&c->state_lock);
1629 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
1630 bch_err(ca, "Cannot offline required disk");
1631 mutex_unlock(&c->state_lock);
1635 __bch2_dev_read_only(c, ca);
1636 __bch2_dev_offline(ca);
1638 mutex_unlock(&c->state_lock);
1642 int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
1646 mutex_lock(&c->state_lock);
1648 if (ca->mi.state == BCH_MEMBER_STATE_RW) {
1649 bch_err(ca, "Cannot migrate data off RW device");
1650 mutex_unlock(&c->state_lock);
1654 mutex_unlock(&c->state_lock);
1656 ret = bch2_move_data_off_device(ca);
1658 bch_err(ca, "Error migrating data: %i", ret);
1662 ret = bch2_move_metadata_off_device(ca);
1664 bch_err(ca, "Error migrating metadata: %i", ret);
1668 if (ca->mi.has_data || ca->mi.has_metadata) {
1669 bch_err(ca, "Migrate error: data still present");
1676 /* Filesystem open: */
1678 const char *bch2_fs_open(char * const *devices, unsigned nr_devices,
1679 struct bch_opts opts, struct bch_fs **ret)
1682 struct bch_fs *c = NULL;
1683 struct bcache_superblock *sb;
1684 unsigned i, best_sb = 0;
1687 return "need at least one device";
1689 if (!try_module_get(THIS_MODULE))
1690 return "module unloading";
1692 err = "cannot allocate memory";
1693 sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
1697 for (i = 0; i < nr_devices; i++) {
1698 err = bch2_read_super(&sb[i], opts, devices[i]);
1702 err = "attempting to register backing device";
1703 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
1706 err = bch2_validate_cache_super(&sb[i]);
1711 for (i = 1; i < nr_devices; i++)
1712 if (le64_to_cpu(sb[i].sb->seq) >
1713 le64_to_cpu(sb[best_sb].sb->seq))
1716 for (i = 0; i < nr_devices; i++) {
1717 err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
1722 err = "cannot allocate memory";
1723 c = bch2_fs_alloc(sb[best_sb].sb, opts);
1727 err = "bch2_dev_online() error";
1728 mutex_lock(&c->sb_lock);
1729 for (i = 0; i < nr_devices; i++)
1730 if (__bch2_dev_online(c, &sb[i])) {
1731 mutex_unlock(&c->sb_lock);
1734 mutex_unlock(&c->sb_lock);
1736 err = "insufficient devices";
1737 if (!bch2_fs_may_start(c, 0))
1740 if (!c->opts.nostart) {
1741 err = __bch2_fs_start(c);
1746 err = bch2_fs_online(c);
1753 closure_put(&c->cl);
1758 module_put(THIS_MODULE);
1766 for (i = 0; i < nr_devices; i++)
1767 bch2_free_super(&sb[i]);
1771 static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb,
1772 struct bch_opts opts)
1776 bool allocated_fs = false;
1778 err = bch2_validate_cache_super(sb);
1782 mutex_lock(&bch_fs_list_lock);
1783 c = __bch2_uuid_to_fs(sb->sb->uuid);
1785 closure_get(&c->cl);
1787 err = bch2_dev_in_fs(c->disk_sb, sb->sb);
1791 c = bch2_fs_alloc(sb->sb, opts);
1792 err = "cannot allocate memory";
1796 allocated_fs = true;
1799 err = "bch2_dev_online() error";
1801 mutex_lock(&c->sb_lock);
1802 if (__bch2_dev_online(c, sb)) {
1803 mutex_unlock(&c->sb_lock);
1806 mutex_unlock(&c->sb_lock);
1808 if (!c->opts.nostart && bch2_fs_may_start(c, 0)) {
1809 err = __bch2_fs_start(c);
1814 err = __bch2_fs_online(c);
1818 closure_put(&c->cl);
1819 mutex_unlock(&bch_fs_list_lock);
1823 mutex_unlock(&bch_fs_list_lock);
1828 closure_put(&c->cl);
1833 const char *bch2_fs_open_incremental(const char *path)
1835 struct bcache_superblock sb;
1836 struct bch_opts opts = bch2_opts_empty();
1839 err = bch2_read_super(&sb, opts, path);
1843 if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
1844 err = __bch2_fs_open_incremental(&sb, opts);
1846 err = "not a bcachefs superblock";
1848 bch2_free_super(&sb);
1853 /* Global interfaces/init */
1855 static void bcachefs_exit(void)
1859 bch2_chardev_exit();
1861 kset_unregister(bcachefs_kset);
1864 static int __init bcachefs_init(void)
1866 bch2_bkey_pack_test();
1868 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
1869 bch2_chardev_init() ||
1880 #define BCH_DEBUG_PARAM(name, description) \
1882 module_param_named(name, bch2_##name, bool, 0644); \
1883 MODULE_PARM_DESC(name, description);
1885 #undef BCH_DEBUG_PARAM
1887 module_exit(bcachefs_exit);
1888 module_init(bcachefs_init);