4 #include "btree_iter.h"
5 #include "btree_update.h"
11 #include "writeback.h"
13 #include <linux/kthread.h>
14 #include <linux/module.h>
15 #include <linux/random.h>
17 static int bch_blockdev_major;
18 static DEFINE_IDA(bch_blockdev_minor);
19 static LIST_HEAD(uncached_devices);
20 static struct kmem_cache *bch_search_cache;
22 static void write_bdev_super_endio(struct bio *bio)
24 struct cached_dev *dc = bio->bi_private;
25 /* XXX: error checking */
27 closure_put(&dc->sb_write);
30 static void bch_write_bdev_super_unlock(struct closure *cl)
32 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
34 up(&dc->sb_write_mutex);
37 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
39 struct backingdev_sb *sb = dc->disk_sb.sb;
40 struct closure *cl = &dc->sb_write;
41 struct bio *bio = dc->disk_sb.bio;
43 down(&dc->sb_write_mutex);
44 closure_init(cl, parent);
46 sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64,
47 (struct nonce) { 0 }, sb).lo;
50 bio->bi_bdev = dc->disk_sb.bdev;
51 bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
52 bio->bi_iter.bi_size =
53 roundup(vstruct_bytes(sb),
54 bdev_logical_block_size(dc->disk_sb.bdev));
55 bio->bi_end_io = write_bdev_super_endio;
57 bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META);
62 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
65 bool bch_is_open_backing_dev(struct block_device *bdev)
67 struct cache_set *c, *tc;
68 struct cached_dev *dc, *t;
70 list_for_each_entry_safe(c, tc, &bch_fs_list, list)
71 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
72 if (dc->disk_sb.bdev == bdev)
74 list_for_each_entry_safe(dc, t, &uncached_devices, list)
75 if (dc->disk_sb.bdev == bdev)
80 static int open_dev(struct block_device *b, fmode_t mode)
82 struct bcache_device *d = b->bd_disk->private_data;
84 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
91 static void release_dev(struct gendisk *b, fmode_t mode)
93 struct bcache_device *d = b->private_data;
98 static int ioctl_dev(struct block_device *b, fmode_t mode,
99 unsigned int cmd, unsigned long arg)
101 struct bcache_device *d = b->bd_disk->private_data;
103 return d->ioctl(d, mode, cmd, arg);
106 static const struct block_device_operations bcache_ops = {
108 .release = release_dev,
110 .owner = THIS_MODULE,
113 void bch_blockdev_stop(struct bcache_device *d)
115 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
116 closure_queue(&d->cl);
119 static void bcache_device_unlink(struct bcache_device *d)
121 lockdep_assert_held(&bch_register_lock);
123 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
124 sysfs_remove_link(&d->c->kobj, d->name);
125 sysfs_remove_link(&d->kobj, "cache");
129 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
132 snprintf(d->name, BCACHEDEVNAME_SIZE,
133 "%s%llu", name, bcache_dev_inum(d));
135 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
136 sysfs_create_link(&c->kobj, &d->kobj, d->name),
137 "Couldn't create device <-> cache set symlinks");
139 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
142 static void bcache_device_detach(struct bcache_device *d)
144 lockdep_assert_held(&bch_register_lock);
146 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
147 mutex_lock(&d->inode_lock);
148 bch_inode_rm(d->c, bcache_dev_inum(d));
149 mutex_unlock(&d->inode_lock);
152 bcache_device_unlink(d);
154 radix_tree_delete(&d->c->devices, bcache_dev_inum(d));
156 closure_put(&d->c->caching);
160 static int bcache_device_attach(struct bcache_device *d, struct cache_set *c)
164 lockdep_assert_held(&bch_register_lock);
166 ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d);
168 pr_err("radix_tree_insert() error for inum %llu",
174 closure_get(&c->caching);
179 static void bcache_device_free(struct bcache_device *d)
181 lockdep_assert_held(&bch_register_lock);
183 pr_info("%s stopped", d->disk->disk_name);
186 bcache_device_detach(d);
187 if (d->disk && d->disk->flags & GENHD_FL_UP)
188 del_gendisk(d->disk);
189 if (d->disk && d->disk->queue)
190 blk_cleanup_queue(d->disk->queue);
192 ida_simple_remove(&bch_blockdev_minor, d->disk->first_minor);
196 bioset_exit(&d->bio_split);
198 closure_debug_destroy(&d->cl);
201 static int bcache_device_init(struct bcache_device *d, unsigned block_size,
204 struct request_queue *q;
207 mutex_init(&d->inode_lock);
209 minor = ida_simple_get(&bch_blockdev_minor, 0, MINORMASK + 1, GFP_KERNEL);
211 pr_err("cannot allocate minor");
215 if (!(d->disk = alloc_disk(1)) ||
216 bioset_init(&d->bio_split, 4, offsetof(struct bch_read_bio, bio))) {
217 pr_err("cannot allocate disk");
218 ida_simple_remove(&bch_blockdev_minor, minor);
222 set_capacity(d->disk, sectors);
223 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
225 d->disk->major = bch_blockdev_major;
226 d->disk->first_minor = minor;
227 d->disk->fops = &bcache_ops;
228 d->disk->private_data = d;
230 q = blk_alloc_queue(GFP_KERNEL);
232 pr_err("cannot allocate queue");
236 blk_queue_make_request(q, NULL);
239 q->backing_dev_info.congested_data = d;
240 q->limits.max_hw_sectors = UINT_MAX;
241 q->limits.max_sectors = UINT_MAX;
242 q->limits.max_segment_size = UINT_MAX;
243 q->limits.max_segments = BIO_MAX_PAGES;
244 blk_queue_max_discard_sectors(q, UINT_MAX);
245 q->limits.discard_granularity = 512;
246 q->limits.io_min = block_size;
247 q->limits.logical_block_size = block_size;
248 q->limits.physical_block_size = block_size;
249 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
250 clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
251 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
253 blk_queue_write_cache(q, true, true);
260 static void calc_cached_dev_sectors(struct cache_set *c)
263 struct cached_dev *dc;
265 list_for_each_entry(dc, &c->cached_devs, list)
266 sectors += bdev_sectors(dc->disk_sb.bdev);
268 c->cached_dev_sectors = sectors;
271 void bch_cached_dev_run(struct cached_dev *dc)
273 struct bcache_device *d = &dc->disk;
274 char buf[BCH_SB_LABEL_SIZE + 1];
277 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
278 dc->disk_sb.sb->disk_uuid.b),
283 memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
284 buf[BCH_SB_LABEL_SIZE] = '\0';
285 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
287 if (atomic_xchg(&dc->running, 1)) {
294 BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_NONE) {
297 closure_init_stack(&cl);
299 SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_STALE);
300 bch_write_bdev_super(dc, &cl);
305 bd_link_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
306 /* won't show up in the uevent file, use udevadm monitor -e instead
307 * only class / kset properties are persistent */
308 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
312 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
313 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
314 pr_debug("error creating sysfs link");
317 static void cached_dev_detach_finish(struct work_struct *w)
319 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
320 char buf[BDEVNAME_SIZE];
323 closure_init_stack(&cl);
325 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
326 BUG_ON(atomic_read(&dc->count));
328 mutex_lock(&bch_register_lock);
330 memset(&dc->disk_sb.sb->set_uuid, 0, 16);
331 SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE);
333 bch_write_bdev_super(dc, &cl);
336 bcache_device_detach(&dc->disk);
337 list_move(&dc->list, &uncached_devices);
339 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
340 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
342 mutex_unlock(&bch_register_lock);
344 pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf));
346 /* Drop ref we took in cached_dev_detach() */
347 closure_put(&dc->disk.cl);
350 void bch_cached_dev_detach(struct cached_dev *dc)
352 lockdep_assert_held(&bch_register_lock);
354 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
357 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
361 * Block the device from being closed and freed until we're finished
364 closure_get(&dc->disk.cl);
366 dc->writeback_pd.rate.rate = UINT_MAX;
367 bch_writeback_queue(dc);
371 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
373 __le64 rtime = cpu_to_le64(ktime_get_seconds());
374 char buf[BDEVNAME_SIZE];
378 lockdep_assert_held(&c->state_lock);
380 bdevname(dc->disk_sb.bdev, buf);
382 if (memcmp(&dc->disk_sb.sb->set_uuid,
388 pr_err("Can't attach %s: already attached", buf);
392 if (!bch_fs_running(c)) {
393 pr_err("Can't attach %s: not running", buf);
397 if (le16_to_cpu(dc->disk_sb.sb->block_size) < c->sb.block_size) {
399 pr_err("Couldn't attach %s: block size less than set's block size",
404 found = !bch_cached_dev_inode_find_by_uuid(c,
405 &dc->disk_sb.sb->disk_uuid,
408 if (!found && BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
409 pr_err("Couldn't find uuid for %s in set", buf);
414 (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE ||
415 BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE)) {
417 bch_inode_rm(c, bcache_dev_inum(&dc->disk));
420 /* Deadlocks since we're called via sysfs...
421 sysfs_remove_file(&dc->kobj, &sysfs_attach);
427 closure_init_stack(&cl);
429 bkey_inode_blockdev_init(&dc->disk.inode.k_i);
430 dc->disk.inode.k.type = BCH_INODE_BLOCKDEV;
431 SET_CACHED_DEV(&dc->disk.inode.v, true);
432 dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
433 memcpy(dc->disk.inode.v.i_label,
434 dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
435 dc->disk.inode.v.i_ctime = rtime;
436 dc->disk.inode.v.i_mtime = rtime;
438 ret = bch_inode_create(c, &dc->disk.inode.k_i,
439 0, BLOCKDEV_INODE_MAX,
440 &c->unused_inode_hint);
442 pr_err("Error %d, not caching %s", ret, buf);
446 pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
448 dc->disk_sb.sb->set_uuid = c->sb.uuid;
449 SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
451 bch_write_bdev_super(dc, &cl);
454 dc->disk.inode.v.i_mtime = rtime;
455 bch_btree_update(c, BTREE_ID_INODES,
456 &dc->disk.inode.k_i, NULL);
459 /* Count dirty sectors before attaching */
460 if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY)
461 bch_sectors_dirty_init(dc, c);
463 ret = bcache_device_attach(&dc->disk, c);
467 list_move(&dc->list, &c->cached_devs);
468 calc_cached_dev_sectors(c);
471 * dc->c must be set before dc->count != 0 - paired with the mb in
475 atomic_set(&dc->count, 1);
477 if (bch_cached_dev_writeback_start(dc))
480 if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
481 atomic_set(&dc->has_dirty, 1);
482 atomic_inc(&dc->count);
485 bch_cached_dev_run(dc);
486 bcache_device_link(&dc->disk, c, "bdev");
488 pr_info("Caching %s as %s on set %pU",
489 bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
490 dc->disk.c->sb.uuid.b);
494 void bch_attach_backing_devs(struct cache_set *c)
496 struct cached_dev *dc, *t;
498 lockdep_assert_held(&bch_register_lock);
499 lockdep_assert_held(&c->state_lock);
501 list_for_each_entry_safe(dc, t, &uncached_devices, list)
502 bch_cached_dev_attach(dc, c);
505 void bch_cached_dev_release(struct kobject *kobj)
507 struct cached_dev *dc = container_of(kobj, struct cached_dev,
510 module_put(THIS_MODULE);
513 static void cached_dev_free(struct closure *cl)
515 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
517 bch_cached_dev_writeback_stop(dc);
518 bch_cached_dev_writeback_free(dc);
520 mutex_lock(&bch_register_lock);
522 if (atomic_read(&dc->running))
523 bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
524 bcache_device_free(&dc->disk);
527 mutex_unlock(&bch_register_lock);
529 bch_free_super((void *) &dc->disk_sb);
531 kobject_put(&dc->disk.kobj);
534 static void cached_dev_flush(struct closure *cl)
536 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
537 struct bcache_device *d = &dc->disk;
539 mutex_lock(&bch_register_lock);
540 bcache_device_unlink(d);
541 mutex_unlock(&bch_register_lock);
543 bch_cache_accounting_destroy(&dc->accounting);
544 kobject_del(&d->kobj);
546 continue_at(cl, cached_dev_free, system_wq);
549 static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
553 struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
555 dc->sequential_cutoff = 4 << 20;
557 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
558 list_add(&io->lru, &dc->io_lru);
559 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
562 dc->disk.stripe_size = q->limits.io_opt >> 9;
564 if (dc->disk.stripe_size)
565 dc->partial_stripes_expensive =
566 q->limits.raid_partial_stripes_expensive;
568 ret = bcache_device_init(&dc->disk, block_size,
569 dc->disk_sb.bdev->bd_part->nr_sects -
570 le64_to_cpu(dc->disk_sb.sb->data_offset));
574 dc->disk.disk->queue->backing_dev_info.ra_pages =
575 max(dc->disk.disk->queue->backing_dev_info.ra_pages,
576 q->backing_dev_info.ra_pages);
578 bch_cached_dev_request_init(dc);
579 ret = bch_cached_dev_writeback_init(dc);
586 /* Cached device - bcache superblock */
588 static const char *bdev_validate_super(struct backingdev_sb *sb)
590 switch (le64_to_cpu(sb->version)) {
591 case BCACHE_SB_VERSION_BDEV:
592 sb->data_offset = cpu_to_le64(BDEV_DATA_START_DEFAULT);
594 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
595 if (le64_to_cpu(sb->data_offset) < BDEV_DATA_START_DEFAULT)
596 return "Bad data offset";
600 return"Unsupported superblock version";
603 sb->last_mount = cpu_to_le32(get_seconds());
608 const char *bch_backing_dev_register(struct bcache_superblock *sb)
610 char name[BDEVNAME_SIZE];
613 struct cached_dev *dc;
615 dc = kzalloc(sizeof(*dc), GFP_KERNEL);
617 return "cannot allocate memory";
619 __module_get(THIS_MODULE);
620 INIT_LIST_HEAD(&dc->list);
621 closure_init(&dc->disk.cl, NULL);
622 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
623 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
624 INIT_WORK(&dc->detach, cached_dev_detach_finish);
625 sema_init(&dc->sb_write_mutex, 1);
626 INIT_LIST_HEAD(&dc->io_lru);
627 spin_lock_init(&dc->io_lock);
628 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
630 memcpy(&dc->disk_sb, sb, sizeof(*sb));
631 dc->disk_sb.bdev->bd_holder = dc;
632 memset(sb, 0, sizeof(*sb));
634 err = bdev_validate_super(dc->disk_sb.sb);
638 if (cached_dev_init(dc, le16_to_cpu(dc->disk_sb.sb->block_size) << 9))
641 err = "error creating kobject";
642 if (kobject_add(&dc->disk.kobj,
643 &part_to_dev(dc->disk_sb.bdev->bd_part)->kobj,
647 err = "error accounting kobject";
648 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
651 pr_info("registered backing device %s",
652 bdevname(dc->disk_sb.bdev, name));
654 list_add(&dc->list, &uncached_devices);
655 list_for_each_entry(c, &bch_fs_list, list)
656 bch_cached_dev_attach(dc, c);
658 if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE ||
659 BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE)
660 bch_cached_dev_run(dc);
664 bch_blockdev_stop(&dc->disk);
668 /* Flash only volumes */
670 void bch_blockdev_volume_release(struct kobject *kobj)
672 struct bcache_device *d = container_of(kobj, struct bcache_device,
677 static void blockdev_volume_free(struct closure *cl)
679 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
681 mutex_lock(&bch_register_lock);
682 bcache_device_free(d);
683 mutex_unlock(&bch_register_lock);
684 kobject_put(&d->kobj);
687 static void blockdev_volume_flush(struct closure *cl)
689 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
691 mutex_lock(&bch_register_lock);
692 bcache_device_unlink(d);
693 mutex_unlock(&bch_register_lock);
694 kobject_del(&d->kobj);
695 continue_at(cl, blockdev_volume_free, system_wq);
698 static int blockdev_volume_run(struct cache_set *c,
699 struct bkey_s_c_inode_blockdev inode)
701 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
708 bkey_reassemble(&d->inode.k_i, inode.s_c);
710 closure_init(&d->cl, NULL);
711 set_closure_fn(&d->cl, blockdev_volume_flush, system_wq);
713 kobject_init(&d->kobj, &bch_blockdev_volume_ktype);
715 ret = bcache_device_init(d, block_bytes(c),
716 le64_to_cpu(inode.v->i_size) >> 9);
720 ret = bcache_device_attach(d, c);
724 bch_blockdev_volume_request_init(d);
727 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
730 bcache_device_link(d, c, "volume");
734 kobject_put(&d->kobj);
738 int bch_blockdev_volumes_start(struct cache_set *c)
740 struct btree_iter iter;
742 struct bkey_s_c_inode_blockdev inode;
745 if (!bch_fs_running(c))
748 for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
749 if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
752 if (k.k->type != BCH_INODE_BLOCKDEV)
755 inode = bkey_s_c_to_inode_blockdev(k);
757 ret = blockdev_volume_run(c, inode);
761 bch_btree_iter_unlock(&iter);
766 int bch_blockdev_volume_create(struct cache_set *c, u64 size)
768 __le64 rtime = cpu_to_le64(ktime_get_seconds());
769 struct bkey_i_inode_blockdev inode;
772 bkey_inode_blockdev_init(&inode.k_i);
773 get_random_bytes(&inode.v.i_uuid, sizeof(inode.v.i_uuid));
774 inode.v.i_ctime = rtime;
775 inode.v.i_mtime = rtime;
776 inode.v.i_size = cpu_to_le64(size);
778 ret = bch_inode_create(c, &inode.k_i, 0, BLOCKDEV_INODE_MAX,
779 &c->unused_inode_hint);
781 pr_err("Can't create volume: %d", ret);
785 return blockdev_volume_run(c, inode_blockdev_i_to_s_c(&inode));
788 void bch_blockdevs_stop(struct cache_set *c)
790 struct cached_dev *dc;
791 struct bcache_device *d;
792 struct radix_tree_iter iter;
795 mutex_lock(&bch_register_lock);
798 radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
799 d = radix_tree_deref_slot(slot);
801 if (CACHED_DEV(&d->inode.v) &&
802 test_bit(BCH_FS_DETACHING, &c->flags)) {
803 dc = container_of(d, struct cached_dev, disk);
804 bch_cached_dev_detach(dc);
806 bch_blockdev_stop(d);
811 mutex_unlock(&bch_register_lock);
814 void bch_fs_blockdev_exit(struct cache_set *c)
816 mempool_exit(&c->search);
819 int bch_fs_blockdev_init(struct cache_set *c)
821 return mempool_init_slab_pool(&c->search, 1, bch_search_cache);
824 void bch_blockdev_exit(void)
826 kmem_cache_destroy(bch_search_cache);
828 if (bch_blockdev_major >= 0)
829 unregister_blkdev(bch_blockdev_major, "bcache");
832 int __init bch_blockdev_init(void)
834 bch_blockdev_major = register_blkdev(0, "bcache");
835 if (bch_blockdev_major < 0)
836 return bch_blockdev_major;
838 bch_search_cache = KMEM_CACHE(search, 0);
839 if (!bch_search_cache)