]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/blockdev.c
d3a373c2a61612c64ac67a8a80084a63b9204dad
[bcachefs-tools-debian] / libbcache / blockdev.c
1
2 #include "bcache.h"
3 #include "blockdev.h"
4 #include "btree_iter.h"
5 #include "btree_update.h"
6 #include "checksum.h"
7 #include "error.h"
8 #include "inode.h"
9 #include "request.h"
10 #include "super-io.h"
11 #include "writeback.h"
12
13 #include <linux/kthread.h>
14 #include <linux/module.h>
15 #include <linux/random.h>
16
17 static int bch_blockdev_major;
18 static DEFINE_IDA(bch_blockdev_minor);
19 static LIST_HEAD(uncached_devices);
20 struct kmem_cache *bch_search_cache;
21
22 static void write_bdev_super_endio(struct bio *bio)
23 {
24         struct cached_dev *dc = bio->bi_private;
25         /* XXX: error checking */
26
27         closure_put(&dc->sb_write);
28 }
29
30 static void bch_write_bdev_super_unlock(struct closure *cl)
31 {
32         struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
33
34         up(&dc->sb_write_mutex);
35 }
36
37 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
38 {
39         struct backingdev_sb *sb = dc->disk_sb.sb;
40         struct closure *cl = &dc->sb_write;
41         struct bio *bio = dc->disk_sb.bio;
42
43         down(&dc->sb_write_mutex);
44         closure_init(cl, parent);
45
46         sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64,
47                                 (struct nonce) { 0 }, sb).lo;
48
49         bio_reset(bio);
50         bio->bi_bdev            = dc->disk_sb.bdev;
51         bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
52         bio->bi_iter.bi_size    =
53                 roundup(vstruct_bytes(sb),
54                         bdev_logical_block_size(dc->disk_sb.bdev));
55         bio->bi_end_io          = write_bdev_super_endio;
56         bio->bi_private         = dc;
57         bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META);
58         bch_bio_map(bio, sb);
59
60         closure_get(cl);
61
62         closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
63 }
64
65 bool bch_is_open_backing_dev(struct block_device *bdev)
66 {
67         struct cache_set *c, *tc;
68         struct cached_dev *dc, *t;
69
70         list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
71                 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
72                         if (dc->disk_sb.bdev == bdev)
73                                 return true;
74         list_for_each_entry_safe(dc, t, &uncached_devices, list)
75                 if (dc->disk_sb.bdev == bdev)
76                         return true;
77         return false;
78 }
79
80 static int open_dev(struct block_device *b, fmode_t mode)
81 {
82         struct bcache_device *d = b->bd_disk->private_data;
83
84         if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
85                 return -ENXIO;
86
87         closure_get(&d->cl);
88         return 0;
89 }
90
91 static void release_dev(struct gendisk *b, fmode_t mode)
92 {
93         struct bcache_device *d = b->private_data;
94
95         closure_put(&d->cl);
96 }
97
98 static int ioctl_dev(struct block_device *b, fmode_t mode,
99                      unsigned int cmd, unsigned long arg)
100 {
101         struct bcache_device *d = b->bd_disk->private_data;
102
103         return d->ioctl(d, mode, cmd, arg);
104 }
105
106 static const struct block_device_operations bcache_ops = {
107         .open           = open_dev,
108         .release        = release_dev,
109         .ioctl          = ioctl_dev,
110         .owner          = THIS_MODULE,
111 };
112
113 void bch_blockdev_stop(struct bcache_device *d)
114 {
115         if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
116                 closure_queue(&d->cl);
117 }
118
119 static void bcache_device_unlink(struct bcache_device *d)
120 {
121         lockdep_assert_held(&bch_register_lock);
122
123         if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
124                 sysfs_remove_link(&d->c->kobj, d->name);
125                 sysfs_remove_link(&d->kobj, "cache");
126         }
127 }
128
129 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
130                                const char *name)
131 {
132         snprintf(d->name, BCACHEDEVNAME_SIZE,
133                  "%s%llu", name, bcache_dev_inum(d));
134
135         WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
136              sysfs_create_link(&c->kobj, &d->kobj, d->name),
137              "Couldn't create device <-> cache set symlinks");
138
139         clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
140 }
141
142 static void bcache_device_detach(struct bcache_device *d)
143 {
144         lockdep_assert_held(&bch_register_lock);
145
146         if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
147                 mutex_lock(&d->inode_lock);
148                 bch_inode_rm(d->c, bcache_dev_inum(d));
149                 mutex_unlock(&d->inode_lock);
150         }
151
152         bcache_device_unlink(d);
153
154         radix_tree_delete(&d->c->devices, bcache_dev_inum(d));
155
156         closure_put(&d->c->caching);
157         d->c = NULL;
158 }
159
160 static int bcache_device_attach(struct bcache_device *d, struct cache_set *c)
161 {
162         int ret;
163
164         lockdep_assert_held(&bch_register_lock);
165
166         ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d);
167         if (ret) {
168                 pr_err("radix_tree_insert() error for inum %llu",
169                        bcache_dev_inum(d));
170                 return ret;
171         }
172
173         d->c = c;
174         closure_get(&c->caching);
175
176         return ret;
177 }
178
179 static void bcache_device_free(struct bcache_device *d)
180 {
181         lockdep_assert_held(&bch_register_lock);
182
183         pr_info("%s stopped", d->disk->disk_name);
184
185         if (d->c)
186                 bcache_device_detach(d);
187         if (d->disk && d->disk->flags & GENHD_FL_UP)
188                 del_gendisk(d->disk);
189         if (d->disk && d->disk->queue)
190                 blk_cleanup_queue(d->disk->queue);
191         if (d->disk) {
192                 ida_simple_remove(&bch_blockdev_minor, d->disk->first_minor);
193                 put_disk(d->disk);
194         }
195
196         bioset_exit(&d->bio_split);
197
198         closure_debug_destroy(&d->cl);
199 }
200
201 static int bcache_device_init(struct bcache_device *d, unsigned block_size,
202                               sector_t sectors)
203 {
204         struct request_queue *q;
205         int minor;
206
207         mutex_init(&d->inode_lock);
208
209         minor = ida_simple_get(&bch_blockdev_minor, 0, MINORMASK + 1, GFP_KERNEL);
210         if (minor < 0) {
211                 pr_err("cannot allocate minor");
212                 return minor;
213         }
214
215         if (!(d->disk = alloc_disk(1)) ||
216             bioset_init(&d->bio_split, 4, offsetof(struct bch_read_bio, bio))) {
217                 pr_err("cannot allocate disk");
218                 ida_simple_remove(&bch_blockdev_minor, minor);
219                 return -ENOMEM;
220         }
221
222         set_capacity(d->disk, sectors);
223         snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
224
225         d->disk->major          = bch_blockdev_major;
226         d->disk->first_minor    = minor;
227         d->disk->fops           = &bcache_ops;
228         d->disk->private_data   = d;
229
230         q = blk_alloc_queue(GFP_KERNEL);
231         if (!q) {
232                 pr_err("cannot allocate queue");
233                 return -ENOMEM;
234         }
235
236         blk_queue_make_request(q, NULL);
237         d->disk->queue                  = q;
238         q->queuedata                    = d;
239         q->backing_dev_info.congested_data = d;
240         q->limits.max_hw_sectors        = UINT_MAX;
241         q->limits.max_sectors           = UINT_MAX;
242         q->limits.max_segment_size      = UINT_MAX;
243         q->limits.max_segments          = BIO_MAX_PAGES;
244         blk_queue_max_discard_sectors(q, UINT_MAX);
245         q->limits.discard_granularity   = 512;
246         q->limits.io_min                = block_size;
247         q->limits.logical_block_size    = block_size;
248         q->limits.physical_block_size   = block_size;
249         set_bit(QUEUE_FLAG_NONROT,      &d->disk->queue->queue_flags);
250         clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
251         set_bit(QUEUE_FLAG_DISCARD,     &d->disk->queue->queue_flags);
252
253         blk_queue_write_cache(q, true, true);
254
255         return 0;
256 }
257
258 /* Cached device */
259
260 static void calc_cached_dev_sectors(struct cache_set *c)
261 {
262         u64 sectors = 0;
263         struct cached_dev *dc;
264
265         list_for_each_entry(dc, &c->cached_devs, list)
266                 sectors += bdev_sectors(dc->disk_sb.bdev);
267
268         c->cached_dev_sectors = sectors;
269 }
270
271 void bch_cached_dev_run(struct cached_dev *dc)
272 {
273         struct bcache_device *d = &dc->disk;
274         char buf[BCH_SB_LABEL_SIZE + 1];
275         char *env[] = {
276                 "DRIVER=bcache",
277                 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
278                           dc->disk_sb.sb->disk_uuid.b),
279                 NULL,
280                 NULL,
281         };
282
283         memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
284         buf[BCH_SB_LABEL_SIZE] = '\0';
285         env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
286
287         if (atomic_xchg(&dc->running, 1)) {
288                 kfree(env[1]);
289                 kfree(env[2]);
290                 return;
291         }
292
293         if (!d->c &&
294             BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_NONE) {
295                 struct closure cl;
296
297                 closure_init_stack(&cl);
298
299                 SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_STALE);
300                 bch_write_bdev_super(dc, &cl);
301                 closure_sync(&cl);
302         }
303
304         add_disk(d->disk);
305         bd_link_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
306         /* won't show up in the uevent file, use udevadm monitor -e instead
307          * only class / kset properties are persistent */
308         kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
309         kfree(env[1]);
310         kfree(env[2]);
311
312         if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
313             sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
314                 pr_debug("error creating sysfs link");
315 }
316
317 static void cached_dev_detach_finish(struct work_struct *w)
318 {
319         struct cached_dev *dc = container_of(w, struct cached_dev, detach);
320         char buf[BDEVNAME_SIZE];
321         struct closure cl;
322
323         closure_init_stack(&cl);
324
325         BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
326         BUG_ON(atomic_read(&dc->count));
327
328         mutex_lock(&bch_register_lock);
329
330         memset(&dc->disk_sb.sb->set_uuid, 0, 16);
331         SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE);
332
333         bch_write_bdev_super(dc, &cl);
334         closure_sync(&cl);
335
336         bcache_device_detach(&dc->disk);
337         list_move(&dc->list, &uncached_devices);
338
339         clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
340         clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
341
342         mutex_unlock(&bch_register_lock);
343
344         pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf));
345
346         /* Drop ref we took in cached_dev_detach() */
347         closure_put(&dc->disk.cl);
348 }
349
350 void bch_cached_dev_detach(struct cached_dev *dc)
351 {
352         lockdep_assert_held(&bch_register_lock);
353
354         if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
355                 return;
356
357         if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
358                 return;
359
360         /*
361          * Block the device from being closed and freed until we're finished
362          * detaching
363          */
364         closure_get(&dc->disk.cl);
365
366         dc->writeback_pd.rate.rate = UINT_MAX;
367         bch_writeback_queue(dc);
368         cached_dev_put(dc);
369 }
370
371 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
372 {
373         __le64 rtime = cpu_to_le64(ktime_get_seconds());
374         char buf[BDEVNAME_SIZE];
375         bool found;
376         int ret;
377
378         bdevname(dc->disk_sb.bdev, buf);
379
380         if (memcmp(&dc->disk_sb.sb->set_uuid,
381                    &c->sb.uuid,
382                    sizeof(c->sb.uuid)))
383                 return -ENOENT;
384
385         if (dc->disk.c) {
386                 pr_err("Can't attach %s: already attached", buf);
387                 return -EINVAL;
388         }
389
390         if (!test_bit(CACHE_SET_RUNNING, &c->flags))
391                 return 0;
392
393         if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
394                 pr_err("Can't attach %s: shutting down", buf);
395                 return -EINVAL;
396         }
397
398         if (le16_to_cpu(dc->disk_sb.sb->block_size) < c->sb.block_size) {
399                 /* Will die */
400                 pr_err("Couldn't attach %s: block size less than set's block size",
401                        buf);
402                 return -EINVAL;
403         }
404
405         found = !bch_cached_dev_inode_find_by_uuid(c,
406                                         &dc->disk_sb.sb->disk_uuid,
407                                         &dc->disk.inode);
408
409         if (!found && BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
410                 pr_err("Couldn't find uuid for %s in set", buf);
411                 return -ENOENT;
412         }
413
414         if (found &&
415             (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE ||
416              BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE)) {
417                 found = false;
418                 bch_inode_rm(c, bcache_dev_inum(&dc->disk));
419         }
420
421         /* Deadlocks since we're called via sysfs...
422         sysfs_remove_file(&dc->kobj, &sysfs_attach);
423          */
424
425         if (!found) {
426                 struct closure cl;
427
428                 closure_init_stack(&cl);
429
430                 bkey_inode_blockdev_init(&dc->disk.inode.k_i);
431                 dc->disk.inode.k.type = BCH_INODE_BLOCKDEV;
432                 SET_CACHED_DEV(&dc->disk.inode.v, true);
433                 dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
434                 memcpy(dc->disk.inode.v.i_label,
435                        dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
436                 dc->disk.inode.v.i_ctime = rtime;
437                 dc->disk.inode.v.i_mtime = rtime;
438
439                 ret = bch_inode_create(c, &dc->disk.inode.k_i,
440                                        0, BLOCKDEV_INODE_MAX,
441                                        &c->unused_inode_hint);
442                 if (ret) {
443                         pr_err("Error %d, not caching %s", ret, buf);
444                         return ret;
445                 }
446
447                 pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
448
449                 dc->disk_sb.sb->set_uuid = c->sb.uuid;
450                 SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
451
452                 bch_write_bdev_super(dc, &cl);
453                 closure_sync(&cl);
454         } else {
455                 dc->disk.inode.v.i_mtime = rtime;
456                 bch_btree_update(c, BTREE_ID_INODES,
457                                  &dc->disk.inode.k_i, NULL);
458         }
459
460         /* Count dirty sectors before attaching */
461         if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY)
462                 bch_sectors_dirty_init(dc, c);
463
464         ret = bcache_device_attach(&dc->disk, c);
465         if (ret)
466                 return ret;
467
468         list_move(&dc->list, &c->cached_devs);
469         calc_cached_dev_sectors(c);
470
471         /*
472          * dc->c must be set before dc->count != 0 - paired with the mb in
473          * cached_dev_get()
474          */
475         smp_wmb();
476         atomic_set(&dc->count, 1);
477
478         if (bch_cached_dev_writeback_start(dc))
479                 return -ENOMEM;
480
481         if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
482                 atomic_set(&dc->has_dirty, 1);
483                 atomic_inc(&dc->count);
484         }
485
486         bch_cached_dev_run(dc);
487         bcache_device_link(&dc->disk, c, "bdev");
488
489         pr_info("Caching %s as %s on set %pU",
490                 bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
491                 dc->disk.c->sb.uuid.b);
492         return 0;
493 }
494
495 void bch_attach_backing_devs(struct cache_set *c)
496 {
497         struct cached_dev *dc, *t;
498
499         lockdep_assert_held(&bch_register_lock);
500
501         list_for_each_entry_safe(dc, t, &uncached_devices, list)
502                 bch_cached_dev_attach(dc, c);
503 }
504
505 void bch_cached_dev_release(struct kobject *kobj)
506 {
507         struct cached_dev *dc = container_of(kobj, struct cached_dev,
508                                              disk.kobj);
509         kfree(dc);
510         module_put(THIS_MODULE);
511 }
512
513 static void cached_dev_free(struct closure *cl)
514 {
515         struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
516
517         bch_cached_dev_writeback_stop(dc);
518         bch_cached_dev_writeback_free(dc);
519
520         mutex_lock(&bch_register_lock);
521
522         if (atomic_read(&dc->running))
523                 bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
524         bcache_device_free(&dc->disk);
525         list_del(&dc->list);
526
527         mutex_unlock(&bch_register_lock);
528
529         bch_free_super((void *) &dc->disk_sb);
530
531         kobject_put(&dc->disk.kobj);
532 }
533
534 static void cached_dev_flush(struct closure *cl)
535 {
536         struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
537         struct bcache_device *d = &dc->disk;
538
539         mutex_lock(&bch_register_lock);
540         bcache_device_unlink(d);
541         mutex_unlock(&bch_register_lock);
542
543         bch_cache_accounting_destroy(&dc->accounting);
544         kobject_del(&d->kobj);
545
546         continue_at(cl, cached_dev_free, system_wq);
547 }
548
549 static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
550 {
551         int ret;
552         struct io *io;
553         struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
554
555         dc->sequential_cutoff           = 4 << 20;
556
557         for (io = dc->io; io < dc->io + RECENT_IO; io++) {
558                 list_add(&io->lru, &dc->io_lru);
559                 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
560         }
561
562         dc->disk.stripe_size = q->limits.io_opt >> 9;
563
564         if (dc->disk.stripe_size)
565                 dc->partial_stripes_expensive =
566                         q->limits.raid_partial_stripes_expensive;
567
568         ret = bcache_device_init(&dc->disk, block_size,
569                          dc->disk_sb.bdev->bd_part->nr_sects -
570                          le64_to_cpu(dc->disk_sb.sb->data_offset));
571         if (ret)
572                 return ret;
573
574         dc->disk.disk->queue->backing_dev_info.ra_pages =
575                 max(dc->disk.disk->queue->backing_dev_info.ra_pages,
576                     q->backing_dev_info.ra_pages);
577
578         bch_cached_dev_request_init(dc);
579         ret = bch_cached_dev_writeback_init(dc);
580         if (ret)
581                 return ret;
582
583         return 0;
584 }
585
586 /* Cached device - bcache superblock */
587
588 static const char *bdev_validate_super(struct backingdev_sb *sb)
589 {
590         switch (le64_to_cpu(sb->version)) {
591         case BCACHE_SB_VERSION_BDEV:
592                 sb->data_offset = cpu_to_le64(BDEV_DATA_START_DEFAULT);
593                 break;
594         case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
595                 if (le64_to_cpu(sb->data_offset) < BDEV_DATA_START_DEFAULT)
596                         return "Bad data offset";
597
598                 break;
599         default:
600                 return"Unsupported superblock version";
601         }
602
603         sb->last_mount  = cpu_to_le32(get_seconds());
604
605         return NULL;
606 }
607
608 const char *bch_backing_dev_register(struct bcache_superblock *sb)
609 {
610         char name[BDEVNAME_SIZE];
611         const char *err;
612         struct cache_set *c;
613         struct cached_dev *dc;
614
615         dc = kzalloc(sizeof(*dc), GFP_KERNEL);
616         if (!dc)
617                 return "cannot allocate memory";
618
619         __module_get(THIS_MODULE);
620         INIT_LIST_HEAD(&dc->list);
621         closure_init(&dc->disk.cl, NULL);
622         set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
623         kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
624         INIT_WORK(&dc->detach, cached_dev_detach_finish);
625         sema_init(&dc->sb_write_mutex, 1);
626         INIT_LIST_HEAD(&dc->io_lru);
627         spin_lock_init(&dc->io_lock);
628         bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
629
630         memcpy(&dc->disk_sb, sb, sizeof(*sb));
631         dc->disk_sb.bdev->bd_holder = dc;
632         memset(sb, 0, sizeof(*sb));
633
634         err = bdev_validate_super(dc->disk_sb.sb);
635         if (err)
636                 goto err;
637
638         if (cached_dev_init(dc, le16_to_cpu(dc->disk_sb.sb->block_size) << 9))
639                 goto err;
640
641         err = "error creating kobject";
642         if (kobject_add(&dc->disk.kobj,
643                         &part_to_dev(dc->disk_sb.bdev->bd_part)->kobj,
644                         "bcache"))
645                 goto err;
646
647         err = "error accounting kobject";
648         if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
649                 goto err;
650
651         pr_info("registered backing device %s",
652                 bdevname(dc->disk_sb.bdev, name));
653
654         list_add(&dc->list, &uncached_devices);
655         list_for_each_entry(c, &bch_cache_sets, list)
656                 bch_cached_dev_attach(dc, c);
657
658         if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE ||
659             BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE)
660                 bch_cached_dev_run(dc);
661
662         return NULL;
663 err:
664         bch_blockdev_stop(&dc->disk);
665         return err;
666 }
667
668 /* Flash only volumes */
669
670 void bch_blockdev_volume_release(struct kobject *kobj)
671 {
672         struct bcache_device *d = container_of(kobj, struct bcache_device,
673                                                kobj);
674         kfree(d);
675 }
676
677 static void blockdev_volume_free(struct closure *cl)
678 {
679         struct bcache_device *d = container_of(cl, struct bcache_device, cl);
680
681         mutex_lock(&bch_register_lock);
682         bcache_device_free(d);
683         mutex_unlock(&bch_register_lock);
684         kobject_put(&d->kobj);
685 }
686
687 static void blockdev_volume_flush(struct closure *cl)
688 {
689         struct bcache_device *d = container_of(cl, struct bcache_device, cl);
690
691         mutex_lock(&bch_register_lock);
692         bcache_device_unlink(d);
693         mutex_unlock(&bch_register_lock);
694         kobject_del(&d->kobj);
695         continue_at(cl, blockdev_volume_free, system_wq);
696 }
697
698 static int blockdev_volume_run(struct cache_set *c,
699                                struct bkey_s_c_inode_blockdev inode)
700 {
701         struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
702                                           GFP_KERNEL);
703         int ret = -ENOMEM;
704
705         if (!d)
706                 return ret;
707
708         bkey_reassemble(&d->inode.k_i, inode.s_c);
709
710         closure_init(&d->cl, NULL);
711         set_closure_fn(&d->cl, blockdev_volume_flush, system_wq);
712
713         kobject_init(&d->kobj, &bch_blockdev_volume_ktype);
714
715         ret = bcache_device_init(d, block_bytes(c),
716                                  le64_to_cpu(inode.v->i_size) >> 9);
717         if (ret)
718                 goto err;
719
720         ret = bcache_device_attach(d, c);
721         if (ret)
722                 goto err;
723
724         bch_blockdev_volume_request_init(d);
725         add_disk(d->disk);
726
727         if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
728                 goto err;
729
730         bcache_device_link(d, c, "volume");
731
732         return 0;
733 err:
734         kobject_put(&d->kobj);
735         return ret;
736 }
737
738 int bch_blockdev_volumes_start(struct cache_set *c)
739 {
740         struct btree_iter iter;
741         struct bkey_s_c k;
742         struct bkey_s_c_inode_blockdev inode;
743         int ret = 0;
744
745         if (test_bit(CACHE_SET_STOPPING, &c->flags))
746                 return -EINVAL;
747
748         for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
749                 if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
750                         break;
751
752                 if (k.k->type != BCH_INODE_BLOCKDEV)
753                         continue;
754
755                 inode = bkey_s_c_to_inode_blockdev(k);
756
757                 ret = blockdev_volume_run(c, inode);
758                 if (ret)
759                         break;
760         }
761         bch_btree_iter_unlock(&iter);
762
763         return ret;
764 }
765
766 int bch_blockdev_volume_create(struct cache_set *c, u64 size)
767 {
768         __le64 rtime = cpu_to_le64(ktime_get_seconds());
769         struct bkey_i_inode_blockdev inode;
770         int ret;
771
772         bkey_inode_blockdev_init(&inode.k_i);
773         get_random_bytes(&inode.v.i_uuid, sizeof(inode.v.i_uuid));
774         inode.v.i_ctime = rtime;
775         inode.v.i_mtime = rtime;
776         inode.v.i_size = cpu_to_le64(size);
777
778         ret = bch_inode_create(c, &inode.k_i, 0, BLOCKDEV_INODE_MAX,
779                                &c->unused_inode_hint);
780         if (ret) {
781                 pr_err("Can't create volume: %d", ret);
782                 return ret;
783         }
784
785         return blockdev_volume_run(c, inode_blockdev_i_to_s_c(&inode));
786 }
787
788 void bch_blockdevs_stop(struct cache_set *c)
789 {
790         struct cached_dev *dc;
791         struct bcache_device *d;
792         struct radix_tree_iter iter;
793         void **slot;
794
795         mutex_lock(&bch_register_lock);
796         rcu_read_lock();
797
798         radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
799                 d = radix_tree_deref_slot(slot);
800
801                 if (CACHED_DEV(&d->inode.v) &&
802                     test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
803                         dc = container_of(d, struct cached_dev, disk);
804                         bch_cached_dev_detach(dc);
805                 } else {
806                         bch_blockdev_stop(d);
807                 }
808         }
809
810         rcu_read_unlock();
811         mutex_unlock(&bch_register_lock);
812 }
813
814 void bch_blockdev_exit(void)
815 {
816         kmem_cache_destroy(bch_search_cache);
817
818         if (bch_blockdev_major >= 0)
819                 unregister_blkdev(bch_blockdev_major, "bcache");
820 }
821
822 int __init bch_blockdev_init(void)
823 {
824         bch_blockdev_major = register_blkdev(0, "bcache");
825         if (bch_blockdev_major < 0)
826                 return bch_blockdev_major;
827
828         bch_search_cache = KMEM_CACHE(search, 0);
829         if (!bch_search_cache)
830                 return -ENOMEM;
831
832         return 0;
833 }