]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/super.c
bcache in userspace; userspace fsck
[bcachefs-tools-debian] / libbcache / super.c
1 /*
2  * bcache setup/teardown code, and some metadata io - read a superblock and
3  * figure out what to do with it.
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8
9 #include "bcache.h"
10 #include "blockdev.h"
11 #include "alloc.h"
12 #include "btree_cache.h"
13 #include "btree_gc.h"
14 #include "btree_update.h"
15 #include "btree_io.h"
16 #include "chardev.h"
17 #include "checksum.h"
18 #include "clock.h"
19 #include "compress.h"
20 #include "debug.h"
21 #include "error.h"
22 #include "fs.h"
23 #include "fs-gc.h"
24 #include "inode.h"
25 #include "io.h"
26 #include "journal.h"
27 #include "keylist.h"
28 #include "move.h"
29 #include "migrate.h"
30 #include "movinggc.h"
31 #include "notify.h"
32 #include "stats.h"
33 #include "super.h"
34 #include "tier.h"
35 #include "writeback.h"
36
37 #include <linux/backing-dev.h>
38 #include <linux/blkdev.h>
39 #include <linux/debugfs.h>
40 #include <linux/genhd.h>
41 #include <linux/idr.h>
42 #include <linux/kthread.h>
43 #include <linux/module.h>
44 #include <linux/percpu.h>
45 #include <linux/random.h>
46 #include <linux/reboot.h>
47 #include <linux/sysfs.h>
48 #include <crypto/hash.h>
49
50 #include <trace/events/bcache.h>
51
52 MODULE_LICENSE("GPL");
53 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
54
55 static const uuid_le invalid_uuid = {
56         .b = {
57                 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
58                 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
59         }
60 };
61
62 static struct kset *bcache_kset;
63 struct mutex bch_register_lock;
64 LIST_HEAD(bch_cache_sets);
65
66 static int bch_chardev_major;
67 static struct class *bch_chardev_class;
68 static struct device *bch_chardev;
69 static DEFINE_IDR(bch_chardev_minor);
70 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
71 struct workqueue_struct *bcache_io_wq;
72 struct crypto_shash *bch_sha1;
73
74 static void bch_cache_stop(struct cache *);
75 static int bch_cache_online(struct cache *);
76
77 static bool bch_is_open_cache(struct block_device *bdev)
78 {
79         struct cache_set *c;
80         struct cache *ca;
81         unsigned i;
82
83         rcu_read_lock();
84         list_for_each_entry(c, &bch_cache_sets, list)
85                 for_each_cache_rcu(ca, c, i)
86                         if (ca->disk_sb.bdev == bdev) {
87                                 rcu_read_unlock();
88                                 return true;
89                         }
90         rcu_read_unlock();
91         return false;
92 }
93
94 static bool bch_is_open(struct block_device *bdev)
95 {
96         lockdep_assert_held(&bch_register_lock);
97
98         return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
99 }
100
101 static const char *bch_blkdev_open(const char *path, void *holder,
102                                    struct block_device **ret)
103 {
104         struct block_device *bdev;
105         const char *err;
106
107         *ret = NULL;
108         bdev = blkdev_get_by_path(path, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
109                                   holder);
110
111         if (bdev == ERR_PTR(-EBUSY)) {
112                 bdev = lookup_bdev(path);
113                 if (IS_ERR(bdev))
114                         return "device busy";
115
116                 err = bch_is_open(bdev)
117                         ? "device already registered"
118                         : "device busy";
119
120                 bdput(bdev);
121                 return err;
122         }
123
124         if (IS_ERR(bdev))
125                 return "failed to open device";
126
127         bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
128
129         *ret = bdev;
130         return NULL;
131 }
132
133 static int bch_congested_fn(void *data, int bdi_bits)
134 {
135         struct backing_dev_info *bdi;
136         struct cache_set *c = data;
137         struct cache *ca;
138         unsigned i;
139         int ret = 0;
140
141         rcu_read_lock();
142         if (bdi_bits & (1 << WB_sync_congested)) {
143                 /* Reads - check all devices: */
144                 for_each_cache_rcu(ca, c, i) {
145                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
146
147                         if (bdi_congested(bdi, bdi_bits)) {
148                                 ret = 1;
149                                 break;
150                         }
151                 }
152         } else {
153                 /* Writes only go to tier 0: */
154                 group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
155                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
156
157                         if (bdi_congested(bdi, bdi_bits)) {
158                                 ret = 1;
159                                 break;
160                         }
161                 }
162         }
163         rcu_read_unlock();
164
165         return ret;
166 }
167
168 /* Superblock */
169
170 static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi)
171 {
172         return (struct cache_member_cpu) {
173                 .nbuckets       = le64_to_cpu(mi->nbuckets),
174                 .first_bucket   = le16_to_cpu(mi->first_bucket),
175                 .bucket_size    = le16_to_cpu(mi->bucket_size),
176                 .state          = CACHE_STATE(mi),
177                 .tier           = CACHE_TIER(mi),
178                 .replication_set= CACHE_REPLICATION_SET(mi),
179                 .has_metadata   = CACHE_HAS_METADATA(mi),
180                 .has_data       = CACHE_HAS_DATA(mi),
181                 .replacement    = CACHE_REPLACEMENT(mi),
182                 .discard        = CACHE_DISCARD(mi),
183                 .valid          = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
184         };
185 }
186
187 static const char *validate_cache_super(struct bcache_superblock *disk_sb)
188 {
189         struct cache_sb *sb = disk_sb->sb;
190         struct cache_member_cpu mi;
191         u16 block_size;
192         unsigned i;
193
194         switch (le64_to_cpu(sb->version)) {
195         case BCACHE_SB_VERSION_CDEV_V0:
196         case BCACHE_SB_VERSION_CDEV_WITH_UUID:
197         case BCACHE_SB_VERSION_CDEV_V2:
198         case BCACHE_SB_VERSION_CDEV_V3:
199                 break;
200         default:
201                 return"Unsupported superblock version";
202         }
203
204         if (CACHE_SET_SYNC(sb) &&
205             le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3)
206                 return "Unsupported superblock version";
207
208         block_size = le16_to_cpu(sb->block_size);
209
210         if (!is_power_of_2(block_size) ||
211             block_size > PAGE_SECTORS)
212                 return "Bad block size";
213
214         if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le)))
215                 return "Bad disk UUID";
216
217         if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
218                 return "Bad user UUID";
219
220         if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le)))
221                 return "Bad set UUID";
222
223         if (!sb->nr_in_set ||
224             sb->nr_in_set <= sb->nr_this_dev ||
225             sb->nr_in_set > MAX_CACHES_PER_SET)
226                 return "Bad cache device number in set";
227
228         if (!CACHE_SET_META_REPLICAS_WANT(sb) ||
229             CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
230                 return "Invalid number of metadata replicas";
231
232         if (!CACHE_SET_META_REPLICAS_HAVE(sb) ||
233             CACHE_SET_META_REPLICAS_HAVE(sb) >
234             CACHE_SET_META_REPLICAS_WANT(sb))
235                 return "Invalid number of metadata replicas";
236
237         if (!CACHE_SET_DATA_REPLICAS_WANT(sb) ||
238             CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
239                 return "Invalid number of data replicas";
240
241         if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) ||
242             CACHE_SET_DATA_REPLICAS_HAVE(sb) >
243             CACHE_SET_DATA_REPLICAS_WANT(sb))
244                 return "Invalid number of data replicas";
245
246         if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR)
247                 return "Invalid checksum type";
248
249         if (!CACHE_SET_BTREE_NODE_SIZE(sb))
250                 return "Btree node size not set";
251
252         if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb)))
253                 return "Btree node size not a power of two";
254
255         if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
256                 return "Btree node size too large";
257
258         /* Default value, for old filesystems: */
259         if (!CACHE_SET_GC_RESERVE(sb))
260                 SET_CACHE_SET_GC_RESERVE(sb, 10);
261
262         if (CACHE_SET_GC_RESERVE(sb) < 5)
263                 return "gc reserve percentage too small";
264
265         if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb))
266                 SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9);
267
268         /* 4 mb max: */
269         if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
270                 return "max journal entry size too big";
271
272         if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb))
273                 return "Invalid superblock: member info area missing";
274
275         mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev);
276
277         if (mi.nbuckets > LONG_MAX)
278                 return "Too many buckets";
279
280         if (mi.nbuckets < 1 << 8)
281                 return "Not enough buckets";
282
283         if (!is_power_of_2(mi.bucket_size) ||
284             mi.bucket_size < PAGE_SECTORS ||
285             mi.bucket_size < block_size)
286                 return "Bad bucket size";
287
288         if (get_capacity(disk_sb->bdev->bd_disk) <
289             mi.bucket_size * mi.nbuckets)
290                 return "Invalid superblock: device too small";
291
292         if (le64_to_cpu(sb->offset) +
293             (__set_blocks(sb, le16_to_cpu(sb->u64s),
294                           block_size << 9) * block_size) >
295             mi.first_bucket * mi.bucket_size)
296                 return "Invalid superblock: first bucket comes before end of super";
297
298         for (i = 0; i < bch_nr_journal_buckets(sb); i++)
299                 if (journal_bucket(sb, i) <  mi.first_bucket ||
300                     journal_bucket(sb, i) >= mi.nbuckets)
301                         return "bad journal bucket";
302
303         return NULL;
304 }
305
306 void free_super(struct bcache_superblock *sb)
307 {
308         if (sb->bio)
309                 bio_put(sb->bio);
310         if (!IS_ERR_OR_NULL(sb->bdev))
311                 blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
312
313         free_pages((unsigned long) sb->sb, sb->page_order);
314         memset(sb, 0, sizeof(*sb));
315 }
316
317 static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
318 {
319         struct cache_sb *new_sb;
320         struct bio *bio;
321
322         if (sb->page_order >= order && sb->sb)
323                 return 0;
324
325         new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
326         if (!new_sb)
327                 return -ENOMEM;
328
329         bio = (dynamic_fault("bcache:add:super_realloc")
330                ? NULL
331                : bio_kmalloc(GFP_KERNEL, 1 << order));
332         if (!bio) {
333                 free_pages((unsigned long) new_sb, order);
334                 return -ENOMEM;
335         }
336
337         if (sb->sb)
338                 memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
339
340         free_pages((unsigned long) sb->sb, sb->page_order);
341         sb->sb = new_sb;
342
343         if (sb->bio)
344                 bio_put(sb->bio);
345         sb->bio = bio;
346
347         sb->page_order = order;
348
349         return 0;
350 }
351
352 int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s)
353 {
354         struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev;
355         char buf[BDEVNAME_SIZE];
356         size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s);
357         u64 want = bytes + (SB_SECTOR << 9);
358
359         u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) *
360                 ((u64) le16_to_cpu(mi->bucket_size) << 9);
361
362         if (want > first_bucket_offset) {
363                 pr_err("%s: superblock too big: want %llu but have %llu",
364                        bdevname(sb->bdev, buf), want, first_bucket_offset);
365                 return -ENOSPC;
366         }
367
368         return __bch_super_realloc(sb, get_order(bytes));
369 }
370
371 static const char *read_super(struct bcache_superblock *sb,
372                               const char *path)
373 {
374         const char *err;
375         unsigned order = 0;
376
377         lockdep_assert_held(&bch_register_lock);
378
379         memset(sb, 0, sizeof(*sb));
380
381         err = bch_blkdev_open(path, &sb, &sb->bdev);
382         if (err)
383                 return err;
384 retry:
385         err = "cannot allocate memory";
386         if (__bch_super_realloc(sb, order))
387                 goto err;
388
389         err = "dynamic fault";
390         if (cache_set_init_fault("read_super"))
391                 goto err;
392
393         bio_reset(sb->bio);
394         sb->bio->bi_bdev = sb->bdev;
395         sb->bio->bi_iter.bi_sector = SB_SECTOR;
396         sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
397         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
398         bch_bio_map(sb->bio, sb->sb);
399
400         err = "IO error";
401         if (submit_bio_wait(sb->bio))
402                 goto err;
403
404         err = "Not a bcache superblock";
405         if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
406                 goto err;
407
408         err = "Superblock has incorrect offset";
409         if (le64_to_cpu(sb->sb->offset) != SB_SECTOR)
410                 goto err;
411
412         pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
413                  le64_to_cpu(sb->sb->version),
414                  le64_to_cpu(sb->sb->flags),
415                  le64_to_cpu(sb->sb->seq),
416                  le16_to_cpu(sb->sb->u64s));
417
418         err = "Superblock block size smaller than device block size";
419         if (le16_to_cpu(sb->sb->block_size) << 9 <
420             bdev_logical_block_size(sb->bdev))
421                 goto err;
422
423         order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s)));
424         if (order > sb->page_order)
425                 goto retry;
426
427         err = "bad checksum reading superblock";
428         if (le64_to_cpu(sb->sb->csum) !=
429             __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s),
430                        le64_to_cpu(sb->sb->version) <
431                        BCACHE_SB_VERSION_CDEV_V3
432                        ? BCH_CSUM_CRC64
433                        : CACHE_SB_CSUM_TYPE(sb->sb)))
434                 goto err;
435
436         return NULL;
437 err:
438         free_super(sb);
439         return err;
440 }
441
442 void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb)
443 {
444         struct cache_sb *sb = disk_sb->sb;
445         struct bio *bio = disk_sb->bio;
446
447         bio->bi_bdev            = disk_sb->bdev;
448         bio->bi_iter.bi_sector  = SB_SECTOR;
449         bio->bi_iter.bi_size    =
450                 roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)),
451                         bdev_logical_block_size(disk_sb->bdev));
452         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
453         bch_bio_map(bio, sb);
454
455         pr_debug("ver %llu, flags %llu, seq %llu",
456                  le64_to_cpu(sb->version),
457                  le64_to_cpu(sb->flags),
458                  le64_to_cpu(sb->seq));
459
460         bch_generic_make_request(bio, c);
461 }
462
463 static void write_super_endio(struct bio *bio)
464 {
465         struct cache *ca = bio->bi_private;
466
467         /* XXX: return errors directly */
468
469         cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
470
471         bch_account_io_completion(ca);
472
473         closure_put(&ca->set->sb_write);
474         percpu_ref_put(&ca->ref);
475 }
476
477 static void bcache_write_super_unlock(struct closure *cl)
478 {
479         struct cache_set *c = container_of(cl, struct cache_set, sb_write);
480
481         up(&c->sb_write_mutex);
482 }
483
484 /* Update cached mi: */
485 static int cache_set_mi_update(struct cache_set *c,
486                                struct cache_member *mi,
487                                unsigned nr_in_set)
488 {
489         struct cache_member_rcu *new, *old;
490         struct cache *ca;
491         unsigned i;
492
493         mutex_lock(&c->mi_lock);
494
495         new = kzalloc(sizeof(struct cache_member_rcu) +
496                       sizeof(struct cache_member_cpu) * nr_in_set,
497                       GFP_KERNEL);
498         if (!new) {
499                 mutex_unlock(&c->mi_lock);
500                 return -ENOMEM;
501         }
502
503         new->nr_in_set = nr_in_set;
504
505         for (i = 0; i < nr_in_set; i++)
506                 new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
507
508         rcu_read_lock();
509         for_each_cache(ca, c, i)
510                 ca->mi = new->m[i];
511         rcu_read_unlock();
512
513         old = rcu_dereference_protected(c->members,
514                                 lockdep_is_held(&c->mi_lock));
515
516         rcu_assign_pointer(c->members, new);
517         if (old)
518                 kfree_rcu(old, rcu);
519
520         mutex_unlock(&c->mi_lock);
521         return 0;
522 }
523
524 /* doesn't copy member info */
525 static void __copy_super(struct cache_sb *dst, struct cache_sb *src)
526 {
527         dst->version            = src->version;
528         dst->seq                = src->seq;
529         dst->user_uuid          = src->user_uuid;
530         dst->set_uuid           = src->set_uuid;
531         memcpy(dst->label, src->label, SB_LABEL_SIZE);
532         dst->flags              = src->flags;
533         dst->flags2             = src->flags2;
534         dst->nr_in_set          = src->nr_in_set;
535         dst->block_size         = src->block_size;
536 }
537
538 static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src)
539 {
540         struct cache_member *new;
541
542         lockdep_assert_held(&bch_register_lock);
543
544         new = kzalloc(sizeof(struct cache_member) * src->nr_in_set,
545                       GFP_KERNEL);
546         if (!new)
547                 return -ENOMEM;
548
549         memcpy(new, src->members,
550                src->nr_in_set * sizeof(struct cache_member));
551
552         if (cache_set_mi_update(c, new, src->nr_in_set)) {
553                 kfree(new);
554                 return -ENOMEM;
555         }
556
557         kfree(c->disk_mi);
558         c->disk_mi = new;
559
560         __copy_super(&c->disk_sb, src);
561
562         c->sb.block_size        = le16_to_cpu(src->block_size);
563         c->sb.btree_node_size   = CACHE_SET_BTREE_NODE_SIZE(src);
564         c->sb.nr_in_set         = src->nr_in_set;
565         c->sb.clean             = CACHE_SET_CLEAN(src);
566         c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src);
567         c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src);
568         c->sb.str_hash_type     = CACHE_SET_STR_HASH_TYPE(src);
569
570         return 0;
571 }
572
573 static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca)
574 {
575         struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb;
576
577         if (src->nr_in_set != dst->nr_in_set) {
578                 /*
579                  * We have to preserve the list of journal buckets on the
580                  * cache's superblock:
581                  */
582                 unsigned old_offset = bch_journal_buckets_offset(dst);
583                 unsigned u64s = bch_journal_buckets_offset(src)
584                         + bch_nr_journal_buckets(dst);
585                 int ret = bch_super_realloc(&ca->disk_sb, u64s);
586
587                 if (ret)
588                         return ret;
589
590                 dst->nr_in_set  = src->nr_in_set;
591                 dst->u64s       = cpu_to_le16(u64s);
592
593                 memmove(dst->_data + bch_journal_buckets_offset(dst),
594                         dst->_data + old_offset,
595                         bch_nr_journal_buckets(dst) * sizeof(u64));
596         }
597
598         memcpy(dst->_data,
599                c->disk_mi,
600                src->nr_in_set * sizeof(struct cache_member));
601
602         __copy_super(dst, src);
603
604         return 0;
605 }
606
607 static void __bcache_write_super(struct cache_set *c)
608 {
609         struct closure *cl = &c->sb_write;
610         struct cache *ca;
611         unsigned i;
612
613         cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set);
614
615         closure_init(cl, &c->cl);
616
617         le64_add_cpu(&c->disk_sb.seq, 1);
618
619         for_each_cache(ca, c, i) {
620                 struct cache_sb *sb = ca->disk_sb.sb;
621                 struct bio *bio = ca->disk_sb.bio;
622
623                 cache_sb_from_cache_set(c, ca);
624
625                 SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
626                 sb->csum = cpu_to_le64(__csum_set(sb,
627                                                   le16_to_cpu(sb->u64s),
628                                                   CACHE_SB_CSUM_TYPE(sb)));
629
630                 bio_reset(bio);
631                 bio->bi_bdev    = ca->disk_sb.bdev;
632                 bio->bi_end_io  = write_super_endio;
633                 bio->bi_private = ca;
634
635                 closure_get(cl);
636                 percpu_ref_get(&ca->ref);
637                 __write_super(c, &ca->disk_sb);
638         }
639
640         closure_return_with_destructor(cl, bcache_write_super_unlock);
641 }
642
643 void bcache_write_super(struct cache_set *c)
644 {
645         down(&c->sb_write_mutex);
646         __bcache_write_super(c);
647 }
648
649 void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
650                                    bool meta)
651 {
652         struct cache_member *mi;
653         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
654         const struct bch_extent_ptr *ptr;
655
656         if (!CACHE_SET_SYNC(&c->disk_sb))
657                 return;
658
659         down(&c->sb_write_mutex);
660
661         /* recheck, might have raced */
662         if (bch_check_super_marked(c, k, meta)) {
663                 up(&c->sb_write_mutex);
664                 return;
665         }
666
667         mi = c->disk_mi;
668
669         extent_for_each_ptr(e, ptr)
670                 if (bch_extent_ptr_is_dirty(c, e, ptr))
671                         (meta
672                          ? SET_CACHE_HAS_METADATA
673                          : SET_CACHE_HAS_DATA)(mi + ptr->dev, true);
674
675         __bcache_write_super(c);
676 }
677
678 /* Cache set RO/RW: */
679
680 /*
681  * For startup/shutdown of RW stuff, the dependencies are:
682  *
683  * - foreground writes depend on copygc and tiering (to free up space)
684  *
685  * - copygc and tiering depend on mark and sweep gc (they actually probably
686  *   don't because they either reserve ahead of time or don't block if
687  *   allocations fail, but allocations can require mark and sweep gc to run
688  *   because of generation number wraparound)
689  *
690  * - all of the above depends on the allocator threads
691  *
692  * - allocator depends on the journal (when it rewrites prios and gens)
693  */
694
695 static void __bch_cache_set_read_only(struct cache_set *c)
696 {
697         struct cache *ca;
698         unsigned i;
699
700         c->tiering_pd.rate.rate = UINT_MAX;
701         bch_ratelimit_reset(&c->tiering_pd.rate);
702         bch_tiering_read_stop(c);
703
704         for_each_cache(ca, c, i)
705                 bch_moving_gc_stop(ca);
706
707         bch_gc_thread_stop(c);
708
709         bch_btree_flush(c);
710
711         for_each_cache(ca, c, i)
712                 bch_cache_allocator_stop(ca);
713
714         /*
715          * Write a journal entry after flushing the btree, so we don't end up
716          * replaying everything we just flushed:
717          */
718         if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
719                 int ret;
720
721                 bch_journal_flush_async(&c->journal, NULL);
722                 ret = bch_journal_meta(&c->journal);
723                 BUG_ON(ret && !bch_journal_error(&c->journal));
724         }
725
726         cancel_delayed_work_sync(&c->journal.write_work);
727         cancel_delayed_work_sync(&c->journal.reclaim_work);
728 }
729
730 static void bch_writes_disabled(struct percpu_ref *writes)
731 {
732         struct cache_set *c = container_of(writes, struct cache_set, writes);
733
734         set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
735         wake_up(&bch_read_only_wait);
736 }
737
738 static void bch_cache_set_read_only_work(struct work_struct *work)
739 {
740         struct cache_set *c =
741                 container_of(work, struct cache_set, read_only_work);
742
743         percpu_ref_put(&c->writes);
744
745         del_timer(&c->foreground_write_wakeup);
746         cancel_delayed_work(&c->pd_controllers_update);
747
748         c->foreground_write_pd.rate.rate = UINT_MAX;
749         bch_wake_delayed_writes((unsigned long) c);
750
751         if (!test_bit(CACHE_SET_EMERGENCY_RO, &c->flags)) {
752                 /*
753                  * If we're not doing an emergency shutdown, we want to wait on
754                  * outstanding writes to complete so they don't see spurious
755                  * errors due to shutting down the allocator:
756                  */
757                 wait_event(bch_read_only_wait,
758                            test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
759
760                 __bch_cache_set_read_only(c);
761
762                 if (!bch_journal_error(&c->journal) &&
763                     !test_bit(CACHE_SET_ERROR, &c->flags)) {
764                         SET_CACHE_SET_CLEAN(&c->disk_sb, true);
765                         bcache_write_super(c);
766                 }
767         } else {
768                 /*
769                  * If we are doing an emergency shutdown outstanding writes may
770                  * hang until we shutdown the allocator so we don't want to wait
771                  * on outstanding writes before shutting everything down - but
772                  * we do need to wait on them before returning and signalling
773                  * that going RO is complete:
774                  */
775                 __bch_cache_set_read_only(c);
776
777                 wait_event(bch_read_only_wait,
778                            test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
779         }
780
781         bch_notify_cache_set_read_only(c);
782         trace_bcache_cache_set_read_only_done(c);
783
784         set_bit(CACHE_SET_RO_COMPLETE, &c->flags);
785         wake_up(&bch_read_only_wait);
786 }
787
788 bool bch_cache_set_read_only(struct cache_set *c)
789 {
790         if (test_and_set_bit(CACHE_SET_RO, &c->flags))
791                 return false;
792
793         trace_bcache_cache_set_read_only(c);
794
795         percpu_ref_get(&c->writes);
796
797         /*
798          * Block new foreground-end write operations from starting - any new
799          * writes will return -EROFS:
800          *
801          * (This is really blocking new _allocations_, writes to previously
802          * allocated space can still happen until stopping the allocator in
803          * bch_cache_allocator_stop()).
804          */
805         percpu_ref_kill(&c->writes);
806
807         queue_work(system_freezable_wq, &c->read_only_work);
808         return true;
809 }
810
811 bool bch_cache_set_emergency_read_only(struct cache_set *c)
812 {
813         bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
814
815         bch_cache_set_read_only(c);
816         bch_journal_halt(&c->journal);
817
818         wake_up(&bch_read_only_wait);
819         return ret;
820 }
821
822 void bch_cache_set_read_only_sync(struct cache_set *c)
823 {
824         /* so we don't race with bch_cache_set_read_write() */
825         lockdep_assert_held(&bch_register_lock);
826
827         bch_cache_set_read_only(c);
828
829         wait_event(bch_read_only_wait,
830                    test_bit(CACHE_SET_RO_COMPLETE, &c->flags) &&
831                    test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
832 }
833
834 static const char *__bch_cache_set_read_write(struct cache_set *c)
835 {
836         struct cache *ca;
837         const char *err;
838         unsigned i;
839
840         lockdep_assert_held(&bch_register_lock);
841
842         err = "error starting allocator thread";
843         for_each_cache(ca, c, i)
844                 if (ca->mi.state == CACHE_ACTIVE &&
845                     bch_cache_allocator_start(ca)) {
846                         percpu_ref_put(&ca->ref);
847                         goto err;
848                 }
849
850         err = "error starting btree GC thread";
851         if (bch_gc_thread_start(c))
852                 goto err;
853
854         for_each_cache(ca, c, i) {
855                 if (ca->mi.state != CACHE_ACTIVE)
856                         continue;
857
858                 err = "error starting moving GC thread";
859                 if (bch_moving_gc_thread_start(ca)) {
860                         percpu_ref_put(&ca->ref);
861                         goto err;
862                 }
863         }
864
865         err = "error starting tiering thread";
866         if (bch_tiering_read_start(c))
867                 goto err;
868
869         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
870
871         return NULL;
872 err:
873         __bch_cache_set_read_only(c);
874         return err;
875 }
876
877 const char *bch_cache_set_read_write(struct cache_set *c)
878 {
879         const char *err;
880
881         lockdep_assert_held(&bch_register_lock);
882
883         if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags))
884                 return NULL;
885
886         err = __bch_cache_set_read_write(c);
887         if (err)
888                 return err;
889
890         percpu_ref_reinit(&c->writes);
891
892         clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
893         clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
894         clear_bit(CACHE_SET_RO_COMPLETE, &c->flags);
895         clear_bit(CACHE_SET_RO, &c->flags);
896         return NULL;
897 }
898
899 /* Cache set startup/shutdown: */
900
901 static void cache_set_free(struct cache_set *c)
902 {
903         del_timer_sync(&c->foreground_write_wakeup);
904         cancel_delayed_work_sync(&c->pd_controllers_update);
905         cancel_work_sync(&c->read_only_work);
906         cancel_work_sync(&c->bio_submit_work);
907         cancel_work_sync(&c->read_retry_work);
908
909         bch_btree_cache_free(c);
910         bch_journal_free(&c->journal);
911         bch_io_clock_exit(&c->io_clock[WRITE]);
912         bch_io_clock_exit(&c->io_clock[READ]);
913         bch_compress_free(c);
914         bdi_destroy(&c->bdi);
915         lg_lock_free(&c->bucket_stats_lock);
916         free_percpu(c->bucket_stats_percpu);
917         mempool_exit(&c->btree_bounce_pool);
918         mempool_exit(&c->bio_bounce_pages);
919         bioset_exit(&c->bio_write);
920         bioset_exit(&c->bio_read_split);
921         bioset_exit(&c->bio_read);
922         bioset_exit(&c->btree_read_bio);
923         mempool_exit(&c->btree_interior_update_pool);
924         mempool_exit(&c->btree_reserve_pool);
925         mempool_exit(&c->fill_iter);
926         mempool_exit(&c->search);
927         percpu_ref_exit(&c->writes);
928
929         if (c->copygc_wq)
930                 destroy_workqueue(c->copygc_wq);
931         if (c->wq)
932                 destroy_workqueue(c->wq);
933
934         kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
935         kfree(c->disk_mi);
936         kfree(c);
937         module_put(THIS_MODULE);
938 }
939
940 /*
941  * should be __cache_set_stop4 - block devices are closed, now we can finally
942  * free it
943  */
944 void bch_cache_set_release(struct kobject *kobj)
945 {
946         struct cache_set *c = container_of(kobj, struct cache_set, kobj);
947         struct completion *stop_completion = c->stop_completion;
948
949         bch_notify_cache_set_stopped(c);
950         bch_info(c, "stopped");
951
952         cache_set_free(c);
953
954         if (stop_completion)
955                 complete(stop_completion);
956 }
957
958 /*
959  * All activity on the cache_set should have stopped now - close devices:
960  */
961 static void __cache_set_stop3(struct closure *cl)
962 {
963         struct cache_set *c = container_of(cl, struct cache_set, cl);
964         struct cache *ca;
965         unsigned i;
966
967         mutex_lock(&bch_register_lock);
968         for_each_cache(ca, c, i)
969                 bch_cache_stop(ca);
970         mutex_unlock(&bch_register_lock);
971
972         mutex_lock(&bch_register_lock);
973         list_del(&c->list);
974         if (c->minor >= 0)
975                 idr_remove(&bch_chardev_minor, c->minor);
976         mutex_unlock(&bch_register_lock);
977
978         closure_debug_destroy(&c->cl);
979         kobject_put(&c->kobj);
980 }
981
982 /*
983  * Openers (i.e. block devices) should have exited, shutdown all userspace
984  * interfaces and wait for &c->cl to hit 0
985  */
986 static void __cache_set_stop2(struct closure *cl)
987 {
988         struct cache_set *c = container_of(cl, struct cache_set, caching);
989
990         bch_debug_exit_cache_set(c);
991
992         if (!IS_ERR_OR_NULL(c->chardev))
993                 device_unregister(c->chardev);
994
995         if (c->kobj.state_in_sysfs)
996                 kobject_del(&c->kobj);
997
998         bch_cache_accounting_destroy(&c->accounting);
999
1000         kobject_put(&c->time_stats);
1001         kobject_put(&c->opts_dir);
1002         kobject_put(&c->internal);
1003
1004         mutex_lock(&bch_register_lock);
1005         bch_cache_set_read_only_sync(c);
1006         mutex_unlock(&bch_register_lock);
1007
1008         closure_return(cl);
1009 }
1010
1011 /*
1012  * First phase of the shutdown process that's kicked off by cache_set_stop(); we
1013  * haven't waited for anything to stop yet, we're just punting to process
1014  * context to shut down block devices:
1015  */
1016 static void __cache_set_stop1(struct closure *cl)
1017 {
1018         struct cache_set *c = container_of(cl, struct cache_set, caching);
1019
1020         bch_blockdevs_stop(c);
1021
1022         continue_at(cl, __cache_set_stop2, system_wq);
1023 }
1024
1025 void bch_cache_set_stop(struct cache_set *c)
1026 {
1027         if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1028                 closure_queue(&c->caching);
1029 }
1030
1031 void bch_cache_set_unregister(struct cache_set *c)
1032 {
1033         if (!test_and_set_bit(CACHE_SET_UNREGISTERING, &c->flags))
1034                 bch_cache_set_stop(c);
1035 }
1036
1037 static unsigned cache_set_nr_devices(struct cache_set *c)
1038 {
1039         unsigned i, nr = 0;
1040         struct cache_member *mi = c->disk_mi;
1041
1042         lockdep_assert_held(&bch_register_lock);
1043
1044         for (i = 0; i < c->disk_sb.nr_in_set; i++)
1045                 if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le)))
1046                         nr++;
1047
1048         return nr;
1049 }
1050
1051 static unsigned cache_set_nr_online_devices(struct cache_set *c)
1052 {
1053         unsigned i, nr = 0;
1054
1055         for (i = 0; i < c->sb.nr_in_set; i++)
1056                 if (c->cache[i])
1057                         nr++;
1058
1059         return nr;
1060 }
1061
1062 #define alloc_bucket_pages(gfp, ca)                     \
1063         ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
1064
1065 static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
1066                                              struct cache_set_opts opts)
1067 {
1068         struct cache_set *c;
1069         unsigned iter_size, journal_entry_bytes;
1070
1071         c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1072         if (!c)
1073                 return NULL;
1074
1075         __module_get(THIS_MODULE);
1076
1077         c->minor                = -1;
1078
1079         sema_init(&c->sb_write_mutex, 1);
1080         INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
1081         mutex_init(&c->btree_cache_lock);
1082         mutex_init(&c->bucket_lock);
1083         mutex_init(&c->btree_root_lock);
1084         INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
1085         mutex_init(&c->mi_lock);
1086
1087         init_rwsem(&c->gc_lock);
1088
1089 #define BCH_TIME_STAT(name, frequency_units, duration_units)            \
1090         spin_lock_init(&c->name##_time.lock);
1091         BCH_TIME_STATS()
1092 #undef BCH_TIME_STAT
1093
1094         bch_open_buckets_init(c);
1095         bch_tiering_init_cache_set(c);
1096
1097         INIT_LIST_HEAD(&c->list);
1098         INIT_LIST_HEAD(&c->cached_devs);
1099         INIT_LIST_HEAD(&c->btree_cache);
1100         INIT_LIST_HEAD(&c->btree_cache_freeable);
1101         INIT_LIST_HEAD(&c->btree_cache_freed);
1102
1103         INIT_LIST_HEAD(&c->btree_interior_update_list);
1104         mutex_init(&c->btree_reserve_cache_lock);
1105         mutex_init(&c->btree_interior_update_lock);
1106
1107         mutex_init(&c->bio_bounce_pages_lock);
1108         INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
1109         spin_lock_init(&c->bio_submit_lock);
1110         bio_list_init(&c->read_retry_list);
1111         spin_lock_init(&c->read_retry_lock);
1112         INIT_WORK(&c->read_retry_work, bch_read_retry_work);
1113         mutex_init(&c->zlib_workspace_lock);
1114
1115         seqcount_init(&c->gc_pos_lock);
1116
1117         c->prio_clock[READ].hand = 1;
1118         c->prio_clock[READ].min_prio = 0;
1119         c->prio_clock[WRITE].hand = 1;
1120         c->prio_clock[WRITE].min_prio = 0;
1121
1122         c->congested_read_threshold_us  = 2000;
1123         c->congested_write_threshold_us = 20000;
1124         c->error_limit  = 16 << IO_ERROR_SHIFT;
1125         init_waitqueue_head(&c->writeback_wait);
1126
1127         c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
1128
1129         c->copy_gc_enabled = 1;
1130         c->tiering_enabled = 1;
1131         c->tiering_percent = 10;
1132
1133         c->foreground_target_percent = 20;
1134
1135         c->journal.write_time   = &c->journal_write_time;
1136         c->journal.delay_time   = &c->journal_delay_time;
1137         c->journal.blocked_time = &c->journal_blocked_time;
1138         c->journal.flush_seq_time = &c->journal_flush_seq_time;
1139
1140         mutex_init(&c->uevent_lock);
1141
1142         if (cache_sb_to_cache_set(c, sb))
1143                 goto err;
1144
1145         scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid);
1146
1147         c->opts = cache_superblock_opts(sb);
1148         cache_set_opts_apply(&c->opts, opts);
1149
1150         c->block_bits           = ilog2(c->sb.block_size);
1151
1152         if (cache_set_init_fault("cache_set_alloc"))
1153                 goto err;
1154
1155         iter_size = (btree_blocks(c) + 1) * 2 *
1156                 sizeof(struct btree_node_iter_set);
1157
1158         journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb);
1159
1160         if (!(c->wq = alloc_workqueue("bcache",
1161                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
1162             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
1163                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
1164             percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
1165             mempool_init_slab_pool(&c->search, 1, bch_search_cache) ||
1166             mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
1167                                       sizeof(struct btree_reserve)) ||
1168             mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
1169                                       sizeof(struct btree_interior_update)) ||
1170             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1171             bioset_init(&c->btree_read_bio, 1, 0) ||
1172             bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
1173             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
1174             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
1175             mempool_init_page_pool(&c->bio_bounce_pages,
1176                                    max_t(unsigned,
1177                                          c->sb.btree_node_size,
1178                                          CRC32_EXTENT_SIZE_MAX) /
1179                                    PAGE_SECTORS, 0) ||
1180             !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
1181             lg_lock_init(&c->bucket_stats_lock) ||
1182             mempool_init_page_pool(&c->btree_bounce_pool, 1,
1183                                    ilog2(btree_pages(c))) ||
1184             bdi_setup_and_register(&c->bdi, "bcache") ||
1185             bch_io_clock_init(&c->io_clock[READ]) ||
1186             bch_io_clock_init(&c->io_clock[WRITE]) ||
1187             bch_journal_alloc(&c->journal, journal_entry_bytes) ||
1188             bch_btree_cache_alloc(c) ||
1189             bch_compress_init(c))
1190                 goto err;
1191
1192         c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
1193         c->bdi.congested_fn     = bch_congested_fn;
1194         c->bdi.congested_data   = c;
1195
1196         /*
1197          * Now that all allocations have succeeded, init various refcounty
1198          * things that let us shutdown:
1199          */
1200         closure_init(&c->cl, NULL);
1201
1202         c->kobj.kset = bcache_kset;
1203         kobject_init(&c->kobj, &bch_cache_set_ktype);
1204         kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1205         kobject_init(&c->opts_dir, &bch_cache_set_opts_dir_ktype);
1206         kobject_init(&c->time_stats, &bch_cache_set_time_stats_ktype);
1207
1208         bch_cache_accounting_init(&c->accounting, &c->cl);
1209
1210         closure_init(&c->caching, &c->cl);
1211         set_closure_fn(&c->caching, __cache_set_stop1, system_wq);
1212
1213         continue_at_noreturn(&c->cl, __cache_set_stop3, system_wq);
1214         return c;
1215 err:
1216         cache_set_free(c);
1217         return NULL;
1218 }
1219
1220 static int bch_cache_set_online(struct cache_set *c)
1221 {
1222         struct cache *ca;
1223         unsigned i;
1224
1225         lockdep_assert_held(&bch_register_lock);
1226
1227         if (c->kobj.state_in_sysfs)
1228                 return 0;
1229
1230         c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
1231         if (c->minor < 0)
1232                 return c->minor;
1233
1234         c->chardev = device_create(bch_chardev_class, NULL,
1235                                    MKDEV(bch_chardev_major, c->minor), NULL,
1236                                    "bcache%u-ctl", c->minor);
1237         if (IS_ERR(c->chardev))
1238                 return PTR_ERR(c->chardev);
1239
1240         if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) ||
1241             kobject_add(&c->internal, &c->kobj, "internal") ||
1242             kobject_add(&c->opts_dir, &c->kobj, "options") ||
1243             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
1244             bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1245                 return -1;
1246
1247         for_each_cache(ca, c, i)
1248                 if (bch_cache_online(ca)) {
1249                         percpu_ref_put(&ca->ref);
1250                         return -1;
1251                 }
1252
1253         list_add(&c->list, &bch_cache_sets);
1254         return 0;
1255 }
1256
1257 static const char *run_cache_set(struct cache_set *c)
1258 {
1259         const char *err = "cannot allocate memory";
1260         struct cache *ca;
1261         unsigned i, id;
1262         time64_t now;
1263         LIST_HEAD(journal);
1264         struct jset *j;
1265         int ret = -EINVAL;
1266
1267         lockdep_assert_held(&bch_register_lock);
1268         BUG_ON(test_bit(CACHE_SET_RUNNING, &c->flags));
1269
1270         /* We don't want bch_fatal_error() to free underneath us */
1271         closure_get(&c->caching);
1272
1273         /*
1274          * Make sure that each cache object's mi is up to date before
1275          * we start testing it.
1276          */
1277         for_each_cache(ca, c, i)
1278                 cache_sb_from_cache_set(c, ca);
1279
1280         /*
1281          * CACHE_SET_SYNC is true if the cache set has already been run
1282          * and potentially has data.
1283          * It is false if it is the first time it is run.
1284          */
1285
1286         if (CACHE_SET_SYNC(&c->disk_sb)) {
1287                 ret = bch_journal_read(c, &journal);
1288                 if (ret)
1289                         goto err;
1290
1291                 pr_debug("btree_journal_read() done");
1292
1293                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1294
1295                 err = "error reading priorities";
1296                 for_each_cache(ca, c, i) {
1297                         ret = bch_prio_read(ca);
1298                         if (ret) {
1299                                 percpu_ref_put(&ca->ref);
1300                                 goto err;
1301                         }
1302                 }
1303
1304                 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
1305                 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
1306
1307                 for_each_cache(ca, c, i) {
1308                         bch_recalc_min_prio(ca, READ);
1309                         bch_recalc_min_prio(ca, WRITE);
1310                 }
1311
1312                 /*
1313                  * If bch_prio_read() fails it'll call cache_set_error and we'll
1314                  * tear everything down right away, but if we perhaps checked
1315                  * sooner we could avoid journal replay.
1316                  */
1317
1318                 for (id = 0; id < BTREE_ID_NR; id++) {
1319                         unsigned level;
1320                         struct bkey_i *k;
1321
1322                         err = "bad btree root";
1323                         k = bch_journal_find_btree_root(c, j, id, &level);
1324                         if (!k && id == BTREE_ID_EXTENTS)
1325                                 goto err;
1326                         if (!k) {
1327                                 pr_debug("missing btree root: %d", id);
1328                                 continue;
1329                         }
1330
1331                         err = "error reading btree root";
1332                         if (bch_btree_root_read(c, id, k, level))
1333                                 goto err;
1334                 }
1335
1336                 bch_verbose(c, "starting mark and sweep:");
1337
1338                 err = "error in recovery";
1339                 if (bch_initial_gc(c, &journal))
1340                         goto err;
1341
1342                 bch_verbose(c, "mark and sweep done");
1343
1344                 /*
1345                  * bch_journal_start() can't happen sooner, or btree_gc_finish()
1346                  * will give spurious errors about oldest_gen > bucket_gen -
1347                  * this is a hack but oh well.
1348                  */
1349                 bch_journal_start(c);
1350
1351                 err = "error starting allocator thread";
1352                 for_each_cache(ca, c, i)
1353                         if (ca->mi.state == CACHE_ACTIVE &&
1354                             bch_cache_allocator_start(ca)) {
1355                                 percpu_ref_put(&ca->ref);
1356                                 goto err;
1357                         }
1358
1359                 bch_verbose(c, "starting journal replay:");
1360
1361                 err = "journal replay failed";
1362                 ret = bch_journal_replay(c, &journal);
1363                 if (ret)
1364                         goto err;
1365
1366                 bch_verbose(c, "journal replay done");
1367
1368                 /*
1369                  * Write a new journal entry _before_ we start journalling new
1370                  * data - otherwise, we could end up with btree node bsets with
1371                  * journal seqs arbitrarily far in the future vs. the most
1372                  * recently written journal entry on disk, if we crash before
1373                  * writing the next journal entry:
1374                  */
1375                 err = "error writing journal entry";
1376                 if (bch_journal_meta(&c->journal))
1377                         goto err;
1378
1379                 bch_verbose(c, "starting fs gc:");
1380                 err = "error in fs gc";
1381                 ret = bch_gc_inode_nlinks(c);
1382                 if (ret)
1383                         goto err;
1384                 bch_verbose(c, "fs gc done");
1385
1386                 if (!c->opts.nofsck) {
1387                         bch_verbose(c, "starting fsck:");
1388                         err = "error in fsck";
1389                         ret = bch_fsck(c);
1390                         if (ret)
1391                                 goto err;
1392                         bch_verbose(c, "fsck done");
1393                 }
1394         } else {
1395                 struct bkey_i_inode inode;
1396                 struct closure cl;
1397
1398                 closure_init_stack(&cl);
1399
1400                 bch_notice(c, "initializing new filesystem");
1401
1402                 err = "unable to allocate journal buckets";
1403                 for_each_cache(ca, c, i)
1404                         if (bch_cache_journal_alloc(ca)) {
1405                                 percpu_ref_put(&ca->ref);
1406                                 goto err;
1407                         }
1408
1409                 bch_initial_gc(c, NULL);
1410
1411                 /*
1412                  * journal_res_get() will crash if called before this has
1413                  * set up the journal.pin FIFO and journal.cur pointer:
1414                  */
1415                 bch_journal_start(c);
1416                 bch_journal_set_replay_done(&c->journal);
1417
1418                 err = "error starting allocator thread";
1419                 for_each_cache(ca, c, i)
1420                         if (ca->mi.state == CACHE_ACTIVE &&
1421                             bch_cache_allocator_start(ca)) {
1422                                 percpu_ref_put(&ca->ref);
1423                                 goto err;
1424                         }
1425
1426                 err = "cannot allocate new btree root";
1427                 for (id = 0; id < BTREE_ID_NR; id++)
1428                         if (bch_btree_root_alloc(c, id, &cl)) {
1429                                 closure_sync(&cl);
1430                                 goto err;
1431                         }
1432
1433                 /* Wait for new btree roots to be written: */
1434                 closure_sync(&cl);
1435
1436                 bkey_inode_init(&inode.k_i);
1437                 inode.k.p.inode = BCACHE_ROOT_INO;
1438                 inode.v.i_mode = cpu_to_le16(S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO);
1439                 inode.v.i_nlink = cpu_to_le32(2);
1440                 get_random_bytes(&inode.v.i_hash_seed, sizeof(inode.v.i_hash_seed));
1441                 SET_INODE_STR_HASH_TYPE(&inode.v, c->sb.str_hash_type);
1442
1443                 err = "error creating root directory";
1444                 if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i,
1445                                      NULL, NULL, NULL, 0))
1446                         goto err;
1447
1448                 err = "error writing first journal entry";
1449                 if (bch_journal_meta(&c->journal))
1450                         goto err;
1451         }
1452
1453         if (c->opts.read_only) {
1454                 bch_cache_set_read_only_sync(c);
1455         } else {
1456                 err = __bch_cache_set_read_write(c);
1457                 if (err)
1458                         goto err;
1459         }
1460
1461         now = ktime_get_seconds();
1462         rcu_read_lock();
1463         for_each_cache_rcu(ca, c, i)
1464                 c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now);
1465         rcu_read_unlock();
1466
1467         /* Mark cache set as initialized: */
1468         SET_CACHE_SET_SYNC(&c->disk_sb, true);
1469         SET_CACHE_SET_CLEAN(&c->disk_sb, false);
1470         bcache_write_super(c);
1471
1472         err = "dynamic fault";
1473         if (cache_set_init_fault("run_cache_set"))
1474                 goto err;
1475
1476         err = "error creating kobject";
1477         if (bch_cache_set_online(c))
1478                 goto err;
1479
1480         err = "can't bring up blockdev volumes";
1481         if (bch_blockdev_volumes_start(c))
1482                 goto err;
1483
1484         bch_debug_init_cache_set(c);
1485         set_bit(CACHE_SET_RUNNING, &c->flags);
1486         bch_attach_backing_devs(c);
1487
1488         closure_put(&c->caching);
1489
1490         bch_notify_cache_set_read_write(c);
1491
1492         BUG_ON(!list_empty(&journal));
1493         return NULL;
1494 err:
1495         switch (ret) {
1496         case BCH_FSCK_ERRORS_NOT_FIXED:
1497                 bch_err(c, "filesystem contains errors: please report this to the developers");
1498                 pr_cont("mount with -o fix_errors to repair");
1499                 err = "fsck error";
1500                 break;
1501         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
1502                 bch_err(c, "filesystem contains errors: please report this to the developers");
1503                 pr_cont("repair unimplemented: inform the developers so that it can be added");
1504                 err = "fsck error";
1505                 break;
1506         case BCH_FSCK_REPAIR_IMPOSSIBLE:
1507                 bch_err(c, "filesystem contains errors, but repair impossible");
1508                 err = "fsck error";
1509                 break;
1510         case BCH_FSCK_UNKNOWN_VERSION:
1511                 err = "unknown metadata version";;
1512                 break;
1513         case -ENOMEM:
1514                 err = "cannot allocate memory";
1515                 break;
1516         case -EIO:
1517                 err = "IO error";
1518                 break;
1519         }
1520
1521         BUG_ON(!err);
1522
1523         bch_journal_entries_free(&journal);
1524         set_bit(CACHE_SET_ERROR, &c->flags);
1525         bch_cache_set_unregister(c);
1526         closure_put(&c->caching);
1527         return err;
1528 }
1529
1530 static const char *can_add_cache(struct cache_sb *sb,
1531                                  struct cache_set *c)
1532 {
1533         if (le16_to_cpu(sb->block_size) != c->sb.block_size)
1534                 return "mismatched block size";
1535
1536         if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) <
1537             CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb))
1538                 return "new cache bucket_size is too small";
1539
1540         return NULL;
1541 }
1542
1543 static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
1544 {
1545         const char *err;
1546         bool match;
1547
1548         err = can_add_cache(sb, c);
1549         if (err)
1550                 return err;
1551
1552         /*
1553          * When attaching an existing device, the cache set superblock must
1554          * already contain member_info with a matching UUID
1555          */
1556         match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq)
1557                 ? (sb->nr_this_dev < c->disk_sb.nr_in_set &&
1558                    !memcmp(&c->disk_mi[sb->nr_this_dev].uuid,
1559                            &sb->disk_uuid, sizeof(uuid_le)))
1560                 : (sb->nr_this_dev < sb->nr_in_set &&
1561                    !memcmp(&sb->members[sb->nr_this_dev].uuid,
1562                            &sb->disk_uuid, sizeof(uuid_le)));
1563
1564         if (!match)
1565                 return "cache sb does not match set";
1566
1567         return NULL;
1568 }
1569
1570 /* Cache device */
1571
1572 bool bch_cache_read_only(struct cache *ca)
1573 {
1574         struct cache_set *c = ca->set;
1575         char buf[BDEVNAME_SIZE];
1576
1577         bdevname(ca->disk_sb.bdev, buf);
1578
1579         lockdep_assert_held(&bch_register_lock);
1580
1581         if (ca->mi.state != CACHE_ACTIVE)
1582                 return false;
1583
1584         if (!bch_cache_may_remove(ca)) {
1585                 bch_err(c, "required member %s going RO, forcing fs RO", buf);
1586                 bch_cache_set_read_only_sync(c);
1587         }
1588
1589         trace_bcache_cache_read_only(ca);
1590
1591         bch_moving_gc_stop(ca);
1592
1593         /*
1594          * This stops new data writes (e.g. to existing open data
1595          * buckets) and then waits for all existing writes to
1596          * complete.
1597          */
1598         bch_cache_allocator_stop(ca);
1599
1600         bch_cache_group_remove_cache(&c->journal.devs, ca);
1601
1602         /*
1603          * Device data write barrier -- no non-meta-data writes should
1604          * occur after this point.  However, writes to btree buckets,
1605          * journal buckets, and the superblock can still occur.
1606          */
1607         trace_bcache_cache_read_only_done(ca);
1608
1609         bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
1610         bch_notify_cache_read_only(ca);
1611
1612         SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
1613         bcache_write_super(c);
1614         return true;
1615 }
1616
1617 static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
1618 {
1619         lockdep_assert_held(&bch_register_lock);
1620
1621         if (ca->mi.state == CACHE_ACTIVE)
1622                 return NULL;
1623
1624         if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
1625                 return "removing";
1626
1627         trace_bcache_cache_read_write(ca);
1628
1629         if (bch_cache_allocator_start(ca))
1630                 return "error starting allocator thread";
1631
1632         if (bch_moving_gc_thread_start(ca))
1633                 return "error starting moving GC thread";
1634
1635         bch_cache_group_add_cache(&c->journal.devs, ca);
1636
1637         wake_up_process(c->tiering_read);
1638
1639         bch_notify_cache_read_write(ca);
1640         trace_bcache_cache_read_write_done(ca);
1641
1642         return NULL;
1643 }
1644
1645 const char *bch_cache_read_write(struct cache *ca)
1646 {
1647         struct cache_set *c = ca->set;
1648         const char *err;
1649
1650         err = __bch_cache_read_write(c, ca);
1651         if (err)
1652                 return err;
1653
1654         SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE);
1655         bcache_write_super(c);
1656
1657         return NULL;
1658 }
1659
1660 /*
1661  * bch_cache_stop has already returned, so we no longer hold the register
1662  * lock at the point this is called.
1663  */
1664
1665 void bch_cache_release(struct kobject *kobj)
1666 {
1667         struct cache *ca = container_of(kobj, struct cache, kobj);
1668
1669         percpu_ref_exit(&ca->ref);
1670         kfree(ca);
1671 }
1672
1673 static void bch_cache_free_work(struct work_struct *work)
1674 {
1675         struct cache *ca = container_of(work, struct cache, free_work);
1676         struct cache_set *c = ca->set;
1677         unsigned i;
1678
1679         cancel_work_sync(&ca->io_error_work);
1680
1681         if (c && c->kobj.state_in_sysfs) {
1682                 char buf[12];
1683
1684                 sprintf(buf, "cache%u", ca->sb.nr_this_dev);
1685                 sysfs_remove_link(&c->kobj, buf);
1686         }
1687
1688         if (ca->kobj.state_in_sysfs)
1689                 kobject_del(&ca->kobj);
1690
1691         free_super(&ca->disk_sb);
1692
1693         /*
1694          * bch_cache_stop can be called in the middle of initialization
1695          * of the struct cache object.
1696          * As such, not all the sub-structures may be initialized.
1697          * However, they were zeroed when the object was allocated.
1698          */
1699
1700         free_percpu(ca->sectors_written);
1701         bioset_exit(&ca->replica_set);
1702         free_percpu(ca->bucket_stats_percpu);
1703         kfree(ca->journal.bucket_seq);
1704         free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1705         kfree(ca->prio_buckets);
1706         kfree(ca->bio_prio);
1707         kfree(ca->journal.bio);
1708         vfree(ca->buckets);
1709         vfree(ca->oldest_gens);
1710         free_heap(&ca->heap);
1711         free_fifo(&ca->free_inc);
1712
1713         for (i = 0; i < RESERVE_NR; i++)
1714                 free_fifo(&ca->free[i]);
1715
1716         kobject_put(&ca->kobj);
1717
1718         if (c)
1719                 kobject_put(&c->kobj);
1720 }
1721
1722 static void bch_cache_percpu_ref_release(struct percpu_ref *ref)
1723 {
1724         struct cache *ca = container_of(ref, struct cache, ref);
1725
1726         schedule_work(&ca->free_work);
1727 }
1728
1729 static void bch_cache_free_rcu(struct rcu_head *rcu)
1730 {
1731         struct cache *ca = container_of(rcu, struct cache, free_rcu);
1732
1733         /*
1734          * This decrements the ref count to ca, and once the ref count
1735          * is 0 (outstanding bios to the ca also incremented it and
1736          * decrement it on completion/error), bch_cache_percpu_ref_release
1737          * is called, and that eventually results in bch_cache_free_work
1738          * being called, which in turn results in bch_cache_release being
1739          * called.
1740          *
1741          * In particular, these functions won't be called until there are no
1742          * bios outstanding (the per-cpu ref counts are all 0), so it
1743          * is safe to remove the actual sysfs device at that point,
1744          * and that can indicate success to the user.
1745          */
1746
1747         percpu_ref_kill(&ca->ref);
1748 }
1749
1750 static void bch_cache_stop(struct cache *ca)
1751 {
1752         struct cache_set *c = ca->set;
1753
1754         lockdep_assert_held(&bch_register_lock);
1755
1756         if (c) {
1757                 BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca);
1758                 rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL);
1759         }
1760
1761         call_rcu(&ca->free_rcu, bch_cache_free_rcu);
1762 }
1763
1764 static void bch_cache_remove_work(struct work_struct *work)
1765 {
1766         struct cache *ca = container_of(work, struct cache, remove_work);
1767         struct cache_set *c = ca->set;
1768         char name[BDEVNAME_SIZE];
1769         bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
1770         unsigned dev = ca->sb.nr_this_dev;
1771
1772         bdevname(ca->disk_sb.bdev, name);
1773
1774         /*
1775          * Device should already be RO, now migrate data off:
1776          *
1777          * XXX: locking is sketchy, bch_cache_read_write() has to check
1778          * CACHE_DEV_REMOVING bit
1779          */
1780         if (!ca->mi.has_data) {
1781                 /* Nothing to do: */
1782         } else if (!bch_move_data_off_device(ca)) {
1783                 lockdep_assert_held(&bch_register_lock);
1784                 SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
1785
1786                 bcache_write_super(c);
1787         } else if (force) {
1788                 bch_flag_data_bad(ca);
1789
1790                 lockdep_assert_held(&bch_register_lock);
1791                 SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
1792
1793                 bcache_write_super(c);
1794         } else {
1795                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1796                         name);
1797                 clear_bit(CACHE_DEV_REMOVING, &ca->flags);
1798                 return;
1799         }
1800
1801         /* Now metadata: */
1802
1803         if (!ca->mi.has_metadata) {
1804                 /* Nothing to do: */
1805         } else if (!bch_move_meta_data_off_device(ca)) {
1806                 lockdep_assert_held(&bch_register_lock);
1807                 SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false);
1808
1809                 bcache_write_super(c);
1810         } else {
1811                 bch_err(c, "Remove of %s failed, unable to migrate metadata off",
1812                         name);
1813                 clear_bit(CACHE_DEV_REMOVING, &ca->flags);
1814                 return;
1815         }
1816
1817         /*
1818          * Ok, really doing the remove:
1819          * Drop device's prio pointer before removing it from superblock:
1820          */
1821         bch_notify_cache_removed(ca);
1822
1823         spin_lock(&c->journal.lock);
1824         c->journal.prio_buckets[dev] = 0;
1825         spin_unlock(&c->journal.lock);
1826
1827         bch_journal_meta(&c->journal);
1828
1829         /*
1830          * Stop device before removing it from the cache set's list of devices -
1831          * and get our own ref on cache set since ca is going away:
1832          */
1833         closure_get(&c->cl);
1834
1835         mutex_lock(&bch_register_lock);
1836         bch_cache_stop(ca);
1837
1838         /*
1839          * RCU barrier between dropping between c->cache and dropping from
1840          * member info:
1841          */
1842         synchronize_rcu();
1843
1844         lockdep_assert_held(&bch_register_lock);
1845
1846         /*
1847          * Free this device's slot in the cache_member array - all pointers to
1848          * this device must be gone:
1849          */
1850         memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid));
1851
1852         bcache_write_super(c);
1853         mutex_unlock(&bch_register_lock);
1854
1855         closure_put(&c->cl);
1856 }
1857
1858 bool bch_cache_remove(struct cache *ca, bool force)
1859 {
1860         mutex_lock(&bch_register_lock);
1861
1862         if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
1863                 return false;
1864
1865         if (!bch_cache_may_remove(ca)) {
1866                 bch_err(ca->set, "Can't remove last device in tier %u",
1867                         ca->mi.tier);
1868                 bch_notify_cache_remove_failed(ca);
1869                 return false;
1870         }
1871
1872         /* First, go RO before we try to migrate data off: */
1873         bch_cache_read_only(ca);
1874
1875         if (force)
1876                 set_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
1877         set_bit(CACHE_DEV_REMOVING, &ca->flags);
1878         bch_notify_cache_removing(ca);
1879
1880         mutex_unlock(&bch_register_lock);
1881
1882         /* Migrate the data and finish removal asynchronously: */
1883
1884         queue_work(system_long_wq, &ca->remove_work);
1885         return true;
1886 }
1887
1888 static int bch_cache_online(struct cache *ca)
1889 {
1890         char buf[12];
1891
1892         lockdep_assert_held(&bch_register_lock);
1893
1894         sprintf(buf, "cache%u", ca->sb.nr_this_dev);
1895
1896         if (kobject_add(&ca->kobj,
1897                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1898                         "bcache") ||
1899             sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
1900             sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
1901                 return -1;
1902
1903         return 0;
1904 }
1905
1906 static const char *cache_alloc(struct bcache_superblock *sb,
1907                                struct cache_set *c,
1908                                struct cache **ret)
1909 {
1910         size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1911         size_t heap_size;
1912         unsigned i, journal_entry_pages;
1913         const char *err = "cannot allocate memory";
1914         struct cache *ca;
1915
1916         if (c->sb.nr_in_set == 1)
1917                 bdevname(sb->bdev, c->name);
1918
1919         if (cache_set_init_fault("cache_alloc"))
1920                 return err;
1921
1922         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1923         if (!ca)
1924                 return err;
1925
1926         if (percpu_ref_init(&ca->ref, bch_cache_percpu_ref_release,
1927                             0, GFP_KERNEL)) {
1928                 kfree(ca);
1929                 return err;
1930         }
1931
1932         kobject_init(&ca->kobj, &bch_cache_ktype);
1933
1934         spin_lock_init(&ca->self.lock);
1935         ca->self.nr_devices = 1;
1936         rcu_assign_pointer(ca->self.d[0].dev, ca);
1937         ca->sb.nr_this_dev = sb->sb->nr_this_dev;
1938
1939         INIT_WORK(&ca->free_work, bch_cache_free_work);
1940         INIT_WORK(&ca->remove_work, bch_cache_remove_work);
1941         spin_lock_init(&ca->freelist_lock);
1942         spin_lock_init(&ca->prio_buckets_lock);
1943         mutex_init(&ca->heap_lock);
1944         bch_moving_init_cache(ca);
1945
1946         ca->disk_sb = *sb;
1947         ca->disk_sb.bdev->bd_holder = ca;
1948         memset(sb, 0, sizeof(*sb));
1949
1950         INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1951
1952         err = "dynamic fault";
1953         if (cache_set_init_fault("cache_alloc"))
1954                 goto err;
1955
1956         ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members +
1957                                     ca->disk_sb.sb->nr_this_dev);
1958         ca->bucket_bits = ilog2(ca->mi.bucket_size);
1959
1960         /* XXX: tune these */
1961         movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1962         reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1963         /*
1964          * free_inc must be smaller than the copygc reserve: if it was bigger,
1965          * one copygc iteration might not make enough buckets available to fill
1966          * up free_inc and allow the allocator to make forward progress
1967          */
1968         free_inc_reserve = movinggc_reserve / 2;
1969         heap_size = movinggc_reserve * 8;
1970
1971         journal_entry_pages =
1972                 DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
1973                              PAGE_SECTORS);
1974
1975         if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1976             !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1977             !init_fifo(&ca->free[RESERVE_MOVINGGC],
1978                        movinggc_reserve, GFP_KERNEL) ||
1979             !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1980             !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
1981             !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
1982             !(ca->oldest_gens   = vzalloc(sizeof(u8) *
1983                                           ca->mi.nbuckets)) ||
1984             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1985                                           ca->mi.nbuckets)) ||
1986             !(ca->prio_buckets  = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1987                                           2, GFP_KERNEL)) ||
1988             !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1989             !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
1990             !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb),
1991                                                sizeof(u64), GFP_KERNEL)) ||
1992             !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) ||
1993             !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) ||
1994             bioset_init(&ca->replica_set, 4,
1995                         offsetof(struct bch_write_bio, bio)) ||
1996             !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
1997                 goto err;
1998
1999         ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2000
2001         total_reserve = ca->free_inc.size;
2002         for (i = 0; i < RESERVE_NR; i++)
2003                 total_reserve += ca->free[i].size;
2004         pr_debug("%zu buckets reserved", total_reserve);
2005
2006         ca->copygc_write_point.group = &ca->self;
2007         ca->tiering_write_point.group = &ca->self;
2008
2009         kobject_get(&c->kobj);
2010         ca->set = c;
2011
2012         kobject_get(&ca->kobj);
2013         rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca);
2014
2015         if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq))
2016                 cache_sb_to_cache_set(c, ca->disk_sb.sb);
2017
2018         /*
2019          * Increase journal write timeout if flushes to this device are
2020          * expensive:
2021          */
2022         if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
2023             journal_flushes_device(ca))
2024                 c->journal.write_delay_ms =
2025                         max(c->journal.write_delay_ms, 1000U);
2026
2027         err = "error creating kobject";
2028         if (c->kobj.state_in_sysfs &&
2029             bch_cache_online(ca))
2030                 goto err;
2031
2032         if (ret)
2033                 *ret = ca;
2034         else
2035                 kobject_put(&ca->kobj);
2036         return NULL;
2037 err:
2038         bch_cache_stop(ca);
2039         return err;
2040 }
2041
2042 static struct cache_set *cache_set_lookup(uuid_le uuid)
2043 {
2044         struct cache_set *c;
2045
2046         lockdep_assert_held(&bch_register_lock);
2047
2048         list_for_each_entry(c, &bch_cache_sets, list)
2049                 if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le)))
2050                         return c;
2051
2052         return NULL;
2053 }
2054
2055 static const char *register_cache(struct bcache_superblock *sb,
2056                                   struct cache_set_opts opts)
2057 {
2058         char name[BDEVNAME_SIZE];
2059         const char *err = "cannot allocate memory";
2060         struct cache_set *c;
2061
2062         err = validate_cache_super(sb);
2063         if (err)
2064                 return err;
2065
2066         bdevname(sb->bdev, name);
2067
2068         c = cache_set_lookup(sb->sb->set_uuid);
2069         if (c) {
2070                 if ((err = (can_attach_cache(sb->sb, c) ?:
2071                             cache_alloc(sb, c, NULL))))
2072                         return err;
2073
2074                 if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
2075                         err = run_cache_set(c);
2076                         if (err)
2077                                 return err;
2078                 }
2079                 goto out;
2080         }
2081
2082         c = bch_cache_set_alloc(sb->sb, opts);
2083         if (!c)
2084                 return err;
2085
2086         err = cache_alloc(sb, c, NULL);
2087         if (err)
2088                 goto err_stop;
2089
2090         if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
2091                 err = run_cache_set(c);
2092                 if (err)
2093                         goto err_stop;
2094         }
2095
2096         err = "error creating kobject";
2097         if (bch_cache_set_online(c))
2098                 goto err_stop;
2099 out:
2100
2101         bch_info(c, "started");
2102         return NULL;
2103 err_stop:
2104         bch_cache_set_stop(c);
2105         return err;
2106 }
2107
2108 int bch_cache_set_add_cache(struct cache_set *c, const char *path)
2109 {
2110         struct bcache_superblock sb;
2111         const char *err;
2112         struct cache *ca;
2113         struct cache_member *new_mi = NULL;
2114         struct cache_member mi;
2115         unsigned nr_this_dev, nr_in_set, u64s;
2116         int ret = -EINVAL;
2117
2118         mutex_lock(&bch_register_lock);
2119
2120         err = read_super(&sb, path);
2121         if (err)
2122                 goto err_unlock;
2123
2124         err = validate_cache_super(&sb);
2125         if (err)
2126                 goto err_unlock;
2127
2128         err = can_add_cache(sb.sb, c);
2129         if (err)
2130                 goto err_unlock;
2131
2132         /*
2133          * Preserve the old cache member information (esp. tier)
2134          * before we start bashing the disk stuff.
2135          */
2136         mi = sb.sb->members[sb.sb->nr_this_dev];
2137         mi.last_mount = cpu_to_le64(ktime_get_seconds());
2138
2139         down_read(&c->gc_lock);
2140
2141         if (dynamic_fault("bcache:add:no_slot"))
2142                 goto no_slot;
2143
2144         if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
2145                 goto no_slot;
2146
2147         for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++)
2148                 if (nr_this_dev >= c->sb.nr_in_set ||
2149                     bch_is_zero(c->disk_mi[nr_this_dev].uuid.b,
2150                                  sizeof(uuid_le)))
2151                         goto have_slot;
2152 no_slot:
2153         up_read(&c->gc_lock);
2154
2155         err = "no slots available in superblock";
2156         ret = -ENOSPC;
2157         goto err_unlock;
2158
2159 have_slot:
2160         nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set);
2161         up_read(&c->gc_lock);
2162
2163         u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64));
2164         err = "no space in superblock for member info";
2165         if (bch_super_realloc(&sb, u64s))
2166                 goto err_unlock;
2167
2168         new_mi = dynamic_fault("bcache:add:member_info_realloc")
2169                 ? NULL
2170                 : kmalloc(sizeof(struct cache_member) * nr_in_set,
2171                           GFP_KERNEL);
2172         if (!new_mi) {
2173                 err = "cannot allocate memory";
2174                 ret = -ENOMEM;
2175                 goto err_unlock;
2176         }
2177
2178         memcpy(new_mi, c->disk_mi,
2179                sizeof(struct cache_member) * nr_in_set);
2180         new_mi[nr_this_dev] = mi;
2181
2182         sb.sb->nr_this_dev      = nr_this_dev;
2183         sb.sb->nr_in_set        = nr_in_set;
2184         sb.sb->u64s             = cpu_to_le16(u64s);
2185         memcpy(sb.sb->members, new_mi,
2186                sizeof(struct cache_member) * nr_in_set);
2187
2188         if (cache_set_mi_update(c, new_mi, nr_in_set)) {
2189                 err = "cannot allocate memory";
2190                 ret = -ENOMEM;
2191                 goto err_unlock;
2192         }
2193
2194         /* commit new member info */
2195         swap(c->disk_mi, new_mi);
2196         kfree(new_mi);
2197         new_mi = NULL;
2198         c->disk_sb.nr_in_set = nr_in_set;
2199         c->sb.nr_in_set = nr_in_set;
2200
2201         err = cache_alloc(&sb, c, &ca);
2202         if (err)
2203                 goto err_unlock;
2204
2205         bcache_write_super(c);
2206
2207         err = "journal alloc failed";
2208         if (bch_cache_journal_alloc(ca))
2209                 goto err_put;
2210
2211         bch_notify_cache_added(ca);
2212
2213         if (ca->mi.state == CACHE_ACTIVE) {
2214                 err = __bch_cache_read_write(c, ca);
2215                 if (err)
2216                         goto err_put;
2217         }
2218
2219         kobject_put(&ca->kobj);
2220         mutex_unlock(&bch_register_lock);
2221         return 0;
2222 err_put:
2223         bch_cache_stop(ca);
2224 err_unlock:
2225         kfree(new_mi);
2226         free_super(&sb);
2227         mutex_unlock(&bch_register_lock);
2228
2229         bch_err(c, "Unable to add device: %s", err);
2230         return ret ?: -EINVAL;
2231 }
2232
2233 const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
2234                                    struct cache_set_opts opts,
2235                                    struct cache_set **ret)
2236 {
2237         const char *err;
2238         struct cache_set *c = NULL;
2239         struct bcache_superblock *sb;
2240         uuid_le uuid;
2241         unsigned i;
2242
2243         memset(&uuid, 0, sizeof(uuid_le));
2244
2245         if (!nr_devices)
2246                 return "need at least one device";
2247
2248         if (!try_module_get(THIS_MODULE))
2249                 return "module unloading";
2250
2251         err = "cannot allocate memory";
2252         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
2253         if (!sb)
2254                 goto err;
2255
2256         /*
2257          * read_super() needs to happen under register_lock, so that the
2258          * exclusive open is atomic with adding the new cache set to the list of
2259          * cache sets:
2260          */
2261         mutex_lock(&bch_register_lock);
2262
2263         for (i = 0; i < nr_devices; i++) {
2264                 err = read_super(&sb[i], devices[i]);
2265                 if (err)
2266                         goto err_unlock;
2267
2268                 err = "attempting to register backing device";
2269                 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
2270                         goto err_unlock;
2271
2272                 err = validate_cache_super(&sb[i]);
2273                 if (err)
2274                         goto err_unlock;
2275         }
2276
2277         err = "cache set already registered";
2278         if (cache_set_lookup(sb->sb->set_uuid))
2279                 goto err_unlock;
2280
2281         err = "cannot allocate memory";
2282         c = bch_cache_set_alloc(sb[0].sb, opts);
2283         if (!c)
2284                 goto err_unlock;
2285
2286         for (i = 0; i < nr_devices; i++) {
2287                 err = cache_alloc(&sb[i], c, NULL);
2288                 if (err)
2289                         goto err_unlock;
2290         }
2291
2292         err = "insufficient devices";
2293         if (cache_set_nr_online_devices(c) != cache_set_nr_devices(c))
2294                 goto err_unlock;
2295
2296         err = run_cache_set(c);
2297         if (err)
2298                 goto err_unlock;
2299
2300         err = "error creating kobject";
2301         if (bch_cache_set_online(c))
2302                 goto err_unlock;
2303
2304         if (ret) {
2305                 closure_get(&c->cl);
2306                 *ret = c;
2307         }
2308
2309         mutex_unlock(&bch_register_lock);
2310
2311         err = NULL;
2312 out:
2313         kfree(sb);
2314         module_put(THIS_MODULE);
2315         return err;
2316 err_unlock:
2317         if (c)
2318                 bch_cache_set_stop(c);
2319         mutex_unlock(&bch_register_lock);
2320 err:
2321         for (i = 0; i < nr_devices; i++)
2322                 free_super(&sb[i]);
2323         goto out;
2324 }
2325
2326 const char *bch_register_one(const char *path)
2327 {
2328         struct bcache_superblock sb;
2329         const char *err;
2330
2331         mutex_lock(&bch_register_lock);
2332
2333         err = read_super(&sb, path);
2334         if (err)
2335                 goto err;
2336
2337         if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
2338                 err = bch_backing_dev_register(&sb);
2339         else
2340                 err = register_cache(&sb, cache_set_opts_empty());
2341
2342         free_super(&sb);
2343 err:
2344         mutex_unlock(&bch_register_lock);
2345         return err;
2346 }
2347
2348 /* Global interfaces/init */
2349
2350 #define kobj_attribute_write(n, fn)                                     \
2351         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
2352
2353 #define kobj_attribute_rw(n, show, store)                               \
2354         static struct kobj_attribute ksysfs_##n =                       \
2355                 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
2356
2357 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
2358                                const char *, size_t);
2359
2360 kobj_attribute_write(register,          register_bcache);
2361 kobj_attribute_write(register_quiet,    register_bcache);
2362
2363 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2364                                const char *buffer, size_t size)
2365 {
2366         ssize_t ret = -EINVAL;
2367         const char *err = "cannot allocate memory";
2368         char *path = NULL;
2369
2370         if (!try_module_get(THIS_MODULE))
2371                 return -EBUSY;
2372
2373         if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
2374                 goto err;
2375
2376         err = bch_register_one(strim(path));
2377         if (err)
2378                 goto err;
2379
2380         ret = size;
2381 out:
2382         kfree(path);
2383         module_put(THIS_MODULE);
2384         return ret;
2385 err:
2386         pr_err("error opening %s: %s", path, err);
2387         goto out;
2388 }
2389
2390 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2391 {
2392         if (code == SYS_DOWN ||
2393             code == SYS_HALT ||
2394             code == SYS_POWER_OFF) {
2395                 struct cache_set *c;
2396
2397                 mutex_lock(&bch_register_lock);
2398
2399                 if (!list_empty(&bch_cache_sets))
2400                         pr_info("Setting all devices read only:");
2401
2402                 list_for_each_entry(c, &bch_cache_sets, list)
2403                         bch_cache_set_read_only(c);
2404
2405                 list_for_each_entry(c, &bch_cache_sets, list)
2406                         bch_cache_set_read_only_sync(c);
2407
2408                 mutex_unlock(&bch_register_lock);
2409         }
2410
2411         return NOTIFY_DONE;
2412 }
2413
2414 static struct notifier_block reboot = {
2415         .notifier_call  = bcache_reboot,
2416         .priority       = INT_MAX, /* before any real devices */
2417 };
2418
2419 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
2420                            const char *buffer, size_t size)
2421 {
2422         bcache_reboot(NULL, SYS_DOWN, NULL);
2423         return size;
2424 }
2425
2426 kobj_attribute_write(reboot,            reboot_test);
2427
2428 static void bcache_exit(void)
2429 {
2430         bch_debug_exit();
2431         bch_fs_exit();
2432         bch_blockdev_exit();
2433         if (bcache_kset)
2434                 kset_unregister(bcache_kset);
2435         if (bcache_io_wq)
2436                 destroy_workqueue(bcache_io_wq);
2437         if (!IS_ERR_OR_NULL(bch_chardev_class))
2438                 device_destroy(bch_chardev_class,
2439                                MKDEV(bch_chardev_major, 0));
2440         if (!IS_ERR_OR_NULL(bch_chardev_class))
2441                 class_destroy(bch_chardev_class);
2442         if (bch_chardev_major > 0)
2443                 unregister_chrdev(bch_chardev_major, "bcache");
2444         if (!IS_ERR_OR_NULL(bch_sha1))
2445                 crypto_free_shash(bch_sha1);
2446         unregister_reboot_notifier(&reboot);
2447 }
2448
2449 static int __init bcache_init(void)
2450 {
2451         static const struct attribute *files[] = {
2452                 &ksysfs_register.attr,
2453                 &ksysfs_register_quiet.attr,
2454                 &ksysfs_reboot.attr,
2455                 NULL
2456         };
2457
2458         mutex_init(&bch_register_lock);
2459         register_reboot_notifier(&reboot);
2460         closure_debug_init();
2461         bkey_pack_test();
2462
2463         bch_sha1 = crypto_alloc_shash("sha1", 0, 0);
2464         if (IS_ERR(bch_sha1))
2465                 goto err;
2466
2467         bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
2468         if (bch_chardev_major < 0)
2469                 goto err;
2470
2471         bch_chardev_class = class_create(THIS_MODULE, "bcache");
2472         if (IS_ERR(bch_chardev_class))
2473                 goto err;
2474
2475         bch_chardev = device_create(bch_chardev_class, NULL,
2476                                     MKDEV(bch_chardev_major, 255),
2477                                     NULL, "bcache-ctl");
2478         if (IS_ERR(bch_chardev))
2479                 goto err;
2480
2481         if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
2482             !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
2483             sysfs_create_files(&bcache_kset->kobj, files) ||
2484             bch_blockdev_init() ||
2485             bch_fs_init() ||
2486             bch_debug_init())
2487                 goto err;
2488
2489         return 0;
2490 err:
2491         bcache_exit();
2492         return -ENOMEM;
2493 }
2494
2495 #define BCH_DEBUG_PARAM(name, description)                      \
2496         bool bch_##name;                                        \
2497         module_param_named(name, bch_##name, bool, 0644);       \
2498         MODULE_PARM_DESC(name, description);
2499 BCH_DEBUG_PARAMS()
2500 #undef BCH_DEBUG_PARAM
2501
2502 module_exit(bcache_exit);
2503 module_init(bcache_init);